0d491f5056087fa1fe6b484adcad0b5843e797b2
[platform/kernel/linux-rpi.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - IPv6 support postponed
11  *    - support for alternate links postponed
12  *    - partial support for non-blocking sockets only
13  *    - support for urgent data postponed
14  *
15  *  Copyright IBM Corp. 2016
16  *
17  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
18  *              based on prototype from Frank Blaschka
19  */
20
21 #define KMSG_COMPONENT "smc"
22 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
23
24 #include <linux/module.h>
25 #include <linux/socket.h>
26 #include <linux/workqueue.h>
27 #include <linux/in.h>
28 #include <linux/sched/signal.h>
29
30 #include <net/sock.h>
31 #include <net/tcp.h>
32 #include <net/smc.h>
33
34 #include "smc.h"
35 #include "smc_clc.h"
36 #include "smc_llc.h"
37 #include "smc_cdc.h"
38 #include "smc_core.h"
39 #include "smc_ib.h"
40 #include "smc_pnet.h"
41 #include "smc_tx.h"
42 #include "smc_rx.h"
43 #include "smc_close.h"
44
45 static DEFINE_MUTEX(smc_create_lgr_pending);    /* serialize link group
46                                                  * creation
47                                                  */
48
49 struct smc_lgr_list smc_lgr_list = {            /* established link groups */
50         .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
51         .list = LIST_HEAD_INIT(smc_lgr_list.list),
52 };
53
54 static void smc_tcp_listen_work(struct work_struct *);
55
56 static void smc_set_keepalive(struct sock *sk, int val)
57 {
58         struct smc_sock *smc = smc_sk(sk);
59
60         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
61 }
62
63 static struct smc_hashinfo smc_v4_hashinfo = {
64         .lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
65 };
66
67 int smc_hash_sk(struct sock *sk)
68 {
69         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
70         struct hlist_head *head;
71
72         head = &h->ht;
73
74         write_lock_bh(&h->lock);
75         sk_add_node(sk, head);
76         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
77         write_unlock_bh(&h->lock);
78
79         return 0;
80 }
81 EXPORT_SYMBOL_GPL(smc_hash_sk);
82
83 void smc_unhash_sk(struct sock *sk)
84 {
85         struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
86
87         write_lock_bh(&h->lock);
88         if (sk_del_node_init(sk))
89                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
90         write_unlock_bh(&h->lock);
91 }
92 EXPORT_SYMBOL_GPL(smc_unhash_sk);
93
94 struct proto smc_proto = {
95         .name           = "SMC",
96         .owner          = THIS_MODULE,
97         .keepalive      = smc_set_keepalive,
98         .hash           = smc_hash_sk,
99         .unhash         = smc_unhash_sk,
100         .obj_size       = sizeof(struct smc_sock),
101         .h.smc_hash     = &smc_v4_hashinfo,
102         .slab_flags     = SLAB_TYPESAFE_BY_RCU,
103 };
104 EXPORT_SYMBOL_GPL(smc_proto);
105
106 static int smc_release(struct socket *sock)
107 {
108         struct sock *sk = sock->sk;
109         struct smc_sock *smc;
110         int rc = 0;
111
112         if (!sk)
113                 goto out;
114
115         smc = smc_sk(sk);
116         if (sk->sk_state == SMC_LISTEN)
117                 /* smc_close_non_accepted() is called and acquires
118                  * sock lock for child sockets again
119                  */
120                 lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
121         else
122                 lock_sock(sk);
123
124         if (!smc->use_fallback) {
125                 rc = smc_close_active(smc);
126                 sock_set_flag(sk, SOCK_DEAD);
127                 sk->sk_shutdown |= SHUTDOWN_MASK;
128         }
129         if (smc->clcsock) {
130                 sock_release(smc->clcsock);
131                 smc->clcsock = NULL;
132         }
133         if (smc->use_fallback) {
134                 sock_put(sk); /* passive closing */
135                 sk->sk_state = SMC_CLOSED;
136                 sk->sk_state_change(sk);
137         }
138
139         /* detach socket */
140         sock_orphan(sk);
141         sock->sk = NULL;
142         if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
143                 smc_conn_free(&smc->conn);
144         release_sock(sk);
145
146         sk->sk_prot->unhash(sk);
147         sock_put(sk); /* final sock_put */
148 out:
149         return rc;
150 }
151
152 static void smc_destruct(struct sock *sk)
153 {
154         if (sk->sk_state != SMC_CLOSED)
155                 return;
156         if (!sock_flag(sk, SOCK_DEAD))
157                 return;
158
159         sk_refcnt_debug_dec(sk);
160 }
161
162 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
163 {
164         struct smc_sock *smc;
165         struct sock *sk;
166
167         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
168         if (!sk)
169                 return NULL;
170
171         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
172         sk->sk_state = SMC_INIT;
173         sk->sk_destruct = smc_destruct;
174         sk->sk_protocol = SMCPROTO_SMC;
175         smc = smc_sk(sk);
176         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
177         INIT_LIST_HEAD(&smc->accept_q);
178         spin_lock_init(&smc->accept_q_lock);
179         sk->sk_prot->hash(sk);
180         sk_refcnt_debug_inc(sk);
181
182         return sk;
183 }
184
185 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
186                     int addr_len)
187 {
188         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
189         struct sock *sk = sock->sk;
190         struct smc_sock *smc;
191         int rc;
192
193         smc = smc_sk(sk);
194
195         /* replicate tests from inet_bind(), to be safe wrt. future changes */
196         rc = -EINVAL;
197         if (addr_len < sizeof(struct sockaddr_in))
198                 goto out;
199
200         rc = -EAFNOSUPPORT;
201         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
202         if ((addr->sin_family != AF_INET) &&
203             ((addr->sin_family != AF_UNSPEC) ||
204              (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
205                 goto out;
206
207         lock_sock(sk);
208
209         /* Check if socket is already active */
210         rc = -EINVAL;
211         if (sk->sk_state != SMC_INIT)
212                 goto out_rel;
213
214         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
215         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
216
217 out_rel:
218         release_sock(sk);
219 out:
220         return rc;
221 }
222
223 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
224                                    unsigned long mask)
225 {
226         /* options we don't get control via setsockopt for */
227         nsk->sk_type = osk->sk_type;
228         nsk->sk_sndbuf = osk->sk_sndbuf;
229         nsk->sk_rcvbuf = osk->sk_rcvbuf;
230         nsk->sk_sndtimeo = osk->sk_sndtimeo;
231         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
232         nsk->sk_mark = osk->sk_mark;
233         nsk->sk_priority = osk->sk_priority;
234         nsk->sk_rcvlowat = osk->sk_rcvlowat;
235         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
236         nsk->sk_err = osk->sk_err;
237
238         nsk->sk_flags &= ~mask;
239         nsk->sk_flags |= osk->sk_flags & mask;
240 }
241
242 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
243                              (1UL << SOCK_KEEPOPEN) | \
244                              (1UL << SOCK_LINGER) | \
245                              (1UL << SOCK_BROADCAST) | \
246                              (1UL << SOCK_TIMESTAMP) | \
247                              (1UL << SOCK_DBG) | \
248                              (1UL << SOCK_RCVTSTAMP) | \
249                              (1UL << SOCK_RCVTSTAMPNS) | \
250                              (1UL << SOCK_LOCALROUTE) | \
251                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
252                              (1UL << SOCK_RXQ_OVFL) | \
253                              (1UL << SOCK_WIFI_STATUS) | \
254                              (1UL << SOCK_NOFCS) | \
255                              (1UL << SOCK_FILTER_LOCKED))
256 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
257  * clc socket (since smc is not called for these options from net/core)
258  */
259 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
260 {
261         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
262 }
263
264 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
265                              (1UL << SOCK_KEEPOPEN) | \
266                              (1UL << SOCK_LINGER) | \
267                              (1UL << SOCK_DBG))
268 /* copy only settings and flags relevant for smc from clc to smc socket */
269 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
270 {
271         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
272 }
273
274 static int smc_clnt_conf_first_link(struct smc_sock *smc)
275 {
276         struct smc_link_group *lgr = smc->conn.lgr;
277         struct smc_link *link;
278         int rest;
279         int rc;
280
281         link = &lgr->lnk[SMC_SINGLE_LINK];
282         /* receive CONFIRM LINK request from server over RoCE fabric */
283         rest = wait_for_completion_interruptible_timeout(
284                 &link->llc_confirm,
285                 SMC_LLC_WAIT_FIRST_TIME);
286         if (rest <= 0) {
287                 struct smc_clc_msg_decline dclc;
288
289                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
290                                       SMC_CLC_DECLINE);
291                 return rc;
292         }
293
294         if (link->llc_confirm_rc)
295                 return SMC_CLC_DECL_RMBE_EC;
296
297         rc = smc_ib_modify_qp_rts(link);
298         if (rc)
299                 return SMC_CLC_DECL_INTERR;
300
301         smc_wr_remember_qp_attr(link);
302
303         rc = smc_wr_reg_send(link,
304                              smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
305         if (rc)
306                 return SMC_CLC_DECL_INTERR;
307
308         /* send CONFIRM LINK response over RoCE fabric */
309         rc = smc_llc_send_confirm_link(link,
310                                        link->smcibdev->mac[link->ibport - 1],
311                                        &link->smcibdev->gid[link->ibport - 1],
312                                        SMC_LLC_RESP);
313         if (rc < 0)
314                 return SMC_CLC_DECL_TCL;
315
316         return 0;
317 }
318
319 static void smc_conn_save_peer_info(struct smc_sock *smc,
320                                     struct smc_clc_msg_accept_confirm *clc)
321 {
322         smc->conn.peer_conn_idx = clc->conn_idx;
323         smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
324         smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
325         atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
326 }
327
328 static void smc_link_save_peer_info(struct smc_link *link,
329                                     struct smc_clc_msg_accept_confirm *clc)
330 {
331         link->peer_qpn = ntoh24(clc->qpn);
332         memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
333         memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
334         link->peer_psn = ntoh24(clc->psn);
335         link->peer_mtu = clc->qp_mtu;
336 }
337
338 static void smc_lgr_forget(struct smc_link_group *lgr)
339 {
340         spin_lock_bh(&smc_lgr_list.lock);
341         /* do not use this link group for new connections */
342         if (!list_empty(&lgr->list))
343                 list_del_init(&lgr->list);
344         spin_unlock_bh(&smc_lgr_list.lock);
345 }
346
347 /* setup for RDMA connection of client */
348 static int smc_connect_rdma(struct smc_sock *smc)
349 {
350         struct smc_clc_msg_accept_confirm aclc;
351         int local_contact = SMC_FIRST_CONTACT;
352         struct smc_ib_device *smcibdev;
353         struct smc_link *link;
354         u8 srv_first_contact;
355         int reason_code = 0;
356         int rc = 0;
357         u8 ibport;
358
359         sock_hold(&smc->sk); /* sock put in passive closing */
360
361         if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
362                 /* peer has not signalled SMC-capability */
363                 smc->use_fallback = true;
364                 goto out_connected;
365         }
366
367         /* IPSec connections opt out of SMC-R optimizations */
368         if (using_ipsec(smc)) {
369                 reason_code = SMC_CLC_DECL_IPSEC;
370                 goto decline_rdma;
371         }
372
373         /* PNET table look up: search active ib_device and port
374          * within same PNETID that also contains the ethernet device
375          * used for the internal TCP socket
376          */
377         smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
378         if (!smcibdev) {
379                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
380                 goto decline_rdma;
381         }
382
383         /* do inband token exchange */
384         reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
385         if (reason_code < 0) {
386                 rc = reason_code;
387                 goto out_err;
388         }
389         if (reason_code > 0) /* configuration error */
390                 goto decline_rdma;
391         /* receive SMC Accept CLC message */
392         reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
393                                        SMC_CLC_ACCEPT);
394         if (reason_code < 0) {
395                 rc = reason_code;
396                 goto out_err;
397         }
398         if (reason_code > 0)
399                 goto decline_rdma;
400
401         srv_first_contact = aclc.hdr.flag;
402         mutex_lock(&smc_create_lgr_pending);
403         local_contact = smc_conn_create(smc, smcibdev, ibport, &aclc.lcl,
404                                         srv_first_contact);
405         if (local_contact < 0) {
406                 rc = local_contact;
407                 if (rc == -ENOMEM)
408                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
409                 else if (rc == -ENOLINK)
410                         reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
411                 goto decline_rdma_unlock;
412         }
413         link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
414
415         smc_conn_save_peer_info(smc, &aclc);
416
417         /* create send buffer and rmb */
418         rc = smc_buf_create(smc);
419         if (rc) {
420                 reason_code = SMC_CLC_DECL_MEM;
421                 goto decline_rdma_unlock;
422         }
423
424         if (local_contact == SMC_FIRST_CONTACT)
425                 smc_link_save_peer_info(link, &aclc);
426
427         rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
428         if (rc) {
429                 reason_code = SMC_CLC_DECL_INTERR;
430                 goto decline_rdma_unlock;
431         }
432
433         smc_close_init(smc);
434         smc_rx_init(smc);
435
436         if (local_contact == SMC_FIRST_CONTACT) {
437                 rc = smc_ib_ready_link(link);
438                 if (rc) {
439                         reason_code = SMC_CLC_DECL_INTERR;
440                         goto decline_rdma_unlock;
441                 }
442         } else {
443                 struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
444
445                 if (!buf_desc->reused) {
446                         /* register memory region for new rmb */
447                         rc = smc_wr_reg_send(link,
448                                              buf_desc->mr_rx[SMC_SINGLE_LINK]);
449                         if (rc) {
450                                 reason_code = SMC_CLC_DECL_INTERR;
451                                 goto decline_rdma_unlock;
452                         }
453                 }
454         }
455         smc_rmb_sync_sg_for_device(&smc->conn);
456
457         rc = smc_clc_send_confirm(smc);
458         if (rc)
459                 goto out_err_unlock;
460
461         if (local_contact == SMC_FIRST_CONTACT) {
462                 /* QP confirmation over RoCE fabric */
463                 reason_code = smc_clnt_conf_first_link(smc);
464                 if (reason_code < 0) {
465                         rc = reason_code;
466                         goto out_err_unlock;
467                 }
468                 if (reason_code > 0)
469                         goto decline_rdma_unlock;
470         }
471
472         mutex_unlock(&smc_create_lgr_pending);
473         smc_tx_init(smc);
474
475 out_connected:
476         smc_copy_sock_settings_to_clc(smc);
477         if (smc->sk.sk_state == SMC_INIT)
478                 smc->sk.sk_state = SMC_ACTIVE;
479
480         return rc ? rc : local_contact;
481
482 decline_rdma_unlock:
483         if (local_contact == SMC_FIRST_CONTACT)
484                 smc_lgr_forget(smc->conn.lgr);
485         mutex_unlock(&smc_create_lgr_pending);
486         smc_conn_free(&smc->conn);
487 decline_rdma:
488         /* RDMA setup failed, switch back to TCP */
489         smc->use_fallback = true;
490         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
491                 rc = smc_clc_send_decline(smc, reason_code);
492                 if (rc < 0)
493                         goto out_err;
494         }
495         goto out_connected;
496
497 out_err_unlock:
498         if (local_contact == SMC_FIRST_CONTACT)
499                 smc_lgr_forget(smc->conn.lgr);
500         mutex_unlock(&smc_create_lgr_pending);
501         smc_conn_free(&smc->conn);
502 out_err:
503         if (smc->sk.sk_state == SMC_INIT)
504                 sock_put(&smc->sk); /* passive closing */
505         return rc;
506 }
507
508 static int smc_connect(struct socket *sock, struct sockaddr *addr,
509                        int alen, int flags)
510 {
511         struct sock *sk = sock->sk;
512         struct smc_sock *smc;
513         int rc = -EINVAL;
514
515         smc = smc_sk(sk);
516
517         /* separate smc parameter checking to be safe */
518         if (alen < sizeof(addr->sa_family))
519                 goto out_err;
520         if (addr->sa_family != AF_INET)
521                 goto out_err;
522
523         lock_sock(sk);
524         switch (sk->sk_state) {
525         default:
526                 goto out;
527         case SMC_ACTIVE:
528                 rc = -EISCONN;
529                 goto out;
530         case SMC_INIT:
531                 rc = 0;
532                 break;
533         }
534
535         smc_copy_sock_settings_to_clc(smc);
536         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
537         rc = kernel_connect(smc->clcsock, addr, alen, flags);
538         if (rc)
539                 goto out;
540
541         /* setup RDMA connection */
542         rc = smc_connect_rdma(smc);
543         if (rc < 0)
544                 goto out;
545         else
546                 rc = 0; /* success cases including fallback */
547
548 out:
549         release_sock(sk);
550 out_err:
551         return rc;
552 }
553
554 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
555 {
556         struct socket *new_clcsock = NULL;
557         struct sock *lsk = &lsmc->sk;
558         struct sock *new_sk;
559         int rc;
560
561         release_sock(lsk);
562         new_sk = smc_sock_alloc(sock_net(lsk), NULL);
563         if (!new_sk) {
564                 rc = -ENOMEM;
565                 lsk->sk_err = ENOMEM;
566                 *new_smc = NULL;
567                 lock_sock(lsk);
568                 goto out;
569         }
570         *new_smc = smc_sk(new_sk);
571
572         rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
573         lock_sock(lsk);
574         if  (rc < 0)
575                 lsk->sk_err = -rc;
576         if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
577                 if (new_clcsock)
578                         sock_release(new_clcsock);
579                 new_sk->sk_state = SMC_CLOSED;
580                 sock_set_flag(new_sk, SOCK_DEAD);
581                 new_sk->sk_prot->unhash(new_sk);
582                 sock_put(new_sk); /* final */
583                 *new_smc = NULL;
584                 goto out;
585         }
586
587         (*new_smc)->clcsock = new_clcsock;
588 out:
589         return rc;
590 }
591
592 /* add a just created sock to the accept queue of the listen sock as
593  * candidate for a following socket accept call from user space
594  */
595 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
596 {
597         struct smc_sock *par = smc_sk(parent);
598
599         sock_hold(sk); /* sock_put in smc_accept_unlink () */
600         spin_lock(&par->accept_q_lock);
601         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
602         spin_unlock(&par->accept_q_lock);
603         sk_acceptq_added(parent);
604 }
605
606 /* remove a socket from the accept queue of its parental listening socket */
607 static void smc_accept_unlink(struct sock *sk)
608 {
609         struct smc_sock *par = smc_sk(sk)->listen_smc;
610
611         spin_lock(&par->accept_q_lock);
612         list_del_init(&smc_sk(sk)->accept_q);
613         spin_unlock(&par->accept_q_lock);
614         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
615         sock_put(sk); /* sock_hold in smc_accept_enqueue */
616 }
617
618 /* remove a sock from the accept queue to bind it to a new socket created
619  * for a socket accept call from user space
620  */
621 struct sock *smc_accept_dequeue(struct sock *parent,
622                                 struct socket *new_sock)
623 {
624         struct smc_sock *isk, *n;
625         struct sock *new_sk;
626
627         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
628                 new_sk = (struct sock *)isk;
629
630                 smc_accept_unlink(new_sk);
631                 if (new_sk->sk_state == SMC_CLOSED) {
632                         if (isk->clcsock) {
633                                 sock_release(isk->clcsock);
634                                 isk->clcsock = NULL;
635                         }
636                         new_sk->sk_prot->unhash(new_sk);
637                         sock_put(new_sk); /* final */
638                         continue;
639                 }
640                 if (new_sock)
641                         sock_graft(new_sk, new_sock);
642                 return new_sk;
643         }
644         return NULL;
645 }
646
647 /* clean up for a created but never accepted sock */
648 void smc_close_non_accepted(struct sock *sk)
649 {
650         struct smc_sock *smc = smc_sk(sk);
651
652         lock_sock(sk);
653         if (!sk->sk_lingertime)
654                 /* wait for peer closing */
655                 sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
656         if (!smc->use_fallback) {
657                 smc_close_active(smc);
658                 sock_set_flag(sk, SOCK_DEAD);
659                 sk->sk_shutdown |= SHUTDOWN_MASK;
660         }
661         if (smc->clcsock) {
662                 struct socket *tcp;
663
664                 tcp = smc->clcsock;
665                 smc->clcsock = NULL;
666                 sock_release(tcp);
667         }
668         if (smc->use_fallback) {
669                 sock_put(sk); /* passive closing */
670                 sk->sk_state = SMC_CLOSED;
671         } else {
672                 if (sk->sk_state == SMC_CLOSED)
673                         smc_conn_free(&smc->conn);
674         }
675         release_sock(sk);
676         sk->sk_prot->unhash(sk);
677         sock_put(sk); /* final sock_put */
678 }
679
680 static int smc_serv_conf_first_link(struct smc_sock *smc)
681 {
682         struct smc_link_group *lgr = smc->conn.lgr;
683         struct smc_link *link;
684         int rest;
685         int rc;
686
687         link = &lgr->lnk[SMC_SINGLE_LINK];
688
689         rc = smc_wr_reg_send(link,
690                              smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
691         if (rc)
692                 return SMC_CLC_DECL_INTERR;
693
694         /* send CONFIRM LINK request to client over the RoCE fabric */
695         rc = smc_llc_send_confirm_link(link,
696                                        link->smcibdev->mac[link->ibport - 1],
697                                        &link->smcibdev->gid[link->ibport - 1],
698                                        SMC_LLC_REQ);
699         if (rc < 0)
700                 return SMC_CLC_DECL_TCL;
701
702         /* receive CONFIRM LINK response from client over the RoCE fabric */
703         rest = wait_for_completion_interruptible_timeout(
704                 &link->llc_confirm_resp,
705                 SMC_LLC_WAIT_FIRST_TIME);
706         if (rest <= 0) {
707                 struct smc_clc_msg_decline dclc;
708
709                 rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
710                                       SMC_CLC_DECLINE);
711                 return rc;
712         }
713
714         if (link->llc_confirm_resp_rc)
715                 return SMC_CLC_DECL_RMBE_EC;
716
717         return 0;
718 }
719
720 /* setup for RDMA connection of server */
721 static void smc_listen_work(struct work_struct *work)
722 {
723         struct smc_sock *new_smc = container_of(work, struct smc_sock,
724                                                 smc_listen_work);
725         struct smc_clc_msg_proposal_prefix *pclc_prfx;
726         struct socket *newclcsock = new_smc->clcsock;
727         struct smc_sock *lsmc = new_smc->listen_smc;
728         struct smc_clc_msg_accept_confirm cclc;
729         int local_contact = SMC_REUSE_CONTACT;
730         struct sock *newsmcsk = &new_smc->sk;
731         struct smc_clc_msg_proposal *pclc;
732         struct smc_ib_device *smcibdev;
733         u8 buf[SMC_CLC_MAX_LEN];
734         struct smc_link *link;
735         int reason_code = 0;
736         int rc = 0;
737         __be32 subnet;
738         u8 prefix_len;
739         u8 ibport;
740
741         /* check if peer is smc capable */
742         if (!tcp_sk(newclcsock->sk)->syn_smc) {
743                 new_smc->use_fallback = true;
744                 goto out_connected;
745         }
746
747         /* do inband token exchange -
748          *wait for and receive SMC Proposal CLC message
749          */
750         reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
751                                        SMC_CLC_PROPOSAL);
752         if (reason_code < 0)
753                 goto out_err;
754         if (reason_code > 0)
755                 goto decline_rdma;
756
757         /* IPSec connections opt out of SMC-R optimizations */
758         if (using_ipsec(new_smc)) {
759                 reason_code = SMC_CLC_DECL_IPSEC;
760                 goto decline_rdma;
761         }
762
763         /* PNET table look up: search active ib_device and port
764          * within same PNETID that also contains the ethernet device
765          * used for the internal TCP socket
766          */
767         smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
768         if (!smcibdev) {
769                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
770                 goto decline_rdma;
771         }
772
773         /* determine subnet and mask from internal TCP socket */
774         rc = smc_clc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
775         if (rc) {
776                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
777                 goto decline_rdma;
778         }
779
780         pclc = (struct smc_clc_msg_proposal *)&buf;
781         pclc_prfx = smc_clc_proposal_get_prefix(pclc);
782         if (pclc_prfx->outgoing_subnet != subnet ||
783             pclc_prfx->prefix_len != prefix_len) {
784                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
785                 goto decline_rdma;
786         }
787
788         /* allocate connection / link group */
789         mutex_lock(&smc_create_lgr_pending);
790         local_contact = smc_conn_create(new_smc, smcibdev, ibport, &pclc->lcl,
791                                         0);
792         if (local_contact < 0) {
793                 rc = local_contact;
794                 if (rc == -ENOMEM)
795                         reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
796                 goto decline_rdma_unlock;
797         }
798         link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
799
800         /* create send buffer and rmb */
801         rc = smc_buf_create(new_smc);
802         if (rc) {
803                 reason_code = SMC_CLC_DECL_MEM;
804                 goto decline_rdma_unlock;
805         }
806
807         smc_close_init(new_smc);
808         smc_rx_init(new_smc);
809
810         if (local_contact != SMC_FIRST_CONTACT) {
811                 struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
812
813                 if (!buf_desc->reused) {
814                         /* register memory region for new rmb */
815                         rc = smc_wr_reg_send(link,
816                                              buf_desc->mr_rx[SMC_SINGLE_LINK]);
817                         if (rc) {
818                                 reason_code = SMC_CLC_DECL_INTERR;
819                                 goto decline_rdma_unlock;
820                         }
821                 }
822         }
823         smc_rmb_sync_sg_for_device(&new_smc->conn);
824
825         rc = smc_clc_send_accept(new_smc, local_contact);
826         if (rc)
827                 goto out_err_unlock;
828
829         /* receive SMC Confirm CLC message */
830         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
831                                        SMC_CLC_CONFIRM);
832         if (reason_code < 0)
833                 goto out_err_unlock;
834         if (reason_code > 0)
835                 goto decline_rdma_unlock;
836         smc_conn_save_peer_info(new_smc, &cclc);
837         if (local_contact == SMC_FIRST_CONTACT)
838                 smc_link_save_peer_info(link, &cclc);
839
840         rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
841         if (rc) {
842                 reason_code = SMC_CLC_DECL_INTERR;
843                 goto decline_rdma_unlock;
844         }
845
846         if (local_contact == SMC_FIRST_CONTACT) {
847                 rc = smc_ib_ready_link(link);
848                 if (rc) {
849                         reason_code = SMC_CLC_DECL_INTERR;
850                         goto decline_rdma_unlock;
851                 }
852                 /* QP confirmation over RoCE fabric */
853                 reason_code = smc_serv_conf_first_link(new_smc);
854                 if (reason_code < 0)
855                         /* peer is not aware of a problem */
856                         goto out_err_unlock;
857                 if (reason_code > 0)
858                         goto decline_rdma_unlock;
859         }
860
861         smc_tx_init(new_smc);
862         mutex_unlock(&smc_create_lgr_pending);
863
864 out_connected:
865         sk_refcnt_debug_inc(newsmcsk);
866         if (newsmcsk->sk_state == SMC_INIT)
867                 newsmcsk->sk_state = SMC_ACTIVE;
868 enqueue:
869         lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
870         if (lsmc->sk.sk_state == SMC_LISTEN) {
871                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
872         } else { /* no longer listening */
873                 smc_close_non_accepted(newsmcsk);
874         }
875         release_sock(&lsmc->sk);
876
877         /* Wake up accept */
878         lsmc->sk.sk_data_ready(&lsmc->sk);
879         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
880         return;
881
882 decline_rdma_unlock:
883         if (local_contact == SMC_FIRST_CONTACT)
884                 smc_lgr_forget(new_smc->conn.lgr);
885         mutex_unlock(&smc_create_lgr_pending);
886 decline_rdma:
887         /* RDMA setup failed, switch back to TCP */
888         smc_conn_free(&new_smc->conn);
889         new_smc->use_fallback = true;
890         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
891                 if (smc_clc_send_decline(new_smc, reason_code) < 0)
892                         goto out_err;
893         }
894         goto out_connected;
895
896 out_err_unlock:
897         if (local_contact == SMC_FIRST_CONTACT)
898                 smc_lgr_forget(new_smc->conn.lgr);
899         mutex_unlock(&smc_create_lgr_pending);
900 out_err:
901         if (newsmcsk->sk_state == SMC_INIT)
902                 sock_put(&new_smc->sk); /* passive closing */
903         newsmcsk->sk_state = SMC_CLOSED;
904         smc_conn_free(&new_smc->conn);
905         goto enqueue; /* queue new sock with sk_err set */
906 }
907
908 static void smc_tcp_listen_work(struct work_struct *work)
909 {
910         struct smc_sock *lsmc = container_of(work, struct smc_sock,
911                                              tcp_listen_work);
912         struct sock *lsk = &lsmc->sk;
913         struct smc_sock *new_smc;
914         int rc = 0;
915
916         lock_sock(lsk);
917         while (lsk->sk_state == SMC_LISTEN) {
918                 rc = smc_clcsock_accept(lsmc, &new_smc);
919                 if (rc)
920                         goto out;
921                 if (!new_smc)
922                         continue;
923
924                 new_smc->listen_smc = lsmc;
925                 new_smc->use_fallback = false; /* assume rdma capability first*/
926                 sock_hold(lsk); /* sock_put in smc_listen_work */
927                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
928                 smc_copy_sock_settings_to_smc(new_smc);
929                 sock_hold(&new_smc->sk); /* sock_put in passive closing */
930                 if (!schedule_work(&new_smc->smc_listen_work))
931                         sock_put(&new_smc->sk);
932         }
933
934 out:
935         if (lsmc->clcsock) {
936                 sock_release(lsmc->clcsock);
937                 lsmc->clcsock = NULL;
938         }
939         release_sock(lsk);
940         /* no more listening, wake up smc_close_wait_listen_clcsock and
941          * accept
942          */
943         lsk->sk_state_change(lsk);
944         sock_put(&lsmc->sk); /* sock_hold in smc_listen */
945 }
946
947 static int smc_listen(struct socket *sock, int backlog)
948 {
949         struct sock *sk = sock->sk;
950         struct smc_sock *smc;
951         int rc;
952
953         smc = smc_sk(sk);
954         lock_sock(sk);
955
956         rc = -EINVAL;
957         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
958                 goto out;
959
960         rc = 0;
961         if (sk->sk_state == SMC_LISTEN) {
962                 sk->sk_max_ack_backlog = backlog;
963                 goto out;
964         }
965         /* some socket options are handled in core, so we could not apply
966          * them to the clc socket -- copy smc socket options to clc socket
967          */
968         smc_copy_sock_settings_to_clc(smc);
969         tcp_sk(smc->clcsock->sk)->syn_smc = 1;
970
971         rc = kernel_listen(smc->clcsock, backlog);
972         if (rc)
973                 goto out;
974         sk->sk_max_ack_backlog = backlog;
975         sk->sk_ack_backlog = 0;
976         sk->sk_state = SMC_LISTEN;
977         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
978         sock_hold(sk); /* sock_hold in tcp_listen_worker */
979         if (!schedule_work(&smc->tcp_listen_work))
980                 sock_put(sk);
981
982 out:
983         release_sock(sk);
984         return rc;
985 }
986
987 static int smc_accept(struct socket *sock, struct socket *new_sock,
988                       int flags, bool kern)
989 {
990         struct sock *sk = sock->sk, *nsk;
991         DECLARE_WAITQUEUE(wait, current);
992         struct smc_sock *lsmc;
993         long timeo;
994         int rc = 0;
995
996         lsmc = smc_sk(sk);
997         sock_hold(sk); /* sock_put below */
998         lock_sock(sk);
999
1000         if (lsmc->sk.sk_state != SMC_LISTEN) {
1001                 rc = -EINVAL;
1002                 goto out;
1003         }
1004
1005         /* Wait for an incoming connection */
1006         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1007         add_wait_queue_exclusive(sk_sleep(sk), &wait);
1008         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1009                 set_current_state(TASK_INTERRUPTIBLE);
1010                 if (!timeo) {
1011                         rc = -EAGAIN;
1012                         break;
1013                 }
1014                 release_sock(sk);
1015                 timeo = schedule_timeout(timeo);
1016                 /* wakeup by sk_data_ready in smc_listen_work() */
1017                 sched_annotate_sleep();
1018                 lock_sock(sk);
1019                 if (signal_pending(current)) {
1020                         rc = sock_intr_errno(timeo);
1021                         break;
1022                 }
1023         }
1024         set_current_state(TASK_RUNNING);
1025         remove_wait_queue(sk_sleep(sk), &wait);
1026
1027         if (!rc)
1028                 rc = sock_error(nsk);
1029
1030 out:
1031         release_sock(sk);
1032         sock_put(sk); /* sock_hold above */
1033         return rc;
1034 }
1035
1036 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1037                        int peer)
1038 {
1039         struct smc_sock *smc;
1040
1041         if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1042             (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1043                 return -ENOTCONN;
1044
1045         smc = smc_sk(sock->sk);
1046
1047         return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1048 }
1049
1050 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1051 {
1052         struct sock *sk = sock->sk;
1053         struct smc_sock *smc;
1054         int rc = -EPIPE;
1055
1056         smc = smc_sk(sk);
1057         lock_sock(sk);
1058         if ((sk->sk_state != SMC_ACTIVE) &&
1059             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1060             (sk->sk_state != SMC_INIT))
1061                 goto out;
1062         if (smc->use_fallback)
1063                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1064         else
1065                 rc = smc_tx_sendmsg(smc, msg, len);
1066 out:
1067         release_sock(sk);
1068         return rc;
1069 }
1070
1071 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1072                        int flags)
1073 {
1074         struct sock *sk = sock->sk;
1075         struct smc_sock *smc;
1076         int rc = -ENOTCONN;
1077
1078         smc = smc_sk(sk);
1079         lock_sock(sk);
1080         if ((sk->sk_state == SMC_INIT) ||
1081             (sk->sk_state == SMC_LISTEN) ||
1082             (sk->sk_state == SMC_CLOSED))
1083                 goto out;
1084
1085         if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1086                 rc = 0;
1087                 goto out;
1088         }
1089
1090         if (smc->use_fallback)
1091                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1092         else
1093                 rc = smc_rx_recvmsg(smc, msg, len, flags);
1094
1095 out:
1096         release_sock(sk);
1097         return rc;
1098 }
1099
1100 static __poll_t smc_accept_poll(struct sock *parent)
1101 {
1102         struct smc_sock *isk = smc_sk(parent);
1103         __poll_t mask = 0;
1104
1105         spin_lock(&isk->accept_q_lock);
1106         if (!list_empty(&isk->accept_q))
1107                 mask = EPOLLIN | EPOLLRDNORM;
1108         spin_unlock(&isk->accept_q_lock);
1109
1110         return mask;
1111 }
1112
1113 static __poll_t smc_poll(struct file *file, struct socket *sock,
1114                              poll_table *wait)
1115 {
1116         struct sock *sk = sock->sk;
1117         __poll_t mask = 0;
1118         struct smc_sock *smc;
1119         int rc;
1120
1121         if (!sk)
1122                 return EPOLLNVAL;
1123
1124         smc = smc_sk(sock->sk);
1125         sock_hold(sk);
1126         lock_sock(sk);
1127         if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1128                 /* delegate to CLC child sock */
1129                 release_sock(sk);
1130                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1131                 /* if non-blocking connect finished ... */
1132                 lock_sock(sk);
1133                 if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) {
1134                         sk->sk_err = smc->clcsock->sk->sk_err;
1135                         if (sk->sk_err) {
1136                                 mask |= EPOLLERR;
1137                         } else {
1138                                 rc = smc_connect_rdma(smc);
1139                                 if (rc < 0)
1140                                         mask |= EPOLLERR;
1141                                 /* success cases including fallback */
1142                                 mask |= EPOLLOUT | EPOLLWRNORM;
1143                         }
1144                 }
1145         } else {
1146                 if (sk->sk_state != SMC_CLOSED) {
1147                         release_sock(sk);
1148                         sock_poll_wait(file, sk_sleep(sk), wait);
1149                         lock_sock(sk);
1150                 }
1151                 if (sk->sk_err)
1152                         mask |= EPOLLERR;
1153                 if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1154                     (sk->sk_state == SMC_CLOSED))
1155                         mask |= EPOLLHUP;
1156                 if (sk->sk_state == SMC_LISTEN) {
1157                         /* woken up by sk_data_ready in smc_listen_work() */
1158                         mask = smc_accept_poll(sk);
1159                 } else {
1160                         if (atomic_read(&smc->conn.sndbuf_space) ||
1161                             sk->sk_shutdown & SEND_SHUTDOWN) {
1162                                 mask |= EPOLLOUT | EPOLLWRNORM;
1163                         } else {
1164                                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1165                                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1166                         }
1167                         if (atomic_read(&smc->conn.bytes_to_rcv))
1168                                 mask |= EPOLLIN | EPOLLRDNORM;
1169                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1170                                 mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1171                         if (sk->sk_state == SMC_APPCLOSEWAIT1)
1172                                 mask |= EPOLLIN;
1173                 }
1174
1175         }
1176         release_sock(sk);
1177         sock_put(sk);
1178
1179         return mask;
1180 }
1181
1182 static int smc_shutdown(struct socket *sock, int how)
1183 {
1184         struct sock *sk = sock->sk;
1185         struct smc_sock *smc;
1186         int rc = -EINVAL;
1187         int rc1 = 0;
1188
1189         smc = smc_sk(sk);
1190
1191         if ((how < SHUT_RD) || (how > SHUT_RDWR))
1192                 return rc;
1193
1194         lock_sock(sk);
1195
1196         rc = -ENOTCONN;
1197         if ((sk->sk_state != SMC_LISTEN) &&
1198             (sk->sk_state != SMC_ACTIVE) &&
1199             (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1200             (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1201             (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1202             (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1203             (sk->sk_state != SMC_APPFINCLOSEWAIT))
1204                 goto out;
1205         if (smc->use_fallback) {
1206                 rc = kernel_sock_shutdown(smc->clcsock, how);
1207                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1208                 if (sk->sk_shutdown == SHUTDOWN_MASK)
1209                         sk->sk_state = SMC_CLOSED;
1210                 goto out;
1211         }
1212         switch (how) {
1213         case SHUT_RDWR:         /* shutdown in both directions */
1214                 rc = smc_close_active(smc);
1215                 break;
1216         case SHUT_WR:
1217                 rc = smc_close_shutdown_write(smc);
1218                 break;
1219         case SHUT_RD:
1220                 if (sk->sk_state == SMC_LISTEN)
1221                         rc = smc_close_active(smc);
1222                 else
1223                         rc = 0;
1224                         /* nothing more to do because peer is not involved */
1225                 break;
1226         }
1227         rc1 = kernel_sock_shutdown(smc->clcsock, how);
1228         /* map sock_shutdown_cmd constants to sk_shutdown value range */
1229         sk->sk_shutdown |= how + 1;
1230
1231 out:
1232         release_sock(sk);
1233         return rc ? rc : rc1;
1234 }
1235
1236 static int smc_setsockopt(struct socket *sock, int level, int optname,
1237                           char __user *optval, unsigned int optlen)
1238 {
1239         struct sock *sk = sock->sk;
1240         struct smc_sock *smc;
1241
1242         smc = smc_sk(sk);
1243
1244         /* generic setsockopts reaching us here always apply to the
1245          * CLC socket
1246          */
1247         return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1248                                              optval, optlen);
1249 }
1250
1251 static int smc_getsockopt(struct socket *sock, int level, int optname,
1252                           char __user *optval, int __user *optlen)
1253 {
1254         struct smc_sock *smc;
1255
1256         smc = smc_sk(sock->sk);
1257         /* socket options apply to the CLC socket */
1258         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1259                                              optval, optlen);
1260 }
1261
1262 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1263                      unsigned long arg)
1264 {
1265         struct smc_sock *smc;
1266
1267         smc = smc_sk(sock->sk);
1268         if (smc->use_fallback)
1269                 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1270         else
1271                 return sock_no_ioctl(sock, cmd, arg);
1272 }
1273
1274 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1275                             int offset, size_t size, int flags)
1276 {
1277         struct sock *sk = sock->sk;
1278         struct smc_sock *smc;
1279         int rc = -EPIPE;
1280
1281         smc = smc_sk(sk);
1282         lock_sock(sk);
1283         if (sk->sk_state != SMC_ACTIVE)
1284                 goto out;
1285         if (smc->use_fallback)
1286                 rc = kernel_sendpage(smc->clcsock, page, offset,
1287                                      size, flags);
1288         else
1289                 rc = sock_no_sendpage(sock, page, offset, size, flags);
1290
1291 out:
1292         release_sock(sk);
1293         return rc;
1294 }
1295
1296 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1297                                struct pipe_inode_info *pipe, size_t len,
1298                                     unsigned int flags)
1299 {
1300         struct sock *sk = sock->sk;
1301         struct smc_sock *smc;
1302         int rc = -ENOTCONN;
1303
1304         smc = smc_sk(sk);
1305         lock_sock(sk);
1306         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1307                 goto out;
1308         if (smc->use_fallback) {
1309                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1310                                                     pipe, len, flags);
1311         } else {
1312                 rc = -EOPNOTSUPP;
1313         }
1314 out:
1315         release_sock(sk);
1316         return rc;
1317 }
1318
1319 /* must look like tcp */
1320 static const struct proto_ops smc_sock_ops = {
1321         .family         = PF_SMC,
1322         .owner          = THIS_MODULE,
1323         .release        = smc_release,
1324         .bind           = smc_bind,
1325         .connect        = smc_connect,
1326         .socketpair     = sock_no_socketpair,
1327         .accept         = smc_accept,
1328         .getname        = smc_getname,
1329         .poll           = smc_poll,
1330         .ioctl          = smc_ioctl,
1331         .listen         = smc_listen,
1332         .shutdown       = smc_shutdown,
1333         .setsockopt     = smc_setsockopt,
1334         .getsockopt     = smc_getsockopt,
1335         .sendmsg        = smc_sendmsg,
1336         .recvmsg        = smc_recvmsg,
1337         .mmap           = sock_no_mmap,
1338         .sendpage       = smc_sendpage,
1339         .splice_read    = smc_splice_read,
1340 };
1341
1342 static int smc_create(struct net *net, struct socket *sock, int protocol,
1343                       int kern)
1344 {
1345         struct smc_sock *smc;
1346         struct sock *sk;
1347         int rc;
1348
1349         rc = -ESOCKTNOSUPPORT;
1350         if (sock->type != SOCK_STREAM)
1351                 goto out;
1352
1353         rc = -EPROTONOSUPPORT;
1354         if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1355                 goto out;
1356
1357         rc = -ENOBUFS;
1358         sock->ops = &smc_sock_ops;
1359         sk = smc_sock_alloc(net, sock);
1360         if (!sk)
1361                 goto out;
1362
1363         /* create internal TCP socket for CLC handshake and fallback */
1364         smc = smc_sk(sk);
1365         smc->use_fallback = false; /* assume rdma capability first */
1366         rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1367                               IPPROTO_TCP, &smc->clcsock);
1368         if (rc)
1369                 sk_common_release(sk);
1370         smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1371         smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1372
1373 out:
1374         return rc;
1375 }
1376
1377 static const struct net_proto_family smc_sock_family_ops = {
1378         .family = PF_SMC,
1379         .owner  = THIS_MODULE,
1380         .create = smc_create,
1381 };
1382
1383 static int __init smc_init(void)
1384 {
1385         int rc;
1386
1387         rc = smc_pnet_init();
1388         if (rc)
1389                 return rc;
1390
1391         rc = smc_llc_init();
1392         if (rc) {
1393                 pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1394                 goto out_pnet;
1395         }
1396
1397         rc = smc_cdc_init();
1398         if (rc) {
1399                 pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1400                 goto out_pnet;
1401         }
1402
1403         rc = proto_register(&smc_proto, 1);
1404         if (rc) {
1405                 pr_err("%s: proto_register fails with %d\n", __func__, rc);
1406                 goto out_pnet;
1407         }
1408
1409         rc = sock_register(&smc_sock_family_ops);
1410         if (rc) {
1411                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1412                 goto out_proto;
1413         }
1414         INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1415
1416         rc = smc_ib_register_client();
1417         if (rc) {
1418                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1419                 goto out_sock;
1420         }
1421
1422         static_branch_enable(&tcp_have_smc);
1423         return 0;
1424
1425 out_sock:
1426         sock_unregister(PF_SMC);
1427 out_proto:
1428         proto_unregister(&smc_proto);
1429 out_pnet:
1430         smc_pnet_exit();
1431         return rc;
1432 }
1433
1434 static void __exit smc_exit(void)
1435 {
1436         struct smc_link_group *lgr, *lg;
1437         LIST_HEAD(lgr_freeing_list);
1438
1439         spin_lock_bh(&smc_lgr_list.lock);
1440         if (!list_empty(&smc_lgr_list.list))
1441                 list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1442         spin_unlock_bh(&smc_lgr_list.lock);
1443         list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1444                 list_del_init(&lgr->list);
1445                 smc_lgr_free(lgr); /* free link group */
1446         }
1447         static_branch_disable(&tcp_have_smc);
1448         smc_ib_unregister_client();
1449         sock_unregister(PF_SMC);
1450         proto_unregister(&smc_proto);
1451         smc_pnet_exit();
1452 }
1453
1454 module_init(smc_init);
1455 module_exit(smc_exit);
1456
1457 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1458 MODULE_DESCRIPTION("smc socket address family");
1459 MODULE_LICENSE("GPL");
1460 MODULE_ALIAS_NETPROTO(PF_SMC);