05c705a688e520058d77d72c46bbbe53603de486
[platform/kernel/linux-rpi.git] / net / smc / af_smc.c
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <net/sock.h>
30 #include <net/tcp.h>
31
32 #include "smc.h"
33 #include "smc_clc.h"
34 #include "smc_ib.h"
35 #include "smc_pnet.h"
36
37 static void smc_tcp_listen_work(struct work_struct *);
38
39 static void smc_set_keepalive(struct sock *sk, int val)
40 {
41         struct smc_sock *smc = smc_sk(sk);
42
43         smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
44 }
45
46 static struct proto smc_proto = {
47         .name           = "SMC",
48         .owner          = THIS_MODULE,
49         .keepalive      = smc_set_keepalive,
50         .obj_size       = sizeof(struct smc_sock),
51         .slab_flags     = SLAB_DESTROY_BY_RCU,
52 };
53
54 static int smc_release(struct socket *sock)
55 {
56         struct sock *sk = sock->sk;
57         struct smc_sock *smc;
58
59         if (!sk)
60                 goto out;
61
62         smc = smc_sk(sk);
63         lock_sock(sk);
64
65         sk->sk_state = SMC_CLOSED;
66         if (smc->clcsock) {
67                 sock_release(smc->clcsock);
68                 smc->clcsock = NULL;
69         }
70
71         /* detach socket */
72         sock_orphan(sk);
73         sock->sk = NULL;
74         release_sock(sk);
75
76         sock_put(sk);
77 out:
78         return 0;
79 }
80
81 static void smc_destruct(struct sock *sk)
82 {
83         if (sk->sk_state != SMC_CLOSED)
84                 return;
85         if (!sock_flag(sk, SOCK_DEAD))
86                 return;
87
88         sk_refcnt_debug_dec(sk);
89 }
90
91 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
92 {
93         struct smc_sock *smc;
94         struct sock *sk;
95
96         sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
97         if (!sk)
98                 return NULL;
99
100         sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
101         sk->sk_state = SMC_INIT;
102         sk->sk_destruct = smc_destruct;
103         sk->sk_protocol = SMCPROTO_SMC;
104         smc = smc_sk(sk);
105         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
106         INIT_LIST_HEAD(&smc->accept_q);
107         spin_lock_init(&smc->accept_q_lock);
108         sk_refcnt_debug_inc(sk);
109
110         return sk;
111 }
112
113 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
114                     int addr_len)
115 {
116         struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
117         struct sock *sk = sock->sk;
118         struct smc_sock *smc;
119         int rc;
120
121         smc = smc_sk(sk);
122
123         /* replicate tests from inet_bind(), to be safe wrt. future changes */
124         rc = -EINVAL;
125         if (addr_len < sizeof(struct sockaddr_in))
126                 goto out;
127
128         rc = -EAFNOSUPPORT;
129         /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
130         if ((addr->sin_family != AF_INET) &&
131             ((addr->sin_family != AF_UNSPEC) ||
132              (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
133                 goto out;
134
135         lock_sock(sk);
136
137         /* Check if socket is already active */
138         rc = -EINVAL;
139         if (sk->sk_state != SMC_INIT)
140                 goto out_rel;
141
142         smc->clcsock->sk->sk_reuse = sk->sk_reuse;
143         rc = kernel_bind(smc->clcsock, uaddr, addr_len);
144
145 out_rel:
146         release_sock(sk);
147 out:
148         return rc;
149 }
150
151 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
152                                    unsigned long mask)
153 {
154         /* options we don't get control via setsockopt for */
155         nsk->sk_type = osk->sk_type;
156         nsk->sk_sndbuf = osk->sk_sndbuf;
157         nsk->sk_rcvbuf = osk->sk_rcvbuf;
158         nsk->sk_sndtimeo = osk->sk_sndtimeo;
159         nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
160         nsk->sk_mark = osk->sk_mark;
161         nsk->sk_priority = osk->sk_priority;
162         nsk->sk_rcvlowat = osk->sk_rcvlowat;
163         nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
164         nsk->sk_err = osk->sk_err;
165
166         nsk->sk_flags &= ~mask;
167         nsk->sk_flags |= osk->sk_flags & mask;
168 }
169
170 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
171                              (1UL << SOCK_KEEPOPEN) | \
172                              (1UL << SOCK_LINGER) | \
173                              (1UL << SOCK_BROADCAST) | \
174                              (1UL << SOCK_TIMESTAMP) | \
175                              (1UL << SOCK_DBG) | \
176                              (1UL << SOCK_RCVTSTAMP) | \
177                              (1UL << SOCK_RCVTSTAMPNS) | \
178                              (1UL << SOCK_LOCALROUTE) | \
179                              (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
180                              (1UL << SOCK_RXQ_OVFL) | \
181                              (1UL << SOCK_WIFI_STATUS) | \
182                              (1UL << SOCK_NOFCS) | \
183                              (1UL << SOCK_FILTER_LOCKED))
184 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
185  * clc socket (since smc is not called for these options from net/core)
186  */
187 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
188 {
189         smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
190 }
191
192 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
193                              (1UL << SOCK_KEEPOPEN) | \
194                              (1UL << SOCK_LINGER) | \
195                              (1UL << SOCK_DBG))
196 /* copy only settings and flags relevant for smc from clc to smc socket */
197 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
198 {
199         smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
200 }
201
202 /* determine subnet and mask of internal TCP socket */
203 int smc_netinfo_by_tcpsk(struct socket *clcsock,
204                          __be32 *subnet, u8 *prefix_len)
205 {
206         struct dst_entry *dst = sk_dst_get(clcsock->sk);
207         struct sockaddr_in addr;
208         int rc = -ENOENT;
209         int len;
210
211         if (!dst) {
212                 rc = -ENOTCONN;
213                 goto out;
214         }
215         if (!dst->dev) {
216                 rc = -ENODEV;
217                 goto out_rel;
218         }
219
220         /* get address to which the internal TCP socket is bound */
221         kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
222         /* analyze IPv4 specific data of net_device belonging to TCP socket */
223         for_ifa(dst->dev->ip_ptr) {
224                 if (ifa->ifa_address != addr.sin_addr.s_addr)
225                         continue;
226                 *prefix_len = inet_mask_len(ifa->ifa_mask);
227                 *subnet = ifa->ifa_address & ifa->ifa_mask;
228                 rc = 0;
229                 break;
230         } endfor_ifa(dst->dev->ip_ptr);
231
232 out_rel:
233         dst_release(dst);
234 out:
235         return rc;
236 }
237
238 /* setup for RDMA connection of client */
239 static int smc_connect_rdma(struct smc_sock *smc)
240 {
241         struct smc_clc_msg_accept_confirm aclc;
242         struct smc_ib_device *smcibdev;
243         int reason_code = 0;
244         int rc = 0;
245         u8 ibport;
246
247         /* IPSec connections opt out of SMC-R optimizations */
248         if (using_ipsec(smc)) {
249                 reason_code = SMC_CLC_DECL_IPSEC;
250                 goto decline_rdma;
251         }
252
253         /* PNET table look up: search active ib_device and port
254          * within same PNETID that also contains the ethernet device
255          * used for the internal TCP socket
256          */
257         smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
258         if (!smcibdev) {
259                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
260                 goto decline_rdma;
261         }
262
263         /* do inband token exchange */
264         reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
265         if (reason_code < 0) {
266                 rc = reason_code;
267                 goto out_err;
268         }
269         if (reason_code > 0) /* configuration error */
270                 goto decline_rdma;
271         /* receive SMC Accept CLC message */
272         reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
273                                        SMC_CLC_ACCEPT);
274         if (reason_code < 0) {
275                 rc = reason_code;
276                 goto out_err;
277         }
278         if (reason_code > 0)
279                 goto decline_rdma;
280
281         /* tbd in follow-on patch: more steps to setup RDMA communcication,
282          * create connection, link group, link
283          */
284
285         /* tbd in follow-on patch: more steps to setup RDMA communcication,
286          * create rmbs, map rmbs, rtoken_handling, modify_qp
287          */
288
289         rc = smc_clc_send_confirm(smc);
290         if (rc)
291                 goto out_err;
292
293         /* tbd in follow-on patch: llc_confirm */
294
295 out_connected:
296         smc_copy_sock_settings_to_clc(smc);
297         smc->sk.sk_state = SMC_ACTIVE;
298
299         return rc;
300
301 decline_rdma:
302         /* RDMA setup failed, switch back to TCP */
303         smc->use_fallback = true;
304         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
305                 rc = smc_clc_send_decline(smc, reason_code, 0);
306                 if (rc < sizeof(struct smc_clc_msg_decline))
307                         goto out_err;
308         }
309         goto out_connected;
310
311 out_err:
312         return rc;
313 }
314
315 static int smc_connect(struct socket *sock, struct sockaddr *addr,
316                        int alen, int flags)
317 {
318         struct sock *sk = sock->sk;
319         struct smc_sock *smc;
320         int rc = -EINVAL;
321
322         smc = smc_sk(sk);
323
324         /* separate smc parameter checking to be safe */
325         if (alen < sizeof(addr->sa_family))
326                 goto out_err;
327         if (addr->sa_family != AF_INET)
328                 goto out_err;
329         smc->addr = addr;       /* needed for nonblocking connect */
330
331         lock_sock(sk);
332         switch (sk->sk_state) {
333         default:
334                 goto out;
335         case SMC_ACTIVE:
336                 rc = -EISCONN;
337                 goto out;
338         case SMC_INIT:
339                 rc = 0;
340                 break;
341         }
342
343         smc_copy_sock_settings_to_clc(smc);
344         rc = kernel_connect(smc->clcsock, addr, alen, flags);
345         if (rc)
346                 goto out;
347
348         /* setup RDMA connection */
349         rc = smc_connect_rdma(smc);
350         if (rc < 0)
351                 goto out;
352         else
353                 rc = 0; /* success cases including fallback */
354
355 out:
356         release_sock(sk);
357 out_err:
358         return rc;
359 }
360
361 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
362 {
363         struct sock *sk = &lsmc->sk;
364         struct socket *new_clcsock;
365         struct sock *new_sk;
366         int rc;
367
368         release_sock(&lsmc->sk);
369         new_sk = smc_sock_alloc(sock_net(sk), NULL);
370         if (!new_sk) {
371                 rc = -ENOMEM;
372                 lsmc->sk.sk_err = ENOMEM;
373                 *new_smc = NULL;
374                 lock_sock(&lsmc->sk);
375                 goto out;
376         }
377         *new_smc = smc_sk(new_sk);
378
379         rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
380         lock_sock(&lsmc->sk);
381         if  (rc < 0) {
382                 lsmc->sk.sk_err = -rc;
383                 new_sk->sk_state = SMC_CLOSED;
384                 sock_set_flag(new_sk, SOCK_DEAD);
385                 sock_put(new_sk);
386                 *new_smc = NULL;
387                 goto out;
388         }
389         if (lsmc->sk.sk_state == SMC_CLOSED) {
390                 if (new_clcsock)
391                         sock_release(new_clcsock);
392                 new_sk->sk_state = SMC_CLOSED;
393                 sock_set_flag(new_sk, SOCK_DEAD);
394                 sock_put(new_sk);
395                 *new_smc = NULL;
396                 goto out;
397         }
398
399         (*new_smc)->clcsock = new_clcsock;
400 out:
401         return rc;
402 }
403
404 /* add a just created sock to the accept queue of the listen sock as
405  * candidate for a following socket accept call from user space
406  */
407 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
408 {
409         struct smc_sock *par = smc_sk(parent);
410
411         sock_hold(sk);
412         spin_lock(&par->accept_q_lock);
413         list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
414         spin_unlock(&par->accept_q_lock);
415         sk_acceptq_added(parent);
416 }
417
418 /* remove a socket from the accept queue of its parental listening socket */
419 static void smc_accept_unlink(struct sock *sk)
420 {
421         struct smc_sock *par = smc_sk(sk)->listen_smc;
422
423         spin_lock(&par->accept_q_lock);
424         list_del_init(&smc_sk(sk)->accept_q);
425         spin_unlock(&par->accept_q_lock);
426         sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
427         sock_put(sk);
428 }
429
430 /* remove a sock from the accept queue to bind it to a new socket created
431  * for a socket accept call from user space
432  */
433 static struct sock *smc_accept_dequeue(struct sock *parent,
434                                        struct socket *new_sock)
435 {
436         struct smc_sock *isk, *n;
437         struct sock *new_sk;
438
439         list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
440                 new_sk = (struct sock *)isk;
441
442                 smc_accept_unlink(new_sk);
443                 if (new_sk->sk_state == SMC_CLOSED) {
444                         /* tbd in follow-on patch: close this sock */
445                         continue;
446                 }
447                 if (new_sock)
448                         sock_graft(new_sk, new_sock);
449                 return new_sk;
450         }
451         return NULL;
452 }
453
454 /* clean up for a created but never accepted sock */
455 static void smc_close_non_accepted(struct sock *sk)
456 {
457         struct smc_sock *smc = smc_sk(sk);
458
459         sock_hold(sk);
460         if (smc->clcsock) {
461                 struct socket *tcp;
462
463                 tcp = smc->clcsock;
464                 smc->clcsock = NULL;
465                 sock_release(tcp);
466         }
467         /* more closing stuff to be added with socket closing patch */
468         sock_put(sk);
469 }
470
471 /* setup for RDMA connection of server */
472 static void smc_listen_work(struct work_struct *work)
473 {
474         struct smc_sock *new_smc = container_of(work, struct smc_sock,
475                                                 smc_listen_work);
476         struct socket *newclcsock = new_smc->clcsock;
477         struct smc_sock *lsmc = new_smc->listen_smc;
478         struct smc_clc_msg_accept_confirm cclc;
479         struct sock *newsmcsk = &new_smc->sk;
480         struct smc_clc_msg_proposal pclc;
481         struct smc_ib_device *smcibdev;
482         struct sockaddr_in peeraddr;
483         int reason_code = 0;
484         int rc = 0, len;
485         __be32 subnet;
486         u8 prefix_len;
487         u8 ibport;
488
489         /* do inband token exchange -
490          *wait for and receive SMC Proposal CLC message
491          */
492         reason_code = smc_clc_wait_msg(new_smc, &pclc, sizeof(pclc),
493                                        SMC_CLC_PROPOSAL);
494         if (reason_code < 0)
495                 goto out_err;
496         if (reason_code > 0)
497                 goto decline_rdma;
498
499         /* IPSec connections opt out of SMC-R optimizations */
500         if (using_ipsec(new_smc)) {
501                 reason_code = SMC_CLC_DECL_IPSEC;
502                 goto decline_rdma;
503         }
504
505         /* PNET table look up: search active ib_device and port
506          * within same PNETID that also contains the ethernet device
507          * used for the internal TCP socket
508          */
509         smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
510         if (!smcibdev) {
511                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
512                 goto decline_rdma;
513         }
514
515         /* determine subnet and mask from internal TCP socket */
516         rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
517         if (rc) {
518                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
519                 goto decline_rdma;
520         }
521         if ((pclc.outgoing_subnet != subnet) ||
522             (pclc.prefix_len != prefix_len)) {
523                 reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
524                 goto decline_rdma;
525         }
526
527         /* get address of the peer connected to the internal TCP socket */
528         kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
529
530         /* tbd in follow-on patch: more steps to setup RDMA communcication,
531          * create connection, link_group, link
532          */
533
534         /* tbd in follow-on patch: more steps to setup RDMA communcication,
535          * create rmbs, map rmbs
536          */
537
538         rc = smc_clc_send_accept(new_smc);
539         if (rc)
540                 goto out_err;
541
542         /* receive SMC Confirm CLC message */
543         reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
544                                        SMC_CLC_CONFIRM);
545         if (reason_code < 0)
546                 goto out_err;
547         if (reason_code > 0)
548                 goto decline_rdma;
549
550         /* tbd in follow-on patch: more steps to setup RDMA communcication,
551          * rtoken_handling, modify_qp
552          */
553
554 out_connected:
555         sk_refcnt_debug_inc(newsmcsk);
556         newsmcsk->sk_state = SMC_ACTIVE;
557 enqueue:
558         lock_sock(&lsmc->sk);
559         if (lsmc->sk.sk_state == SMC_LISTEN) {
560                 smc_accept_enqueue(&lsmc->sk, newsmcsk);
561         } else { /* no longer listening */
562                 smc_close_non_accepted(newsmcsk);
563         }
564         release_sock(&lsmc->sk);
565
566         /* Wake up accept */
567         lsmc->sk.sk_data_ready(&lsmc->sk);
568         sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
569         return;
570
571 decline_rdma:
572         /* RDMA setup failed, switch back to TCP */
573         new_smc->use_fallback = true;
574         if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
575                 rc = smc_clc_send_decline(new_smc, reason_code, 0);
576                 if (rc < sizeof(struct smc_clc_msg_decline))
577                         goto out_err;
578         }
579         goto out_connected;
580
581 out_err:
582         newsmcsk->sk_state = SMC_CLOSED;
583         goto enqueue; /* queue new sock with sk_err set */
584 }
585
586 static void smc_tcp_listen_work(struct work_struct *work)
587 {
588         struct smc_sock *lsmc = container_of(work, struct smc_sock,
589                                              tcp_listen_work);
590         struct smc_sock *new_smc;
591         int rc = 0;
592
593         lock_sock(&lsmc->sk);
594         while (lsmc->sk.sk_state == SMC_LISTEN) {
595                 rc = smc_clcsock_accept(lsmc, &new_smc);
596                 if (rc)
597                         goto out;
598                 if (!new_smc)
599                         continue;
600
601                 new_smc->listen_smc = lsmc;
602                 new_smc->use_fallback = false; /* assume rdma capability first*/
603                 sock_hold(&lsmc->sk); /* sock_put in smc_listen_work */
604                 INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
605                 smc_copy_sock_settings_to_smc(new_smc);
606                 schedule_work(&new_smc->smc_listen_work);
607         }
608
609 out:
610         release_sock(&lsmc->sk);
611         lsmc->sk.sk_data_ready(&lsmc->sk); /* no more listening, wake accept */
612 }
613
614 static int smc_listen(struct socket *sock, int backlog)
615 {
616         struct sock *sk = sock->sk;
617         struct smc_sock *smc;
618         int rc;
619
620         smc = smc_sk(sk);
621         lock_sock(sk);
622
623         rc = -EINVAL;
624         if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
625                 goto out;
626
627         rc = 0;
628         if (sk->sk_state == SMC_LISTEN) {
629                 sk->sk_max_ack_backlog = backlog;
630                 goto out;
631         }
632         /* some socket options are handled in core, so we could not apply
633          * them to the clc socket -- copy smc socket options to clc socket
634          */
635         smc_copy_sock_settings_to_clc(smc);
636
637         rc = kernel_listen(smc->clcsock, backlog);
638         if (rc)
639                 goto out;
640         sk->sk_max_ack_backlog = backlog;
641         sk->sk_ack_backlog = 0;
642         sk->sk_state = SMC_LISTEN;
643         INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
644         schedule_work(&smc->tcp_listen_work);
645
646 out:
647         release_sock(sk);
648         return rc;
649 }
650
651 static int smc_accept(struct socket *sock, struct socket *new_sock,
652                       int flags)
653 {
654         struct sock *sk = sock->sk, *nsk;
655         DECLARE_WAITQUEUE(wait, current);
656         struct smc_sock *lsmc;
657         long timeo;
658         int rc = 0;
659
660         lsmc = smc_sk(sk);
661         lock_sock(sk);
662
663         if (lsmc->sk.sk_state != SMC_LISTEN) {
664                 rc = -EINVAL;
665                 goto out;
666         }
667
668         /* Wait for an incoming connection */
669         timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
670         add_wait_queue_exclusive(sk_sleep(sk), &wait);
671         while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
672                 set_current_state(TASK_INTERRUPTIBLE);
673                 if (!timeo) {
674                         rc = -EAGAIN;
675                         break;
676                 }
677                 release_sock(sk);
678                 timeo = schedule_timeout(timeo);
679                 /* wakeup by sk_data_ready in smc_listen_work() */
680                 sched_annotate_sleep();
681                 lock_sock(sk);
682                 if (signal_pending(current)) {
683                         rc = sock_intr_errno(timeo);
684                         break;
685                 }
686         }
687         set_current_state(TASK_RUNNING);
688         remove_wait_queue(sk_sleep(sk), &wait);
689
690         if (!rc)
691                 rc = sock_error(nsk);
692
693 out:
694         release_sock(sk);
695         return rc;
696 }
697
698 static int smc_getname(struct socket *sock, struct sockaddr *addr,
699                        int *len, int peer)
700 {
701         struct smc_sock *smc;
702
703         if (peer && (sock->sk->sk_state != SMC_ACTIVE))
704                 return -ENOTCONN;
705
706         smc = smc_sk(sock->sk);
707
708         return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
709 }
710
711 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
712 {
713         struct sock *sk = sock->sk;
714         struct smc_sock *smc;
715         int rc = -EPIPE;
716
717         smc = smc_sk(sk);
718         lock_sock(sk);
719         if (sk->sk_state != SMC_ACTIVE)
720                 goto out;
721         if (smc->use_fallback)
722                 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
723         else
724                 rc = sock_no_sendmsg(sock, msg, len);
725 out:
726         release_sock(sk);
727         return rc;
728 }
729
730 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
731                        int flags)
732 {
733         struct sock *sk = sock->sk;
734         struct smc_sock *smc;
735         int rc = -ENOTCONN;
736
737         smc = smc_sk(sk);
738         lock_sock(sk);
739         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
740                 goto out;
741
742         if (smc->use_fallback)
743                 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
744         else
745                 rc = sock_no_recvmsg(sock, msg, len, flags);
746 out:
747         release_sock(sk);
748         return rc;
749 }
750
751 static unsigned int smc_accept_poll(struct sock *parent)
752 {
753         struct smc_sock *isk;
754         struct sock *sk;
755
756         lock_sock(parent);
757         list_for_each_entry(isk, &smc_sk(parent)->accept_q, accept_q) {
758                 sk = (struct sock *)isk;
759
760                 if (sk->sk_state == SMC_ACTIVE) {
761                         release_sock(parent);
762                         return POLLIN | POLLRDNORM;
763                 }
764         }
765         release_sock(parent);
766
767         return 0;
768 }
769
770 static unsigned int smc_poll(struct file *file, struct socket *sock,
771                              poll_table *wait)
772 {
773         struct sock *sk = sock->sk;
774         unsigned int mask = 0;
775         struct smc_sock *smc;
776         int rc;
777
778         smc = smc_sk(sock->sk);
779         if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
780                 /* delegate to CLC child sock */
781                 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
782                 /* if non-blocking connect finished ... */
783                 lock_sock(sk);
784                 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
785                         sk->sk_err = smc->clcsock->sk->sk_err;
786                         if (sk->sk_err) {
787                                 mask |= POLLERR;
788                         } else {
789                                 rc = smc_connect_rdma(smc);
790                                 if (rc < 0)
791                                         mask |= POLLERR;
792                                 else
793                                         /* success cases including fallback */
794                                         mask |= POLLOUT | POLLWRNORM;
795                         }
796                 }
797                 release_sock(sk);
798         } else {
799                 sock_poll_wait(file, sk_sleep(sk), wait);
800                 if (sk->sk_state == SMC_LISTEN)
801                         /* woken up by sk_data_ready in smc_listen_work() */
802                         mask |= smc_accept_poll(sk);
803                 if (sk->sk_err)
804                         mask |= POLLERR;
805                 /* for now - to be enhanced in follow-on patch */
806         }
807
808         return mask;
809 }
810
811 static int smc_shutdown(struct socket *sock, int how)
812 {
813         struct sock *sk = sock->sk;
814         struct smc_sock *smc;
815         int rc = -EINVAL;
816
817         smc = smc_sk(sk);
818
819         if ((how < SHUT_RD) || (how > SHUT_RDWR))
820                 goto out_err;
821
822         lock_sock(sk);
823
824         rc = -ENOTCONN;
825         if (sk->sk_state == SMC_CLOSED)
826                 goto out;
827         if (smc->use_fallback) {
828                 rc = kernel_sock_shutdown(smc->clcsock, how);
829                 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
830                 if (sk->sk_shutdown == SHUTDOWN_MASK)
831                         sk->sk_state = SMC_CLOSED;
832         } else {
833                 rc = sock_no_shutdown(sock, how);
834         }
835
836 out:
837         release_sock(sk);
838
839 out_err:
840         return rc;
841 }
842
843 static int smc_setsockopt(struct socket *sock, int level, int optname,
844                           char __user *optval, unsigned int optlen)
845 {
846         struct sock *sk = sock->sk;
847         struct smc_sock *smc;
848
849         smc = smc_sk(sk);
850
851         /* generic setsockopts reaching us here always apply to the
852          * CLC socket
853          */
854         return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
855                                              optval, optlen);
856 }
857
858 static int smc_getsockopt(struct socket *sock, int level, int optname,
859                           char __user *optval, int __user *optlen)
860 {
861         struct smc_sock *smc;
862
863         smc = smc_sk(sock->sk);
864         /* socket options apply to the CLC socket */
865         return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
866                                              optval, optlen);
867 }
868
869 static int smc_ioctl(struct socket *sock, unsigned int cmd,
870                      unsigned long arg)
871 {
872         struct smc_sock *smc;
873
874         smc = smc_sk(sock->sk);
875         if (smc->use_fallback)
876                 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
877         else
878                 return sock_no_ioctl(sock, cmd, arg);
879 }
880
881 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
882                             int offset, size_t size, int flags)
883 {
884         struct sock *sk = sock->sk;
885         struct smc_sock *smc;
886         int rc = -EPIPE;
887
888         smc = smc_sk(sk);
889         lock_sock(sk);
890         if (sk->sk_state != SMC_ACTIVE)
891                 goto out;
892         if (smc->use_fallback)
893                 rc = kernel_sendpage(smc->clcsock, page, offset,
894                                      size, flags);
895         else
896                 rc = sock_no_sendpage(sock, page, offset, size, flags);
897
898 out:
899         release_sock(sk);
900         return rc;
901 }
902
903 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
904                                struct pipe_inode_info *pipe, size_t len,
905                                     unsigned int flags)
906 {
907         struct sock *sk = sock->sk;
908         struct smc_sock *smc;
909         int rc = -ENOTCONN;
910
911         smc = smc_sk(sk);
912         lock_sock(sk);
913         if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
914                 goto out;
915         if (smc->use_fallback) {
916                 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
917                                                     pipe, len, flags);
918         } else {
919                 rc = -EOPNOTSUPP;
920         }
921 out:
922         release_sock(sk);
923         return rc;
924 }
925
926 /* must look like tcp */
927 static const struct proto_ops smc_sock_ops = {
928         .family         = PF_SMC,
929         .owner          = THIS_MODULE,
930         .release        = smc_release,
931         .bind           = smc_bind,
932         .connect        = smc_connect,
933         .socketpair     = sock_no_socketpair,
934         .accept         = smc_accept,
935         .getname        = smc_getname,
936         .poll           = smc_poll,
937         .ioctl          = smc_ioctl,
938         .listen         = smc_listen,
939         .shutdown       = smc_shutdown,
940         .setsockopt     = smc_setsockopt,
941         .getsockopt     = smc_getsockopt,
942         .sendmsg        = smc_sendmsg,
943         .recvmsg        = smc_recvmsg,
944         .mmap           = sock_no_mmap,
945         .sendpage       = smc_sendpage,
946         .splice_read    = smc_splice_read,
947 };
948
949 static int smc_create(struct net *net, struct socket *sock, int protocol,
950                       int kern)
951 {
952         struct smc_sock *smc;
953         struct sock *sk;
954         int rc;
955
956         rc = -ESOCKTNOSUPPORT;
957         if (sock->type != SOCK_STREAM)
958                 goto out;
959
960         rc = -EPROTONOSUPPORT;
961         if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
962                 goto out;
963
964         rc = -ENOBUFS;
965         sock->ops = &smc_sock_ops;
966         sk = smc_sock_alloc(net, sock);
967         if (!sk)
968                 goto out;
969
970         /* create internal TCP socket for CLC handshake and fallback */
971         smc = smc_sk(sk);
972         smc->use_fallback = false; /* assume rdma capability first */
973         rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
974                               IPPROTO_TCP, &smc->clcsock);
975         if (rc)
976                 sk_common_release(sk);
977
978 out:
979         return rc;
980 }
981
982 static const struct net_proto_family smc_sock_family_ops = {
983         .family = PF_SMC,
984         .owner  = THIS_MODULE,
985         .create = smc_create,
986 };
987
988 static int __init smc_init(void)
989 {
990         int rc;
991
992         rc = smc_pnet_init();
993         if (rc)
994                 return rc;
995
996         rc = proto_register(&smc_proto, 1);
997         if (rc) {
998                 pr_err("%s: proto_register fails with %d\n", __func__, rc);
999                 goto out_pnet;
1000         }
1001
1002         rc = sock_register(&smc_sock_family_ops);
1003         if (rc) {
1004                 pr_err("%s: sock_register fails with %d\n", __func__, rc);
1005                 goto out_proto;
1006         }
1007
1008         rc = smc_ib_register_client();
1009         if (rc) {
1010                 pr_err("%s: ib_register fails with %d\n", __func__, rc);
1011                 goto out_sock;
1012         }
1013
1014         return 0;
1015
1016 out_sock:
1017         sock_unregister(PF_SMC);
1018 out_proto:
1019         proto_unregister(&smc_proto);
1020 out_pnet:
1021         smc_pnet_exit();
1022         return rc;
1023 }
1024
1025 static void __exit smc_exit(void)
1026 {
1027         smc_ib_unregister_client();
1028         sock_unregister(PF_SMC);
1029         proto_unregister(&smc_proto);
1030         smc_pnet_exit();
1031 }
1032
1033 module_init(smc_init);
1034 module_exit(smc_exit);
1035
1036 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1037 MODULE_DESCRIPTION("smc socket address family");
1038 MODULE_LICENSE("GPL");
1039 MODULE_ALIAS_NETPROTO(PF_SMC);