2 * Shared Memory Communications over RDMA (SMC-R) and RoCE
4 * AF_SMC protocol family socket handler keeping the AF_INET sock address type
5 * applies to SOCK_STREAM sockets only
6 * offers an alternative communication option for TCP-protocol sockets
7 * applicable with RoCE-cards only
9 * Copyright IBM Corp. 2016
11 * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com>
12 * based on prototype from Frank Blaschka
15 #define KMSG_COMPONENT "smc"
16 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
18 #include <linux/module.h>
19 #include <linux/socket.h>
24 static void smc_set_keepalive(struct sock *sk, int val)
26 struct smc_sock *smc = smc_sk(sk);
28 smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
31 static struct proto smc_proto = {
34 .keepalive = smc_set_keepalive,
35 .obj_size = sizeof(struct smc_sock),
36 .slab_flags = SLAB_DESTROY_BY_RCU,
39 static int smc_release(struct socket *sock)
41 struct sock *sk = sock->sk;
50 sk->sk_state = SMC_CLOSED;
52 sock_release(smc->clcsock);
66 static void smc_destruct(struct sock *sk)
68 if (sk->sk_state != SMC_CLOSED)
70 if (!sock_flag(sk, SOCK_DEAD))
73 sk_refcnt_debug_dec(sk);
76 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
81 sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
85 sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
86 sk->sk_state = SMC_INIT;
87 sk->sk_destruct = smc_destruct;
88 sk->sk_protocol = SMCPROTO_SMC;
89 sk_refcnt_debug_inc(sk);
96 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
99 struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
100 struct sock *sk = sock->sk;
101 struct smc_sock *smc;
106 /* replicate tests from inet_bind(), to be safe wrt. future changes */
108 if (addr_len < sizeof(struct sockaddr_in))
112 /* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
113 if ((addr->sin_family != AF_INET) &&
114 ((addr->sin_family != AF_UNSPEC) ||
115 (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
120 /* Check if socket is already active */
122 if (sk->sk_state != SMC_INIT)
125 smc->clcsock->sk->sk_reuse = sk->sk_reuse;
126 rc = kernel_bind(smc->clcsock, uaddr, addr_len);
134 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
137 /* options we don't get control via setsockopt for */
138 nsk->sk_type = osk->sk_type;
139 nsk->sk_sndbuf = osk->sk_sndbuf;
140 nsk->sk_rcvbuf = osk->sk_rcvbuf;
141 nsk->sk_sndtimeo = osk->sk_sndtimeo;
142 nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
143 nsk->sk_mark = osk->sk_mark;
144 nsk->sk_priority = osk->sk_priority;
145 nsk->sk_rcvlowat = osk->sk_rcvlowat;
146 nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
147 nsk->sk_err = osk->sk_err;
149 nsk->sk_flags &= ~mask;
150 nsk->sk_flags |= osk->sk_flags & mask;
153 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
154 (1UL << SOCK_KEEPOPEN) | \
155 (1UL << SOCK_LINGER) | \
156 (1UL << SOCK_BROADCAST) | \
157 (1UL << SOCK_TIMESTAMP) | \
158 (1UL << SOCK_DBG) | \
159 (1UL << SOCK_RCVTSTAMP) | \
160 (1UL << SOCK_RCVTSTAMPNS) | \
161 (1UL << SOCK_LOCALROUTE) | \
162 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
163 (1UL << SOCK_RXQ_OVFL) | \
164 (1UL << SOCK_WIFI_STATUS) | \
165 (1UL << SOCK_NOFCS) | \
166 (1UL << SOCK_FILTER_LOCKED))
167 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
168 * clc socket (since smc is not called for these options from net/core)
170 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
172 smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
175 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
176 (1UL << SOCK_KEEPOPEN) | \
177 (1UL << SOCK_LINGER) | \
179 /* copy only settings and flags relevant for smc from clc to smc socket */
180 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
182 smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
185 static int smc_connect(struct socket *sock, struct sockaddr *addr,
188 struct sock *sk = sock->sk;
189 struct smc_sock *smc;
194 /* separate smc parameter checking to be safe */
195 if (alen < sizeof(addr->sa_family))
197 if (addr->sa_family != AF_INET)
201 switch (sk->sk_state) {
212 smc_copy_sock_settings_to_clc(smc);
213 rc = kernel_connect(smc->clcsock, addr, alen, flags);
217 sk->sk_state = SMC_ACTIVE;
219 /* always use TCP fallback as transport mechanism for now;
220 * This will change once RDMA transport is implemented
222 smc->use_fallback = true;
230 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
232 struct sock *sk = &lsmc->sk;
233 struct socket *new_clcsock;
237 new_sk = smc_sock_alloc(sock_net(sk), NULL);
240 lsmc->sk.sk_err = ENOMEM;
244 *new_smc = smc_sk(new_sk);
246 rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
253 (*new_smc)->clcsock = new_clcsock;
258 static int smc_listen(struct socket *sock, int backlog)
260 struct sock *sk = sock->sk;
261 struct smc_sock *smc;
268 if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
272 if (sk->sk_state == SMC_LISTEN) {
273 sk->sk_max_ack_backlog = backlog;
276 /* some socket options are handled in core, so we could not apply
277 * them to the clc socket -- copy smc socket options to clc socket
279 smc_copy_sock_settings_to_clc(smc);
281 rc = kernel_listen(smc->clcsock, backlog);
284 sk->sk_max_ack_backlog = backlog;
285 sk->sk_ack_backlog = 0;
286 sk->sk_state = SMC_LISTEN;
293 static int smc_accept(struct socket *sock, struct socket *new_sock,
296 struct smc_sock *new_smc;
297 struct sock *sk = sock->sk;
298 struct smc_sock *lsmc;
304 if (lsmc->sk.sk_state != SMC_LISTEN) {
309 rc = smc_clcsock_accept(lsmc, &new_smc);
312 sock_graft(&new_smc->sk, new_sock);
313 new_smc->sk.sk_state = SMC_ACTIVE;
315 smc_copy_sock_settings_to_smc(new_smc);
317 /* always use TCP fallback as transport mechanism for now;
318 * This will change once RDMA transport is implemented
320 new_smc->use_fallback = true;
327 static int smc_getname(struct socket *sock, struct sockaddr *addr,
330 struct smc_sock *smc;
332 if (peer && (sock->sk->sk_state != SMC_ACTIVE))
335 smc = smc_sk(sock->sk);
337 return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
340 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
342 struct sock *sk = sock->sk;
343 struct smc_sock *smc;
348 if (sk->sk_state != SMC_ACTIVE)
350 if (smc->use_fallback)
351 rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
353 rc = sock_no_sendmsg(sock, msg, len);
359 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
362 struct sock *sk = sock->sk;
363 struct smc_sock *smc;
368 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
371 if (smc->use_fallback)
372 rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
374 rc = sock_no_recvmsg(sock, msg, len, flags);
380 static unsigned int smc_poll(struct file *file, struct socket *sock,
383 struct sock *sk = sock->sk;
384 unsigned int mask = 0;
385 struct smc_sock *smc;
387 smc = smc_sk(sock->sk);
388 if ((sk->sk_state == SMC_INIT) || (sk->sk_state == SMC_LISTEN) ||
390 mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
391 /* if non-blocking connect finished ... */
393 if ((sk->sk_state == SMC_INIT) && (mask & POLLOUT)) {
394 sk->sk_state = SMC_ACTIVE;
395 /* always use TCP fallback as transport mechanism;
396 * This will change once RDMA transport is implemented
398 smc->use_fallback = true;
402 mask = sock_no_poll(file, sock, wait);
408 static int smc_shutdown(struct socket *sock, int how)
410 struct sock *sk = sock->sk;
411 struct smc_sock *smc;
416 if ((how < SHUT_RD) || (how > SHUT_RDWR))
422 if (sk->sk_state == SMC_CLOSED)
424 if (smc->use_fallback) {
425 rc = kernel_sock_shutdown(smc->clcsock, how);
426 sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
427 if (sk->sk_shutdown == SHUTDOWN_MASK)
428 sk->sk_state = SMC_CLOSED;
430 rc = sock_no_shutdown(sock, how);
440 static int smc_setsockopt(struct socket *sock, int level, int optname,
441 char __user *optval, unsigned int optlen)
443 struct sock *sk = sock->sk;
444 struct smc_sock *smc;
448 /* generic setsockopts reaching us here always apply to the
451 return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
455 static int smc_getsockopt(struct socket *sock, int level, int optname,
456 char __user *optval, int __user *optlen)
458 struct smc_sock *smc;
460 smc = smc_sk(sock->sk);
461 /* socket options apply to the CLC socket */
462 return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
466 static int smc_ioctl(struct socket *sock, unsigned int cmd,
469 struct smc_sock *smc;
471 smc = smc_sk(sock->sk);
472 if (smc->use_fallback)
473 return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
475 return sock_no_ioctl(sock, cmd, arg);
478 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
479 int offset, size_t size, int flags)
481 struct sock *sk = sock->sk;
482 struct smc_sock *smc;
487 if (sk->sk_state != SMC_ACTIVE)
489 if (smc->use_fallback)
490 rc = kernel_sendpage(smc->clcsock, page, offset,
493 rc = sock_no_sendpage(sock, page, offset, size, flags);
500 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
501 struct pipe_inode_info *pipe, size_t len,
504 struct sock *sk = sock->sk;
505 struct smc_sock *smc;
510 if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
512 if (smc->use_fallback) {
513 rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
523 /* must look like tcp */
524 static const struct proto_ops smc_sock_ops = {
526 .owner = THIS_MODULE,
527 .release = smc_release,
529 .connect = smc_connect,
530 .socketpair = sock_no_socketpair,
531 .accept = smc_accept,
532 .getname = smc_getname,
535 .listen = smc_listen,
536 .shutdown = smc_shutdown,
537 .setsockopt = smc_setsockopt,
538 .getsockopt = smc_getsockopt,
539 .sendmsg = smc_sendmsg,
540 .recvmsg = smc_recvmsg,
541 .mmap = sock_no_mmap,
542 .sendpage = smc_sendpage,
543 .splice_read = smc_splice_read,
546 static int smc_create(struct net *net, struct socket *sock, int protocol,
549 struct smc_sock *smc;
553 rc = -ESOCKTNOSUPPORT;
554 if (sock->type != SOCK_STREAM)
557 rc = -EPROTONOSUPPORT;
558 if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
562 sock->ops = &smc_sock_ops;
563 sk = smc_sock_alloc(net, sock);
567 /* create internal TCP socket for CLC handshake and fallback */
569 rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
570 IPPROTO_TCP, &smc->clcsock);
572 sk_common_release(sk);
578 static const struct net_proto_family smc_sock_family_ops = {
580 .owner = THIS_MODULE,
581 .create = smc_create,
584 static int __init smc_init(void)
588 rc = proto_register(&smc_proto, 1);
590 pr_err("%s: proto_register fails with %d\n", __func__, rc);
594 rc = sock_register(&smc_sock_family_ops);
596 pr_err("%s: sock_register fails with %d\n", __func__, rc);
603 proto_unregister(&smc_proto);
608 static void __exit smc_exit(void)
610 sock_unregister(PF_SMC);
611 proto_unregister(&smc_proto);
614 module_init(smc_init);
615 module_exit(smc_exit);
617 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
618 MODULE_DESCRIPTION("smc socket address family");
619 MODULE_LICENSE("GPL");
620 MODULE_ALIAS_NETPROTO(PF_SMC);