1 /* Evaluate MSG_ZEROCOPY
3 * Send traffic between two processes over one of the supported
9 * - SOCK_DGRAM with UDP_CORK
11 * - SOCK_RAW with IP_HDRINCL
20 * Start this program on two connected hosts, one in send mode and
21 * the other with option '-r' to put it in receiver mode.
23 * If zerocopy mode ('-z') is enabled, the sender will verify that
24 * the kernel queues completions on the error queue for all zerocopy
30 #include <arpa/inet.h>
34 #include <linux/errqueue.h>
35 #include <linux/if_packet.h>
36 #include <linux/ipv6.h>
37 #include <linux/socket.h>
38 #include <linux/sockios.h>
39 #include <net/ethernet.h>
41 #include <netinet/ip.h>
42 #include <netinet/ip6.h>
43 #include <netinet/tcp.h>
44 #include <netinet/udp.h>
52 #include <sys/ioctl.h>
53 #include <sys/socket.h>
56 #include <sys/types.h>
59 #include <linux/rds.h>
61 #ifndef SO_EE_ORIGIN_ZEROCOPY
62 #define SO_EE_ORIGIN_ZEROCOPY 5
66 #define SO_ZEROCOPY 60
69 #ifndef SO_EE_CODE_ZEROCOPY_COPIED
70 #define SO_EE_CODE_ZEROCOPY_COPIED 1
74 #define MSG_ZEROCOPY 0x4000000
78 static bool cfg_cork_mixed;
79 static int cfg_cpu = -1; /* default: pin to last cpu */
80 static int cfg_family = PF_UNSPEC;
81 static int cfg_ifindex = 1;
82 static int cfg_payload_len;
83 static int cfg_port = 8000;
85 static int cfg_runtime_ms = 4200;
86 static int cfg_verbose;
87 static int cfg_waittime_ms = 500;
88 static bool cfg_zerocopy;
90 static socklen_t cfg_alen;
91 static struct sockaddr_storage cfg_dst_addr;
92 static struct sockaddr_storage cfg_src_addr;
94 static char payload[IP_MAXPACKET];
95 static long packets, bytes, completions, expected_completions;
96 static int zerocopied = -1;
97 static uint32_t next_completion;
99 static unsigned long gettimeofday_ms(void)
103 gettimeofday(&tv, NULL);
104 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
107 static uint16_t get_ip_csum(const uint16_t *start, int num_words)
109 unsigned long sum = 0;
112 for (i = 0; i < num_words; i++)
116 sum = (sum & 0xFFFF) + (sum >> 16);
121 static int do_setcpu(int cpu)
127 if (sched_setaffinity(0, sizeof(mask), &mask))
128 fprintf(stderr, "cpu: unable to pin, may increase variance.\n");
129 else if (cfg_verbose)
130 fprintf(stderr, "cpu: %u\n", cpu);
135 static void do_setsockopt(int fd, int level, int optname, int val)
137 if (setsockopt(fd, level, optname, &val, sizeof(val)))
138 error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
141 static int do_poll(int fd, int events)
150 ret = poll(&pfd, 1, cfg_waittime_ms);
152 error(1, errno, "poll");
154 return ret && (pfd.revents & events);
157 static int do_accept(int fd)
161 fd = accept(fda, NULL, NULL);
163 error(1, errno, "accept");
165 error(1, errno, "close listen sock");
170 static void add_zcopy_cookie(struct msghdr *msg, uint32_t cookie)
174 if (!msg->msg_control)
175 error(1, errno, "NULL cookie");
176 cm = (void *)msg->msg_control;
177 cm->cmsg_len = CMSG_LEN(sizeof(cookie));
178 cm->cmsg_level = SOL_RDS;
179 cm->cmsg_type = RDS_CMSG_ZCOPY_COOKIE;
180 memcpy(CMSG_DATA(cm), &cookie, sizeof(cookie));
183 static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
185 int ret, len, i, flags;
186 static uint32_t cookie;
187 char ckbuf[CMSG_SPACE(sizeof(cookie))];
190 for (i = 0; i < msg->msg_iovlen; i++)
191 len += msg->msg_iov[i].iov_len;
193 flags = MSG_DONTWAIT;
195 flags |= MSG_ZEROCOPY;
196 if (domain == PF_RDS) {
197 memset(&msg->msg_control, 0, sizeof(msg->msg_control));
198 msg->msg_controllen = CMSG_SPACE(sizeof(cookie));
199 msg->msg_control = (struct cmsghdr *)ckbuf;
200 add_zcopy_cookie(msg, ++cookie);
204 ret = sendmsg(fd, msg, flags);
205 if (ret == -1 && errno == EAGAIN)
208 error(1, errno, "send");
209 if (cfg_verbose && ret != len)
210 fprintf(stderr, "send: ret=%u != %u\n", ret, len);
215 if (do_zerocopy && ret)
216 expected_completions++;
218 if (do_zerocopy && domain == PF_RDS) {
219 msg->msg_control = NULL;
220 msg->msg_controllen = 0;
226 static void do_sendmsg_corked(int fd, struct msghdr *msg)
228 bool do_zerocopy = cfg_zerocopy;
229 int i, payload_len, extra_len;
231 /* split up the packet. for non-multiple, make first buffer longer */
232 payload_len = cfg_payload_len / cfg_cork;
233 extra_len = cfg_payload_len - (cfg_cork * payload_len);
235 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
237 for (i = 0; i < cfg_cork; i++) {
239 /* in mixed-frags mode, alternate zerocopy and copy frags
240 * start with non-zerocopy, to ensure attach later works
243 do_zerocopy = (i & 1);
245 msg->msg_iov[0].iov_len = payload_len + extra_len;
248 do_sendmsg(fd, msg, do_zerocopy,
249 (cfg_dst_addr.ss_family == AF_INET ?
250 PF_INET : PF_INET6));
253 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
256 static int setup_iph(struct iphdr *iph, uint16_t payload_len)
258 struct sockaddr_in *daddr = (void *) &cfg_dst_addr;
259 struct sockaddr_in *saddr = (void *) &cfg_src_addr;
261 memset(iph, 0, sizeof(*iph));
267 iph->saddr = saddr->sin_addr.s_addr;
268 iph->daddr = daddr->sin_addr.s_addr;
269 iph->protocol = IPPROTO_EGP;
270 iph->tot_len = htons(sizeof(*iph) + payload_len);
271 iph->check = get_ip_csum((void *) iph, iph->ihl << 1);
276 static int setup_ip6h(struct ipv6hdr *ip6h, uint16_t payload_len)
278 struct sockaddr_in6 *daddr = (void *) &cfg_dst_addr;
279 struct sockaddr_in6 *saddr = (void *) &cfg_src_addr;
281 memset(ip6h, 0, sizeof(*ip6h));
284 ip6h->payload_len = htons(payload_len);
285 ip6h->nexthdr = IPPROTO_EGP;
287 ip6h->saddr = saddr->sin6_addr;
288 ip6h->daddr = daddr->sin6_addr;
290 return sizeof(*ip6h);
294 static void setup_sockaddr(int domain, const char *str_addr,
295 struct sockaddr_storage *sockaddr)
297 struct sockaddr_in6 *addr6 = (void *) sockaddr;
298 struct sockaddr_in *addr4 = (void *) sockaddr;
302 memset(addr4, 0, sizeof(*addr4));
303 addr4->sin_family = AF_INET;
304 addr4->sin_port = htons(cfg_port);
306 inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1)
307 error(1, 0, "ipv4 parse error: %s", str_addr);
310 memset(addr6, 0, sizeof(*addr6));
311 addr6->sin6_family = AF_INET6;
312 addr6->sin6_port = htons(cfg_port);
314 inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1)
315 error(1, 0, "ipv6 parse error: %s", str_addr);
318 error(1, 0, "illegal domain");
322 static int do_setup_tx(int domain, int type, int protocol)
326 fd = socket(domain, type, protocol);
328 error(1, errno, "socket t");
330 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
332 do_setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, 1);
334 if (domain != PF_PACKET && domain != PF_RDS)
335 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
336 error(1, errno, "connect");
338 if (domain == PF_RDS) {
339 if (bind(fd, (void *) &cfg_src_addr, cfg_alen))
340 error(1, errno, "bind");
346 static uint32_t do_process_zerocopy_cookies(struct rds_zcopy_cookies *ck)
350 if (ck->num > RDS_MAX_ZCOOKIES)
351 error(1, 0, "Returned %d cookies, max expected %d\n",
352 ck->num, RDS_MAX_ZCOOKIES);
353 for (i = 0; i < ck->num; i++)
354 if (cfg_verbose >= 2)
355 fprintf(stderr, "%d\n", ck->cookies[i]);
359 static bool do_recvmsg_completion(int fd)
361 char cmsgbuf[CMSG_SPACE(sizeof(struct rds_zcopy_cookies))];
362 struct rds_zcopy_cookies *ck;
363 struct cmsghdr *cmsg;
367 memset(&msg, 0, sizeof(msg));
368 msg.msg_control = cmsgbuf;
369 msg.msg_controllen = sizeof(cmsgbuf);
371 if (recvmsg(fd, &msg, MSG_DONTWAIT))
374 if (msg.msg_flags & MSG_CTRUNC)
375 error(1, errno, "recvmsg notification: truncated");
377 for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
378 if (cmsg->cmsg_level == SOL_RDS &&
379 cmsg->cmsg_type == RDS_CMSG_ZCOPY_COMPLETION) {
381 ck = (struct rds_zcopy_cookies *)CMSG_DATA(cmsg);
382 completions += do_process_zerocopy_cookies(ck);
386 error(0, 0, "ignoring cmsg at level %d type %d\n",
387 cmsg->cmsg_level, cmsg->cmsg_type);
392 static bool do_recv_completion(int fd, int domain)
394 struct sock_extended_err *serr;
395 struct msghdr msg = {};
397 uint32_t hi, lo, range;
401 if (domain == PF_RDS)
402 return do_recvmsg_completion(fd);
404 msg.msg_control = control;
405 msg.msg_controllen = sizeof(control);
407 ret = recvmsg(fd, &msg, MSG_ERRQUEUE);
408 if (ret == -1 && errno == EAGAIN)
411 error(1, errno, "recvmsg notification");
412 if (msg.msg_flags & MSG_CTRUNC)
413 error(1, errno, "recvmsg notification: truncated");
415 cm = CMSG_FIRSTHDR(&msg);
417 error(1, 0, "cmsg: no cmsg");
418 if (!((cm->cmsg_level == SOL_IP && cm->cmsg_type == IP_RECVERR) ||
419 (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_RECVERR) ||
420 (cm->cmsg_level == SOL_PACKET && cm->cmsg_type == PACKET_TX_TIMESTAMP)))
421 error(1, 0, "serr: wrong type: %d.%d",
422 cm->cmsg_level, cm->cmsg_type);
424 serr = (void *) CMSG_DATA(cm);
426 if (serr->ee_origin != SO_EE_ORIGIN_ZEROCOPY)
427 error(1, 0, "serr: wrong origin: %u", serr->ee_origin);
428 if (serr->ee_errno != 0)
429 error(1, 0, "serr: wrong error code: %u", serr->ee_errno);
435 /* Detect notification gaps. These should not happen often, if at all.
436 * Gaps can occur due to drops, reordering and retransmissions.
438 if (lo != next_completion)
439 fprintf(stderr, "gap: %u..%u does not append to %u\n",
440 lo, hi, next_completion);
441 next_completion = hi + 1;
443 zerocopy = !(serr->ee_code & SO_EE_CODE_ZEROCOPY_COPIED);
444 if (zerocopied == -1)
445 zerocopied = zerocopy;
446 else if (zerocopied != zerocopy) {
447 fprintf(stderr, "serr: inconsistent\n");
448 zerocopied = zerocopy;
451 if (cfg_verbose >= 2)
452 fprintf(stderr, "completed: %u (h=%u l=%u)\n",
455 completions += range;
459 /* Read all outstanding messages on the errqueue */
460 static void do_recv_completions(int fd, int domain)
462 while (do_recv_completion(fd, domain)) {}
465 /* Wait for all remaining completions on the errqueue */
466 static void do_recv_remaining_completions(int fd, int domain)
468 int64_t tstop = gettimeofday_ms() + cfg_waittime_ms;
470 while (completions < expected_completions &&
471 gettimeofday_ms() < tstop) {
472 if (do_poll(fd, domain == PF_RDS ? POLLIN : POLLERR))
473 do_recv_completions(fd, domain);
476 if (completions < expected_completions)
477 fprintf(stderr, "missing notifications: %lu < %lu\n",
478 completions, expected_completions);
481 static void do_tx(int domain, int type, int protocol)
483 struct iovec iov[3] = { {0} };
484 struct sockaddr_ll laddr;
485 struct msghdr msg = {0};
494 fd = do_setup_tx(domain, type, protocol);
496 if (domain == PF_PACKET) {
497 uint16_t proto = cfg_family == PF_INET ? ETH_P_IP : ETH_P_IPV6;
499 /* sock_raw passes ll header as data */
500 if (type == SOCK_RAW) {
501 memset(eth.h_dest, 0x06, ETH_ALEN);
502 memset(eth.h_source, 0x02, ETH_ALEN);
503 eth.h_proto = htons(proto);
504 iov[0].iov_base = ð
505 iov[0].iov_len = sizeof(eth);
509 /* both sock_raw and sock_dgram expect name */
510 memset(&laddr, 0, sizeof(laddr));
511 laddr.sll_family = AF_PACKET;
512 laddr.sll_ifindex = cfg_ifindex;
513 laddr.sll_protocol = htons(proto);
514 laddr.sll_halen = ETH_ALEN;
516 memset(laddr.sll_addr, 0x06, ETH_ALEN);
518 msg.msg_name = &laddr;
519 msg.msg_namelen = sizeof(laddr);
522 /* packet and raw sockets with hdrincl must pass network header */
523 if (domain == PF_PACKET || protocol == IPPROTO_RAW) {
524 if (cfg_family == PF_INET)
525 iov[1].iov_len = setup_iph(&nh.iph, cfg_payload_len);
527 iov[1].iov_len = setup_ip6h(&nh.ip6h, cfg_payload_len);
529 iov[1].iov_base = (void *) &nh;
533 if (domain == PF_RDS) {
534 msg.msg_name = &cfg_dst_addr;
535 msg.msg_namelen = (cfg_dst_addr.ss_family == AF_INET ?
536 sizeof(struct sockaddr_in) :
537 sizeof(struct sockaddr_in6));
540 iov[2].iov_base = payload;
541 iov[2].iov_len = cfg_payload_len;
543 msg.msg_iov = &iov[3 - msg.msg_iovlen];
545 tstop = gettimeofday_ms() + cfg_runtime_ms;
548 do_sendmsg_corked(fd, &msg);
550 do_sendmsg(fd, &msg, cfg_zerocopy, domain);
552 while (!do_poll(fd, POLLOUT)) {
554 do_recv_completions(fd, domain);
557 } while (gettimeofday_ms() < tstop);
560 do_recv_remaining_completions(fd, domain);
563 error(1, errno, "close");
565 fprintf(stderr, "tx=%lu (%lu MB) txc=%lu zc=%c\n",
566 packets, bytes >> 20, completions,
567 zerocopied == 1 ? 'y' : 'n');
570 static int do_setup_rx(int domain, int type, int protocol)
574 /* If tx over PF_PACKET, rx over PF_INET(6)/SOCK_RAW,
575 * to recv the only copy of the packet, not a clone
577 if (domain == PF_PACKET)
578 error(1, 0, "Use PF_INET/SOCK_RAW to read");
580 if (type == SOCK_RAW && protocol == IPPROTO_RAW)
581 error(1, 0, "IPPROTO_RAW: not supported on Rx");
583 fd = socket(domain, type, protocol);
585 error(1, errno, "socket r");
587 do_setsockopt(fd, SOL_SOCKET, SO_RCVBUF, 1 << 21);
588 do_setsockopt(fd, SOL_SOCKET, SO_RCVLOWAT, 1 << 16);
589 do_setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, 1);
591 if (bind(fd, (void *) &cfg_dst_addr, cfg_alen))
592 error(1, errno, "bind");
594 if (type == SOCK_STREAM) {
596 error(1, errno, "listen");
603 /* Flush all outstanding bytes for the tcp receive queue */
604 static void do_flush_tcp(int fd)
608 /* MSG_TRUNC flushes up to len bytes */
609 ret = recv(fd, NULL, 1 << 21, MSG_TRUNC | MSG_DONTWAIT);
610 if (ret == -1 && errno == EAGAIN)
613 error(1, errno, "flush");
621 /* Flush all outstanding datagrams. Verify first few bytes of each. */
622 static void do_flush_datagram(int fd, int type)
627 /* MSG_TRUNC will return full datagram length */
628 ret = recv(fd, buf, sizeof(buf), MSG_DONTWAIT | MSG_TRUNC);
629 if (ret == -1 && errno == EAGAIN)
632 /* raw ipv4 return with header, raw ipv6 without */
633 if (cfg_family == PF_INET && type == SOCK_RAW) {
634 off += sizeof(struct iphdr);
635 ret -= sizeof(struct iphdr);
639 error(1, errno, "recv");
640 if (ret != cfg_payload_len)
641 error(1, 0, "recv: ret=%u != %u", ret, cfg_payload_len);
642 if (ret > sizeof(buf) - off)
643 ret = sizeof(buf) - off;
644 if (memcmp(buf + off, payload, ret))
645 error(1, 0, "recv: data mismatch");
648 bytes += cfg_payload_len;
651 static void do_rx(int domain, int type, int protocol)
653 const int cfg_receiver_wait_ms = 400;
657 fd = do_setup_rx(domain, type, protocol);
659 tstop = gettimeofday_ms() + cfg_runtime_ms + cfg_receiver_wait_ms;
661 if (type == SOCK_STREAM)
664 do_flush_datagram(fd, type);
668 } while (gettimeofday_ms() < tstop);
671 error(1, errno, "close");
673 fprintf(stderr, "rx=%lu (%lu MB)\n", packets, bytes >> 20);
676 static void do_test(int domain, int type, int protocol)
680 if (cfg_cork && (domain == PF_PACKET || type != SOCK_DGRAM))
681 error(1, 0, "can only cork udp sockets");
685 for (i = 0; i < IP_MAXPACKET; i++)
686 payload[i] = 'a' + (i % 26);
689 do_rx(domain, type, protocol);
691 do_tx(domain, type, protocol);
694 static void usage(const char *filepath)
696 error(1, 0, "Usage: %s [options] <test>", filepath);
699 static void parse_opts(int argc, char **argv)
701 const int max_payload_len = sizeof(payload) -
702 sizeof(struct ipv6hdr) -
703 sizeof(struct tcphdr) -
704 40 /* max tcp options */;
706 char *daddr = NULL, *saddr = NULL;
709 cfg_payload_len = max_payload_len;
711 while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
714 if (cfg_family != PF_UNSPEC)
715 error(1, 0, "Pass one of -4 or -6");
716 cfg_family = PF_INET;
717 cfg_alen = sizeof(struct sockaddr_in);
720 if (cfg_family != PF_UNSPEC)
721 error(1, 0, "Pass one of -4 or -6");
722 cfg_family = PF_INET6;
723 cfg_alen = sizeof(struct sockaddr_in6);
726 cfg_cork = strtol(optarg, NULL, 0);
729 cfg_cpu = strtol(optarg, NULL, 0);
735 cfg_ifindex = if_nametoindex(optarg);
736 if (cfg_ifindex == 0)
737 error(1, errno, "invalid iface: %s", optarg);
740 cfg_cork_mixed = true;
743 cfg_port = strtoul(optarg, NULL, 0);
749 cfg_payload_len = strtoul(optarg, NULL, 0);
755 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
766 cfg_test = argv[argc - 1];
767 if (strcmp(cfg_test, "rds") == 0) {
769 error(1, 0, "-D <server addr> required for PF_RDS\n");
770 if (!cfg_rx && !saddr)
771 error(1, 0, "-S <client addr> required for PF_RDS\n");
773 setup_sockaddr(cfg_family, daddr, &cfg_dst_addr);
774 setup_sockaddr(cfg_family, saddr, &cfg_src_addr);
776 if (cfg_payload_len > max_payload_len)
777 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
778 if (cfg_cork_mixed && (!cfg_zerocopy || !cfg_cork))
779 error(1, 0, "-m: cork_mixed requires corking and zerocopy");
781 if (optind != argc - 1)
785 int main(int argc, char **argv)
787 const char *cfg_test;
789 parse_opts(argc, argv);
791 cfg_test = argv[argc - 1];
793 if (!strcmp(cfg_test, "packet"))
794 do_test(PF_PACKET, SOCK_RAW, 0);
795 else if (!strcmp(cfg_test, "packet_dgram"))
796 do_test(PF_PACKET, SOCK_DGRAM, 0);
797 else if (!strcmp(cfg_test, "raw"))
798 do_test(cfg_family, SOCK_RAW, IPPROTO_EGP);
799 else if (!strcmp(cfg_test, "raw_hdrincl"))
800 do_test(cfg_family, SOCK_RAW, IPPROTO_RAW);
801 else if (!strcmp(cfg_test, "tcp"))
802 do_test(cfg_family, SOCK_STREAM, 0);
803 else if (!strcmp(cfg_test, "udp"))
804 do_test(cfg_family, SOCK_DGRAM, 0);
805 else if (!strcmp(cfg_test, "rds"))
806 do_test(PF_RDS, SOCK_SEQPACKET, 0);
808 error(1, 0, "unknown cfg_test %s", cfg_test);