1 /* SPDX-License-Identifier: MIT */
2 /* based on linux-kernel/tools/testing/selftests/net/msg_zerocopy.c */
15 #include <arpa/inet.h>
16 #include <linux/errqueue.h>
17 #include <linux/if_packet.h>
18 #include <linux/io_uring.h>
19 #include <linux/ipv6.h>
20 #include <linux/socket.h>
21 #include <linux/sockios.h>
22 #include <net/ethernet.h>
24 #include <netinet/in.h>
25 #include <netinet/ip.h>
26 #include <netinet/ip6.h>
27 #include <netinet/tcp.h>
28 #include <netinet/udp.h>
29 #include <sys/ioctl.h>
31 #include <sys/resource.h>
32 #include <sys/socket.h>
35 #include <sys/types.h>
41 #define NOTIF_TAG 0xfffffffULL
52 static bool cfg_flush = false;
53 static bool cfg_cork = false;
54 static int cfg_mode = MODE_ZC_FIXED;
55 static int cfg_nr_reqs = 8;
56 static int cfg_family = PF_UNSPEC;
57 static int cfg_payload_len;
58 static int cfg_port = 8000;
59 static int cfg_runtime_ms = 4200;
61 static socklen_t cfg_alen;
62 static struct sockaddr_storage cfg_dst_addr;
64 static char payload[IP_MAXPACKET] __attribute__((aligned(4096)));
70 unsigned *ring_entries;
79 unsigned *ring_entries;
80 struct io_uring_cqe *cqes;
87 unsigned *kring_entries;
91 struct io_uring_sqe *sqes;
102 unsigned *kring_mask;
103 unsigned *kring_entries;
105 struct io_uring_cqe *cqes;
111 struct io_uring_sq sq;
112 struct io_uring_cq cq;
117 # ifndef __NR_io_uring_setup
118 # define __NR_io_uring_setup 535
120 # ifndef __NR_io_uring_enter
121 # define __NR_io_uring_enter 536
123 # ifndef __NR_io_uring_register
124 # define __NR_io_uring_register 537
126 #else /* !__alpha__ */
127 # ifndef __NR_io_uring_setup
128 # define __NR_io_uring_setup 425
130 # ifndef __NR_io_uring_enter
131 # define __NR_io_uring_enter 426
133 # ifndef __NR_io_uring_register
134 # define __NR_io_uring_register 427
138 #if defined(__x86_64) || defined(__i386__)
139 #define read_barrier() __asm__ __volatile__("":::"memory")
140 #define write_barrier() __asm__ __volatile__("":::"memory")
143 #define read_barrier() __sync_synchronize()
144 #define write_barrier() __sync_synchronize()
147 static int io_uring_setup(unsigned int entries, struct io_uring_params *p)
149 return syscall(__NR_io_uring_setup, entries, p);
152 static int io_uring_enter(int fd, unsigned int to_submit,
153 unsigned int min_complete,
154 unsigned int flags, sigset_t *sig)
156 return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
157 flags, sig, _NSIG / 8);
160 static int io_uring_register_buffers(struct io_uring *ring,
161 const struct iovec *iovecs,
166 ret = syscall(__NR_io_uring_register, ring->ring_fd,
167 IORING_REGISTER_BUFFERS, iovecs, nr_iovecs);
168 return (ret < 0) ? -errno : ret;
171 static int io_uring_register_notifications(struct io_uring *ring,
173 struct io_uring_notification_slot *slots)
176 struct io_uring_notification_register r = {
178 .data = (unsigned long)slots,
181 ret = syscall(__NR_io_uring_register, ring->ring_fd,
182 IORING_REGISTER_NOTIFIERS, &r, sizeof(r));
183 return (ret < 0) ? -errno : ret;
186 static int io_uring_mmap(int fd, struct io_uring_params *p,
187 struct io_uring_sq *sq, struct io_uring_cq *cq)
193 sq->ring_sz = p->sq_off.array + p->sq_entries * sizeof(unsigned);
194 ptr = mmap(0, sq->ring_sz, PROT_READ | PROT_WRITE,
195 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQ_RING);
196 if (ptr == MAP_FAILED)
198 sq->khead = ptr + p->sq_off.head;
199 sq->ktail = ptr + p->sq_off.tail;
200 sq->kring_mask = ptr + p->sq_off.ring_mask;
201 sq->kring_entries = ptr + p->sq_off.ring_entries;
202 sq->kflags = ptr + p->sq_off.flags;
203 sq->kdropped = ptr + p->sq_off.dropped;
204 sq->array = ptr + p->sq_off.array;
206 size = p->sq_entries * sizeof(struct io_uring_sqe);
207 sq->sqes = mmap(0, size, PROT_READ | PROT_WRITE,
208 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_SQES);
209 if (sq->sqes == MAP_FAILED) {
212 munmap(sq->khead, sq->ring_sz);
216 cq->ring_sz = p->cq_off.cqes + p->cq_entries * sizeof(struct io_uring_cqe);
217 ptr = mmap(0, cq->ring_sz, PROT_READ | PROT_WRITE,
218 MAP_SHARED | MAP_POPULATE, fd, IORING_OFF_CQ_RING);
219 if (ptr == MAP_FAILED) {
221 munmap(sq->sqes, p->sq_entries * sizeof(struct io_uring_sqe));
224 cq->khead = ptr + p->cq_off.head;
225 cq->ktail = ptr + p->cq_off.tail;
226 cq->kring_mask = ptr + p->cq_off.ring_mask;
227 cq->kring_entries = ptr + p->cq_off.ring_entries;
228 cq->koverflow = ptr + p->cq_off.overflow;
229 cq->cqes = ptr + p->cq_off.cqes;
233 static int io_uring_queue_init(unsigned entries, struct io_uring *ring,
236 struct io_uring_params p;
239 memset(ring, 0, sizeof(*ring));
240 memset(&p, 0, sizeof(p));
243 fd = io_uring_setup(entries, &p);
246 ret = io_uring_mmap(fd, &p, &ring->sq, &ring->cq);
254 static int io_uring_submit(struct io_uring *ring)
256 struct io_uring_sq *sq = &ring->sq;
257 const unsigned mask = *sq->kring_mask;
258 unsigned ktail, submitted, to_submit;
262 if (*sq->khead != *sq->ktail) {
263 submitted = *sq->kring_entries;
266 if (sq->sqe_head == sq->sqe_tail)
270 to_submit = sq->sqe_tail - sq->sqe_head;
271 for (submitted = 0; submitted < to_submit; submitted++) {
273 sq->array[ktail++ & mask] = sq->sqe_head++ & mask;
278 if (*sq->ktail != ktail) {
284 ret = io_uring_enter(ring->ring_fd, submitted, 0,
285 IORING_ENTER_GETEVENTS, NULL);
286 return ret < 0 ? -errno : ret;
289 static inline void io_uring_prep_send(struct io_uring_sqe *sqe, int sockfd,
290 const void *buf, size_t len, int flags)
292 memset(sqe, 0, sizeof(*sqe));
293 sqe->opcode = (__u8) IORING_OP_SEND;
295 sqe->addr = (unsigned long) buf;
297 sqe->msg_flags = (__u32) flags;
300 static inline void io_uring_prep_sendzc(struct io_uring_sqe *sqe, int sockfd,
301 const void *buf, size_t len, int flags,
302 unsigned slot_idx, unsigned zc_flags)
304 io_uring_prep_send(sqe, sockfd, buf, len, flags);
305 sqe->opcode = (__u8) IORING_OP_SENDZC_NOTIF;
306 sqe->notification_idx = slot_idx;
307 sqe->ioprio = zc_flags;
310 static struct io_uring_sqe *io_uring_get_sqe(struct io_uring *ring)
312 struct io_uring_sq *sq = &ring->sq;
314 if (sq->sqe_tail + 1 - sq->sqe_head > *sq->kring_entries)
316 return &sq->sqes[sq->sqe_tail++ & *sq->kring_mask];
319 static int io_uring_wait_cqe(struct io_uring *ring, struct io_uring_cqe **cqe_ptr)
321 struct io_uring_cq *cq = &ring->cq;
322 const unsigned mask = *cq->kring_mask;
323 unsigned head = *cq->khead;
329 if (head != *cq->ktail) {
330 *cqe_ptr = &cq->cqes[head & mask];
333 ret = io_uring_enter(ring->ring_fd, 0, 1,
334 IORING_ENTER_GETEVENTS, NULL);
342 static inline void io_uring_cqe_seen(struct io_uring *ring)
344 *(&ring->cq)->khead += 1;
348 static unsigned long gettimeofday_ms(void)
352 gettimeofday(&tv, NULL);
353 return (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
356 static void do_setsockopt(int fd, int level, int optname, int val)
358 if (setsockopt(fd, level, optname, &val, sizeof(val)))
359 error(1, errno, "setsockopt %d.%d: %d", level, optname, val);
362 static int do_setup_tx(int domain, int type, int protocol)
366 fd = socket(domain, type, protocol);
368 error(1, errno, "socket t");
370 do_setsockopt(fd, SOL_SOCKET, SO_SNDBUF, 1 << 21);
372 if (connect(fd, (void *) &cfg_dst_addr, cfg_alen))
373 error(1, errno, "connect");
377 static void do_tx(int domain, int type, int protocol)
379 struct io_uring_notification_slot b[1] = {{.tag = NOTIF_TAG}};
380 struct io_uring_sqe *sqe;
381 struct io_uring_cqe *cqe;
382 unsigned long packets = 0, bytes = 0;
383 struct io_uring ring;
389 fd = do_setup_tx(domain, type, protocol);
391 ret = io_uring_queue_init(512, &ring, 0);
393 error(1, ret, "io_uring: queue init");
395 ret = io_uring_register_notifications(&ring, 1, b);
397 error(1, ret, "io_uring: tx ctx registration");
399 iov.iov_base = payload;
400 iov.iov_len = cfg_payload_len;
402 ret = io_uring_register_buffers(&ring, &iov, 1);
404 error(1, ret, "io_uring: buffer registration");
406 tstop = gettimeofday_ms() + cfg_runtime_ms;
409 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 1);
411 for (i = 0; i < cfg_nr_reqs; i++) {
412 unsigned zc_flags = 0;
413 unsigned buf_idx = 0;
414 unsigned slot_idx = 0;
415 unsigned mode = cfg_mode;
416 unsigned msg_flags = 0;
418 if (cfg_mode == MODE_MIXED)
421 sqe = io_uring_get_sqe(&ring);
423 if (mode == MODE_NONZC) {
424 io_uring_prep_send(sqe, fd, payload,
425 cfg_payload_len, msg_flags);
426 sqe->user_data = NONZC_TAG;
429 zc_flags |= IORING_RECVSEND_NOTIF_FLUSH;
432 io_uring_prep_sendzc(sqe, fd, payload,
434 msg_flags, slot_idx, zc_flags);
435 if (mode == MODE_ZC_FIXED) {
436 sqe->ioprio |= IORING_RECVSEND_FIXED_BUF;
437 sqe->buf_index = buf_idx;
439 sqe->user_data = ZC_TAG;
443 ret = io_uring_submit(&ring);
444 if (ret != cfg_nr_reqs)
445 error(1, ret, "submit");
447 for (i = 0; i < cfg_nr_reqs; i++) {
448 ret = io_uring_wait_cqe(&ring, &cqe);
450 error(1, ret, "wait cqe");
452 if (cqe->user_data == NOTIF_TAG) {
455 } else if (cqe->user_data != NONZC_TAG &&
456 cqe->user_data != ZC_TAG) {
457 error(1, cqe->res, "invalid user_data");
458 } else if (cqe->res <= 0 && cqe->res != -EAGAIN) {
459 error(1, cqe->res, "send failed");
465 /* failed requests don't flush */
468 cqe->user_data == ZC_TAG)
471 io_uring_cqe_seen(&ring);
474 do_setsockopt(fd, IPPROTO_UDP, UDP_CORK, 0);
475 } while (gettimeofday_ms() < tstop);
478 error(1, errno, "close");
480 fprintf(stderr, "tx=%lu (MB=%lu), tx/s=%lu (MB/s=%lu)\n",
481 packets, bytes >> 20,
482 packets / (cfg_runtime_ms / 1000),
483 (bytes >> 20) / (cfg_runtime_ms / 1000));
486 ret = io_uring_wait_cqe(&ring, &cqe);
488 error(1, ret, "wait cqe");
489 io_uring_cqe_seen(&ring);
494 static void do_test(int domain, int type, int protocol)
498 for (i = 0; i < IP_MAXPACKET; i++)
499 payload[i] = 'a' + (i % 26);
500 do_tx(domain, type, protocol);
503 static void usage(const char *filepath)
505 error(1, 0, "Usage: %s [-f] [-n<N>] [-z0] [-s<payload size>] "
506 "(-4|-6) [-t<time s>] -D<dst_ip> udp", filepath);
509 static void parse_opts(int argc, char **argv)
511 const int max_payload_len = sizeof(payload) -
512 sizeof(struct ipv6hdr) -
513 sizeof(struct tcphdr) -
514 40 /* max tcp options */;
515 struct sockaddr_in6 *addr6 = (void *) &cfg_dst_addr;
516 struct sockaddr_in *addr4 = (void *) &cfg_dst_addr;
522 cfg_payload_len = max_payload_len;
524 while ((c = getopt(argc, argv, "46D:p:s:t:n:fc:m:")) != -1) {
527 if (cfg_family != PF_UNSPEC)
528 error(1, 0, "Pass one of -4 or -6");
529 cfg_family = PF_INET;
530 cfg_alen = sizeof(struct sockaddr_in);
533 if (cfg_family != PF_UNSPEC)
534 error(1, 0, "Pass one of -4 or -6");
535 cfg_family = PF_INET6;
536 cfg_alen = sizeof(struct sockaddr_in6);
542 cfg_port = strtoul(optarg, NULL, 0);
545 cfg_payload_len = strtoul(optarg, NULL, 0);
548 cfg_runtime_ms = 200 + strtoul(optarg, NULL, 10) * 1000;
551 cfg_nr_reqs = strtoul(optarg, NULL, 0);
557 cfg_cork = strtol(optarg, NULL, 0);
560 cfg_mode = strtol(optarg, NULL, 0);
565 switch (cfg_family) {
567 memset(addr4, 0, sizeof(*addr4));
568 addr4->sin_family = AF_INET;
569 addr4->sin_port = htons(cfg_port);
571 inet_pton(AF_INET, daddr, &(addr4->sin_addr)) != 1)
572 error(1, 0, "ipv4 parse error: %s", daddr);
575 memset(addr6, 0, sizeof(*addr6));
576 addr6->sin6_family = AF_INET6;
577 addr6->sin6_port = htons(cfg_port);
579 inet_pton(AF_INET6, daddr, &(addr6->sin6_addr)) != 1)
580 error(1, 0, "ipv6 parse error: %s", daddr);
583 error(1, 0, "illegal domain");
586 if (cfg_payload_len > max_payload_len)
587 error(1, 0, "-s: payload exceeds max (%d)", max_payload_len);
588 if (cfg_mode == MODE_NONZC && cfg_flush)
589 error(1, 0, "-f: only zerocopy modes support notifications");
590 if (optind != argc - 1)
594 int main(int argc, char **argv)
596 const char *cfg_test = argv[argc - 1];
598 parse_opts(argc, argv);
600 if (!strcmp(cfg_test, "tcp"))
601 do_test(cfg_family, SOCK_STREAM, 0);
602 else if (!strcmp(cfg_test, "udp"))
603 do_test(cfg_family, SOCK_DGRAM, 0);
605 error(1, 0, "unknown cfg_test %s", cfg_test);
610 int main(int argc, char **argv)