selftests/bpf: Fix erroneous bitmask operation
[platform/kernel/linux-rpi.git] / tools / testing / selftests / bpf / progs / xdp_synproxy_kern.c
1 // SPDX-License-Identifier: LGPL-2.1 OR BSD-2-Clause
2 /* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3
4 #include "vmlinux.h"
5
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_endian.h>
8 #include <asm/errno.h>
9
10 #define TC_ACT_OK 0
11 #define TC_ACT_SHOT 2
12
13 #define NSEC_PER_SEC 1000000000L
14
15 #define ETH_ALEN 6
16 #define ETH_P_IP 0x0800
17 #define ETH_P_IPV6 0x86DD
18
19 #define tcp_flag_word(tp) (((union tcp_word_hdr *)(tp))->words[3])
20
21 #define IP_DF 0x4000
22 #define IP_MF 0x2000
23 #define IP_OFFSET 0x1fff
24
25 #define NEXTHDR_TCP 6
26
27 #define TCPOPT_NOP 1
28 #define TCPOPT_EOL 0
29 #define TCPOPT_MSS 2
30 #define TCPOPT_WINDOW 3
31 #define TCPOPT_SACK_PERM 4
32 #define TCPOPT_TIMESTAMP 8
33
34 #define TCPOLEN_MSS 4
35 #define TCPOLEN_WINDOW 3
36 #define TCPOLEN_SACK_PERM 2
37 #define TCPOLEN_TIMESTAMP 10
38
39 #define TCP_TS_HZ 1000
40 #define TS_OPT_WSCALE_MASK 0xf
41 #define TS_OPT_SACK (1 << 4)
42 #define TS_OPT_ECN (1 << 5)
43 #define TSBITS 6
44 #define TSMASK (((__u32)1 << TSBITS) - 1)
45 #define TCP_MAX_WSCALE 14U
46
47 #define IPV4_MAXLEN 60
48 #define TCP_MAXLEN 60
49
50 #define DEFAULT_MSS4 1460
51 #define DEFAULT_MSS6 1440
52 #define DEFAULT_WSCALE 7
53 #define DEFAULT_TTL 64
54 #define MAX_ALLOWED_PORTS 8
55
56 #define swap(a, b) \
57         do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0)
58
59 #define __get_unaligned_t(type, ptr) ({                                         \
60         const struct { type x; } __attribute__((__packed__)) *__pptr = (typeof(__pptr))(ptr); \
61         __pptr->x;                                                              \
62 })
63
64 #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr))
65
66 struct {
67         __uint(type, BPF_MAP_TYPE_ARRAY);
68         __type(key, __u32);
69         __type(value, __u64);
70         __uint(max_entries, 2);
71 } values SEC(".maps");
72
73 struct {
74         __uint(type, BPF_MAP_TYPE_ARRAY);
75         __type(key, __u32);
76         __type(value, __u16);
77         __uint(max_entries, MAX_ALLOWED_PORTS);
78 } allowed_ports SEC(".maps");
79
80 /* Some symbols defined in net/netfilter/nf_conntrack_bpf.c are unavailable in
81  * vmlinux.h if CONFIG_NF_CONNTRACK=m, so they are redefined locally.
82  */
83
84 struct bpf_ct_opts___local {
85         s32 netns_id;
86         s32 error;
87         u8 l4proto;
88         u8 dir;
89         u8 reserved[2];
90 } __attribute__((preserve_access_index));
91
92 #define BPF_F_CURRENT_NETNS (-1)
93
94 extern struct nf_conn *bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx,
95                                          struct bpf_sock_tuple *bpf_tuple,
96                                          __u32 len_tuple,
97                                          struct bpf_ct_opts___local *opts,
98                                          __u32 len_opts) __ksym;
99
100 extern struct nf_conn *bpf_skb_ct_lookup(struct __sk_buff *skb_ctx,
101                                          struct bpf_sock_tuple *bpf_tuple,
102                                          u32 len_tuple,
103                                          struct bpf_ct_opts___local *opts,
104                                          u32 len_opts) __ksym;
105
106 extern void bpf_ct_release(struct nf_conn *ct) __ksym;
107
108 static __always_inline void swap_eth_addr(__u8 *a, __u8 *b)
109 {
110         __u8 tmp[ETH_ALEN];
111
112         __builtin_memcpy(tmp, a, ETH_ALEN);
113         __builtin_memcpy(a, b, ETH_ALEN);
114         __builtin_memcpy(b, tmp, ETH_ALEN);
115 }
116
117 static __always_inline __u16 csum_fold(__u32 csum)
118 {
119         csum = (csum & 0xffff) + (csum >> 16);
120         csum = (csum & 0xffff) + (csum >> 16);
121         return (__u16)~csum;
122 }
123
124 static __always_inline __u16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
125                                                __u32 len, __u8 proto,
126                                                __u32 csum)
127 {
128         __u64 s = csum;
129
130         s += (__u32)saddr;
131         s += (__u32)daddr;
132 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
133         s += proto + len;
134 #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
135         s += (proto + len) << 8;
136 #else
137 #error Unknown endian
138 #endif
139         s = (s & 0xffffffff) + (s >> 32);
140         s = (s & 0xffffffff) + (s >> 32);
141
142         return csum_fold((__u32)s);
143 }
144
145 static __always_inline __u16 csum_ipv6_magic(const struct in6_addr *saddr,
146                                              const struct in6_addr *daddr,
147                                              __u32 len, __u8 proto, __u32 csum)
148 {
149         __u64 sum = csum;
150         int i;
151
152 #pragma unroll
153         for (i = 0; i < 4; i++)
154                 sum += (__u32)saddr->in6_u.u6_addr32[i];
155
156 #pragma unroll
157         for (i = 0; i < 4; i++)
158                 sum += (__u32)daddr->in6_u.u6_addr32[i];
159
160         /* Don't combine additions to avoid 32-bit overflow. */
161         sum += bpf_htonl(len);
162         sum += bpf_htonl(proto);
163
164         sum = (sum & 0xffffffff) + (sum >> 32);
165         sum = (sum & 0xffffffff) + (sum >> 32);
166
167         return csum_fold((__u32)sum);
168 }
169
170 static __always_inline __u64 tcp_clock_ns(void)
171 {
172         return bpf_ktime_get_ns();
173 }
174
175 static __always_inline __u32 tcp_ns_to_ts(__u64 ns)
176 {
177         return ns / (NSEC_PER_SEC / TCP_TS_HZ);
178 }
179
180 static __always_inline __u32 tcp_time_stamp_raw(void)
181 {
182         return tcp_ns_to_ts(tcp_clock_ns());
183 }
184
185 struct tcpopt_context {
186         __u8 *ptr;
187         __u8 *end;
188         void *data_end;
189         __be32 *tsecr;
190         __u8 wscale;
191         bool option_timestamp;
192         bool option_sack;
193 };
194
195 static int tscookie_tcpopt_parse(struct tcpopt_context *ctx)
196 {
197         __u8 opcode, opsize;
198
199         if (ctx->ptr >= ctx->end)
200                 return 1;
201         if (ctx->ptr >= ctx->data_end)
202                 return 1;
203
204         opcode = ctx->ptr[0];
205
206         if (opcode == TCPOPT_EOL)
207                 return 1;
208         if (opcode == TCPOPT_NOP) {
209                 ++ctx->ptr;
210                 return 0;
211         }
212
213         if (ctx->ptr + 1 >= ctx->end)
214                 return 1;
215         if (ctx->ptr + 1 >= ctx->data_end)
216                 return 1;
217         opsize = ctx->ptr[1];
218         if (opsize < 2)
219                 return 1;
220
221         if (ctx->ptr + opsize > ctx->end)
222                 return 1;
223
224         switch (opcode) {
225         case TCPOPT_WINDOW:
226                 if (opsize == TCPOLEN_WINDOW && ctx->ptr + TCPOLEN_WINDOW <= ctx->data_end)
227                         ctx->wscale = ctx->ptr[2] < TCP_MAX_WSCALE ? ctx->ptr[2] : TCP_MAX_WSCALE;
228                 break;
229         case TCPOPT_TIMESTAMP:
230                 if (opsize == TCPOLEN_TIMESTAMP && ctx->ptr + TCPOLEN_TIMESTAMP <= ctx->data_end) {
231                         ctx->option_timestamp = true;
232                         /* Client's tsval becomes our tsecr. */
233                         *ctx->tsecr = get_unaligned((__be32 *)(ctx->ptr + 2));
234                 }
235                 break;
236         case TCPOPT_SACK_PERM:
237                 if (opsize == TCPOLEN_SACK_PERM)
238                         ctx->option_sack = true;
239                 break;
240         }
241
242         ctx->ptr += opsize;
243
244         return 0;
245 }
246
247 static int tscookie_tcpopt_parse_batch(__u32 index, void *context)
248 {
249         int i;
250
251         for (i = 0; i < 7; i++)
252                 if (tscookie_tcpopt_parse(context))
253                         return 1;
254         return 0;
255 }
256
257 static __always_inline bool tscookie_init(struct tcphdr *tcp_header,
258                                           __u16 tcp_len, __be32 *tsval,
259                                           __be32 *tsecr, void *data_end)
260 {
261         struct tcpopt_context loop_ctx = {
262                 .ptr = (__u8 *)(tcp_header + 1),
263                 .end = (__u8 *)tcp_header + tcp_len,
264                 .data_end = data_end,
265                 .tsecr = tsecr,
266                 .wscale = TS_OPT_WSCALE_MASK,
267                 .option_timestamp = false,
268                 .option_sack = false,
269         };
270         u32 cookie;
271
272         bpf_loop(6, tscookie_tcpopt_parse_batch, &loop_ctx, 0);
273
274         if (!loop_ctx.option_timestamp)
275                 return false;
276
277         cookie = tcp_time_stamp_raw() & ~TSMASK;
278         cookie |= loop_ctx.wscale & TS_OPT_WSCALE_MASK;
279         if (loop_ctx.option_sack)
280                 cookie |= TS_OPT_SACK;
281         if (tcp_header->ece && tcp_header->cwr)
282                 cookie |= TS_OPT_ECN;
283         *tsval = bpf_htonl(cookie);
284
285         return true;
286 }
287
288 static __always_inline void values_get_tcpipopts(__u16 *mss, __u8 *wscale,
289                                                  __u8 *ttl, bool ipv6)
290 {
291         __u32 key = 0;
292         __u64 *value;
293
294         value = bpf_map_lookup_elem(&values, &key);
295         if (value && *value != 0) {
296                 if (ipv6)
297                         *mss = (*value >> 32) & 0xffff;
298                 else
299                         *mss = *value & 0xffff;
300                 *wscale = (*value >> 16) & 0xf;
301                 *ttl = (*value >> 24) & 0xff;
302                 return;
303         }
304
305         *mss = ipv6 ? DEFAULT_MSS6 : DEFAULT_MSS4;
306         *wscale = DEFAULT_WSCALE;
307         *ttl = DEFAULT_TTL;
308 }
309
310 static __always_inline void values_inc_synacks(void)
311 {
312         __u32 key = 1;
313         __u64 *value;
314
315         value = bpf_map_lookup_elem(&values, &key);
316         if (value)
317                 __sync_fetch_and_add(value, 1);
318 }
319
320 static __always_inline bool check_port_allowed(__u16 port)
321 {
322         __u32 i;
323
324         for (i = 0; i < MAX_ALLOWED_PORTS; i++) {
325                 __u32 key = i;
326                 __u16 *value;
327
328                 value = bpf_map_lookup_elem(&allowed_ports, &key);
329
330                 if (!value)
331                         break;
332                 /* 0 is a terminator value. Check it first to avoid matching on
333                  * a forbidden port == 0 and returning true.
334                  */
335                 if (*value == 0)
336                         break;
337
338                 if (*value == port)
339                         return true;
340         }
341
342         return false;
343 }
344
345 struct header_pointers {
346         struct ethhdr *eth;
347         struct iphdr *ipv4;
348         struct ipv6hdr *ipv6;
349         struct tcphdr *tcp;
350         __u16 tcp_len;
351 };
352
353 static __always_inline int tcp_dissect(void *data, void *data_end,
354                                        struct header_pointers *hdr)
355 {
356         hdr->eth = data;
357         if (hdr->eth + 1 > data_end)
358                 return XDP_DROP;
359
360         switch (bpf_ntohs(hdr->eth->h_proto)) {
361         case ETH_P_IP:
362                 hdr->ipv6 = NULL;
363
364                 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
365                 if (hdr->ipv4 + 1 > data_end)
366                         return XDP_DROP;
367                 if (hdr->ipv4->ihl * 4 < sizeof(*hdr->ipv4))
368                         return XDP_DROP;
369                 if (hdr->ipv4->version != 4)
370                         return XDP_DROP;
371
372                 if (hdr->ipv4->protocol != IPPROTO_TCP)
373                         return XDP_PASS;
374
375                 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
376                 break;
377         case ETH_P_IPV6:
378                 hdr->ipv4 = NULL;
379
380                 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
381                 if (hdr->ipv6 + 1 > data_end)
382                         return XDP_DROP;
383                 if (hdr->ipv6->version != 6)
384                         return XDP_DROP;
385
386                 /* XXX: Extension headers are not supported and could circumvent
387                  * XDP SYN flood protection.
388                  */
389                 if (hdr->ipv6->nexthdr != NEXTHDR_TCP)
390                         return XDP_PASS;
391
392                 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
393                 break;
394         default:
395                 /* XXX: VLANs will circumvent XDP SYN flood protection. */
396                 return XDP_PASS;
397         }
398
399         if (hdr->tcp + 1 > data_end)
400                 return XDP_DROP;
401         hdr->tcp_len = hdr->tcp->doff * 4;
402         if (hdr->tcp_len < sizeof(*hdr->tcp))
403                 return XDP_DROP;
404
405         return XDP_TX;
406 }
407
408 static __always_inline int tcp_lookup(void *ctx, struct header_pointers *hdr, bool xdp)
409 {
410         struct bpf_ct_opts___local ct_lookup_opts = {
411                 .netns_id = BPF_F_CURRENT_NETNS,
412                 .l4proto = IPPROTO_TCP,
413         };
414         struct bpf_sock_tuple tup = {};
415         struct nf_conn *ct;
416         __u32 tup_size;
417
418         if (hdr->ipv4) {
419                 /* TCP doesn't normally use fragments, and XDP can't reassemble
420                  * them.
421                  */
422                 if ((hdr->ipv4->frag_off & bpf_htons(IP_DF | IP_MF | IP_OFFSET)) != bpf_htons(IP_DF))
423                         return XDP_DROP;
424
425                 tup.ipv4.saddr = hdr->ipv4->saddr;
426                 tup.ipv4.daddr = hdr->ipv4->daddr;
427                 tup.ipv4.sport = hdr->tcp->source;
428                 tup.ipv4.dport = hdr->tcp->dest;
429                 tup_size = sizeof(tup.ipv4);
430         } else if (hdr->ipv6) {
431                 __builtin_memcpy(tup.ipv6.saddr, &hdr->ipv6->saddr, sizeof(tup.ipv6.saddr));
432                 __builtin_memcpy(tup.ipv6.daddr, &hdr->ipv6->daddr, sizeof(tup.ipv6.daddr));
433                 tup.ipv6.sport = hdr->tcp->source;
434                 tup.ipv6.dport = hdr->tcp->dest;
435                 tup_size = sizeof(tup.ipv6);
436         } else {
437                 /* The verifier can't track that either ipv4 or ipv6 is not
438                  * NULL.
439                  */
440                 return XDP_ABORTED;
441         }
442         if (xdp)
443                 ct = bpf_xdp_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
444         else
445                 ct = bpf_skb_ct_lookup(ctx, &tup, tup_size, &ct_lookup_opts, sizeof(ct_lookup_opts));
446         if (ct) {
447                 unsigned long status = ct->status;
448
449                 bpf_ct_release(ct);
450                 if (status & IPS_CONFIRMED)
451                         return XDP_PASS;
452         } else if (ct_lookup_opts.error != -ENOENT) {
453                 return XDP_ABORTED;
454         }
455
456         /* error == -ENOENT || !(status & IPS_CONFIRMED) */
457         return XDP_TX;
458 }
459
460 static __always_inline __u8 tcp_mkoptions(__be32 *buf, __be32 *tsopt, __u16 mss,
461                                           __u8 wscale)
462 {
463         __be32 *start = buf;
464
465         *buf++ = bpf_htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
466
467         if (!tsopt)
468                 return buf - start;
469
470         if (tsopt[0] & bpf_htonl(1 << 4))
471                 *buf++ = bpf_htonl((TCPOPT_SACK_PERM << 24) |
472                                    (TCPOLEN_SACK_PERM << 16) |
473                                    (TCPOPT_TIMESTAMP << 8) |
474                                    TCPOLEN_TIMESTAMP);
475         else
476                 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
477                                    (TCPOPT_NOP << 16) |
478                                    (TCPOPT_TIMESTAMP << 8) |
479                                    TCPOLEN_TIMESTAMP);
480         *buf++ = tsopt[0];
481         *buf++ = tsopt[1];
482
483         if ((tsopt[0] & bpf_htonl(0xf)) != bpf_htonl(0xf))
484                 *buf++ = bpf_htonl((TCPOPT_NOP << 24) |
485                                    (TCPOPT_WINDOW << 16) |
486                                    (TCPOLEN_WINDOW << 8) |
487                                    wscale);
488
489         return buf - start;
490 }
491
492 static __always_inline void tcp_gen_synack(struct tcphdr *tcp_header,
493                                            __u32 cookie, __be32 *tsopt,
494                                            __u16 mss, __u8 wscale)
495 {
496         void *tcp_options;
497
498         tcp_flag_word(tcp_header) = TCP_FLAG_SYN | TCP_FLAG_ACK;
499         if (tsopt && (tsopt[0] & bpf_htonl(1 << 5)))
500                 tcp_flag_word(tcp_header) |= TCP_FLAG_ECE;
501         tcp_header->doff = 5; /* doff is part of tcp_flag_word. */
502         swap(tcp_header->source, tcp_header->dest);
503         tcp_header->ack_seq = bpf_htonl(bpf_ntohl(tcp_header->seq) + 1);
504         tcp_header->seq = bpf_htonl(cookie);
505         tcp_header->window = 0;
506         tcp_header->urg_ptr = 0;
507         tcp_header->check = 0; /* Calculate checksum later. */
508
509         tcp_options = (void *)(tcp_header + 1);
510         tcp_header->doff += tcp_mkoptions(tcp_options, tsopt, mss, wscale);
511 }
512
513 static __always_inline void tcpv4_gen_synack(struct header_pointers *hdr,
514                                              __u32 cookie, __be32 *tsopt)
515 {
516         __u8 wscale;
517         __u16 mss;
518         __u8 ttl;
519
520         values_get_tcpipopts(&mss, &wscale, &ttl, false);
521
522         swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
523
524         swap(hdr->ipv4->saddr, hdr->ipv4->daddr);
525         hdr->ipv4->check = 0; /* Calculate checksum later. */
526         hdr->ipv4->tos = 0;
527         hdr->ipv4->id = 0;
528         hdr->ipv4->ttl = ttl;
529
530         tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
531
532         hdr->tcp_len = hdr->tcp->doff * 4;
533         hdr->ipv4->tot_len = bpf_htons(sizeof(*hdr->ipv4) + hdr->tcp_len);
534 }
535
536 static __always_inline void tcpv6_gen_synack(struct header_pointers *hdr,
537                                              __u32 cookie, __be32 *tsopt)
538 {
539         __u8 wscale;
540         __u16 mss;
541         __u8 ttl;
542
543         values_get_tcpipopts(&mss, &wscale, &ttl, true);
544
545         swap_eth_addr(hdr->eth->h_source, hdr->eth->h_dest);
546
547         swap(hdr->ipv6->saddr, hdr->ipv6->daddr);
548         *(__be32 *)hdr->ipv6 = bpf_htonl(0x60000000);
549         hdr->ipv6->hop_limit = ttl;
550
551         tcp_gen_synack(hdr->tcp, cookie, tsopt, mss, wscale);
552
553         hdr->tcp_len = hdr->tcp->doff * 4;
554         hdr->ipv6->payload_len = bpf_htons(hdr->tcp_len);
555 }
556
557 static __always_inline int syncookie_handle_syn(struct header_pointers *hdr,
558                                                 void *ctx,
559                                                 void *data, void *data_end,
560                                                 bool xdp)
561 {
562         __u32 old_pkt_size, new_pkt_size;
563         /* Unlike clang 10, clang 11 and 12 generate code that doesn't pass the
564          * BPF verifier if tsopt is not volatile. Volatile forces it to store
565          * the pointer value and use it directly, otherwise tcp_mkoptions is
566          * (mis)compiled like this:
567          *   if (!tsopt)
568          *       return buf - start;
569          *   reg = stored_return_value_of_tscookie_init;
570          *   if (reg)
571          *       tsopt = tsopt_buf;
572          *   else
573          *       tsopt = NULL;
574          *   ...
575          *   *buf++ = tsopt[1];
576          * It creates a dead branch where tsopt is assigned NULL, but the
577          * verifier can't prove it's dead and blocks the program.
578          */
579         __be32 * volatile tsopt = NULL;
580         __be32 tsopt_buf[2] = {};
581         __u16 ip_len;
582         __u32 cookie;
583         __s64 value;
584
585         /* Checksum is not yet verified, but both checksum failure and TCP
586          * header checks return XDP_DROP, so the order doesn't matter.
587          */
588         if (hdr->tcp->fin || hdr->tcp->rst)
589                 return XDP_DROP;
590
591         /* Issue SYN cookies on allowed ports, drop SYN packets on blocked
592          * ports.
593          */
594         if (!check_port_allowed(bpf_ntohs(hdr->tcp->dest)))
595                 return XDP_DROP;
596
597         if (hdr->ipv4) {
598                 /* Check the IPv4 and TCP checksums before creating a SYNACK. */
599                 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, hdr->ipv4->ihl * 4, 0);
600                 if (value < 0)
601                         return XDP_ABORTED;
602                 if (csum_fold(value) != 0)
603                         return XDP_DROP; /* Bad IPv4 checksum. */
604
605                 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
606                 if (value < 0)
607                         return XDP_ABORTED;
608                 if (csum_tcpudp_magic(hdr->ipv4->saddr, hdr->ipv4->daddr,
609                                       hdr->tcp_len, IPPROTO_TCP, value) != 0)
610                         return XDP_DROP; /* Bad TCP checksum. */
611
612                 ip_len = sizeof(*hdr->ipv4);
613
614                 value = bpf_tcp_raw_gen_syncookie_ipv4(hdr->ipv4, hdr->tcp,
615                                                        hdr->tcp_len);
616         } else if (hdr->ipv6) {
617                 /* Check the TCP checksum before creating a SYNACK. */
618                 value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
619                 if (value < 0)
620                         return XDP_ABORTED;
621                 if (csum_ipv6_magic(&hdr->ipv6->saddr, &hdr->ipv6->daddr,
622                                     hdr->tcp_len, IPPROTO_TCP, value) != 0)
623                         return XDP_DROP; /* Bad TCP checksum. */
624
625                 ip_len = sizeof(*hdr->ipv6);
626
627                 value = bpf_tcp_raw_gen_syncookie_ipv6(hdr->ipv6, hdr->tcp,
628                                                        hdr->tcp_len);
629         } else {
630                 return XDP_ABORTED;
631         }
632
633         if (value < 0)
634                 return XDP_ABORTED;
635         cookie = (__u32)value;
636
637         if (tscookie_init((void *)hdr->tcp, hdr->tcp_len,
638                           &tsopt_buf[0], &tsopt_buf[1], data_end))
639                 tsopt = tsopt_buf;
640
641         /* Check that there is enough space for a SYNACK. It also covers
642          * the check that the destination of the __builtin_memmove below
643          * doesn't overflow.
644          */
645         if (data + sizeof(*hdr->eth) + ip_len + TCP_MAXLEN > data_end)
646                 return XDP_ABORTED;
647
648         if (hdr->ipv4) {
649                 if (hdr->ipv4->ihl * 4 > sizeof(*hdr->ipv4)) {
650                         struct tcphdr *new_tcp_header;
651
652                         new_tcp_header = data + sizeof(*hdr->eth) + sizeof(*hdr->ipv4);
653                         __builtin_memmove(new_tcp_header, hdr->tcp, sizeof(*hdr->tcp));
654                         hdr->tcp = new_tcp_header;
655
656                         hdr->ipv4->ihl = sizeof(*hdr->ipv4) / 4;
657                 }
658
659                 tcpv4_gen_synack(hdr, cookie, tsopt);
660         } else if (hdr->ipv6) {
661                 tcpv6_gen_synack(hdr, cookie, tsopt);
662         } else {
663                 return XDP_ABORTED;
664         }
665
666         /* Recalculate checksums. */
667         hdr->tcp->check = 0;
668         value = bpf_csum_diff(0, 0, (void *)hdr->tcp, hdr->tcp_len, 0);
669         if (value < 0)
670                 return XDP_ABORTED;
671         if (hdr->ipv4) {
672                 hdr->tcp->check = csum_tcpudp_magic(hdr->ipv4->saddr,
673                                                     hdr->ipv4->daddr,
674                                                     hdr->tcp_len,
675                                                     IPPROTO_TCP,
676                                                     value);
677
678                 hdr->ipv4->check = 0;
679                 value = bpf_csum_diff(0, 0, (void *)hdr->ipv4, sizeof(*hdr->ipv4), 0);
680                 if (value < 0)
681                         return XDP_ABORTED;
682                 hdr->ipv4->check = csum_fold(value);
683         } else if (hdr->ipv6) {
684                 hdr->tcp->check = csum_ipv6_magic(&hdr->ipv6->saddr,
685                                                   &hdr->ipv6->daddr,
686                                                   hdr->tcp_len,
687                                                   IPPROTO_TCP,
688                                                   value);
689         } else {
690                 return XDP_ABORTED;
691         }
692
693         /* Set the new packet size. */
694         old_pkt_size = data_end - data;
695         new_pkt_size = sizeof(*hdr->eth) + ip_len + hdr->tcp->doff * 4;
696         if (xdp) {
697                 if (bpf_xdp_adjust_tail(ctx, new_pkt_size - old_pkt_size))
698                         return XDP_ABORTED;
699         } else {
700                 if (bpf_skb_change_tail(ctx, new_pkt_size, 0))
701                         return XDP_ABORTED;
702         }
703
704         values_inc_synacks();
705
706         return XDP_TX;
707 }
708
709 static __always_inline int syncookie_handle_ack(struct header_pointers *hdr)
710 {
711         int err;
712
713         if (hdr->tcp->rst)
714                 return XDP_DROP;
715
716         if (hdr->ipv4)
717                 err = bpf_tcp_raw_check_syncookie_ipv4(hdr->ipv4, hdr->tcp);
718         else if (hdr->ipv6)
719                 err = bpf_tcp_raw_check_syncookie_ipv6(hdr->ipv6, hdr->tcp);
720         else
721                 return XDP_ABORTED;
722         if (err)
723                 return XDP_DROP;
724
725         return XDP_PASS;
726 }
727
728 static __always_inline int syncookie_part1(void *ctx, void *data, void *data_end,
729                                            struct header_pointers *hdr, bool xdp)
730 {
731         int ret;
732
733         ret = tcp_dissect(data, data_end, hdr);
734         if (ret != XDP_TX)
735                 return ret;
736
737         ret = tcp_lookup(ctx, hdr, xdp);
738         if (ret != XDP_TX)
739                 return ret;
740
741         /* Packet is TCP and doesn't belong to an established connection. */
742
743         if ((hdr->tcp->syn ^ hdr->tcp->ack) != 1)
744                 return XDP_DROP;
745
746         /* Grow the TCP header to TCP_MAXLEN to be able to pass any hdr->tcp_len
747          * to bpf_tcp_raw_gen_syncookie_ipv{4,6} and pass the verifier.
748          */
749         if (xdp) {
750                 if (bpf_xdp_adjust_tail(ctx, TCP_MAXLEN - hdr->tcp_len))
751                         return XDP_ABORTED;
752         } else {
753                 /* Without volatile the verifier throws this error:
754                  * R9 32-bit pointer arithmetic prohibited
755                  */
756                 volatile u64 old_len = data_end - data;
757
758                 if (bpf_skb_change_tail(ctx, old_len + TCP_MAXLEN - hdr->tcp_len, 0))
759                         return XDP_ABORTED;
760         }
761
762         return XDP_TX;
763 }
764
765 static __always_inline int syncookie_part2(void *ctx, void *data, void *data_end,
766                                            struct header_pointers *hdr, bool xdp)
767 {
768         if (hdr->ipv4) {
769                 hdr->eth = data;
770                 hdr->ipv4 = (void *)hdr->eth + sizeof(*hdr->eth);
771                 /* IPV4_MAXLEN is needed when calculating checksum.
772                  * At least sizeof(struct iphdr) is needed here to access ihl.
773                  */
774                 if ((void *)hdr->ipv4 + IPV4_MAXLEN > data_end)
775                         return XDP_ABORTED;
776                 hdr->tcp = (void *)hdr->ipv4 + hdr->ipv4->ihl * 4;
777         } else if (hdr->ipv6) {
778                 hdr->eth = data;
779                 hdr->ipv6 = (void *)hdr->eth + sizeof(*hdr->eth);
780                 hdr->tcp = (void *)hdr->ipv6 + sizeof(*hdr->ipv6);
781         } else {
782                 return XDP_ABORTED;
783         }
784
785         if ((void *)hdr->tcp + TCP_MAXLEN > data_end)
786                 return XDP_ABORTED;
787
788         /* We run out of registers, tcp_len gets spilled to the stack, and the
789          * verifier forgets its min and max values checked above in tcp_dissect.
790          */
791         hdr->tcp_len = hdr->tcp->doff * 4;
792         if (hdr->tcp_len < sizeof(*hdr->tcp))
793                 return XDP_ABORTED;
794
795         return hdr->tcp->syn ? syncookie_handle_syn(hdr, ctx, data, data_end, xdp) :
796                                syncookie_handle_ack(hdr);
797 }
798
799 SEC("xdp")
800 int syncookie_xdp(struct xdp_md *ctx)
801 {
802         void *data_end = (void *)(long)ctx->data_end;
803         void *data = (void *)(long)ctx->data;
804         struct header_pointers hdr;
805         int ret;
806
807         ret = syncookie_part1(ctx, data, data_end, &hdr, true);
808         if (ret != XDP_TX)
809                 return ret;
810
811         data_end = (void *)(long)ctx->data_end;
812         data = (void *)(long)ctx->data;
813
814         return syncookie_part2(ctx, data, data_end, &hdr, true);
815 }
816
817 SEC("tc")
818 int syncookie_tc(struct __sk_buff *skb)
819 {
820         void *data_end = (void *)(long)skb->data_end;
821         void *data = (void *)(long)skb->data;
822         struct header_pointers hdr;
823         int ret;
824
825         ret = syncookie_part1(skb, data, data_end, &hdr, false);
826         if (ret != XDP_TX)
827                 return ret == XDP_PASS ? TC_ACT_OK : TC_ACT_SHOT;
828
829         data_end = (void *)(long)skb->data_end;
830         data = (void *)(long)skb->data;
831
832         ret = syncookie_part2(skb, data, data_end, &hdr, false);
833         switch (ret) {
834         case XDP_PASS:
835                 return TC_ACT_OK;
836         case XDP_TX:
837                 return bpf_redirect(skb->ifindex, 0);
838         default:
839                 return TC_ACT_SHOT;
840         }
841 }
842
843 char _license[] SEC("license") = "GPL";