Merge tag 'powerpc-6.6-6' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc...
[platform/kernel/linux-starfive.git] / drivers / net / ethernet / google / gve / gve_rx.c
1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
2 /* Google virtual Ethernet (gve) driver
3  *
4  * Copyright (C) 2015-2021 Google, Inc.
5  */
6
7 #include "gve.h"
8 #include "gve_adminq.h"
9 #include "gve_utils.h"
10 #include <linux/etherdevice.h>
11 #include <linux/filter.h>
12 #include <net/xdp.h>
13 #include <net/xdp_sock_drv.h>
14
15 static void gve_rx_free_buffer(struct device *dev,
16                                struct gve_rx_slot_page_info *page_info,
17                                union gve_rx_data_slot *data_slot)
18 {
19         dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) &
20                                       GVE_DATA_SLOT_ADDR_PAGE_MASK);
21
22         page_ref_sub(page_info->page, page_info->pagecnt_bias - 1);
23         gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
24 }
25
26 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx)
27 {
28         u32 slots = rx->mask + 1;
29         int i;
30
31         if (rx->data.raw_addressing) {
32                 for (i = 0; i < slots; i++)
33                         gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
34                                            &rx->data.data_ring[i]);
35         } else {
36                 for (i = 0; i < slots; i++)
37                         page_ref_sub(rx->data.page_info[i].page,
38                                      rx->data.page_info[i].pagecnt_bias - 1);
39                 gve_unassign_qpl(priv, rx->data.qpl->id);
40                 rx->data.qpl = NULL;
41
42                 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) {
43                         page_ref_sub(rx->qpl_copy_pool[i].page,
44                                      rx->qpl_copy_pool[i].pagecnt_bias - 1);
45                         put_page(rx->qpl_copy_pool[i].page);
46                 }
47         }
48         kvfree(rx->data.page_info);
49         rx->data.page_info = NULL;
50 }
51
52 static void gve_rx_free_ring(struct gve_priv *priv, int idx)
53 {
54         struct gve_rx_ring *rx = &priv->rx[idx];
55         struct device *dev = &priv->pdev->dev;
56         u32 slots = rx->mask + 1;
57         size_t bytes;
58
59         gve_rx_remove_from_block(priv, idx);
60
61         bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
62         dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus);
63         rx->desc.desc_ring = NULL;
64
65         dma_free_coherent(dev, sizeof(*rx->q_resources),
66                           rx->q_resources, rx->q_resources_bus);
67         rx->q_resources = NULL;
68
69         gve_rx_unfill_pages(priv, rx);
70
71         bytes = sizeof(*rx->data.data_ring) * slots;
72         dma_free_coherent(dev, bytes, rx->data.data_ring,
73                           rx->data.data_bus);
74         rx->data.data_ring = NULL;
75
76         kvfree(rx->qpl_copy_pool);
77         rx->qpl_copy_pool = NULL;
78
79         netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
80 }
81
82 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info,
83                              dma_addr_t addr, struct page *page, __be64 *slot_addr)
84 {
85         page_info->page = page;
86         page_info->page_offset = 0;
87         page_info->page_address = page_address(page);
88         *slot_addr = cpu_to_be64(addr);
89         /* The page already has 1 ref */
90         page_ref_add(page, INT_MAX - 1);
91         page_info->pagecnt_bias = INT_MAX;
92 }
93
94 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
95                                struct gve_rx_slot_page_info *page_info,
96                                union gve_rx_data_slot *data_slot)
97 {
98         struct page *page;
99         dma_addr_t dma;
100         int err;
101
102         err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE,
103                              GFP_ATOMIC);
104         if (err)
105                 return err;
106
107         gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr);
108         return 0;
109 }
110
111 static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
112 {
113         struct gve_priv *priv = rx->gve;
114         u32 slots;
115         int err;
116         int i;
117         int j;
118
119         /* Allocate one page per Rx queue slot. Each page is split into two
120          * packet buffers, when possible we "page flip" between the two.
121          */
122         slots = rx->mask + 1;
123
124         rx->data.page_info = kvzalloc(slots *
125                                       sizeof(*rx->data.page_info), GFP_KERNEL);
126         if (!rx->data.page_info)
127                 return -ENOMEM;
128
129         if (!rx->data.raw_addressing) {
130                 rx->data.qpl = gve_assign_rx_qpl(priv, rx->q_num);
131                 if (!rx->data.qpl) {
132                         kvfree(rx->data.page_info);
133                         rx->data.page_info = NULL;
134                         return -ENOMEM;
135                 }
136         }
137         for (i = 0; i < slots; i++) {
138                 if (!rx->data.raw_addressing) {
139                         struct page *page = rx->data.qpl->pages[i];
140                         dma_addr_t addr = i * PAGE_SIZE;
141
142                         gve_setup_rx_buffer(&rx->data.page_info[i], addr, page,
143                                             &rx->data.data_ring[i].qpl_offset);
144                         continue;
145                 }
146                 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i],
147                                           &rx->data.data_ring[i]);
148                 if (err)
149                         goto alloc_err_rda;
150         }
151
152         if (!rx->data.raw_addressing) {
153                 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
154                         struct page *page = alloc_page(GFP_KERNEL);
155
156                         if (!page) {
157                                 err = -ENOMEM;
158                                 goto alloc_err_qpl;
159                         }
160
161                         rx->qpl_copy_pool[j].page = page;
162                         rx->qpl_copy_pool[j].page_offset = 0;
163                         rx->qpl_copy_pool[j].page_address = page_address(page);
164
165                         /* The page already has 1 ref. */
166                         page_ref_add(page, INT_MAX - 1);
167                         rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX;
168                 }
169         }
170
171         return slots;
172
173 alloc_err_qpl:
174         /* Fully free the copy pool pages. */
175         while (j--) {
176                 page_ref_sub(rx->qpl_copy_pool[j].page,
177                              rx->qpl_copy_pool[j].pagecnt_bias - 1);
178                 put_page(rx->qpl_copy_pool[j].page);
179         }
180
181         /* Do not fully free QPL pages - only remove the bias added in this
182          * function with gve_setup_rx_buffer.
183          */
184         while (i--)
185                 page_ref_sub(rx->data.page_info[i].page,
186                              rx->data.page_info[i].pagecnt_bias - 1);
187
188         gve_unassign_qpl(priv, rx->data.qpl->id);
189         rx->data.qpl = NULL;
190
191         return err;
192
193 alloc_err_rda:
194         while (i--)
195                 gve_rx_free_buffer(&priv->pdev->dev,
196                                    &rx->data.page_info[i],
197                                    &rx->data.data_ring[i]);
198         return err;
199 }
200
201 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx)
202 {
203         ctx->skb_head = NULL;
204         ctx->skb_tail = NULL;
205         ctx->total_size = 0;
206         ctx->frag_cnt = 0;
207         ctx->drop_pkt = false;
208 }
209
210 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
211 {
212         struct gve_rx_ring *rx = &priv->rx[idx];
213         struct device *hdev = &priv->pdev->dev;
214         u32 slots, npages;
215         int filled_pages;
216         size_t bytes;
217         int err;
218
219         netif_dbg(priv, drv, priv->dev, "allocating rx ring\n");
220         /* Make sure everything is zeroed to start with */
221         memset(rx, 0, sizeof(*rx));
222
223         rx->gve = priv;
224         rx->q_num = idx;
225
226         slots = priv->rx_data_slot_cnt;
227         rx->mask = slots - 1;
228         rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT;
229
230         /* alloc rx data ring */
231         bytes = sizeof(*rx->data.data_ring) * slots;
232         rx->data.data_ring = dma_alloc_coherent(hdev, bytes,
233                                                 &rx->data.data_bus,
234                                                 GFP_KERNEL);
235         if (!rx->data.data_ring)
236                 return -ENOMEM;
237
238         rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
239         rx->qpl_copy_pool_head = 0;
240         rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1,
241                                      sizeof(rx->qpl_copy_pool[0]),
242                                      GFP_KERNEL);
243
244         if (!rx->qpl_copy_pool) {
245                 err = -ENOMEM;
246                 goto abort_with_slots;
247         }
248
249         filled_pages = gve_prefill_rx_pages(rx);
250         if (filled_pages < 0) {
251                 err = -ENOMEM;
252                 goto abort_with_copy_pool;
253         }
254         rx->fill_cnt = filled_pages;
255         /* Ensure data ring slots (packet buffers) are visible. */
256         dma_wmb();
257
258         /* Alloc gve_queue_resources */
259         rx->q_resources =
260                 dma_alloc_coherent(hdev,
261                                    sizeof(*rx->q_resources),
262                                    &rx->q_resources_bus,
263                                    GFP_KERNEL);
264         if (!rx->q_resources) {
265                 err = -ENOMEM;
266                 goto abort_filled;
267         }
268         netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx,
269                   (unsigned long)rx->data.data_bus);
270
271         /* alloc rx desc ring */
272         bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
273         npages = bytes / PAGE_SIZE;
274         if (npages * PAGE_SIZE != bytes) {
275                 err = -EIO;
276                 goto abort_with_q_resources;
277         }
278
279         rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus,
280                                                 GFP_KERNEL);
281         if (!rx->desc.desc_ring) {
282                 err = -ENOMEM;
283                 goto abort_with_q_resources;
284         }
285         rx->cnt = 0;
286         rx->db_threshold = priv->rx_desc_cnt / 2;
287         rx->desc.seqno = 1;
288
289         /* Allocating half-page buffers allows page-flipping which is faster
290          * than copying or allocating new pages.
291          */
292         rx->packet_buffer_size = PAGE_SIZE / 2;
293         gve_rx_ctx_clear(&rx->ctx);
294         gve_rx_add_to_block(priv, idx);
295
296         return 0;
297
298 abort_with_q_resources:
299         dma_free_coherent(hdev, sizeof(*rx->q_resources),
300                           rx->q_resources, rx->q_resources_bus);
301         rx->q_resources = NULL;
302 abort_filled:
303         gve_rx_unfill_pages(priv, rx);
304 abort_with_copy_pool:
305         kvfree(rx->qpl_copy_pool);
306         rx->qpl_copy_pool = NULL;
307 abort_with_slots:
308         bytes = sizeof(*rx->data.data_ring) * slots;
309         dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
310         rx->data.data_ring = NULL;
311
312         return err;
313 }
314
315 int gve_rx_alloc_rings(struct gve_priv *priv)
316 {
317         int err = 0;
318         int i;
319
320         for (i = 0; i < priv->rx_cfg.num_queues; i++) {
321                 err = gve_rx_alloc_ring(priv, i);
322                 if (err) {
323                         netif_err(priv, drv, priv->dev,
324                                   "Failed to alloc rx ring=%d: err=%d\n",
325                                   i, err);
326                         break;
327                 }
328         }
329         /* Unallocate if there was an error */
330         if (err) {
331                 int j;
332
333                 for (j = 0; j < i; j++)
334                         gve_rx_free_ring(priv, j);
335         }
336         return err;
337 }
338
339 void gve_rx_free_rings_gqi(struct gve_priv *priv)
340 {
341         int i;
342
343         for (i = 0; i < priv->rx_cfg.num_queues; i++)
344                 gve_rx_free_ring(priv, i);
345 }
346
347 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx)
348 {
349         u32 db_idx = be32_to_cpu(rx->q_resources->db_index);
350
351         iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]);
352 }
353
354 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
355 {
356         if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP)))
357                 return PKT_HASH_TYPE_L4;
358         if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6))
359                 return PKT_HASH_TYPE_L3;
360         return PKT_HASH_TYPE_L2;
361 }
362
363 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
364                                         struct gve_rx_slot_page_info *page_info,
365                                         u16 packet_buffer_size, u16 len,
366                                         struct gve_rx_ctx *ctx)
367 {
368         u32 offset = page_info->page_offset + page_info->pad;
369         struct sk_buff *skb = ctx->skb_tail;
370         int num_frags = 0;
371
372         if (!skb) {
373                 skb = napi_get_frags(napi);
374                 if (unlikely(!skb))
375                         return NULL;
376
377                 ctx->skb_head = skb;
378                 ctx->skb_tail = skb;
379         } else {
380                 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags;
381                 if (num_frags == MAX_SKB_FRAGS) {
382                         skb = napi_alloc_skb(napi, 0);
383                         if (!skb)
384                                 return NULL;
385
386                         // We will never chain more than two SKBs: 2 * 16 * 2k > 64k
387                         // which is why we do not need to chain by using skb->next
388                         skb_shinfo(ctx->skb_tail)->frag_list = skb;
389
390                         ctx->skb_tail = skb;
391                         num_frags = 0;
392                 }
393         }
394
395         if (skb != ctx->skb_head) {
396                 ctx->skb_head->len += len;
397                 ctx->skb_head->data_len += len;
398                 ctx->skb_head->truesize += packet_buffer_size;
399         }
400         skb_add_rx_frag(skb, num_frags, page_info->page,
401                         offset, len, packet_buffer_size);
402
403         return ctx->skb_head;
404 }
405
406 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
407 {
408         const __be64 offset = cpu_to_be64(PAGE_SIZE / 2);
409
410         /* "flip" to other packet buffer on this page */
411         page_info->page_offset ^= PAGE_SIZE / 2;
412         *(slot_addr) ^= offset;
413 }
414
415 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info)
416 {
417         int pagecount = page_count(page_info->page);
418
419         /* This page is not being used by any SKBs - reuse */
420         if (pagecount == page_info->pagecnt_bias)
421                 return 1;
422         /* This page is still being used by an SKB - we can't reuse */
423         else if (pagecount > page_info->pagecnt_bias)
424                 return 0;
425         WARN(pagecount < page_info->pagecnt_bias,
426              "Pagecount should never be less than the bias.");
427         return -1;
428 }
429
430 static struct sk_buff *
431 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
432                       struct gve_rx_slot_page_info *page_info, u16 len,
433                       struct napi_struct *napi,
434                       union gve_rx_data_slot *data_slot,
435                       u16 packet_buffer_size, struct gve_rx_ctx *ctx)
436 {
437         struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx);
438
439         if (!skb)
440                 return NULL;
441
442         /* Optimistically stop the kernel from freeing the page.
443          * We will check again in refill to determine if we need to alloc a
444          * new page.
445          */
446         gve_dec_pagecnt_bias(page_info);
447
448         return skb;
449 }
450
451 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx,
452                                            struct gve_rx_slot_page_info *page_info,
453                                            u16 len, struct napi_struct *napi)
454 {
455         u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask;
456         void *src = page_info->page_address + page_info->page_offset;
457         struct gve_rx_slot_page_info *copy_page_info;
458         struct gve_rx_ctx *ctx = &rx->ctx;
459         bool alloc_page = false;
460         struct sk_buff *skb;
461         void *dst;
462
463         copy_page_info = &rx->qpl_copy_pool[pool_idx];
464         if (!copy_page_info->can_flip) {
465                 int recycle = gve_rx_can_recycle_buffer(copy_page_info);
466
467                 if (unlikely(recycle < 0)) {
468                         gve_schedule_reset(rx->gve);
469                         return NULL;
470                 }
471                 alloc_page = !recycle;
472         }
473
474         if (alloc_page) {
475                 struct gve_rx_slot_page_info alloc_page_info;
476                 struct page *page;
477
478                 /* The least recently used page turned out to be
479                  * still in use by the kernel. Ignoring it and moving
480                  * on alleviates head-of-line blocking.
481                  */
482                 rx->qpl_copy_pool_head++;
483
484                 page = alloc_page(GFP_ATOMIC);
485                 if (!page)
486                         return NULL;
487
488                 alloc_page_info.page = page;
489                 alloc_page_info.page_offset = 0;
490                 alloc_page_info.page_address = page_address(page);
491                 alloc_page_info.pad = page_info->pad;
492
493                 memcpy(alloc_page_info.page_address, src, page_info->pad + len);
494                 skb = gve_rx_add_frags(napi, &alloc_page_info,
495                                        rx->packet_buffer_size,
496                                        len, ctx);
497
498                 u64_stats_update_begin(&rx->statss);
499                 rx->rx_frag_copy_cnt++;
500                 rx->rx_frag_alloc_cnt++;
501                 u64_stats_update_end(&rx->statss);
502
503                 return skb;
504         }
505
506         dst = copy_page_info->page_address + copy_page_info->page_offset;
507         memcpy(dst, src, page_info->pad + len);
508         copy_page_info->pad = page_info->pad;
509
510         skb = gve_rx_add_frags(napi, copy_page_info,
511                                rx->packet_buffer_size, len, ctx);
512         if (unlikely(!skb))
513                 return NULL;
514
515         gve_dec_pagecnt_bias(copy_page_info);
516         copy_page_info->page_offset += rx->packet_buffer_size;
517         copy_page_info->page_offset &= (PAGE_SIZE - 1);
518
519         if (copy_page_info->can_flip) {
520                 /* We have used both halves of this copy page, it
521                  * is time for it to go to the back of the queue.
522                  */
523                 copy_page_info->can_flip = false;
524                 rx->qpl_copy_pool_head++;
525                 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page);
526         } else {
527                 copy_page_info->can_flip = true;
528         }
529
530         u64_stats_update_begin(&rx->statss);
531         rx->rx_frag_copy_cnt++;
532         u64_stats_update_end(&rx->statss);
533
534         return skb;
535 }
536
537 static struct sk_buff *
538 gve_rx_qpl(struct device *dev, struct net_device *netdev,
539            struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
540            u16 len, struct napi_struct *napi,
541            union gve_rx_data_slot *data_slot)
542 {
543         struct gve_rx_ctx *ctx = &rx->ctx;
544         struct sk_buff *skb;
545
546         /* if raw_addressing mode is not enabled gvnic can only receive into
547          * registered segments. If the buffer can't be recycled, our only
548          * choice is to copy the data out of it so that we can return it to the
549          * device.
550          */
551         if (page_info->can_flip) {
552                 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx);
553                 /* No point in recycling if we didn't get the skb */
554                 if (skb) {
555                         /* Make sure that the page isn't freed. */
556                         gve_dec_pagecnt_bias(page_info);
557                         gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
558                 }
559         } else {
560                 skb = gve_rx_copy_to_pool(rx, page_info, len, napi);
561         }
562         return skb;
563 }
564
565 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx,
566                                   struct gve_rx_slot_page_info *page_info, struct napi_struct *napi,
567                                   u16 len, union gve_rx_data_slot *data_slot,
568                                   bool is_only_frag)
569 {
570         struct net_device *netdev = priv->dev;
571         struct gve_rx_ctx *ctx = &rx->ctx;
572         struct sk_buff *skb = NULL;
573
574         if (len <= priv->rx_copybreak && is_only_frag)  {
575                 /* Just copy small packets */
576                 skb = gve_rx_copy(netdev, napi, page_info, len);
577                 if (skb) {
578                         u64_stats_update_begin(&rx->statss);
579                         rx->rx_copied_pkt++;
580                         rx->rx_frag_copy_cnt++;
581                         rx->rx_copybreak_pkt++;
582                         u64_stats_update_end(&rx->statss);
583                 }
584         } else {
585                 int recycle = gve_rx_can_recycle_buffer(page_info);
586
587                 if (unlikely(recycle < 0)) {
588                         gve_schedule_reset(priv);
589                         return NULL;
590                 }
591                 page_info->can_flip = recycle;
592                 if (page_info->can_flip) {
593                         u64_stats_update_begin(&rx->statss);
594                         rx->rx_frag_flip_cnt++;
595                         u64_stats_update_end(&rx->statss);
596                 }
597
598                 if (rx->data.raw_addressing) {
599                         skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev,
600                                                     page_info, len, napi,
601                                                     data_slot,
602                                                     rx->packet_buffer_size, ctx);
603                 } else {
604                         skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx,
605                                          page_info, len, napi, data_slot);
606                 }
607         }
608         return skb;
609 }
610
611 static int gve_xsk_pool_redirect(struct net_device *dev,
612                                  struct gve_rx_ring *rx,
613                                  void *data, int len,
614                                  struct bpf_prog *xdp_prog)
615 {
616         struct xdp_buff *xdp;
617         int err;
618
619         if (rx->xsk_pool->frame_len < len)
620                 return -E2BIG;
621         xdp = xsk_buff_alloc(rx->xsk_pool);
622         if (!xdp) {
623                 u64_stats_update_begin(&rx->statss);
624                 rx->xdp_alloc_fails++;
625                 u64_stats_update_end(&rx->statss);
626                 return -ENOMEM;
627         }
628         xdp->data_end = xdp->data + len;
629         memcpy(xdp->data, data, len);
630         err = xdp_do_redirect(dev, xdp, xdp_prog);
631         if (err)
632                 xsk_buff_free(xdp);
633         return err;
634 }
635
636 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
637                             struct xdp_buff *orig, struct bpf_prog *xdp_prog)
638 {
639         int total_len, len = orig->data_end - orig->data;
640         int headroom = XDP_PACKET_HEADROOM;
641         struct xdp_buff new;
642         void *frame;
643         int err;
644
645         if (rx->xsk_pool)
646                 return gve_xsk_pool_redirect(dev, rx, orig->data,
647                                              len, xdp_prog);
648
649         total_len = headroom + SKB_DATA_ALIGN(len) +
650                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
651         frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
652         if (!frame) {
653                 u64_stats_update_begin(&rx->statss);
654                 rx->xdp_alloc_fails++;
655                 u64_stats_update_end(&rx->statss);
656                 return -ENOMEM;
657         }
658         xdp_init_buff(&new, total_len, &rx->xdp_rxq);
659         xdp_prepare_buff(&new, frame, headroom, len, false);
660         memcpy(new.data, orig->data, len);
661
662         err = xdp_do_redirect(dev, &new, xdp_prog);
663         if (err)
664                 page_frag_free(frame);
665
666         return err;
667 }
668
669 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx,
670                          struct xdp_buff *xdp, struct bpf_prog *xprog,
671                          int xdp_act)
672 {
673         struct gve_tx_ring *tx;
674         int tx_qid;
675         int err;
676
677         switch (xdp_act) {
678         case XDP_ABORTED:
679         case XDP_DROP:
680         default:
681                 break;
682         case XDP_TX:
683                 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
684                 tx = &priv->tx[tx_qid];
685                 spin_lock(&tx->xdp_lock);
686                 err = gve_xdp_xmit_one(priv, tx, xdp->data,
687                                        xdp->data_end - xdp->data, NULL);
688                 spin_unlock(&tx->xdp_lock);
689
690                 if (unlikely(err)) {
691                         u64_stats_update_begin(&rx->statss);
692                         rx->xdp_tx_errors++;
693                         u64_stats_update_end(&rx->statss);
694                 }
695                 break;
696         case XDP_REDIRECT:
697                 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog);
698
699                 if (unlikely(err)) {
700                         u64_stats_update_begin(&rx->statss);
701                         rx->xdp_redirect_errors++;
702                         u64_stats_update_end(&rx->statss);
703                 }
704                 break;
705         }
706         u64_stats_update_begin(&rx->statss);
707         if ((u32)xdp_act < GVE_XDP_ACTIONS)
708                 rx->xdp_actions[xdp_act]++;
709         u64_stats_update_end(&rx->statss);
710 }
711
712 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x))
713 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat,
714                    struct gve_rx_desc *desc, u32 idx,
715                    struct gve_rx_cnts *cnts)
716 {
717         bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq);
718         struct gve_rx_slot_page_info *page_info;
719         u16 frag_size = be16_to_cpu(desc->len);
720         struct gve_rx_ctx *ctx = &rx->ctx;
721         union gve_rx_data_slot *data_slot;
722         struct gve_priv *priv = rx->gve;
723         struct sk_buff *skb = NULL;
724         struct bpf_prog *xprog;
725         struct xdp_buff xdp;
726         dma_addr_t page_bus;
727         void *va;
728
729         u16 len = frag_size;
730         struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
731         bool is_first_frag = ctx->frag_cnt == 0;
732
733         bool is_only_frag = is_first_frag && is_last_frag;
734
735         if (unlikely(ctx->drop_pkt))
736                 goto finish_frag;
737
738         if (desc->flags_seq & GVE_RXF_ERR) {
739                 ctx->drop_pkt = true;
740                 cnts->desc_err_pkt_cnt++;
741                 napi_free_frags(napi);
742                 goto finish_frag;
743         }
744
745         if (unlikely(frag_size > rx->packet_buffer_size)) {
746                 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset",
747                             frag_size, rx->packet_buffer_size);
748                 ctx->drop_pkt = true;
749                 napi_free_frags(napi);
750                 gve_schedule_reset(rx->gve);
751                 goto finish_frag;
752         }
753
754         /* Prefetch two packet buffers ahead, we will need it soon. */
755         page_info = &rx->data.page_info[(idx + 2) & rx->mask];
756         va = page_info->page_address + page_info->page_offset;
757         prefetch(page_info->page); /* Kernel page struct. */
758         prefetch(va);              /* Packet header. */
759         prefetch(va + 64);         /* Next cacheline too. */
760
761         page_info = &rx->data.page_info[idx];
762         data_slot = &rx->data.data_ring[idx];
763         page_bus = (rx->data.raw_addressing) ?
764                 be64_to_cpu(data_slot->addr) - page_info->page_offset :
765                 rx->data.qpl->page_buses[idx];
766         dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
767                                 PAGE_SIZE, DMA_FROM_DEVICE);
768         page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
769         len -= page_info->pad;
770         frag_size -= page_info->pad;
771
772         xprog = READ_ONCE(priv->xdp_prog);
773         if (xprog && is_only_frag) {
774                 void *old_data;
775                 int xdp_act;
776
777                 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq);
778                 xdp_prepare_buff(&xdp, page_info->page_address +
779                                  page_info->page_offset, GVE_RX_PAD,
780                                  len, false);
781                 old_data = xdp.data;
782                 xdp_act = bpf_prog_run_xdp(xprog, &xdp);
783                 if (xdp_act != XDP_PASS) {
784                         gve_xdp_done(priv, rx, &xdp, xprog, xdp_act);
785                         ctx->total_size += frag_size;
786                         goto finish_ok_pkt;
787                 }
788
789                 page_info->pad += xdp.data - old_data;
790                 len = xdp.data_end - xdp.data;
791
792                 u64_stats_update_begin(&rx->statss);
793                 rx->xdp_actions[XDP_PASS]++;
794                 u64_stats_update_end(&rx->statss);
795         }
796
797         skb = gve_rx_skb(priv, rx, page_info, napi, len,
798                          data_slot, is_only_frag);
799         if (!skb) {
800                 u64_stats_update_begin(&rx->statss);
801                 rx->rx_skb_alloc_fail++;
802                 u64_stats_update_end(&rx->statss);
803
804                 napi_free_frags(napi);
805                 ctx->drop_pkt = true;
806                 goto finish_frag;
807         }
808         ctx->total_size += frag_size;
809
810         if (is_first_frag) {
811                 if (likely(feat & NETIF_F_RXCSUM)) {
812                         /* NIC passes up the partial sum */
813                         if (desc->csum)
814                                 skb->ip_summed = CHECKSUM_COMPLETE;
815                         else
816                                 skb->ip_summed = CHECKSUM_NONE;
817                         skb->csum = csum_unfold(desc->csum);
818                 }
819
820                 /* parse flags & pass relevant info up */
821                 if (likely(feat & NETIF_F_RXHASH) &&
822                     gve_needs_rss(desc->flags_seq))
823                         skb_set_hash(skb, be32_to_cpu(desc->rss_hash),
824                                      gve_rss_type(desc->flags_seq));
825         }
826
827         if (is_last_frag) {
828                 skb_record_rx_queue(skb, rx->q_num);
829                 if (skb_is_nonlinear(skb))
830                         napi_gro_frags(napi);
831                 else
832                         napi_gro_receive(napi, skb);
833                 goto finish_ok_pkt;
834         }
835
836         goto finish_frag;
837
838 finish_ok_pkt:
839         cnts->ok_pkt_bytes += ctx->total_size;
840         cnts->ok_pkt_cnt++;
841 finish_frag:
842         ctx->frag_cnt++;
843         if (is_last_frag) {
844                 cnts->total_pkt_cnt++;
845                 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1);
846                 gve_rx_ctx_clear(ctx);
847         }
848 }
849
850 bool gve_rx_work_pending(struct gve_rx_ring *rx)
851 {
852         struct gve_rx_desc *desc;
853         __be16 flags_seq;
854         u32 next_idx;
855
856         next_idx = rx->cnt & rx->mask;
857         desc = rx->desc.desc_ring + next_idx;
858
859         flags_seq = desc->flags_seq;
860
861         return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
862 }
863
864 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
865 {
866         int refill_target = rx->mask + 1;
867         u32 fill_cnt = rx->fill_cnt;
868
869         while (fill_cnt - rx->cnt < refill_target) {
870                 struct gve_rx_slot_page_info *page_info;
871                 u32 idx = fill_cnt & rx->mask;
872
873                 page_info = &rx->data.page_info[idx];
874                 if (page_info->can_flip) {
875                         /* The other half of the page is free because it was
876                          * free when we processed the descriptor. Flip to it.
877                          */
878                         union gve_rx_data_slot *data_slot =
879                                                 &rx->data.data_ring[idx];
880
881                         gve_rx_flip_buff(page_info, &data_slot->addr);
882                         page_info->can_flip = 0;
883                 } else {
884                         /* It is possible that the networking stack has already
885                          * finished processing all outstanding packets in the buffer
886                          * and it can be reused.
887                          * Flipping is unnecessary here - if the networking stack still
888                          * owns half the page it is impossible to tell which half. Either
889                          * the whole page is free or it needs to be replaced.
890                          */
891                         int recycle = gve_rx_can_recycle_buffer(page_info);
892
893                         if (recycle < 0) {
894                                 if (!rx->data.raw_addressing)
895                                         gve_schedule_reset(priv);
896                                 return false;
897                         }
898                         if (!recycle) {
899                                 /* We can't reuse the buffer - alloc a new one*/
900                                 union gve_rx_data_slot *data_slot =
901                                                 &rx->data.data_ring[idx];
902                                 struct device *dev = &priv->pdev->dev;
903                                 gve_rx_free_buffer(dev, page_info, data_slot);
904                                 page_info->page = NULL;
905                                 if (gve_rx_alloc_buffer(priv, dev, page_info,
906                                                         data_slot)) {
907                                         u64_stats_update_begin(&rx->statss);
908                                         rx->rx_buf_alloc_fail++;
909                                         u64_stats_update_end(&rx->statss);
910                                         break;
911                                 }
912                         }
913                 }
914                 fill_cnt++;
915         }
916         rx->fill_cnt = fill_cnt;
917         return true;
918 }
919
920 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
921                              netdev_features_t feat)
922 {
923         u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
924         u64 xdp_txs = rx->xdp_actions[XDP_TX];
925         struct gve_rx_ctx *ctx = &rx->ctx;
926         struct gve_priv *priv = rx->gve;
927         struct gve_rx_cnts cnts = {0};
928         struct gve_rx_desc *next_desc;
929         u32 idx = rx->cnt & rx->mask;
930         u32 work_done = 0;
931
932         struct gve_rx_desc *desc = &rx->desc.desc_ring[idx];
933
934         // Exceed budget only if (and till) the inflight packet is consumed.
935         while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
936                (work_done < budget || ctx->frag_cnt)) {
937                 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask];
938                 prefetch(next_desc);
939
940                 gve_rx(rx, feat, desc, idx, &cnts);
941
942                 rx->cnt++;
943                 idx = rx->cnt & rx->mask;
944                 desc = &rx->desc.desc_ring[idx];
945                 rx->desc.seqno = gve_next_seqno(rx->desc.seqno);
946                 work_done++;
947         }
948
949         // The device will only send whole packets.
950         if (unlikely(ctx->frag_cnt)) {
951                 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
952
953                 napi_free_frags(napi);
954                 gve_rx_ctx_clear(&rx->ctx);
955                 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
956                             GVE_SEQNO(desc->flags_seq), rx->desc.seqno);
957                 gve_schedule_reset(rx->gve);
958         }
959
960         if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold)
961                 return 0;
962
963         if (work_done) {
964                 u64_stats_update_begin(&rx->statss);
965                 rx->rpackets += cnts.ok_pkt_cnt;
966                 rx->rbytes += cnts.ok_pkt_bytes;
967                 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt;
968                 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt;
969                 u64_stats_update_end(&rx->statss);
970         }
971
972         if (xdp_txs != rx->xdp_actions[XDP_TX])
973                 gve_xdp_tx_flush(priv, rx->q_num);
974
975         if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
976                 xdp_do_flush();
977
978         /* restock ring slots */
979         if (!rx->data.raw_addressing) {
980                 /* In QPL mode buffs are refilled as the desc are processed */
981                 rx->fill_cnt += work_done;
982         } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
983                 /* In raw addressing mode buffs are only refilled if the avail
984                  * falls below a threshold.
985                  */
986                 if (!gve_rx_refill_buffers(priv, rx))
987                         return 0;
988
989                 /* If we were not able to completely refill buffers, we'll want
990                  * to schedule this queue for work again to refill buffers.
991                  */
992                 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
993                         gve_rx_write_doorbell(priv, rx);
994                         return budget;
995                 }
996         }
997
998         gve_rx_write_doorbell(priv, rx);
999         return cnts.total_pkt_cnt;
1000 }
1001
1002 int gve_rx_poll(struct gve_notify_block *block, int budget)
1003 {
1004         struct gve_rx_ring *rx = block->rx;
1005         netdev_features_t feat;
1006         int work_done = 0;
1007
1008         feat = block->napi.dev->features;
1009
1010         /* If budget is 0, do all the work */
1011         if (budget == 0)
1012                 budget = INT_MAX;
1013
1014         if (budget > 0)
1015                 work_done = gve_clean_rx_done(rx, budget, feat);
1016
1017         return work_done;
1018 }