drivers/net/ethernet/google/gve/gve_rx.c

   1 // SPDX-License-Identifier: (GPL-2.0 OR MIT)
   2 /* Google virtual Ethernet (gve) driver
   3  *
   4  * Copyright (C) 2015-2021 Google, Inc.
   5  */
   6
   7 #include "gve.h"
   8 #include "gve_adminq.h"
   9 #include "gve_utils.h"
  10 #include <linux/etherdevice.h>
  11 #include <linux/filter.h>
  12 #include <net/xdp.h>
  13 #include <net/xdp_sock_drv.h>
  14
  15 static void gve_rx_free_buffer(struct device *dev,
  16                                struct gve_rx_slot_page_info *page_info,
  17                                union gve_rx_data_slot *data_slot)
  18 {
  19         dma_addr_t dma = (dma_addr_t)(be64_to_cpu(data_slot->addr) &
  20                                       GVE_DATA_SLOT_ADDR_PAGE_MASK);
  21
  22         page_ref_sub(page_info->page, page_info->pagecnt_bias - 1);
  23         gve_free_page(dev, page_info->page, dma, DMA_FROM_DEVICE);
  24 }
  25
  26 static void gve_rx_unfill_pages(struct gve_priv *priv, struct gve_rx_ring *rx)
  27 {
  28         u32 slots = rx->mask + 1;
  29         int i;
  30
  31         if (rx->data.raw_addressing) {
  32                 for (i = 0; i < slots; i++)
  33                         gve_rx_free_buffer(&priv->pdev->dev, &rx->data.page_info[i],
  34                                            &rx->data.data_ring[i]);
  35         } else {
  36                 for (i = 0; i < slots; i++)
  37                         page_ref_sub(rx->data.page_info[i].page,
  38                                      rx->data.page_info[i].pagecnt_bias - 1);
  39                 gve_unassign_qpl(priv, rx->data.qpl->id);
  40                 rx->data.qpl = NULL;
  41
  42                 for (i = 0; i < rx->qpl_copy_pool_mask + 1; i++) {
  43                         page_ref_sub(rx->qpl_copy_pool[i].page,
  44                                      rx->qpl_copy_pool[i].pagecnt_bias - 1);
  45                         put_page(rx->qpl_copy_pool[i].page);
  46                 }
  47         }
  48         kvfree(rx->data.page_info);
  49         rx->data.page_info = NULL;
  50 }
  51
  52 static void gve_rx_free_ring(struct gve_priv *priv, int idx)
  53 {
  54         struct gve_rx_ring *rx = &priv->rx[idx];
  55         struct device *dev = &priv->pdev->dev;
  56         u32 slots = rx->mask + 1;
  57         size_t bytes;
  58
  59         gve_rx_remove_from_block(priv, idx);
  60
  61         bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
  62         dma_free_coherent(dev, bytes, rx->desc.desc_ring, rx->desc.bus);
  63         rx->desc.desc_ring = NULL;
  64
  65         dma_free_coherent(dev, sizeof(*rx->q_resources),
  66                           rx->q_resources, rx->q_resources_bus);
  67         rx->q_resources = NULL;
  68
  69         gve_rx_unfill_pages(priv, rx);
  70
  71         bytes = sizeof(*rx->data.data_ring) * slots;
  72         dma_free_coherent(dev, bytes, rx->data.data_ring,
  73                           rx->data.data_bus);
  74         rx->data.data_ring = NULL;
  75
  76         kvfree(rx->qpl_copy_pool);
  77         rx->qpl_copy_pool = NULL;
  78
  79         netif_dbg(priv, drv, priv->dev, "freed rx ring %d\n", idx);
  80 }
  81
  82 static void gve_setup_rx_buffer(struct gve_rx_slot_page_info *page_info,
  83                              dma_addr_t addr, struct page *page, __be64 *slot_addr)
  84 {
  85         page_info->page = page;
  86         page_info->page_offset = 0;
  87         page_info->page_address = page_address(page);
  88         *slot_addr = cpu_to_be64(addr);
  89         /* The page already has 1 ref */
  90         page_ref_add(page, INT_MAX - 1);
  91         page_info->pagecnt_bias = INT_MAX;
  92 }
  93
  94 static int gve_rx_alloc_buffer(struct gve_priv *priv, struct device *dev,
  95                                struct gve_rx_slot_page_info *page_info,
  96                                union gve_rx_data_slot *data_slot)
  97 {
  98         struct page *page;
  99         dma_addr_t dma;
 100         int err;
 101
 102         err = gve_alloc_page(priv, dev, &page, &dma, DMA_FROM_DEVICE,
 103                              GFP_ATOMIC);
 104         if (err)
 105                 return err;
 106
 107         gve_setup_rx_buffer(page_info, dma, page, &data_slot->addr);
 108         return 0;
 109 }
 110
 111 static int gve_prefill_rx_pages(struct gve_rx_ring *rx)
 112 {
 113         struct gve_priv *priv = rx->gve;
 114         u32 slots;
 115         int err;
 116         int i;
 117         int j;
 118
 119         /* Allocate one page per Rx queue slot. Each page is split into two
 120          * packet buffers, when possible we "page flip" between the two.
 121          */
 122         slots = rx->mask + 1;
 123
 124         rx->data.page_info = kvzalloc(slots *
 125                                       sizeof(*rx->data.page_info), GFP_KERNEL);
 126         if (!rx->data.page_info)
 127                 return -ENOMEM;
 128
 129         if (!rx->data.raw_addressing) {
 130                 rx->data.qpl = gve_assign_rx_qpl(priv, rx->q_num);
 131                 if (!rx->data.qpl) {
 132                         kvfree(rx->data.page_info);
 133                         rx->data.page_info = NULL;
 134                         return -ENOMEM;
 135                 }
 136         }
 137         for (i = 0; i < slots; i++) {
 138                 if (!rx->data.raw_addressing) {
 139                         struct page *page = rx->data.qpl->pages[i];
 140                         dma_addr_t addr = i * PAGE_SIZE;
 141
 142                         gve_setup_rx_buffer(&rx->data.page_info[i], addr, page,
 143                                             &rx->data.data_ring[i].qpl_offset);
 144                         continue;
 145                 }
 146                 err = gve_rx_alloc_buffer(priv, &priv->pdev->dev, &rx->data.page_info[i],
 147                                           &rx->data.data_ring[i]);
 148                 if (err)
 149                         goto alloc_err_rda;
 150         }
 151
 152         if (!rx->data.raw_addressing) {
 153                 for (j = 0; j < rx->qpl_copy_pool_mask + 1; j++) {
 154                         struct page *page = alloc_page(GFP_KERNEL);
 155
 156                         if (!page) {
 157                                 err = -ENOMEM;
 158                                 goto alloc_err_qpl;
 159                         }
 160
 161                         rx->qpl_copy_pool[j].page = page;
 162                         rx->qpl_copy_pool[j].page_offset = 0;
 163                         rx->qpl_copy_pool[j].page_address = page_address(page);
 164
 165                         /* The page already has 1 ref. */
 166                         page_ref_add(page, INT_MAX - 1);
 167                         rx->qpl_copy_pool[j].pagecnt_bias = INT_MAX;
 168                 }
 169         }
 170
 171         return slots;
 172
 173 alloc_err_qpl:
 174         /* Fully free the copy pool pages. */
 175         while (j--) {
 176                 page_ref_sub(rx->qpl_copy_pool[j].page,
 177                              rx->qpl_copy_pool[j].pagecnt_bias - 1);
 178                 put_page(rx->qpl_copy_pool[j].page);
 179         }
 180
 181         /* Do not fully free QPL pages - only remove the bias added in this
 182          * function with gve_setup_rx_buffer.
 183          */
 184         while (i--)
 185                 page_ref_sub(rx->data.page_info[i].page,
 186                              rx->data.page_info[i].pagecnt_bias - 1);
 187
 188         gve_unassign_qpl(priv, rx->data.qpl->id);
 189         rx->data.qpl = NULL;
 190
 191         return err;
 192
 193 alloc_err_rda:
 194         while (i--)
 195                 gve_rx_free_buffer(&priv->pdev->dev,
 196                                    &rx->data.page_info[i],
 197                                    &rx->data.data_ring[i]);
 198         return err;
 199 }
 200
 201 static void gve_rx_ctx_clear(struct gve_rx_ctx *ctx)
 202 {
 203         ctx->skb_head = NULL;
 204         ctx->skb_tail = NULL;
 205         ctx->total_size = 0;
 206         ctx->frag_cnt = 0;
 207         ctx->drop_pkt = false;
 208 }
 209
 210 static int gve_rx_alloc_ring(struct gve_priv *priv, int idx)
 211 {
 212         struct gve_rx_ring *rx = &priv->rx[idx];
 213         struct device *hdev = &priv->pdev->dev;
 214         u32 slots, npages;
 215         int filled_pages;
 216         size_t bytes;
 217         int err;
 218
 219         netif_dbg(priv, drv, priv->dev, "allocating rx ring\n");
 220         /* Make sure everything is zeroed to start with */
 221         memset(rx, 0, sizeof(*rx));
 222
 223         rx->gve = priv;
 224         rx->q_num = idx;
 225
 226         slots = priv->rx_data_slot_cnt;
 227         rx->mask = slots - 1;
 228         rx->data.raw_addressing = priv->queue_format == GVE_GQI_RDA_FORMAT;
 229
 230         /* alloc rx data ring */
 231         bytes = sizeof(*rx->data.data_ring) * slots;
 232         rx->data.data_ring = dma_alloc_coherent(hdev, bytes,
 233                                                 &rx->data.data_bus,
 234                                                 GFP_KERNEL);
 235         if (!rx->data.data_ring)
 236                 return -ENOMEM;
 237
 238         rx->qpl_copy_pool_mask = min_t(u32, U32_MAX, slots * 2) - 1;
 239         rx->qpl_copy_pool_head = 0;
 240         rx->qpl_copy_pool = kvcalloc(rx->qpl_copy_pool_mask + 1,
 241                                      sizeof(rx->qpl_copy_pool[0]),
 242                                      GFP_KERNEL);
 243
 244         if (!rx->qpl_copy_pool) {
 245                 err = -ENOMEM;
 246                 goto abort_with_slots;
 247         }
 248
 249         filled_pages = gve_prefill_rx_pages(rx);
 250         if (filled_pages < 0) {
 251                 err = -ENOMEM;
 252                 goto abort_with_copy_pool;
 253         }
 254         rx->fill_cnt = filled_pages;
 255         /* Ensure data ring slots (packet buffers) are visible. */
 256         dma_wmb();
 257
 258         /* Alloc gve_queue_resources */
 259         rx->q_resources =
 260                 dma_alloc_coherent(hdev,
 261                                    sizeof(*rx->q_resources),
 262                                    &rx->q_resources_bus,
 263                                    GFP_KERNEL);
 264         if (!rx->q_resources) {
 265                 err = -ENOMEM;
 266                 goto abort_filled;
 267         }
 268         netif_dbg(priv, drv, priv->dev, "rx[%d]->data.data_bus=%lx\n", idx,
 269                   (unsigned long)rx->data.data_bus);
 270
 271         /* alloc rx desc ring */
 272         bytes = sizeof(struct gve_rx_desc) * priv->rx_desc_cnt;
 273         npages = bytes / PAGE_SIZE;
 274         if (npages * PAGE_SIZE != bytes) {
 275                 err = -EIO;
 276                 goto abort_with_q_resources;
 277         }
 278
 279         rx->desc.desc_ring = dma_alloc_coherent(hdev, bytes, &rx->desc.bus,
 280                                                 GFP_KERNEL);
 281         if (!rx->desc.desc_ring) {
 282                 err = -ENOMEM;
 283                 goto abort_with_q_resources;
 284         }
 285         rx->cnt = 0;
 286         rx->db_threshold = priv->rx_desc_cnt / 2;
 287         rx->desc.seqno = 1;
 288
 289         /* Allocating half-page buffers allows page-flipping which is faster
 290          * than copying or allocating new pages.
 291          */
 292         rx->packet_buffer_size = PAGE_SIZE / 2;
 293         gve_rx_ctx_clear(&rx->ctx);
 294         gve_rx_add_to_block(priv, idx);
 295
 296         return 0;
 297
 298 abort_with_q_resources:
 299         dma_free_coherent(hdev, sizeof(*rx->q_resources),
 300                           rx->q_resources, rx->q_resources_bus);
 301         rx->q_resources = NULL;
 302 abort_filled:
 303         gve_rx_unfill_pages(priv, rx);
 304 abort_with_copy_pool:
 305         kvfree(rx->qpl_copy_pool);
 306         rx->qpl_copy_pool = NULL;
 307 abort_with_slots:
 308         bytes = sizeof(*rx->data.data_ring) * slots;
 309         dma_free_coherent(hdev, bytes, rx->data.data_ring, rx->data.data_bus);
 310         rx->data.data_ring = NULL;
 311
 312         return err;
 313 }
 314
 315 int gve_rx_alloc_rings(struct gve_priv *priv)
 316 {
 317         int err = 0;
 318         int i;
 319
 320         for (i = 0; i < priv->rx_cfg.num_queues; i++) {
 321                 err = gve_rx_alloc_ring(priv, i);
 322                 if (err) {
 323                         netif_err(priv, drv, priv->dev,
 324                                   "Failed to alloc rx ring=%d: err=%d\n",
 325                                   i, err);
 326                         break;
 327                 }
 328         }
 329         /* Unallocate if there was an error */
 330         if (err) {
 331                 int j;
 332
 333                 for (j = 0; j < i; j++)
 334                         gve_rx_free_ring(priv, j);
 335         }
 336         return err;
 337 }
 338
 339 void gve_rx_free_rings_gqi(struct gve_priv *priv)
 340 {
 341         int i;
 342
 343         for (i = 0; i < priv->rx_cfg.num_queues; i++)
 344                 gve_rx_free_ring(priv, i);
 345 }
 346
 347 void gve_rx_write_doorbell(struct gve_priv *priv, struct gve_rx_ring *rx)
 348 {
 349         u32 db_idx = be32_to_cpu(rx->q_resources->db_index);
 350
 351         iowrite32be(rx->fill_cnt, &priv->db_bar2[db_idx]);
 352 }
 353
 354 static enum pkt_hash_types gve_rss_type(__be16 pkt_flags)
 355 {
 356         if (likely(pkt_flags & (GVE_RXF_TCP | GVE_RXF_UDP)))
 357                 return PKT_HASH_TYPE_L4;
 358         if (pkt_flags & (GVE_RXF_IPV4 | GVE_RXF_IPV6))
 359                 return PKT_HASH_TYPE_L3;
 360         return PKT_HASH_TYPE_L2;
 361 }
 362
 363 static struct sk_buff *gve_rx_add_frags(struct napi_struct *napi,
 364                                         struct gve_rx_slot_page_info *page_info,
 365                                         u16 packet_buffer_size, u16 len,
 366                                         struct gve_rx_ctx *ctx)
 367 {
 368         u32 offset = page_info->page_offset + page_info->pad;
 369         struct sk_buff *skb = ctx->skb_tail;
 370         int num_frags = 0;
 371
 372         if (!skb) {
 373                 skb = napi_get_frags(napi);
 374                 if (unlikely(!skb))
 375                         return NULL;
 376
 377                 ctx->skb_head = skb;
 378                 ctx->skb_tail = skb;
 379         } else {
 380                 num_frags = skb_shinfo(ctx->skb_tail)->nr_frags;
 381                 if (num_frags == MAX_SKB_FRAGS) {
 382                         skb = napi_alloc_skb(napi, 0);
 383                         if (!skb)
 384                                 return NULL;
 385
 386                         // We will never chain more than two SKBs: 2 * 16 * 2k > 64k
 387                         // which is why we do not need to chain by using skb->next
 388                         skb_shinfo(ctx->skb_tail)->frag_list = skb;
 389
 390                         ctx->skb_tail = skb;
 391                         num_frags = 0;
 392                 }
 393         }
 394
 395         if (skb != ctx->skb_head) {
 396                 ctx->skb_head->len += len;
 397                 ctx->skb_head->data_len += len;
 398                 ctx->skb_head->truesize += packet_buffer_size;
 399         }
 400         skb_add_rx_frag(skb, num_frags, page_info->page,
 401                         offset, len, packet_buffer_size);
 402
 403         return ctx->skb_head;
 404 }
 405
 406 static void gve_rx_flip_buff(struct gve_rx_slot_page_info *page_info, __be64 *slot_addr)
 407 {
 408         const __be64 offset = cpu_to_be64(PAGE_SIZE / 2);
 409
 410         /* "flip" to other packet buffer on this page */
 411         page_info->page_offset ^= PAGE_SIZE / 2;
 412         *(slot_addr) ^= offset;
 413 }
 414
 415 static int gve_rx_can_recycle_buffer(struct gve_rx_slot_page_info *page_info)
 416 {
 417         int pagecount = page_count(page_info->page);
 418
 419         /* This page is not being used by any SKBs - reuse */
 420         if (pagecount == page_info->pagecnt_bias)
 421                 return 1;
 422         /* This page is still being used by an SKB - we can't reuse */
 423         else if (pagecount > page_info->pagecnt_bias)
 424                 return 0;
 425         WARN(pagecount < page_info->pagecnt_bias,
 426              "Pagecount should never be less than the bias.");
 427         return -1;
 428 }
 429
 430 static struct sk_buff *
 431 gve_rx_raw_addressing(struct device *dev, struct net_device *netdev,
 432                       struct gve_rx_slot_page_info *page_info, u16 len,
 433                       struct napi_struct *napi,
 434                       union gve_rx_data_slot *data_slot,
 435                       u16 packet_buffer_size, struct gve_rx_ctx *ctx)
 436 {
 437         struct sk_buff *skb = gve_rx_add_frags(napi, page_info, packet_buffer_size, len, ctx);
 438
 439         if (!skb)
 440                 return NULL;
 441
 442         /* Optimistically stop the kernel from freeing the page.
 443          * We will check again in refill to determine if we need to alloc a
 444          * new page.
 445          */
 446         gve_dec_pagecnt_bias(page_info);
 447
 448         return skb;
 449 }
 450
 451 static struct sk_buff *gve_rx_copy_to_pool(struct gve_rx_ring *rx,
 452                                            struct gve_rx_slot_page_info *page_info,
 453                                            u16 len, struct napi_struct *napi)
 454 {
 455         u32 pool_idx = rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask;
 456         void *src = page_info->page_address + page_info->page_offset;
 457         struct gve_rx_slot_page_info *copy_page_info;
 458         struct gve_rx_ctx *ctx = &rx->ctx;
 459         bool alloc_page = false;
 460         struct sk_buff *skb;
 461         void *dst;
 462
 463         copy_page_info = &rx->qpl_copy_pool[pool_idx];
 464         if (!copy_page_info->can_flip) {
 465                 int recycle = gve_rx_can_recycle_buffer(copy_page_info);
 466
 467                 if (unlikely(recycle < 0)) {
 468                         gve_schedule_reset(rx->gve);
 469                         return NULL;
 470                 }
 471                 alloc_page = !recycle;
 472         }
 473
 474         if (alloc_page) {
 475                 struct gve_rx_slot_page_info alloc_page_info;
 476                 struct page *page;
 477
 478                 /* The least recently used page turned out to be
 479                  * still in use by the kernel. Ignoring it and moving
 480                  * on alleviates head-of-line blocking.
 481                  */
 482                 rx->qpl_copy_pool_head++;
 483
 484                 page = alloc_page(GFP_ATOMIC);
 485                 if (!page)
 486                         return NULL;
 487
 488                 alloc_page_info.page = page;
 489                 alloc_page_info.page_offset = 0;
 490                 alloc_page_info.page_address = page_address(page);
 491                 alloc_page_info.pad = page_info->pad;
 492
 493                 memcpy(alloc_page_info.page_address, src, page_info->pad + len);
 494                 skb = gve_rx_add_frags(napi, &alloc_page_info,
 495                                        rx->packet_buffer_size,
 496                                        len, ctx);
 497
 498                 u64_stats_update_begin(&rx->statss);
 499                 rx->rx_frag_copy_cnt++;
 500                 rx->rx_frag_alloc_cnt++;
 501                 u64_stats_update_end(&rx->statss);
 502
 503                 return skb;
 504         }
 505
 506         dst = copy_page_info->page_address + copy_page_info->page_offset;
 507         memcpy(dst, src, page_info->pad + len);
 508         copy_page_info->pad = page_info->pad;
 509
 510         skb = gve_rx_add_frags(napi, copy_page_info,
 511                                rx->packet_buffer_size, len, ctx);
 512         if (unlikely(!skb))
 513                 return NULL;
 514
 515         gve_dec_pagecnt_bias(copy_page_info);
 516         copy_page_info->page_offset += rx->packet_buffer_size;
 517         copy_page_info->page_offset &= (PAGE_SIZE - 1);
 518
 519         if (copy_page_info->can_flip) {
 520                 /* We have used both halves of this copy page, it
 521                  * is time for it to go to the back of the queue.
 522                  */
 523                 copy_page_info->can_flip = false;
 524                 rx->qpl_copy_pool_head++;
 525                 prefetch(rx->qpl_copy_pool[rx->qpl_copy_pool_head & rx->qpl_copy_pool_mask].page);
 526         } else {
 527                 copy_page_info->can_flip = true;
 528         }
 529
 530         u64_stats_update_begin(&rx->statss);
 531         rx->rx_frag_copy_cnt++;
 532         u64_stats_update_end(&rx->statss);
 533
 534         return skb;
 535 }
 536
 537 static struct sk_buff *
 538 gve_rx_qpl(struct device *dev, struct net_device *netdev,
 539            struct gve_rx_ring *rx, struct gve_rx_slot_page_info *page_info,
 540            u16 len, struct napi_struct *napi,
 541            union gve_rx_data_slot *data_slot)
 542 {
 543         struct gve_rx_ctx *ctx = &rx->ctx;
 544         struct sk_buff *skb;
 545
 546         /* if raw_addressing mode is not enabled gvnic can only receive into
 547          * registered segments. If the buffer can't be recycled, our only
 548          * choice is to copy the data out of it so that we can return it to the
 549          * device.
 550          */
 551         if (page_info->can_flip) {
 552                 skb = gve_rx_add_frags(napi, page_info, rx->packet_buffer_size, len, ctx);
 553                 /* No point in recycling if we didn't get the skb */
 554                 if (skb) {
 555                         /* Make sure that the page isn't freed. */
 556                         gve_dec_pagecnt_bias(page_info);
 557                         gve_rx_flip_buff(page_info, &data_slot->qpl_offset);
 558                 }
 559         } else {
 560                 skb = gve_rx_copy_to_pool(rx, page_info, len, napi);
 561         }
 562         return skb;
 563 }
 564
 565 static struct sk_buff *gve_rx_skb(struct gve_priv *priv, struct gve_rx_ring *rx,
 566                                   struct gve_rx_slot_page_info *page_info, struct napi_struct *napi,
 567                                   u16 len, union gve_rx_data_slot *data_slot,
 568                                   bool is_only_frag)
 569 {
 570         struct net_device *netdev = priv->dev;
 571         struct gve_rx_ctx *ctx = &rx->ctx;
 572         struct sk_buff *skb = NULL;
 573
 574         if (len <= priv->rx_copybreak && is_only_frag)  {
 575                 /* Just copy small packets */
 576                 skb = gve_rx_copy(netdev, napi, page_info, len);
 577                 if (skb) {
 578                         u64_stats_update_begin(&rx->statss);
 579                         rx->rx_copied_pkt++;
 580                         rx->rx_frag_copy_cnt++;
 581                         rx->rx_copybreak_pkt++;
 582                         u64_stats_update_end(&rx->statss);
 583                 }
 584         } else {
 585                 int recycle = gve_rx_can_recycle_buffer(page_info);
 586
 587                 if (unlikely(recycle < 0)) {
 588                         gve_schedule_reset(priv);
 589                         return NULL;
 590                 }
 591                 page_info->can_flip = recycle;
 592                 if (page_info->can_flip) {
 593                         u64_stats_update_begin(&rx->statss);
 594                         rx->rx_frag_flip_cnt++;
 595                         u64_stats_update_end(&rx->statss);
 596                 }
 597
 598                 if (rx->data.raw_addressing) {
 599                         skb = gve_rx_raw_addressing(&priv->pdev->dev, netdev,
 600                                                     page_info, len, napi,
 601                                                     data_slot,
 602                                                     rx->packet_buffer_size, ctx);
 603                 } else {
 604                         skb = gve_rx_qpl(&priv->pdev->dev, netdev, rx,
 605                                          page_info, len, napi, data_slot);
 606                 }
 607         }
 608         return skb;
 609 }
 610
 611 static int gve_xsk_pool_redirect(struct net_device *dev,
 612                                  struct gve_rx_ring *rx,
 613                                  void *data, int len,
 614                                  struct bpf_prog *xdp_prog)
 615 {
 616         struct xdp_buff *xdp;
 617         int err;
 618
 619         if (rx->xsk_pool->frame_len < len)
 620                 return -E2BIG;
 621         xdp = xsk_buff_alloc(rx->xsk_pool);
 622         if (!xdp) {
 623                 u64_stats_update_begin(&rx->statss);
 624                 rx->xdp_alloc_fails++;
 625                 u64_stats_update_end(&rx->statss);
 626                 return -ENOMEM;
 627         }
 628         xdp->data_end = xdp->data + len;
 629         memcpy(xdp->data, data, len);
 630         err = xdp_do_redirect(dev, xdp, xdp_prog);
 631         if (err)
 632                 xsk_buff_free(xdp);
 633         return err;
 634 }
 635
 636 static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
 637                             struct xdp_buff *orig, struct bpf_prog *xdp_prog)
 638 {
 639         int total_len, len = orig->data_end - orig->data;
 640         int headroom = XDP_PACKET_HEADROOM;
 641         struct xdp_buff new;
 642         void *frame;
 643         int err;
 644
 645         if (rx->xsk_pool)
 646                 return gve_xsk_pool_redirect(dev, rx, orig->data,
 647                                              len, xdp_prog);
 648
 649         total_len = headroom + SKB_DATA_ALIGN(len) +
 650                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 651         frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
 652         if (!frame) {
 653                 u64_stats_update_begin(&rx->statss);
 654                 rx->xdp_alloc_fails++;
 655                 u64_stats_update_end(&rx->statss);
 656                 return -ENOMEM;
 657         }
 658         xdp_init_buff(&new, total_len, &rx->xdp_rxq);
 659         xdp_prepare_buff(&new, frame, headroom, len, false);
 660         memcpy(new.data, orig->data, len);
 661
 662         err = xdp_do_redirect(dev, &new, xdp_prog);
 663         if (err)
 664                 page_frag_free(frame);
 665
 666         return err;
 667 }
 668
 669 static void gve_xdp_done(struct gve_priv *priv, struct gve_rx_ring *rx,
 670                          struct xdp_buff *xdp, struct bpf_prog *xprog,
 671                          int xdp_act)
 672 {
 673         struct gve_tx_ring *tx;
 674         int tx_qid;
 675         int err;
 676
 677         switch (xdp_act) {
 678         case XDP_ABORTED:
 679         case XDP_DROP:
 680         default:
 681                 break;
 682         case XDP_TX:
 683                 tx_qid = gve_xdp_tx_queue_id(priv, rx->q_num);
 684                 tx = &priv->tx[tx_qid];
 685                 spin_lock(&tx->xdp_lock);
 686                 err = gve_xdp_xmit_one(priv, tx, xdp->data,
 687                                        xdp->data_end - xdp->data, NULL);
 688                 spin_unlock(&tx->xdp_lock);
 689
 690                 if (unlikely(err)) {
 691                         u64_stats_update_begin(&rx->statss);
 692                         rx->xdp_tx_errors++;
 693                         u64_stats_update_end(&rx->statss);
 694                 }
 695                 break;
 696         case XDP_REDIRECT:
 697                 err = gve_xdp_redirect(priv->dev, rx, xdp, xprog);
 698
 699                 if (unlikely(err)) {
 700                         u64_stats_update_begin(&rx->statss);
 701                         rx->xdp_redirect_errors++;
 702                         u64_stats_update_end(&rx->statss);
 703                 }
 704                 break;
 705         }
 706         u64_stats_update_begin(&rx->statss);
 707         if ((u32)xdp_act < GVE_XDP_ACTIONS)
 708                 rx->xdp_actions[xdp_act]++;
 709         u64_stats_update_end(&rx->statss);
 710 }
 711
 712 #define GVE_PKTCONT_BIT_IS_SET(x) (GVE_RXF_PKT_CONT & (x))
 713 static void gve_rx(struct gve_rx_ring *rx, netdev_features_t feat,
 714                    struct gve_rx_desc *desc, u32 idx,
 715                    struct gve_rx_cnts *cnts)
 716 {
 717         bool is_last_frag = !GVE_PKTCONT_BIT_IS_SET(desc->flags_seq);
 718         struct gve_rx_slot_page_info *page_info;
 719         u16 frag_size = be16_to_cpu(desc->len);
 720         struct gve_rx_ctx *ctx = &rx->ctx;
 721         union gve_rx_data_slot *data_slot;
 722         struct gve_priv *priv = rx->gve;
 723         struct sk_buff *skb = NULL;
 724         struct bpf_prog *xprog;
 725         struct xdp_buff xdp;
 726         dma_addr_t page_bus;
 727         void *va;
 728
 729         u16 len = frag_size;
 730         struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
 731         bool is_first_frag = ctx->frag_cnt == 0;
 732
 733         bool is_only_frag = is_first_frag && is_last_frag;
 734
 735         if (unlikely(ctx->drop_pkt))
 736                 goto finish_frag;
 737
 738         if (desc->flags_seq & GVE_RXF_ERR) {
 739                 ctx->drop_pkt = true;
 740                 cnts->desc_err_pkt_cnt++;
 741                 napi_free_frags(napi);
 742                 goto finish_frag;
 743         }
 744
 745         if (unlikely(frag_size > rx->packet_buffer_size)) {
 746                 netdev_warn(priv->dev, "Unexpected frag size %d, can't exceed %d, scheduling reset",
 747                             frag_size, rx->packet_buffer_size);
 748                 ctx->drop_pkt = true;
 749                 napi_free_frags(napi);
 750                 gve_schedule_reset(rx->gve);
 751                 goto finish_frag;
 752         }
 753
 754         /* Prefetch two packet buffers ahead, we will need it soon. */
 755         page_info = &rx->data.page_info[(idx + 2) & rx->mask];
 756         va = page_info->page_address + page_info->page_offset;
 757         prefetch(page_info->page); /* Kernel page struct. */
 758         prefetch(va);              /* Packet header. */
 759         prefetch(va + 64);         /* Next cacheline too. */
 760
 761         page_info = &rx->data.page_info[idx];
 762         data_slot = &rx->data.data_ring[idx];
 763         page_bus = (rx->data.raw_addressing) ?
 764                 be64_to_cpu(data_slot->addr) - page_info->page_offset :
 765                 rx->data.qpl->page_buses[idx];
 766         dma_sync_single_for_cpu(&priv->pdev->dev, page_bus,
 767                                 PAGE_SIZE, DMA_FROM_DEVICE);
 768         page_info->pad = is_first_frag ? GVE_RX_PAD : 0;
 769         len -= page_info->pad;
 770         frag_size -= page_info->pad;
 771
 772         xprog = READ_ONCE(priv->xdp_prog);
 773         if (xprog && is_only_frag) {
 774                 void *old_data;
 775                 int xdp_act;
 776
 777                 xdp_init_buff(&xdp, rx->packet_buffer_size, &rx->xdp_rxq);
 778                 xdp_prepare_buff(&xdp, page_info->page_address +
 779                                  page_info->page_offset, GVE_RX_PAD,
 780                                  len, false);
 781                 old_data = xdp.data;
 782                 xdp_act = bpf_prog_run_xdp(xprog, &xdp);
 783                 if (xdp_act != XDP_PASS) {
 784                         gve_xdp_done(priv, rx, &xdp, xprog, xdp_act);
 785                         ctx->total_size += frag_size;
 786                         goto finish_ok_pkt;
 787                 }
 788
 789                 page_info->pad += xdp.data - old_data;
 790                 len = xdp.data_end - xdp.data;
 791
 792                 u64_stats_update_begin(&rx->statss);
 793                 rx->xdp_actions[XDP_PASS]++;
 794                 u64_stats_update_end(&rx->statss);
 795         }
 796
 797         skb = gve_rx_skb(priv, rx, page_info, napi, len,
 798                          data_slot, is_only_frag);
 799         if (!skb) {
 800                 u64_stats_update_begin(&rx->statss);
 801                 rx->rx_skb_alloc_fail++;
 802                 u64_stats_update_end(&rx->statss);
 803
 804                 napi_free_frags(napi);
 805                 ctx->drop_pkt = true;
 806                 goto finish_frag;
 807         }
 808         ctx->total_size += frag_size;
 809
 810         if (is_first_frag) {
 811                 if (likely(feat & NETIF_F_RXCSUM)) {
 812                         /* NIC passes up the partial sum */
 813                         if (desc->csum)
 814                                 skb->ip_summed = CHECKSUM_COMPLETE;
 815                         else
 816                                 skb->ip_summed = CHECKSUM_NONE;
 817                         skb->csum = csum_unfold(desc->csum);
 818                 }
 819
 820                 /* parse flags & pass relevant info up */
 821                 if (likely(feat & NETIF_F_RXHASH) &&
 822                     gve_needs_rss(desc->flags_seq))
 823                         skb_set_hash(skb, be32_to_cpu(desc->rss_hash),
 824                                      gve_rss_type(desc->flags_seq));
 825         }
 826
 827         if (is_last_frag) {
 828                 skb_record_rx_queue(skb, rx->q_num);
 829                 if (skb_is_nonlinear(skb))
 830                         napi_gro_frags(napi);
 831                 else
 832                         napi_gro_receive(napi, skb);
 833                 goto finish_ok_pkt;
 834         }
 835
 836         goto finish_frag;
 837
 838 finish_ok_pkt:
 839         cnts->ok_pkt_bytes += ctx->total_size;
 840         cnts->ok_pkt_cnt++;
 841 finish_frag:
 842         ctx->frag_cnt++;
 843         if (is_last_frag) {
 844                 cnts->total_pkt_cnt++;
 845                 cnts->cont_pkt_cnt += (ctx->frag_cnt > 1);
 846                 gve_rx_ctx_clear(ctx);
 847         }
 848 }
 849
 850 bool gve_rx_work_pending(struct gve_rx_ring *rx)
 851 {
 852         struct gve_rx_desc *desc;
 853         __be16 flags_seq;
 854         u32 next_idx;
 855
 856         next_idx = rx->cnt & rx->mask;
 857         desc = rx->desc.desc_ring + next_idx;
 858
 859         flags_seq = desc->flags_seq;
 860
 861         return (GVE_SEQNO(flags_seq) == rx->desc.seqno);
 862 }
 863
 864 static bool gve_rx_refill_buffers(struct gve_priv *priv, struct gve_rx_ring *rx)
 865 {
 866         int refill_target = rx->mask + 1;
 867         u32 fill_cnt = rx->fill_cnt;
 868
 869         while (fill_cnt - rx->cnt < refill_target) {
 870                 struct gve_rx_slot_page_info *page_info;
 871                 u32 idx = fill_cnt & rx->mask;
 872
 873                 page_info = &rx->data.page_info[idx];
 874                 if (page_info->can_flip) {
 875                         /* The other half of the page is free because it was
 876                          * free when we processed the descriptor. Flip to it.
 877                          */
 878                         union gve_rx_data_slot *data_slot =
 879                                                 &rx->data.data_ring[idx];
 880
 881                         gve_rx_flip_buff(page_info, &data_slot->addr);
 882                         page_info->can_flip = 0;
 883                 } else {
 884                         /* It is possible that the networking stack has already
 885                          * finished processing all outstanding packets in the buffer
 886                          * and it can be reused.
 887                          * Flipping is unnecessary here - if the networking stack still
 888                          * owns half the page it is impossible to tell which half. Either
 889                          * the whole page is free or it needs to be replaced.
 890                          */
 891                         int recycle = gve_rx_can_recycle_buffer(page_info);
 892
 893                         if (recycle < 0) {
 894                                 if (!rx->data.raw_addressing)
 895                                         gve_schedule_reset(priv);
 896                                 return false;
 897                         }
 898                         if (!recycle) {
 899                                 /* We can't reuse the buffer - alloc a new one*/
 900                                 union gve_rx_data_slot *data_slot =
 901                                                 &rx->data.data_ring[idx];
 902                                 struct device *dev = &priv->pdev->dev;
 903                                 gve_rx_free_buffer(dev, page_info, data_slot);
 904                                 page_info->page = NULL;
 905                                 if (gve_rx_alloc_buffer(priv, dev, page_info,
 906                                                         data_slot)) {
 907                                         u64_stats_update_begin(&rx->statss);
 908                                         rx->rx_buf_alloc_fail++;
 909                                         u64_stats_update_end(&rx->statss);
 910                                         break;
 911                                 }
 912                         }
 913                 }
 914                 fill_cnt++;
 915         }
 916         rx->fill_cnt = fill_cnt;
 917         return true;
 918 }
 919
 920 static int gve_clean_rx_done(struct gve_rx_ring *rx, int budget,
 921                              netdev_features_t feat)
 922 {
 923         u64 xdp_redirects = rx->xdp_actions[XDP_REDIRECT];
 924         u64 xdp_txs = rx->xdp_actions[XDP_TX];
 925         struct gve_rx_ctx *ctx = &rx->ctx;
 926         struct gve_priv *priv = rx->gve;
 927         struct gve_rx_cnts cnts = {0};
 928         struct gve_rx_desc *next_desc;
 929         u32 idx = rx->cnt & rx->mask;
 930         u32 work_done = 0;
 931
 932         struct gve_rx_desc *desc = &rx->desc.desc_ring[idx];
 933
 934         // Exceed budget only if (and till) the inflight packet is consumed.
 935         while ((GVE_SEQNO(desc->flags_seq) == rx->desc.seqno) &&
 936                (work_done < budget || ctx->frag_cnt)) {
 937                 next_desc = &rx->desc.desc_ring[(idx + 1) & rx->mask];
 938                 prefetch(next_desc);
 939
 940                 gve_rx(rx, feat, desc, idx, &cnts);
 941
 942                 rx->cnt++;
 943                 idx = rx->cnt & rx->mask;
 944                 desc = &rx->desc.desc_ring[idx];
 945                 rx->desc.seqno = gve_next_seqno(rx->desc.seqno);
 946                 work_done++;
 947         }
 948
 949         // The device will only send whole packets.
 950         if (unlikely(ctx->frag_cnt)) {
 951                 struct napi_struct *napi = &priv->ntfy_blocks[rx->ntfy_id].napi;
 952
 953                 napi_free_frags(napi);
 954                 gve_rx_ctx_clear(&rx->ctx);
 955                 netdev_warn(priv->dev, "Unexpected seq number %d with incomplete packet, expected %d, scheduling reset",
 956                             GVE_SEQNO(desc->flags_seq), rx->desc.seqno);
 957                 gve_schedule_reset(rx->gve);
 958         }
 959
 960         if (!work_done && rx->fill_cnt - rx->cnt > rx->db_threshold)
 961                 return 0;
 962
 963         if (work_done) {
 964                 u64_stats_update_begin(&rx->statss);
 965                 rx->rpackets += cnts.ok_pkt_cnt;
 966                 rx->rbytes += cnts.ok_pkt_bytes;
 967                 rx->rx_cont_packet_cnt += cnts.cont_pkt_cnt;
 968                 rx->rx_desc_err_dropped_pkt += cnts.desc_err_pkt_cnt;
 969                 u64_stats_update_end(&rx->statss);
 970         }
 971
 972         if (xdp_txs != rx->xdp_actions[XDP_TX])
 973                 gve_xdp_tx_flush(priv, rx->q_num);
 974
 975         if (xdp_redirects != rx->xdp_actions[XDP_REDIRECT])
 976                 xdp_do_flush();
 977
 978         /* restock ring slots */
 979         if (!rx->data.raw_addressing) {
 980                 /* In QPL mode buffs are refilled as the desc are processed */
 981                 rx->fill_cnt += work_done;
 982         } else if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
 983                 /* In raw addressing mode buffs are only refilled if the avail
 984                  * falls below a threshold.
 985                  */
 986                 if (!gve_rx_refill_buffers(priv, rx))
 987                         return 0;
 988
 989                 /* If we were not able to completely refill buffers, we'll want
 990                  * to schedule this queue for work again to refill buffers.
 991                  */
 992                 if (rx->fill_cnt - rx->cnt <= rx->db_threshold) {
 993                         gve_rx_write_doorbell(priv, rx);
 994                         return budget;
 995                 }
 996         }
 997
 998         gve_rx_write_doorbell(priv, rx);
 999         return cnts.total_pkt_cnt;
1000 }
1001
1002 int gve_rx_poll(struct gve_notify_block *block, int budget)
1003 {
1004         struct gve_rx_ring *rx = block->rx;
1005         netdev_features_t feat;
1006         int work_done = 0;
1007
1008         feat = block->napi.dev->features;
1009
1010         /* If budget is 0, do all the work */
1011         if (budget == 0)
1012                 budget = INT_MAX;
1013
1014         if (budget > 0)
1015                 work_done = gve_clean_rx_done(rx, budget, feat);
1016
1017         return work_done;
1018 }