xsk: i40e: ice: ixgbe: mlx5: Test for dma_need_sync earlier for better performance
authorMagnus Karlsson <magnus.karlsson@intel.com>
Fri, 28 Aug 2020 08:26:24 +0000 (10:26 +0200)
committerDaniel Borkmann <daniel@iogearbox.net>
Mon, 31 Aug 2020 19:15:04 +0000 (21:15 +0200)
Test for dma_need_sync earlier to increase
performance. xsk_buff_dma_sync_for_cpu() takes an xdp_buff as
parameter and from that the xsk_buff_pool reference is dug out. Perf
shows that this dereference causes a lot of cache misses. But as the
buffer pool is now sent down to the driver at zero-copy initialization
time, we might as well use this pointer directly, instead of going via
the xsk_buff and we can do so already in xsk_buff_dma_sync_for_cpu()
instead of in xp_dma_sync_for_cpu. This gets rid of these cache
misses.

Throughput increases with 3% for the xdpsock l2fwd sample application
on my machine.

Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Link: https://lore.kernel.org/bpf/1598603189-32145-11-git-send-email-magnus.karlsson@intel.com
drivers/net/ethernet/intel/i40e/i40e_xsk.c
drivers/net/ethernet/intel/ice/ice_xsk.c
drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c
drivers/net/ethernet/mellanox/mlx5/core/en/xsk/rx.c
include/net/xdp_sock_drv.h
include/net/xsk_buff_pool.h

index 95b9a7e280fa48f5231db3d43ba64ab3790b1c67..2a1153d8957baa9705bb9af487ed66c3b0c408b1 100644 (file)
@@ -314,7 +314,7 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
 
                bi = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
                (*bi)->data_end = (*bi)->data + size;
-               xsk_buff_dma_sync_for_cpu(*bi);
+               xsk_buff_dma_sync_for_cpu(*bi, rx_ring->xsk_pool);
 
                xdp_res = i40e_run_xdp_zc(rx_ring, *bi);
                if (xdp_res) {
index dffef377c5cb832794a3d7f96f13466d2b1a4a2d..797886524054cb3c5866636fbfb4cbbd9af8aece 100644 (file)
@@ -595,7 +595,7 @@ int ice_clean_rx_irq_zc(struct ice_ring *rx_ring, int budget)
 
                rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
                rx_buf->xdp->data_end = rx_buf->xdp->data + size;
-               xsk_buff_dma_sync_for_cpu(rx_buf->xdp);
+               xsk_buff_dma_sync_for_cpu(rx_buf->xdp, rx_ring->xsk_pool);
 
                xdp_res = ice_run_xdp_zc(rx_ring, rx_buf->xdp);
                if (xdp_res) {
index 6af34da45b8d19d7245426c00ecf3366a1e1d5ee..3771857cf887c4f833e3c987ffb55d28303c7ae5 100644 (file)
@@ -287,7 +287,7 @@ int ixgbe_clean_rx_irq_zc(struct ixgbe_q_vector *q_vector,
                }
 
                bi->xdp->data_end = bi->xdp->data + size;
-               xsk_buff_dma_sync_for_cpu(bi->xdp);
+               xsk_buff_dma_sync_for_cpu(bi->xdp, rx_ring->xsk_pool);
                xdp_res = ixgbe_run_xdp_zc(adapter, rx_ring, bi->xdp);
 
                if (xdp_res) {
index a33a1f762c70db7d2722984b159e436410fa3e45..902ce77d01781d34bf80da216aa785b5938ddec4 100644 (file)
@@ -48,7 +48,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq,
 
        xdp->data_end = xdp->data + cqe_bcnt32;
        xdp_set_data_meta_invalid(xdp);
-       xsk_buff_dma_sync_for_cpu(xdp);
+       xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
        prefetch(xdp->data);
 
        rcu_read_lock();
@@ -99,7 +99,7 @@ struct sk_buff *mlx5e_xsk_skb_from_cqe_linear(struct mlx5e_rq *rq,
 
        xdp->data_end = xdp->data + cqe_bcnt;
        xdp_set_data_meta_invalid(xdp);
-       xsk_buff_dma_sync_for_cpu(xdp);
+       xsk_buff_dma_sync_for_cpu(xdp, rq->xsk_pool);
        prefetch(xdp->data);
 
        if (unlikely(get_cqe_opcode(cqe) != MLX5_CQE_RESP_SEND)) {
index a7c7d2eff860c08db6ce629cbe85d7967179cbf2..5b1ee8a9976d2763ccfec32124efea0bad7426f6 100644 (file)
@@ -99,10 +99,13 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
        return xp_raw_get_data(pool, addr);
 }
 
-static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
 {
        struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp);
 
+       if (!pool->dma_need_sync)
+               return;
+
        xp_dma_sync_for_cpu(xskb);
 }
 
@@ -222,7 +225,7 @@ static inline void *xsk_buff_raw_get_data(struct xsk_buff_pool *pool, u64 addr)
        return NULL;
 }
 
-static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp)
+static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp, struct xsk_buff_pool *pool)
 {
 }
 
index 38d03a64c9eae1d72214d21528553234c989b53a..907537dddcac67b72815c39a0e35282d86e14ee2 100644 (file)
@@ -114,9 +114,6 @@ static inline dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb)
 void xp_dma_sync_for_cpu_slow(struct xdp_buff_xsk *xskb);
 static inline void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb)
 {
-       if (!xskb->pool->dma_need_sync)
-               return;
-
        xp_dma_sync_for_cpu_slow(xskb);
 }