bnx2x: Add TPA, Broadcoms HW LRO
authorVladislav Zolotarov <vladz@broadcom.com>
Tue, 24 Jun 2008 03:34:36 +0000 (20:34 -0700)
committerDavid S. Miller <davem@davemloft.net>
Tue, 24 Jun 2008 03:34:36 +0000 (20:34 -0700)
The TPA stands for Transparent Packet Aggregation. When enabled, the FW
aggregate in-order TCP packets according to the 4-tuple match and sends
1 big packet to the driver. This packet is stored on an SGL in which
each SGE is 1 page. The FW also implements a timeout algorithm and it
honors all TCP flag, including the push flag as a trigger to halt
aggregation.

After receiving Ben Hutchings comments, we also added ethtool support,
so now, thanks to Ben's patch, when forwarding is enabled, our
aggregation is turned off using the LRO flags.

Signed-off-by: Vladislav Zolotarov <vladz@broadcom.com>
Signed-off-by: Eilon Greenstein <eilong@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/bnx2x.h
drivers/net/bnx2x_main.c

index f7d73d6..4bf4f7b 100644 (file)
 #define is_multi(bp)           (bp->num_queues > 1)
 
 
+/* fast path */
 
-#define bnx2x_sp_check(bp, var) ((bp->slowpath) ? (&bp->slowpath->var) : NULL)
 struct sw_rx_bd {
        struct sk_buff  *skb;
        DECLARE_PCI_UNMAP_ADDR(mapping)
@@ -144,6 +144,52 @@ struct sw_tx_bd {
        u16             first_bd;
 };
 
+struct sw_rx_page {
+       struct page     *page;
+       DECLARE_PCI_UNMAP_ADDR(mapping)
+};
+
+
+/* MC hsi */
+#define BCM_PAGE_SHIFT                 12
+#define BCM_PAGE_SIZE                  (1 << BCM_PAGE_SHIFT)
+#define BCM_PAGE_MASK                  (~(BCM_PAGE_SIZE - 1))
+#define BCM_PAGE_ALIGN(addr)   (((addr) + BCM_PAGE_SIZE - 1) & BCM_PAGE_MASK)
+
+#define PAGES_PER_SGE_SHIFT            0
+#define PAGES_PER_SGE                  (1 << PAGES_PER_SGE_SHIFT)
+
+/* SGE ring related macros */
+#define NUM_RX_SGE_PAGES               2
+#define RX_SGE_CNT             (BCM_PAGE_SIZE / sizeof(struct eth_rx_sge))
+#define MAX_RX_SGE_CNT                 (RX_SGE_CNT - 2)
+/* RX_SGE_CNT is promissed to be a power of 2 */
+#define RX_SGE_MASK                    (RX_SGE_CNT - 1)
+#define NUM_RX_SGE                     (RX_SGE_CNT * NUM_RX_SGE_PAGES)
+#define MAX_RX_SGE                     (NUM_RX_SGE - 1)
+#define NEXT_SGE_IDX(x)                ((((x) & RX_SGE_MASK) == \
+                                 (MAX_RX_SGE_CNT - 1)) ? (x) + 3 : (x) + 1)
+#define RX_SGE(x)                      ((x) & MAX_RX_SGE)
+
+/* SGE producer mask related macros */
+/* Number of bits in one sge_mask array element */
+#define RX_SGE_MASK_ELEM_SZ            64
+#define RX_SGE_MASK_ELEM_SHIFT         6
+#define RX_SGE_MASK_ELEM_MASK          ((u64)RX_SGE_MASK_ELEM_SZ - 1)
+
+/* Creates a bitmask of all ones in less significant bits.
+   idx - index of the most significant bit in the created mask */
+#define RX_SGE_ONES_MASK(idx) \
+               (((u64)0x1 << (((idx) & RX_SGE_MASK_ELEM_MASK) + 1)) - 1)
+#define RX_SGE_MASK_ELEM_ONE_MASK      ((u64)(~0))
+
+/* Number of u64 elements in SGE mask array */
+#define RX_SGE_MASK_LEN                        ((NUM_RX_SGE_PAGES * RX_SGE_CNT) / \
+                                        RX_SGE_MASK_ELEM_SZ)
+#define RX_SGE_MASK_LEN_MASK           (RX_SGE_MASK_LEN - 1)
+#define NEXT_SGE_MASK_ELEM(el)         (((el) + 1) & RX_SGE_MASK_LEN_MASK)
+
+
 struct bnx2x_fastpath {
 
        struct napi_struct      napi;
@@ -159,7 +205,8 @@ struct bnx2x_fastpath {
        struct eth_tx_bd        *tx_desc_ring;
        dma_addr_t              tx_desc_mapping;
 
-       struct sw_rx_bd         *rx_buf_ring;
+       struct sw_rx_bd         *rx_buf_ring;   /* BDs mappings ring */
+       struct sw_rx_page       *rx_page_ring;  /* SGE pages mappings ring */
 
        struct eth_rx_bd        *rx_desc_ring;
        dma_addr_t              rx_desc_mapping;
@@ -167,6 +214,12 @@ struct bnx2x_fastpath {
        union eth_rx_cqe        *rx_comp_ring;
        dma_addr_t              rx_comp_mapping;
 
+       /* SGE ring */
+       struct eth_rx_sge       *rx_sge_ring;
+       dma_addr_t              rx_sge_mapping;
+
+       u64                     sge_mask[RX_SGE_MASK_LEN];
+
        int                     state;
 #define BNX2X_FP_STATE_CLOSED          0
 #define BNX2X_FP_STATE_IRQ             0x80000
@@ -197,27 +250,152 @@ struct bnx2x_fastpath {
        u16                     rx_bd_cons;
        u16                     rx_comp_prod;
        u16                     rx_comp_cons;
+       u16                     rx_sge_prod;
+       /* The last maximal completed SGE */
+       u16                     last_max_sge;
        u16                     *rx_cons_sb;
+       u16                     *rx_bd_cons_sb;
 
        unsigned long           tx_pkt,
                                rx_pkt,
-                               rx_calls;
+                               rx_calls,
+                               rx_alloc_failed;
+       /* TPA related */
+       struct sw_rx_bd         tpa_pool[ETH_MAX_AGGREGATION_QUEUES_E1H];
+       u8                      tpa_state[ETH_MAX_AGGREGATION_QUEUES_E1H];
+#define BNX2X_TPA_START                        1
+#define BNX2X_TPA_STOP                 2
+       u8                      disable_tpa;
+#ifdef BNX2X_STOP_ON_ERROR
+       u64                     tpa_queue_used;
+#endif
 
        struct bnx2x            *bp; /* parent */
 };
 
 #define bnx2x_fp(bp, nr, var)          (bp->fp[nr].var)
+
+
+/* MC hsi */
+#define MAX_FETCH_BD                   13      /* HW max BDs per packet */
+#define RX_COPY_THRESH                 92
+
+#define NUM_TX_RINGS                   16
+#define TX_DESC_CNT            (BCM_PAGE_SIZE / sizeof(struct eth_tx_bd))
+#define MAX_TX_DESC_CNT                        (TX_DESC_CNT - 1)
+#define NUM_TX_BD                      (TX_DESC_CNT * NUM_TX_RINGS)
+#define MAX_TX_BD                      (NUM_TX_BD - 1)
+#define MAX_TX_AVAIL                   (MAX_TX_DESC_CNT * NUM_TX_RINGS - 2)
+#define NEXT_TX_IDX(x)         ((((x) & MAX_TX_DESC_CNT) == \
+                                 (MAX_TX_DESC_CNT - 1)) ? (x) + 2 : (x) + 1)
+#define TX_BD(x)                       ((x) & MAX_TX_BD)
+#define TX_BD_POFF(x)                  ((x) & MAX_TX_DESC_CNT)
+
+/* The RX BD ring is special, each bd is 8 bytes but the last one is 16 */
+#define NUM_RX_RINGS                   8
+#define RX_DESC_CNT            (BCM_PAGE_SIZE / sizeof(struct eth_rx_bd))
+#define MAX_RX_DESC_CNT                        (RX_DESC_CNT - 2)
+#define RX_DESC_MASK                   (RX_DESC_CNT - 1)
+#define NUM_RX_BD                      (RX_DESC_CNT * NUM_RX_RINGS)
+#define MAX_RX_BD                      (NUM_RX_BD - 1)
+#define MAX_RX_AVAIL                   (MAX_RX_DESC_CNT * NUM_RX_RINGS - 2)
+#define NEXT_RX_IDX(x)         ((((x) & RX_DESC_MASK) == \
+                                 (MAX_RX_DESC_CNT - 1)) ? (x) + 3 : (x) + 1)
+#define RX_BD(x)                       ((x) & MAX_RX_BD)
+
+/* As long as CQE is 4 times bigger than BD entry we have to allocate
+   4 times more pages for CQ ring in order to keep it balanced with
+   BD ring */
+#define NUM_RCQ_RINGS                  (NUM_RX_RINGS * 4)
+#define RCQ_DESC_CNT           (BCM_PAGE_SIZE / sizeof(union eth_rx_cqe))
+#define MAX_RCQ_DESC_CNT               (RCQ_DESC_CNT - 1)
+#define NUM_RCQ_BD                     (RCQ_DESC_CNT * NUM_RCQ_RINGS)
+#define MAX_RCQ_BD                     (NUM_RCQ_BD - 1)
+#define MAX_RCQ_AVAIL                  (MAX_RCQ_DESC_CNT * NUM_RCQ_RINGS - 2)
+#define NEXT_RCQ_IDX(x)                ((((x) & MAX_RCQ_DESC_CNT) == \
+                                 (MAX_RCQ_DESC_CNT - 1)) ? (x) + 2 : (x) + 1)
+#define RCQ_BD(x)                      ((x) & MAX_RCQ_BD)
+
+
 /* This is needed for determening of last_max */
 #define SUB_S16(a, b)                  (s16)((s16)(a) - (s16)(b))
 
+#define __SGE_MASK_SET_BIT(el, bit) \
+       do { \
+               el = ((el) | ((u64)0x1 << (bit))); \
+       } while (0)
+
+#define __SGE_MASK_CLEAR_BIT(el, bit) \
+       do { \
+               el = ((el) & (~((u64)0x1 << (bit)))); \
+       } while (0)
+
+#define SGE_MASK_SET_BIT(fp, idx) \
+       __SGE_MASK_SET_BIT(fp->sge_mask[(idx) >> RX_SGE_MASK_ELEM_SHIFT], \
+                          ((idx) & RX_SGE_MASK_ELEM_MASK))
+
+#define SGE_MASK_CLEAR_BIT(fp, idx) \
+       __SGE_MASK_CLEAR_BIT(fp->sge_mask[(idx) >> RX_SGE_MASK_ELEM_SHIFT], \
+                            ((idx) & RX_SGE_MASK_ELEM_MASK))
+
+
+/* used on a CID received from the HW */
+#define SW_CID(x)                      (le32_to_cpu(x) & \
+                                        (COMMON_RAMROD_ETH_RX_CQE_CID >> 7))
+#define CQE_CMD(x)                     (le32_to_cpu(x) >> \
+                                       COMMON_RAMROD_ETH_RX_CQE_CMD_ID_SHIFT)
+
 #define BD_UNMAP_ADDR(bd)              HILO_U64(le32_to_cpu((bd)->addr_hi), \
                                                 le32_to_cpu((bd)->addr_lo))
 #define BD_UNMAP_LEN(bd)               (le16_to_cpu((bd)->nbytes))
 
+
+#define DPM_TRIGER_TYPE                        0x40
+#define DOORBELL(bp, cid, val) \
+       do { \
+               writel((u32)val, (bp)->doorbells + (BCM_PAGE_SIZE * cid) + \
+                      DPM_TRIGER_TYPE); \
+       } while (0)
+
+
+/* TX CSUM helpers */
+#define SKB_CS_OFF(skb)                (offsetof(struct tcphdr, check) - \
+                                skb->csum_offset)
+#define SKB_CS(skb)            (*(u16 *)(skb_transport_header(skb) + \
+                                         skb->csum_offset))
+
+#define pbd_tcp_flags(skb)     (ntohl(tcp_flag_word(tcp_hdr(skb)))>>16 & 0xff)
+
+#define XMIT_PLAIN                     0
+#define XMIT_CSUM_V4                   0x1
+#define XMIT_CSUM_V6                   0x2
+#define XMIT_CSUM_TCP                  0x4
+#define XMIT_GSO_V4                    0x8
+#define XMIT_GSO_V6                    0x10
+
+#define XMIT_CSUM                      (XMIT_CSUM_V4 | XMIT_CSUM_V6)
+#define XMIT_GSO                       (XMIT_GSO_V4 | XMIT_GSO_V6)
+
+
 /* stuff added to make the code fit 80Col */
 
 #define CQE_TYPE(cqe_fp_flags) ((cqe_fp_flags) & ETH_FAST_PATH_RX_CQE_TYPE)
 
+#define TPA_TYPE_START                 ETH_FAST_PATH_RX_CQE_START_FLG
+#define TPA_TYPE_END                   ETH_FAST_PATH_RX_CQE_END_FLG
+#define TPA_TYPE(cqe_fp_flags)         ((cqe_fp_flags) & \
+                                        (TPA_TYPE_START | TPA_TYPE_END))
+
+#define BNX2X_RX_SUM_OK(cqe) \
+                       (!(cqe->fast_path_cqe.status_flags & \
+                        (ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG | \
+                         ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG)))
+
+#define BNX2X_RX_SUM_FIX(cqe) \
+                       ((le16_to_cpu(cqe->fast_path_cqe.pars_flags.flags) & \
+                         PARSING_FLAGS_OVER_ETHERNET_PROTOCOL) == \
+                        (1 << PARSING_FLAGS_OVER_ETHERNET_PROTOCOL_SHIFT))
+
 #define ETH_RX_ERROR_FALGS     (ETH_FAST_PATH_RX_CQE_PHY_DECODE_ERR_FLG | \
                                 ETH_FAST_PATH_RX_CQE_IP_BAD_XSUM_FLG | \
                                 ETH_FAST_PATH_RX_CQE_L4_BAD_XSUM_FLG)
@@ -247,6 +425,9 @@ struct bnx2x_fastpath {
 #define BNX2X_TX_SB_INDEX \
        (&fp->status_blk->c_status_block.index_values[C_SB_ETH_TX_CQ_INDEX])
 
+
+/* end of fast path */
+
 /* common */
 
 struct bnx2x_common {
@@ -546,7 +727,7 @@ struct bnx2x {
        struct pci_dev          *pdev;
 
        atomic_t                intr_sem;
-       struct msix_entry       msix_table[MAX_CONTEXT+1];
+       struct msix_entry       msix_table[MAX_CONTEXT+1];
 
        int                     tx_ring_size;
 
@@ -604,6 +785,7 @@ struct bnx2x {
 #define USING_DAC_FLAG                 0x10
 #define USING_MSIX_FLAG                        0x20
 #define ASF_ENABLE_FLAG                        0x40
+#define TPA_ENABLE_FLAG                        0x80
 #define NO_MCP_FLAG                    0x100
 #define BP_NOMCP(bp)                   (bp->flags & NO_MCP_FLAG)
 
@@ -725,76 +907,6 @@ void bnx2x_write_dmae(struct bnx2x *bp, dma_addr_t dma_addr, u32 dst_addr,
                      u32 len32);
 int bnx2x_set_gpio(struct bnx2x *bp, int gpio_num, u32 mode);
 
-
-/* MC hsi */
-#define RX_COPY_THRESH                 92
-#define BCM_PAGE_SHIFT                 12
-#define BCM_PAGE_SIZE                  (1 << BCM_PAGE_SHIFT)
-#define BCM_PAGE_MASK                  (~(BCM_PAGE_SIZE - 1))
-#define BCM_PAGE_ALIGN(addr)   (((addr) + BCM_PAGE_SIZE - 1) & BCM_PAGE_MASK)
-
-#define NUM_TX_RINGS                   16
-#define TX_DESC_CNT            (BCM_PAGE_SIZE / sizeof(struct eth_tx_bd))
-#define MAX_TX_DESC_CNT                (TX_DESC_CNT - 1)
-#define NUM_TX_BD                      (TX_DESC_CNT * NUM_TX_RINGS)
-#define MAX_TX_BD                      (NUM_TX_BD - 1)
-#define MAX_TX_AVAIL                   (MAX_TX_DESC_CNT * NUM_TX_RINGS - 2)
-#define NEXT_TX_IDX(x)         ((((x) & MAX_TX_DESC_CNT) == \
-                                (MAX_TX_DESC_CNT - 1)) ? (x) + 2 : (x) + 1)
-#define TX_BD(x)                       ((x) & MAX_TX_BD)
-#define TX_BD_POFF(x)                  ((x) & MAX_TX_DESC_CNT)
-
-/* The RX BD ring is special, each bd is 8 bytes but the last one is 16 */
-#define NUM_RX_RINGS                   8
-#define RX_DESC_CNT            (BCM_PAGE_SIZE / sizeof(struct eth_rx_bd))
-#define MAX_RX_DESC_CNT                (RX_DESC_CNT - 2)
-#define RX_DESC_MASK                   (RX_DESC_CNT - 1)
-#define NUM_RX_BD                      (RX_DESC_CNT * NUM_RX_RINGS)
-#define MAX_RX_BD                      (NUM_RX_BD - 1)
-#define MAX_RX_AVAIL                   (MAX_RX_DESC_CNT * NUM_RX_RINGS - 2)
-#define NEXT_RX_IDX(x)         ((((x) & RX_DESC_MASK) == \
-                                (MAX_RX_DESC_CNT - 1)) ? (x) + 3 : (x) + 1)
-#define RX_BD(x)                       ((x) & MAX_RX_BD)
-
-#define NUM_RCQ_RINGS                  (NUM_RX_RINGS * 2)
-#define RCQ_DESC_CNT           (BCM_PAGE_SIZE / sizeof(union eth_rx_cqe))
-#define MAX_RCQ_DESC_CNT               (RCQ_DESC_CNT - 1)
-#define NUM_RCQ_BD                     (RCQ_DESC_CNT * NUM_RCQ_RINGS)
-#define MAX_RCQ_BD                     (NUM_RCQ_BD - 1)
-#define MAX_RCQ_AVAIL                  (MAX_RCQ_DESC_CNT * NUM_RCQ_RINGS - 2)
-#define NEXT_RCQ_IDX(x)        ((((x) & MAX_RCQ_DESC_CNT) == \
-                                (MAX_RCQ_DESC_CNT - 1)) ? (x) + 2 : (x) + 1)
-#define RCQ_BD(x)                      ((x) & MAX_RCQ_BD)
-
-
-/* used on a CID received from the HW */
-#define SW_CID(x)                      (le32_to_cpu(x) & \
-                                        (COMMON_RAMROD_ETH_RX_CQE_CID >> 1))
-#define CQE_CMD(x)                     (le32_to_cpu(x) >> \
-                                       COMMON_RAMROD_ETH_RX_CQE_CMD_ID_SHIFT)
-
-#define STROM_ASSERT_ARRAY_SIZE        50
-
-
-
-/* must be used on a CID before placing it on a HW ring */
-#define HW_CID(bp, x)          ((BP_PORT(bp) << 23) | (BP_E1HVN(bp) << 17) | x)
-
-#define SP_DESC_CNT            (BCM_PAGE_SIZE / sizeof(struct eth_spe))
-#define MAX_SP_DESC_CNT                (SP_DESC_CNT - 1)
-
-
-#define BNX2X_BTR                      3
-#define MAX_SPQ_PENDING                8
-
-
-#define DPM_TRIGER_TYPE                0x40
-#define DOORBELL(bp, cid, val) \
-       do { \
-               writel((u32)val, (bp)->doorbells + (BCM_PAGE_SIZE * cid) + \
-                      DPM_TRIGER_TYPE); \
-       } while (0)
-
 static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
                           int wait)
 {
@@ -874,14 +986,20 @@ static inline u32 reg_poll(struct bnx2x *bp, u32 reg, u32 expected, int ms,
 #define BNX2X_LOOPBACK_FAILED          (BNX2X_MAC_LOOPBACK_FAILED | \
                                         BNX2X_PHY_LOOPBACK_FAILED)
 
-#define pbd_tcp_flags(skb)     (ntohl(tcp_flag_word(tcp_hdr(skb)))>>16 & 0xff)
+
+#define STROM_ASSERT_ARRAY_SIZE                50
+
 
 /* must be used on a CID before placing it on a HW ring */
+#define HW_CID(bp, x)          ((BP_PORT(bp) << 23) | (BP_E1HVN(bp) << 17) | x)
+
+#define SP_DESC_CNT            (BCM_PAGE_SIZE / sizeof(struct eth_spe))
+#define MAX_SP_DESC_CNT                        (SP_DESC_CNT - 1)
+
+
+#define BNX2X_BTR                      3
+#define MAX_SPQ_PENDING                        8
 
-#define BNX2X_RX_SUM_OK(cqe) \
-                       (!(cqe->fast_path_cqe.status_flags & \
-                        (ETH_FAST_PATH_RX_CQE_IP_XSUM_NO_VALIDATION_FLG | \
-                         ETH_FAST_PATH_RX_CQE_L4_XSUM_NO_VALIDATION_FLG)))
 
 /* CMNG constants
    derived from lab experiments, and not from system spec calculations !!! */
index ccfe33c..fabde55 100644 (file)
@@ -79,6 +79,7 @@ MODULE_VERSION(DRV_MODULE_VERSION);
 static int use_inta;
 static int poll;
 static int debug;
+static int disable_tpa;
 static int nomcp;
 static int load_count[3]; /* 0-common, 1-port0, 2-port1 */
 static int use_multi;
@@ -86,6 +87,7 @@ static int use_multi;
 module_param(use_inta, int, 0);
 module_param(poll, int, 0);
 module_param(debug, int, 0);
+module_param(disable_tpa, int, 0);
 module_param(nomcp, int, 0);
 MODULE_PARM_DESC(use_inta, "use INT#A instead of MSI-X");
 MODULE_PARM_DESC(poll, "use polling (for debug)");
@@ -512,13 +514,16 @@ static void bnx2x_panic_dump(struct bnx2x *bp)
                          i, fp->tx_pkt_prod, fp->tx_pkt_cons, fp->tx_bd_prod,
                          fp->tx_bd_cons, le16_to_cpu(*fp->tx_cons_sb));
                BNX2X_ERR("          rx_comp_prod(%x)  rx_comp_cons(%x)"
-                         "  *rx_cons_sb(%x)\n",
+                         "  *rx_cons_sb(%x)  *rx_bd_cons_sb(%x)"
+                         "  rx_sge_prod(%x)  last_max_sge(%x)\n",
                          fp->rx_comp_prod, fp->rx_comp_cons,
-                         le16_to_cpu(*fp->rx_cons_sb));
+                         le16_to_cpu(*fp->rx_cons_sb),
+                         le16_to_cpu(*fp->rx_bd_cons_sb),
+                         fp->rx_sge_prod, fp->last_max_sge);
                BNX2X_ERR("          fp_c_idx(%x)  fp_u_idx(%x)"
-                         "  bd data(%x,%x)\n",
+                         "  bd data(%x,%x)  rx_alloc_failed(%lx)\n",
                          fp->fp_c_idx, fp->fp_u_idx, hw_prods->packets_prod,
-                         hw_prods->bds_prod);
+                         hw_prods->bds_prod, fp->rx_alloc_failed);
 
                start = TX_BD(le16_to_cpu(*fp->tx_cons_sb) - 10);
                end = TX_BD(le16_to_cpu(*fp->tx_cons_sb) + 245);
@@ -548,6 +553,16 @@ static void bnx2x_panic_dump(struct bnx2x *bp)
                                  j, rx_bd[1], rx_bd[0], sw_bd->skb);
                }
 
+               start = 0;
+               end = RX_SGE_CNT*NUM_RX_SGE_PAGES;
+               for (j = start; j < end; j++) {
+                       u32 *rx_sge = (u32 *)&fp->rx_sge_ring[j];
+                       struct sw_rx_page *sw_page = &fp->rx_page_ring[j];
+
+                       BNX2X_ERR("rx_sge[%x]=[%x:%x]  sw_page=[%p]\n",
+                                 j, rx_sge[1], rx_sge[0], sw_page->page);
+               }
+
                start = RCQ_BD(fp->rx_comp_cons - 10);
                end = RCQ_BD(fp->rx_comp_cons + 503);
                for (j = start; j < end; j++) {
@@ -963,6 +978,62 @@ static void bnx2x_sp_event(struct bnx2x_fastpath *fp,
        mb(); /* force bnx2x_wait_ramrod() to see the change */
 }
 
+static inline void bnx2x_free_rx_sge(struct bnx2x *bp,
+                                    struct bnx2x_fastpath *fp, u16 index)
+{
+       struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
+       struct page *page = sw_buf->page;
+       struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
+
+       /* Skip "next page" elements */
+       if (!page)
+               return;
+
+       pci_unmap_page(bp->pdev, pci_unmap_addr(sw_buf, mapping),
+                      BCM_PAGE_SIZE*PAGES_PER_SGE, PCI_DMA_FROMDEVICE);
+       __free_pages(page, PAGES_PER_SGE_SHIFT);
+
+       sw_buf->page = NULL;
+       sge->addr_hi = 0;
+       sge->addr_lo = 0;
+}
+
+static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
+                                          struct bnx2x_fastpath *fp, int last)
+{
+       int i;
+
+       for (i = 0; i < last; i++)
+               bnx2x_free_rx_sge(bp, fp, i);
+}
+
+static inline int bnx2x_alloc_rx_sge(struct bnx2x *bp,
+                                    struct bnx2x_fastpath *fp, u16 index)
+{
+       struct page *page = alloc_pages(GFP_ATOMIC, PAGES_PER_SGE_SHIFT);
+       struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
+       struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
+       dma_addr_t mapping;
+
+       if (unlikely(page == NULL))
+               return -ENOMEM;
+
+       mapping = pci_map_page(bp->pdev, page, 0, BCM_PAGE_SIZE*PAGES_PER_SGE,
+                              PCI_DMA_FROMDEVICE);
+       if (unlikely(dma_mapping_error(mapping))) {
+               __free_pages(page, PAGES_PER_SGE_SHIFT);
+               return -ENOMEM;
+       }
+
+       sw_buf->page = page;
+       pci_unmap_addr_set(sw_buf, mapping, mapping);
+
+       sge->addr_hi = cpu_to_le32(U64_HI(mapping));
+       sge->addr_lo = cpu_to_le32(U64_LO(mapping));
+
+       return 0;
+}
+
 static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
                                     struct bnx2x_fastpath *fp, u16 index)
 {
@@ -1016,12 +1087,310 @@ static void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
        *prod_bd = *cons_bd;
 }
 
+static inline void bnx2x_update_last_max_sge(struct bnx2x_fastpath *fp,
+                                            u16 idx)
+{
+       u16 last_max = fp->last_max_sge;
+
+       if (SUB_S16(idx, last_max) > 0)
+               fp->last_max_sge = idx;
+}
+
+static void bnx2x_clear_sge_mask_next_elems(struct bnx2x_fastpath *fp)
+{
+       int i, j;
+
+       for (i = 1; i <= NUM_RX_SGE_PAGES; i++) {
+               int idx = RX_SGE_CNT * i - 1;
+
+               for (j = 0; j < 2; j++) {
+                       SGE_MASK_CLEAR_BIT(fp, idx);
+                       idx--;
+               }
+       }
+}
+
+static void bnx2x_update_sge_prod(struct bnx2x_fastpath *fp,
+                                 struct eth_fast_path_rx_cqe *fp_cqe)
+{
+       struct bnx2x *bp = fp->bp;
+       u16 sge_len = BCM_PAGE_ALIGN(le16_to_cpu(fp_cqe->pkt_len) -
+                                    le16_to_cpu(fp_cqe->len_on_bd)) >>
+                     BCM_PAGE_SHIFT;
+       u16 last_max, last_elem, first_elem;
+       u16 delta = 0;
+       u16 i;
+
+       if (!sge_len)
+               return;
+
+       /* First mark all used pages */
+       for (i = 0; i < sge_len; i++)
+               SGE_MASK_CLEAR_BIT(fp, RX_SGE(le16_to_cpu(fp_cqe->sgl[i])));
+
+       DP(NETIF_MSG_RX_STATUS, "fp_cqe->sgl[%d] = %d\n",
+          sge_len - 1, le16_to_cpu(fp_cqe->sgl[sge_len - 1]));
+
+       /* Here we assume that the last SGE index is the biggest */
+       prefetch((void *)(fp->sge_mask));
+       bnx2x_update_last_max_sge(fp, le16_to_cpu(fp_cqe->sgl[sge_len - 1]));
+
+       last_max = RX_SGE(fp->last_max_sge);
+       last_elem = last_max >> RX_SGE_MASK_ELEM_SHIFT;
+       first_elem = RX_SGE(fp->rx_sge_prod) >> RX_SGE_MASK_ELEM_SHIFT;
+
+       /* If ring is not full */
+       if (last_elem + 1 != first_elem)
+               last_elem++;
+
+       /* Now update the prod */
+       for (i = first_elem; i != last_elem; i = NEXT_SGE_MASK_ELEM(i)) {
+               if (likely(fp->sge_mask[i]))
+                       break;
+
+               fp->sge_mask[i] = RX_SGE_MASK_ELEM_ONE_MASK;
+               delta += RX_SGE_MASK_ELEM_SZ;
+       }
+
+       if (delta > 0) {
+               fp->rx_sge_prod += delta;
+               /* clear page-end entries */
+               bnx2x_clear_sge_mask_next_elems(fp);
+       }
+
+       DP(NETIF_MSG_RX_STATUS,
+          "fp->last_max_sge = %d  fp->rx_sge_prod = %d\n",
+          fp->last_max_sge, fp->rx_sge_prod);
+}
+
+static inline void bnx2x_init_sge_ring_bit_mask(struct bnx2x_fastpath *fp)
+{
+       /* Set the mask to all 1-s: it's faster to compare to 0 than to 0xf-s */
+       memset(fp->sge_mask, 0xff,
+              (NUM_RX_SGE >> RX_SGE_MASK_ELEM_SHIFT)*sizeof(u64));
+
+       /* Clear the two last indeces in the page to 1:
+          these are the indeces that correspond to the "next" element,
+          hence will never be indicated and should be removed from
+          the calculations. */
+       bnx2x_clear_sge_mask_next_elems(fp);
+}
+
+static void bnx2x_tpa_start(struct bnx2x_fastpath *fp, u16 queue,
+                           struct sk_buff *skb, u16 cons, u16 prod)
+{
+       struct bnx2x *bp = fp->bp;
+       struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons];
+       struct sw_rx_bd *prod_rx_buf = &fp->rx_buf_ring[prod];
+       struct eth_rx_bd *prod_bd = &fp->rx_desc_ring[prod];
+       dma_addr_t mapping;
+
+       /* move empty skb from pool to prod and map it */
+       prod_rx_buf->skb = fp->tpa_pool[queue].skb;
+       mapping = pci_map_single(bp->pdev, fp->tpa_pool[queue].skb->data,
+                                bp->rx_buf_use_size, PCI_DMA_FROMDEVICE);
+       pci_unmap_addr_set(prod_rx_buf, mapping, mapping);
+
+       /* move partial skb from cons to pool (don't unmap yet) */
+       fp->tpa_pool[queue] = *cons_rx_buf;
+
+       /* mark bin state as start - print error if current state != stop */
+       if (fp->tpa_state[queue] != BNX2X_TPA_STOP)
+               BNX2X_ERR("start of bin not in stop [%d]\n", queue);
+
+       fp->tpa_state[queue] = BNX2X_TPA_START;
+
+       /* point prod_bd to new skb */
+       prod_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
+       prod_bd->addr_lo = cpu_to_le32(U64_LO(mapping));
+
+#ifdef BNX2X_STOP_ON_ERROR
+       fp->tpa_queue_used |= (1 << queue);
+#ifdef __powerpc64__
+       DP(NETIF_MSG_RX_STATUS, "fp->tpa_queue_used = 0x%lx\n",
+#else
+       DP(NETIF_MSG_RX_STATUS, "fp->tpa_queue_used = 0x%llx\n",
+#endif
+          fp->tpa_queue_used);
+#endif
+}
+
+static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
+                              struct sk_buff *skb,
+                              struct eth_fast_path_rx_cqe *fp_cqe,
+                              u16 cqe_idx)
+{
+       struct sw_rx_page *rx_pg, old_rx_pg;
+       struct page *sge;
+       u16 len_on_bd = le16_to_cpu(fp_cqe->len_on_bd);
+       u32 i, frag_len, frag_size, pages;
+       int err;
+       int j;
+
+       frag_size = le16_to_cpu(fp_cqe->pkt_len) - len_on_bd;
+       pages = BCM_PAGE_ALIGN(frag_size) >> BCM_PAGE_SHIFT;
+
+       /* This is needed in order to enable forwarding support */
+       if (frag_size)
+               skb_shinfo(skb)->gso_size = min((u32)BCM_PAGE_SIZE,
+                                              max(frag_size, (u32)len_on_bd));
+
+#ifdef BNX2X_STOP_ON_ERROR
+       if (pages > 8*PAGES_PER_SGE) {
+               BNX2X_ERR("SGL length is too long: %d. CQE index is %d\n",
+                         pages, cqe_idx);
+               BNX2X_ERR("fp_cqe->pkt_len = %d  fp_cqe->len_on_bd = %d\n",
+                         fp_cqe->pkt_len, len_on_bd);
+               bnx2x_panic();
+               return -EINVAL;
+       }
+#endif
+
+       /* Run through the SGL and compose the fragmented skb */
+       for (i = 0, j = 0; i < pages; i += PAGES_PER_SGE, j++) {
+               u16 sge_idx = RX_SGE(le16_to_cpu(fp_cqe->sgl[j]));
+
+               /* FW gives the indices of the SGE as if the ring is an array
+                  (meaning that "next" element will consume 2 indices) */
+               frag_len = min(frag_size, (u32)(BCM_PAGE_SIZE*PAGES_PER_SGE));
+               rx_pg = &fp->rx_page_ring[sge_idx];
+               sge = rx_pg->page;
+               old_rx_pg = *rx_pg;
+
+               /* If we fail to allocate a substitute page, we simply stop
+                  where we are and drop the whole packet */
+               err = bnx2x_alloc_rx_sge(bp, fp, sge_idx);
+               if (unlikely(err)) {
+                       fp->rx_alloc_failed++;
+                       return err;
+               }
+
+               /* Unmap the page as we r going to pass it to the stack */
+               pci_unmap_page(bp->pdev, pci_unmap_addr(&old_rx_pg, mapping),
+                             BCM_PAGE_SIZE*PAGES_PER_SGE, PCI_DMA_FROMDEVICE);
+
+               /* Add one frag and update the appropriate fields in the skb */
+               skb_fill_page_desc(skb, j, old_rx_pg.page, 0, frag_len);
+
+               skb->data_len += frag_len;
+               skb->truesize += frag_len;
+               skb->len += frag_len;
+
+               frag_size -= frag_len;
+       }
+
+       return 0;
+}
+
+static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
+                          u16 queue, int pad, int len, union eth_rx_cqe *cqe,
+                          u16 cqe_idx)
+{
+       struct sw_rx_bd *rx_buf = &fp->tpa_pool[queue];
+       struct sk_buff *skb = rx_buf->skb;
+       /* alloc new skb */
+       struct sk_buff *new_skb = netdev_alloc_skb(bp->dev, bp->rx_buf_size);
+
+       /* Unmap skb in the pool anyway, as we are going to change
+          pool entry status to BNX2X_TPA_STOP even if new skb allocation
+          fails. */
+       pci_unmap_single(bp->pdev, pci_unmap_addr(rx_buf, mapping),
+                        bp->rx_buf_use_size, PCI_DMA_FROMDEVICE);
+
+       /* if alloc failed drop the packet and keep the buffer in the bin */
+       if (likely(new_skb)) {
+
+               prefetch(skb);
+               prefetch(((char *)(skb)) + 128);
+
+               /* else fix ip xsum and give it to the stack */
+               /* (no need to map the new skb) */
+#ifdef BNX2X_STOP_ON_ERROR
+               if (pad + len > bp->rx_buf_size) {
+                       BNX2X_ERR("skb_put is about to fail...  "
+                                 "pad %d  len %d  rx_buf_size %d\n",
+                                 pad, len, bp->rx_buf_size);
+                       bnx2x_panic();
+                       return;
+               }
+#endif
+
+               skb_reserve(skb, pad);
+               skb_put(skb, len);
+
+               skb->protocol = eth_type_trans(skb, bp->dev);
+               skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+               {
+                       struct iphdr *iph;
+
+                       iph = (struct iphdr *)skb->data;
+                       iph->check = 0;
+                       iph->check = ip_fast_csum((u8 *)iph, iph->ihl);
+               }
+
+               if (!bnx2x_fill_frag_skb(bp, fp, skb,
+                                        &cqe->fast_path_cqe, cqe_idx)) {
+#ifdef BCM_VLAN
+                       if ((bp->vlgrp != NULL) &&
+                           (le16_to_cpu(cqe->fast_path_cqe.pars_flags.flags) &
+                            PARSING_FLAGS_VLAN))
+                               vlan_hwaccel_receive_skb(skb, bp->vlgrp,
+                                               le16_to_cpu(cqe->fast_path_cqe.
+                                                           vlan_tag));
+                       else
+#endif
+                               netif_receive_skb(skb);
+               } else {
+                       DP(NETIF_MSG_RX_STATUS, "Failed to allocate new pages"
+                          " - dropping packet!\n");
+                       dev_kfree_skb(skb);
+               }
+
+               bp->dev->last_rx = jiffies;
+
+               /* put new skb in bin */
+               fp->tpa_pool[queue].skb = new_skb;
+
+       } else {
+               DP(NETIF_MSG_RX_STATUS,
+                  "Failed to allocate new skb - dropping packet!\n");
+               fp->rx_alloc_failed++;
+       }
+
+       fp->tpa_state[queue] = BNX2X_TPA_STOP;
+}
+
+static inline void bnx2x_update_rx_prod(struct bnx2x *bp,
+                                       struct bnx2x_fastpath *fp,
+                                       u16 bd_prod, u16 rx_comp_prod,
+                                       u16 rx_sge_prod)
+{
+       struct tstorm_eth_rx_producers rx_prods = {0};
+       int i;
+
+       /* Update producers */
+       rx_prods.bd_prod = bd_prod;
+       rx_prods.cqe_prod = rx_comp_prod;
+       rx_prods.sge_prod = rx_sge_prod;
+
+       for (i = 0; i < sizeof(struct tstorm_eth_rx_producers)/4; i++)
+               REG_WR(bp, BAR_TSTRORM_INTMEM +
+                      TSTORM_RX_PRODS_OFFSET(BP_PORT(bp), FP_CL_ID(fp)) + i*4,
+                      ((u32 *)&rx_prods)[i]);
+
+       DP(NETIF_MSG_RX_STATUS,
+          "Wrote: bd_prod %u  cqe_prod %u  sge_prod %u\n",
+          bd_prod, rx_comp_prod, rx_sge_prod);
+}
+
 static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
 {
        struct bnx2x *bp = fp->bp;
        u16 bd_cons, bd_prod, bd_prod_fw, comp_ring_cons;
        u16 hw_comp_cons, sw_comp_cons, sw_comp_prod;
        int rx_pkt = 0;
+       u16 queue;
 
 #ifdef BNX2X_STOP_ON_ERROR
        if (unlikely(bp->panic))
@@ -1082,6 +1451,49 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
                        len = le16_to_cpu(cqe->fast_path_cqe.pkt_len);
                        pad = cqe->fast_path_cqe.placement_offset;
 
+                       /* If CQE is marked both TPA_START and TPA_END
+                          it is a non-TPA CQE */
+                       if ((!fp->disable_tpa) &&
+                           (TPA_TYPE(cqe_fp_flags) !=
+                                       (TPA_TYPE_START | TPA_TYPE_END))) {
+                               queue = cqe->fast_path_cqe.queue_index;
+
+                               if (TPA_TYPE(cqe_fp_flags) == TPA_TYPE_START) {
+                                       DP(NETIF_MSG_RX_STATUS,
+                                          "calling tpa_start on queue %d\n",
+                                          queue);
+
+                                       bnx2x_tpa_start(fp, queue, skb,
+                                                       bd_cons, bd_prod);
+                                       goto next_rx;
+                               }
+
+                               if (TPA_TYPE(cqe_fp_flags) == TPA_TYPE_END) {
+                                       DP(NETIF_MSG_RX_STATUS,
+                                          "calling tpa_stop on queue %d\n",
+                                          queue);
+
+                                       if (!BNX2X_RX_SUM_FIX(cqe))
+                                               BNX2X_ERR("STOP on none TCP "
+                                                         "data\n");
+
+                                       /* This is a size of the linear data
+                                          on this skb */
+                                       len = le16_to_cpu(cqe->fast_path_cqe.
+                                                               len_on_bd);
+                                       bnx2x_tpa_stop(bp, fp, queue, pad,
+                                                   len, cqe, comp_ring_cons);
+#ifdef BNX2X_STOP_ON_ERROR
+                                       if (bp->panic)
+                                               return -EINVAL;
+#endif
+
+                                       bnx2x_update_sge_prod(fp,
+                                                       &cqe->fast_path_cqe);
+                                       goto next_cqe;
+                               }
+                       }
+
                        pci_dma_sync_single_for_device(bp->pdev,
                                        pci_unmap_addr(rx_buf, mapping),
                                                       pad + RX_COPY_THRESH,
@@ -1112,7 +1524,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
                                        DP(NETIF_MSG_RX_ERR,
                                           "ERROR  packet dropped "
                                           "because of alloc failure\n");
-                                       /* TBD count this as a drop? */
+                                       fp->rx_alloc_failed++;
                                        goto reuse_rx;
                                }
 
@@ -1138,6 +1550,7 @@ static int bnx2x_rx_int(struct bnx2x_fastpath *fp, int budget)
                                DP(NETIF_MSG_RX_ERR,
                                   "ERROR  packet dropped because "
                                   "of alloc failure\n");
+                               fp->rx_alloc_failed++;
 reuse_rx:
                                bnx2x_reuse_rx_skb(fp, skb, bd_cons, bd_prod);
                                goto next_rx;
@@ -1184,11 +1597,9 @@ next_cqe:
        fp->rx_comp_cons = sw_comp_cons;
        fp->rx_comp_prod = sw_comp_prod;
 
-       REG_WR(bp, BAR_TSTRORM_INTMEM +
-               TSTORM_RX_PRODS_OFFSET(BP_PORT(bp), FP_CL_ID(fp)),
-               sw_comp_prod);
-
-
+       /* Update producers */
+       bnx2x_update_rx_prod(bp, fp, bd_prod_fw, sw_comp_prod,
+                            fp->rx_sge_prod);
        mmiowb(); /* keep prod updates ordered */
 
        fp->rx_pkt += rx_pkt;
@@ -2745,10 +3156,10 @@ static void bnx2x_stats_pmf_update(struct bnx2x *bp)
        dmae->opcode = (opcode | DMAE_CMD_C_DST_PCI);
        dmae->src_addr_lo = (bp->port.port_stx >> 2) + DMAE_LEN32_RD_MAX;
        dmae->src_addr_hi = 0;
-       dmae->dst_addr_lo = U64_LO(bnx2x_sp_mapping(bp, port_stats)
-                                  DMAE_LEN32_RD_MAX * 4);
-       dmae->dst_addr_hi = U64_HI(bnx2x_sp_mapping(bp, port_stats)
-                                  DMAE_LEN32_RD_MAX * 4);
+       dmae->dst_addr_lo = U64_LO(bnx2x_sp_mapping(bp, port_stats) +
+                                  DMAE_LEN32_RD_MAX * 4);
+       dmae->dst_addr_hi = U64_HI(bnx2x_sp_mapping(bp, port_stats) +
+                                  DMAE_LEN32_RD_MAX * 4);
        dmae->len = (sizeof(struct host_port_stats) >> 2) - DMAE_LEN32_RD_MAX;
        dmae->comp_addr_lo = U64_LO(bnx2x_sp_mapping(bp, stats_comp));
        dmae->comp_addr_hi = U64_HI(bnx2x_sp_mapping(bp, stats_comp));
@@ -3365,11 +3776,12 @@ static void bnx2x_stats_update(struct bnx2x *bp)
                printk(KERN_DEBUG "  tx avail (%4x)  tx hc idx (%x)"
                                  "  tx pkt (%lx)\n",
                       bnx2x_tx_avail(bp->fp),
-                      *bp->fp->tx_cons_sb, nstats->tx_packets);
+                      le16_to_cpu(*bp->fp->tx_cons_sb), nstats->tx_packets);
                printk(KERN_DEBUG "  rx usage (%4x)  rx hc idx (%x)"
                                  "  rx pkt (%lx)\n",
-                      (u16)(*bp->fp->rx_cons_sb - bp->fp->rx_comp_cons),
-                      *bp->fp->rx_cons_sb, nstats->rx_packets);
+                      (u16)(le16_to_cpu(*bp->fp->rx_cons_sb) -
+                            bp->fp->rx_comp_cons),
+                      le16_to_cpu(*bp->fp->rx_cons_sb), nstats->rx_packets);
                printk(KERN_DEBUG "  %s (Xoff events %u)  brb drops %u\n",
                       netif_queue_stopped(bp->dev)? "Xoff" : "Xon",
                       estats->driver_xoff, estats->brb_drop_lo);
@@ -3623,6 +4035,8 @@ static void bnx2x_init_sb(struct bnx2x *bp, int sb_id,
        REG_WR(bp, BAR_CSTRORM_INTMEM +
               ((CSTORM_SB_HOST_SB_ADDR_OFFSET(port, sb_id)) + 4),
               U64_HI(section));
+       REG_WR8(bp, BAR_CSTRORM_INTMEM + FP_CSB_FUNC_OFF +
+               CSTORM_SB_HOST_STATUS_BLOCK_OFFSET(port, sb_id), func);
 
        for (index = 0; index < HC_CSTORM_SB_NUM_INDICES; index++)
                REG_WR16(bp, BAR_CSTRORM_INTMEM +
@@ -3814,22 +4228,94 @@ static void bnx2x_update_coalesce(struct bnx2x *bp)
        }
 }
 
+static inline void bnx2x_free_tpa_pool(struct bnx2x *bp,
+                                      struct bnx2x_fastpath *fp, int last)
+{
+       int i;
+
+       for (i = 0; i < last; i++) {
+               struct sw_rx_bd *rx_buf = &(fp->tpa_pool[i]);
+               struct sk_buff *skb = rx_buf->skb;
+
+               if (skb == NULL) {
+                       DP(NETIF_MSG_IFDOWN, "tpa bin %d empty on free\n", i);
+                       continue;
+               }
+
+               if (fp->tpa_state[i] == BNX2X_TPA_START)
+                       pci_unmap_single(bp->pdev,
+                                        pci_unmap_addr(rx_buf, mapping),
+                                        bp->rx_buf_use_size,
+                                        PCI_DMA_FROMDEVICE);
+
+               dev_kfree_skb(skb);
+               rx_buf->skb = NULL;
+       }
+}
+
 static void bnx2x_init_rx_rings(struct bnx2x *bp)
 {
-       u16 ring_prod;
+       int func = BP_FUNC(bp);
+       u16 ring_prod, cqe_ring_prod = 0;
        int i, j;
 
        bp->rx_buf_use_size = bp->dev->mtu;
-
        bp->rx_buf_use_size += bp->rx_offset + ETH_OVREHEAD;
        bp->rx_buf_size = bp->rx_buf_use_size + 64;
 
+       if (bp->flags & TPA_ENABLE_FLAG) {
+               DP(NETIF_MSG_IFUP,
+                  "rx_buf_use_size %d  rx_buf_size %d  effective_mtu %d\n",
+                  bp->rx_buf_use_size, bp->rx_buf_size,
+                  bp->dev->mtu + ETH_OVREHEAD);
+
+               for_each_queue(bp, j) {
+                       for (i = 0; i < ETH_MAX_AGGREGATION_QUEUES_E1H; i++) {
+                               struct bnx2x_fastpath *fp = &bp->fp[j];
+
+                               fp->tpa_pool[i].skb =
+                                  netdev_alloc_skb(bp->dev, bp->rx_buf_size);
+                               if (!fp->tpa_pool[i].skb) {
+                                       BNX2X_ERR("Failed to allocate TPA "
+                                                 "skb pool for queue[%d] - "
+                                                 "disabling TPA on this "
+                                                 "queue!\n", j);
+                                       bnx2x_free_tpa_pool(bp, fp, i);
+                                       fp->disable_tpa = 1;
+                                       break;
+                               }
+                               pci_unmap_addr_set((struct sw_rx_bd *)
+                                                       &bp->fp->tpa_pool[i],
+                                                  mapping, 0);
+                               fp->tpa_state[i] = BNX2X_TPA_STOP;
+                       }
+               }
+       }
+
        for_each_queue(bp, j) {
                struct bnx2x_fastpath *fp = &bp->fp[j];
 
                fp->rx_bd_cons = 0;
                fp->rx_cons_sb = BNX2X_RX_SB_INDEX;
+               fp->rx_bd_cons_sb = BNX2X_RX_SB_BD_INDEX;
+
+               /* "next page" elements initialization */
+               /* SGE ring */
+               for (i = 1; i <= NUM_RX_SGE_PAGES; i++) {
+                       struct eth_rx_sge *sge;
+
+                       sge = &fp->rx_sge_ring[RX_SGE_CNT * i - 2];
+                       sge->addr_hi =
+                               cpu_to_le32(U64_HI(fp->rx_sge_mapping +
+                                       BCM_PAGE_SIZE*(i % NUM_RX_SGE_PAGES)));
+                       sge->addr_lo =
+                               cpu_to_le32(U64_LO(fp->rx_sge_mapping +
+                                       BCM_PAGE_SIZE*(i % NUM_RX_SGE_PAGES)));
+               }
+
+               bnx2x_init_sge_ring_bit_mask(fp);
 
+               /* RX BD ring */
                for (i = 1; i <= NUM_RX_RINGS; i++) {
                        struct eth_rx_bd *rx_bd;
 
@@ -3856,35 +4342,61 @@ static void bnx2x_init_rx_rings(struct bnx2x *bp)
                                           BCM_PAGE_SIZE*(i % NUM_RCQ_RINGS)));
                }
 
-               /* rx completion queue */
-               fp->rx_comp_cons = ring_prod = 0;
+               /* Allocate SGEs and initialize the ring elements */
+               for (i = 0, ring_prod = 0;
+                    i < MAX_RX_SGE_CNT*NUM_RX_SGE_PAGES; i++) {
 
+                       if (bnx2x_alloc_rx_sge(bp, fp, ring_prod) < 0) {
+                               BNX2X_ERR("was only able to allocate "
+                                         "%d rx sges\n", i);
+                               BNX2X_ERR("disabling TPA for queue[%d]\n", j);
+                               /* Cleanup already allocated elements */
+                               bnx2x_free_rx_sge_range(bp, fp, ring_prod);
+                               bnx2x_free_tpa_pool(bp, fp,
+                                             ETH_MAX_AGGREGATION_QUEUES_E1H);
+                               fp->disable_tpa = 1;
+                               ring_prod = 0;
+                               break;
+                       }
+                       ring_prod = NEXT_SGE_IDX(ring_prod);
+               }
+               fp->rx_sge_prod = ring_prod;
+
+               /* Allocate BDs and initialize BD ring */
+               fp->rx_comp_cons = fp->rx_alloc_failed = 0;
+               cqe_ring_prod = ring_prod = 0;
                for (i = 0; i < bp->rx_ring_size; i++) {
                        if (bnx2x_alloc_rx_skb(bp, fp, ring_prod) < 0) {
                                BNX2X_ERR("was only able to allocate "
                                          "%d rx skbs\n", i);
+                               fp->rx_alloc_failed++;
                                break;
                        }
                        ring_prod = NEXT_RX_IDX(ring_prod);
+                       cqe_ring_prod = NEXT_RCQ_IDX(cqe_ring_prod);
                        BUG_TRAP(ring_prod > i);
                }
 
-               fp->rx_bd_prod = fp->rx_comp_prod = ring_prod;
+               fp->rx_bd_prod = ring_prod;
+               /* must not have more available CQEs than BDs */
+               fp->rx_comp_prod = min((u16)(NUM_RCQ_RINGS*RCQ_DESC_CNT),
+                                      cqe_ring_prod);
                fp->rx_pkt = fp->rx_calls = 0;
 
-               /* Warning! this will generate an interrupt (to the TSTORM) */
-               /* must only be done when chip is initialized */
-               REG_WR(bp, BAR_TSTRORM_INTMEM +
-                      TSTORM_RX_PRODS_OFFSET(BP_PORT(bp), FP_CL_ID(fp)),
-                       ring_prod);
+               /* Warning!
+                * this will generate an interrupt (to the TSTORM)
+                * must only be done after chip is initialized
+                */
+               bnx2x_update_rx_prod(bp, fp, ring_prod, fp->rx_comp_prod,
+                                    fp->rx_sge_prod);
                if (j != 0)
                        continue;
 
                REG_WR(bp, BAR_USTRORM_INTMEM +
-                      USTORM_MEM_WORKAROUND_ADDRESS_OFFSET(BP_PORT(bp)),
+                      USTORM_MEM_WORKAROUND_ADDRESS_OFFSET(func),
                       U64_LO(fp->rx_comp_mapping));
                REG_WR(bp, BAR_USTRORM_INTMEM +
-                      USTORM_MEM_WORKAROUND_ADDRESS_OFFSET(BP_PORT(bp)) + 4,
+                      USTORM_MEM_WORKAROUND_ADDRESS_OFFSET(func) + 4,
                       U64_HI(fp->rx_comp_mapping));
        }
 }
@@ -3972,6 +4484,18 @@ static void bnx2x_init_context(struct bnx2x *bp)
                                                U64_HI(fp->rx_desc_mapping);
                context->ustorm_st_context.common.bd_page_base_lo =
                                                U64_LO(fp->rx_desc_mapping);
+               if (!fp->disable_tpa) {
+                       context->ustorm_st_context.common.flags |=
+                               (USTORM_ETH_ST_CONTEXT_CONFIG_ENABLE_TPA |
+                                USTORM_ETH_ST_CONTEXT_CONFIG_ENABLE_SGE_RING);
+                       context->ustorm_st_context.common.sge_buff_size =
+                                       (u16)(BCM_PAGE_SIZE*PAGES_PER_SGE);
+                       context->ustorm_st_context.common.sge_page_base_hi =
+                                               U64_HI(fp->rx_sge_mapping);
+                       context->ustorm_st_context.common.sge_page_base_lo =
+                                               U64_LO(fp->rx_sge_mapping);
+               }
+
                context->cstorm_st_context.sb_index_number =
                                                HC_INDEX_C_ETH_TX_CQ_CONS;
                context->cstorm_st_context.status_block_id = sb_id;
@@ -4022,6 +4546,18 @@ static void bnx2x_set_client_config(struct bnx2x *bp)
        }
 #endif
 
+       if (bp->flags & TPA_ENABLE_FLAG) {
+               tstorm_client.max_sges_for_packet =
+                       BCM_PAGE_ALIGN(tstorm_client.mtu) >> BCM_PAGE_SHIFT;
+               tstorm_client.max_sges_for_packet =
+                       ((tstorm_client.max_sges_for_packet +
+                         PAGES_PER_SGE - 1) & (~(PAGES_PER_SGE - 1))) >>
+                       PAGES_PER_SGE_SHIFT;
+
+               tstorm_client.config_flags |=
+                               TSTORM_ETH_CLIENT_CONFIG_ENABLE_SGE_RING;
+       }
+
        for_each_queue(bp, i) {
                REG_WR(bp, BAR_TSTRORM_INTMEM +
                       TSTORM_CLIENT_CONFIG_OFFSET(port, bp->fp[i].cl_id),
@@ -4136,8 +4672,8 @@ static void bnx2x_init_internal(struct bnx2x *bp)
                REG_WR8(bp, BAR_USTRORM_INTMEM + USTORM_FUNCTION_MODE_OFFSET,
                        IS_E1HMF(bp));
 
-               REG_WR16(bp, BAR_XSTRORM_INTMEM +
-                        XSTORM_E1HOV_OFFSET(func), bp->e1hov);
+               REG_WR16(bp, BAR_XSTRORM_INTMEM + XSTORM_E1HOV_OFFSET(func),
+                        bp->e1hov);
        }
 
        /* Zero this manualy as its initialization is
@@ -4145,6 +4681,25 @@ static void bnx2x_init_internal(struct bnx2x *bp)
        for (i = 0; i < USTORM_AGG_DATA_SIZE >> 2; i++)
                REG_WR(bp, BAR_USTRORM_INTMEM +
                       USTORM_AGG_DATA_OFFSET + 4*i, 0);
+
+       for_each_queue(bp, i) {
+               struct bnx2x_fastpath *fp = &bp->fp[i];
+               u16 max_agg_size;
+
+               REG_WR(bp, BAR_USTRORM_INTMEM +
+                      USTORM_CQE_PAGE_BASE_OFFSET(port, FP_CL_ID(fp)),
+                      U64_LO(fp->rx_comp_mapping));
+               REG_WR(bp, BAR_USTRORM_INTMEM +
+                      USTORM_CQE_PAGE_BASE_OFFSET(port, FP_CL_ID(fp)) + 4,
+                      U64_HI(fp->rx_comp_mapping));
+
+               max_agg_size = min((u32)(bp->rx_buf_use_size +
+                                        8*BCM_PAGE_SIZE*PAGES_PER_SGE),
+                                  (u32)0xffff);
+               REG_WR16(bp, BAR_USTRORM_INTMEM +
+                        USTORM_MAX_AGG_SIZE_OFFSET(port, FP_CL_ID(fp)),
+                        max_agg_size);
+       }
 }
 
 static void bnx2x_nic_init(struct bnx2x *bp)
@@ -4767,6 +5322,17 @@ static int bnx2x_init_common(struct bnx2x *bp)
 
        enable_blocks_attention(bp);
 
+       if (bp->flags & TPA_ENABLE_FLAG) {
+               struct tstorm_eth_tpa_exist tmp = {0};
+
+               tmp.tpa_exist = 1;
+
+               REG_WR(bp, BAR_TSTRORM_INTMEM + TSTORM_TPA_EXIST_OFFSET,
+                      ((u32 *)&tmp)[0]);
+               REG_WR(bp, BAR_TSTRORM_INTMEM + TSTORM_TPA_EXIST_OFFSET + 4,
+                      ((u32 *)&tmp)[1]);
+       }
+
        return 0;
 }
 
@@ -5145,8 +5711,12 @@ static void bnx2x_free_mem(struct bnx2x *bp)
                               bnx2x_fp(bp, i, rx_comp_mapping),
                               sizeof(struct eth_fast_path_rx_cqe) *
                               NUM_RCQ_BD);
-       }
 
+               /* SGE ring */
+               BNX2X_PCI_FREE(bnx2x_fp(bp, i, rx_sge_ring),
+                              bnx2x_fp(bp, i, rx_sge_mapping),
+                              BCM_PAGE_SIZE * NUM_RX_SGE_PAGES);
+       }
        /* end of fastpath */
 
        BNX2X_PCI_FREE(bp->def_status_blk, bp->def_status_blk_mapping,
@@ -5161,7 +5731,7 @@ static void bnx2x_free_mem(struct bnx2x *bp)
        BNX2X_PCI_FREE(bp->timers, bp->timers_mapping, 8*1024);
        BNX2X_PCI_FREE(bp->qm, bp->qm_mapping, 128*1024);
 #endif
-       BNX2X_PCI_FREE(bp->spq, bp->spq_mapping, PAGE_SIZE);
+       BNX2X_PCI_FREE(bp->spq, bp->spq_mapping, BCM_PAGE_SIZE);
 
 #undef BNX2X_PCI_FREE
 #undef BNX2X_KFREE
@@ -5223,6 +5793,12 @@ static int bnx2x_alloc_mem(struct bnx2x *bp)
                                sizeof(struct eth_fast_path_rx_cqe) *
                                NUM_RCQ_BD);
 
+               /* SGE ring */
+               BNX2X_ALLOC(bnx2x_fp(bp, i, rx_page_ring),
+                               sizeof(struct sw_rx_page) * NUM_RX_SGE);
+               BNX2X_PCI_ALLOC(bnx2x_fp(bp, i, rx_sge_ring),
+                               &bnx2x_fp(bp, i, rx_sge_mapping),
+                               BCM_PAGE_SIZE * NUM_RX_SGE_PAGES);
        }
        /* end of fastpath */
 
@@ -5313,6 +5889,9 @@ static void bnx2x_free_rx_skbs(struct bnx2x *bp)
                        rx_buf->skb = NULL;
                        dev_kfree_skb(skb);
                }
+               if (!fp->disable_tpa)
+                       bnx2x_free_tpa_pool(bp, fp,
+                                           ETH_MAX_AGGREGATION_QUEUES_E1H);
        }
 }
 
@@ -5664,6 +6243,10 @@ static int bnx2x_nic_load(struct bnx2x *bp, int load_mode)
        if (bnx2x_alloc_mem(bp))
                return -ENOMEM;
 
+       for_each_queue(bp, i)
+               bnx2x_fp(bp, i, disable_tpa) =
+                                       ((bp->flags & TPA_ENABLE_FLAG) == 0);
+
        /* Disable interrupt handling until HW is initialized */
        atomic_set(&bp->intr_sem, 1);
 
@@ -5792,6 +6375,11 @@ load_int_disable:
        /* Release IRQs */
        bnx2x_free_irq(bp);
 
+       /* Free SKBs, SGEs, TPA pool and driver internals */
+       bnx2x_free_skbs(bp);
+       for_each_queue(bp, i)
+               bnx2x_free_rx_sge_range(bp, bp->fp + i,
+                                       RX_SGE_CNT*NUM_RX_SGE_PAGES);
 load_error:
        bnx2x_free_mem(bp);
 
@@ -6090,8 +6678,11 @@ unload_error:
        if (!BP_NOMCP(bp))
                bnx2x_fw_command(bp, DRV_MSG_CODE_UNLOAD_DONE);
 
-       /* Free SKBs and driver internals */
+       /* Free SKBs, SGEs, TPA pool and driver internals */
        bnx2x_free_skbs(bp);
+       for_each_queue(bp, i)
+               bnx2x_free_rx_sge_range(bp, bp->fp + i,
+                                       RX_SGE_CNT*NUM_RX_SGE_PAGES);
        bnx2x_free_mem(bp);
 
        bp->state = BNX2X_STATE_CLOSED;
@@ -6767,6 +7358,16 @@ static int __devinit bnx2x_init_bp(struct bnx2x *bp)
                printk(KERN_ERR PFX
                       "MCP disabled, must load devices in order!\n");
 
+       /* Set TPA flags */
+       if (disable_tpa) {
+               bp->flags &= ~TPA_ENABLE_FLAG;
+               bp->dev->features &= ~NETIF_F_LRO;
+       } else {
+               bp->flags |= TPA_ENABLE_FLAG;
+               bp->dev->features |= NETIF_F_LRO;
+       }
+
+
        bp->tx_ring_size = MAX_TX_AVAIL;
        bp->rx_ring_size = MAX_RX_AVAIL;
 
@@ -7556,6 +8157,33 @@ static int bnx2x_set_coalesce(struct net_device *dev,
        return 0;
 }
 
+static int bnx2x_set_flags(struct net_device *dev, u32 data)
+{
+       struct bnx2x *bp = netdev_priv(dev);
+       int changed = 0;
+       int rc = 0;
+
+       if (data & ETH_FLAG_LRO) {
+               if (!(dev->features & NETIF_F_LRO)) {
+                       dev->features |= NETIF_F_LRO;
+                       bp->flags |= TPA_ENABLE_FLAG;
+                       changed = 1;
+               }
+
+       } else if (dev->features & NETIF_F_LRO) {
+               dev->features &= ~NETIF_F_LRO;
+               bp->flags &= ~TPA_ENABLE_FLAG;
+               changed = 1;
+       }
+
+       if (changed && netif_running(dev)) {
+               bnx2x_nic_unload(bp, UNLOAD_NORMAL);
+               rc = bnx2x_nic_load(bp, LOAD_NORMAL);
+       }
+
+       return rc;
+}
+
 static void bnx2x_get_ringparam(struct net_device *dev,
                                struct ethtool_ringparam *ering)
 {
@@ -7896,35 +8524,37 @@ static int bnx2x_phys_id(struct net_device *dev, u32 data)
 }
 
 static struct ethtool_ops bnx2x_ethtool_ops = {
-       .get_settings           = bnx2x_get_settings,
-       .set_settings           = bnx2x_set_settings,
-       .get_drvinfo            = bnx2x_get_drvinfo,
+       .get_settings           = bnx2x_get_settings,
+       .set_settings           = bnx2x_set_settings,
+       .get_drvinfo            = bnx2x_get_drvinfo,
        .get_wol                = bnx2x_get_wol,
        .set_wol                = bnx2x_set_wol,
-       .get_msglevel           = bnx2x_get_msglevel,
-       .set_msglevel           = bnx2x_set_msglevel,
-       .nway_reset             = bnx2x_nway_reset,
-       .get_link               = ethtool_op_get_link,
-       .get_eeprom_len         = bnx2x_get_eeprom_len,
-       .get_eeprom             = bnx2x_get_eeprom,
-       .set_eeprom             = bnx2x_set_eeprom,
-       .get_coalesce           = bnx2x_get_coalesce,
-       .set_coalesce           = bnx2x_set_coalesce,
-       .get_ringparam          = bnx2x_get_ringparam,
-       .set_ringparam          = bnx2x_set_ringparam,
-       .get_pauseparam         = bnx2x_get_pauseparam,
-       .set_pauseparam         = bnx2x_set_pauseparam,
-       .get_rx_csum            = bnx2x_get_rx_csum,
-       .set_rx_csum            = bnx2x_set_rx_csum,
-       .get_tx_csum            = ethtool_op_get_tx_csum,
+       .get_msglevel           = bnx2x_get_msglevel,
+       .set_msglevel           = bnx2x_set_msglevel,
+       .nway_reset             = bnx2x_nway_reset,
+       .get_link               = ethtool_op_get_link,
+       .get_eeprom_len         = bnx2x_get_eeprom_len,
+       .get_eeprom             = bnx2x_get_eeprom,
+       .set_eeprom             = bnx2x_set_eeprom,
+       .get_coalesce           = bnx2x_get_coalesce,
+       .set_coalesce           = bnx2x_set_coalesce,
+       .get_ringparam          = bnx2x_get_ringparam,
+       .set_ringparam          = bnx2x_set_ringparam,
+       .get_pauseparam         = bnx2x_get_pauseparam,
+       .set_pauseparam         = bnx2x_set_pauseparam,
+       .get_rx_csum            = bnx2x_get_rx_csum,
+       .set_rx_csum            = bnx2x_set_rx_csum,
+       .get_tx_csum            = ethtool_op_get_tx_csum,
        .set_tx_csum            = ethtool_op_set_tx_csum,
-       .get_sg                 = ethtool_op_get_sg,
-       .set_sg                 = ethtool_op_set_sg,
+       .set_flags              = bnx2x_set_flags,
+       .get_flags              = ethtool_op_get_flags,
+       .get_sg                 = ethtool_op_get_sg,
+       .set_sg                 = ethtool_op_set_sg,
        .get_tso                = ethtool_op_get_tso,
        .set_tso                = bnx2x_set_tso,
        .self_test_count        = bnx2x_self_test_count,
-       .self_test              = bnx2x_self_test,
-       .get_strings            = bnx2x_get_strings,
+       .self_test              = bnx2x_self_test,
+       .get_strings            = bnx2x_get_strings,
        .phys_id                = bnx2x_phys_id,
        .get_stats_count        = bnx2x_get_stats_count,
        .get_ethtool_stats      = bnx2x_get_ethtool_stats,