af_packet: TX_RING support for TPACKET_V3
authorSowmini Varadhan <sowmini.varadhan@oracle.com>
Tue, 3 Jan 2017 14:31:47 +0000 (06:31 -0800)
committerDavid S. Miller <davem@davemloft.net>
Tue, 3 Jan 2017 16:00:27 +0000 (11:00 -0500)
Although TPACKET_V3 Rx has some benefits over TPACKET_V2 Rx, *_v3
does not currently have TX_RING support. As a result an application
that wants the best perf for Tx and Rx (e.g. to handle request/response
transacations) ends up needing 2 sockets, one with *_v2 for Tx and
another with *_v3 for Rx.

This patch enables TPACKET_V2 compatible Tx features in TPACKET_V3
so that an application can use a single descriptor to get the benefits
of _v3 RX_RING and _v2 TX_RING. An application may do a block-send by
first filling up multiple frames in the Tx ring and then triggering a
transmit. This patch only support fixed size Tx frames for TPACKET_V3,
and requires that tp_next_offset must be zero.

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Documentation/networking/packet_mmap.txt
net/packet/af_packet.c

index daa015a..f3b9e50 100644 (file)
@@ -565,7 +565,7 @@ TPACKET_V1 --> TPACKET_V2:
                   (void *)hdr + TPACKET_ALIGN(sizeof(struct tpacket_hdr))
 
 TPACKET_V2 --> TPACKET_V3:
-       - Flexible buffer implementation:
+       - Flexible buffer implementation for RX_RING:
                1. Blocks can be configured with non-static frame-size
                2. Read/poll is at a block-level (as opposed to packet-level)
                3. Added poll timeout to avoid indefinite user-space wait
@@ -574,7 +574,12 @@ TPACKET_V2 --> TPACKET_V3:
                        4.1 block::timeout
                        4.2 tpkt_hdr::sk_rxhash
        - RX Hash data available in user space
-       - Currently only RX_RING available
+       - TX_RING semantics are conceptually similar to TPACKET_V2;
+         use tpacket3_hdr instead of tpacket2_hdr, and TPACKET3_HDRLEN
+         instead of TPACKET2_HDRLEN. In the current implementation,
+         the tp_next_offset field in the tpacket3_hdr MUST be set to
+         zero, indicating that the ring does not hold variable sized frames.
+         Packets with non-zero values of tp_next_offset will be dropped.
 
 -------------------------------------------------------------------------------
 + AF_PACKET fanout mode
index b9e1a13..7e39087 100644 (file)
@@ -409,6 +409,9 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                break;
        case TPACKET_V3:
+               h.h3->tp_status = status;
+               flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+               break;
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
@@ -432,6 +435,8 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
                flush_dcache_page(pgv_to_page(&h.h2->tp_status));
                return h.h2->tp_status;
        case TPACKET_V3:
+               flush_dcache_page(pgv_to_page(&h.h3->tp_status));
+               return h.h3->tp_status;
        default:
                WARN(1, "TPACKET version not supported.\n");
                BUG();
@@ -2497,6 +2502,13 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
        ph.raw = frame;
 
        switch (po->tp_version) {
+       case TPACKET_V3:
+               if (ph.h3->tp_next_offset != 0) {
+                       pr_warn_once("variable sized slot not supported");
+                       return -EINVAL;
+               }
+               tp_len = ph.h3->tp_len;
+               break;
        case TPACKET_V2:
                tp_len = ph.h2->tp_len;
                break;
@@ -2516,6 +2528,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
                off_max = po->tx_ring.frame_size - tp_len;
                if (po->sk.sk_type == SOCK_DGRAM) {
                        switch (po->tp_version) {
+                       case TPACKET_V3:
+                               off = ph.h3->tp_net;
+                               break;
                        case TPACKET_V2:
                                off = ph.h2->tp_net;
                                break;
@@ -2525,6 +2540,9 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
                        }
                } else {
                        switch (po->tp_version) {
+                       case TPACKET_V3:
+                               off = ph.h3->tp_mac;
+                               break;
                        case TPACKET_V2:
                                off = ph.h2->tp_mac;
                                break;
@@ -4113,11 +4131,6 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
        struct tpacket_req *req = &req_u->req;
 
        lock_sock(sk);
-       /* Opening a Tx-ring is NOT supported in TPACKET_V3 */
-       if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
-               net_warn_ratelimited("Tx-ring is not supported.\n");
-               goto out;
-       }
 
        rb = tx_ring ? &po->tx_ring : &po->rx_ring;
        rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
@@ -4177,11 +4190,19 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                        goto out;
                switch (po->tp_version) {
                case TPACKET_V3:
-               /* Transmit path is not supported. We checked
-                * it above but just being paranoid
-                */
-                       if (!tx_ring)
+                       /* Block transmit is not supported yet */
+                       if (!tx_ring) {
                                init_prb_bdqc(po, rb, pg_vec, req_u);
+                       } else {
+                               struct tpacket_req3 *req3 = &req_u->req3;
+
+                               if (req3->tp_retire_blk_tov ||
+                                   req3->tp_sizeof_priv ||
+                                   req3->tp_feature_req_word) {
+                                       err = -EINVAL;
+                                       goto out;
+                               }
+                       }
                        break;
                default:
                        break;