tcp: reflect tos value received in SYN to the socket
authorWei Wang <weiwan@google.com>
Thu, 10 Sep 2020 00:50:48 +0000 (17:50 -0700)
committerDavid S. Miller <davem@davemloft.net>
Thu, 10 Sep 2020 20:15:40 +0000 (13:15 -0700)
This commit adds a new TCP feature to reflect the tos value received in
SYN, and send it out on the SYN-ACK, and eventually set the tos value of
the established socket with this reflected tos value. This provides a
way to set the traffic class/QoS level for all traffic in the same
connection to be the same as the incoming SYN request. It could be
useful in data centers to provide equivalent QoS according to the
incoming request.
This feature is guarded by /proc/sys/net/ipv4/tcp_reflect_tos, and is by
default turned off.

Signed-off-by: Wei Wang <weiwan@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
include/net/netns/ipv4.h
net/ipv4/sysctl_net_ipv4.c
net/ipv4/tcp_ipv4.c
net/ipv6/tcp_ipv6.c

index 9e36738c1fe164fa75cb6c4a0802773925f73b9a..8e4fcac4df72f7f4188df410e90e3706998dc738 100644 (file)
@@ -183,6 +183,7 @@ struct netns_ipv4 {
        unsigned int sysctl_tcp_fastopen_blackhole_timeout;
        atomic_t tfo_active_disable_times;
        unsigned long tfo_active_disable_stamp;
+       int sysctl_tcp_reflect_tos;
 
        int sysctl_udp_wmem_min;
        int sysctl_udp_rmem_min;
index 54023a46db04720ac343549113dadd7fa6da9082..3e5f4f2e705e84937112aed32df60b6a9d7e8127 100644 (file)
@@ -1329,6 +1329,15 @@ static struct ctl_table ipv4_net_table[] = {
                .extra1         = SYSCTL_ZERO,
                .extra2         = &comp_sack_nr_max,
        },
+       {
+               .procname       = "tcp_reflect_tos",
+               .data           = &init_net.ipv4.sysctl_tcp_reflect_tos,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+               .extra2         = SYSCTL_ONE,
+       },
        {
                .procname       = "udp_rmem_min",
                .data           = &init_net.ipv4.sysctl_udp_rmem_min,
index c4c7ad4c8b5a5aacf9837d8947da3c8cfc473105..ace48b2790ffaf41bf3a84f0d3b4496b760ac30a 100644 (file)
@@ -972,6 +972,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
        struct flowi4 fl4;
        int err = -1;
        struct sk_buff *skb;
+       u8 tos;
 
        /* First, grab a route. */
        if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
@@ -979,6 +980,9 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
 
        skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
 
+       tos = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+                       tcp_rsk(req)->syn_tos : inet_sk(sk)->tos;
+
        if (skb) {
                __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
 
@@ -986,7 +990,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
                err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
                                            ireq->ir_rmt_addr,
                                            rcu_dereference(ireq->ireq_opt),
-                                           inet_sk(sk)->tos);
+                                           tos & ~INET_ECN_MASK);
                rcu_read_unlock();
                err = net_xmit_eval(err);
        }
@@ -1531,6 +1535,10 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
                inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
        newinet->inet_id = prandom_u32();
 
+       /* Set ToS of the new socket based upon the value of incoming SYN. */
+       if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+               newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
        if (!dst) {
                dst = inet_csk_route_child_sock(sk, newsk, req);
                if (!dst)
index 04efa3ee80ef42fa04fd1c672e5d91bfc3f98ea3..862058dce6d047f5c5e78a67284dff8403c22312 100644 (file)
@@ -510,6 +510,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
        struct flowi6 *fl6 = &fl->u.ip6;
        struct sk_buff *skb;
        int err = -ENOMEM;
+       u8 tclass;
 
        /* First, grab a route. */
        if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
@@ -528,9 +529,12 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 
                rcu_read_lock();
                opt = ireq->ipv6_opt;
+               tclass = sock_net(sk)->ipv4.sysctl_tcp_reflect_tos ?
+                               tcp_rsk(req)->syn_tos : np->tclass;
                if (!opt)
                        opt = rcu_dereference(np->opt);
-               err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass,
+               err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt,
+                              tclass & ~INET_ECN_MASK,
                               sk->sk_priority);
                rcu_read_unlock();
                err = net_xmit_eval(err);
@@ -1310,6 +1314,10 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
        if (np->repflow)
                newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb));
 
+       /* Set ToS of the new socket based upon the value of incoming SYN. */
+       if (sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)
+               newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
+
        /* Clone native IPv6 options from listening socket (if any)
 
           Yes, keeping reference count would be much more clever,