}
#endif
-static int packet_direct_xmit(struct sk_buff *skb)
+static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb)
{
+ if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS))
+ return dev_queue_xmit(skb);
+
#ifdef CONFIG_NETFILTER_EGRESS
if (nf_hook_egress_active()) {
skb = nf_hook_direct_egress(skb);
RCU_INIT_POINTER(po->cached_dev, NULL);
}
-static bool packet_use_direct_xmit(const struct packet_sock *po)
-{
- /* Paired with WRITE_ONCE() in packet_setsockopt() */
- return READ_ONCE(po->xmit) == packet_direct_xmit;
-}
-
static u16 packet_pick_tx_queue(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
{
struct packet_sock *po = pkt_sk(sk);
- if (!po->running) {
+ if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
if (po->fanout)
__fanout_link(sk, po);
else
dev_add_pack(&po->prot_hook);
sock_hold(sk);
- po->running = 1;
+ packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1);
}
}
lockdep_assert_held_once(&po->bind_lock);
- po->running = 0;
+ packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0);
if (po->fanout)
__fanout_unlink(sk, po);
{
struct packet_sock *po = pkt_sk(sk);
- if (po->running)
+ if (packet_sock_flag(po, PACKET_SOCK_RUNNING))
__unregister_prot_hook(sk, sync);
}
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
- int pressure, ret;
+ bool pressure;
+ int ret;
ret = __packet_rcv_has_room(po, skb);
pressure = ret != ROOM_NORMAL;
- if (READ_ONCE(po->pressure) != pressure)
- WRITE_ONCE(po->pressure, pressure);
+ if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure)
+ packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure);
return ret;
}
static void packet_rcv_try_clear_pressure(struct packet_sock *po)
{
- if (READ_ONCE(po->pressure) &&
+ if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) &&
__packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
- WRITE_ONCE(po->pressure, 0);
+ packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false);
}
static void packet_sock_destruct(struct sock *sk)
i = j = min_t(int, po->rollover->sock, num - 1);
do {
po_next = pkt_sk(rcu_dereference(f->arr[i]));
- if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
+ if (po_next != po_skip &&
+ !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
err = -EINVAL;
spin_lock(&po->bind_lock);
- if (po->running &&
+ if (packet_sock_flag(po, PACKET_SOCK_RUNNING) &&
match->type == type &&
match->prot_hook.type == po->prot_hook.type &&
match->prot_hook.dev == po->prot_hook.dev) {
goto retry;
}
- if (!dev_validate_header(dev, skb->data, len)) {
+ if (!dev_validate_header(dev, skb->data, len) || !skb->len) {
err = -EINVAL;
goto out_unlock;
}
}
static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
- size_t *len)
+ size_t *len, int vnet_hdr_sz)
{
- struct virtio_net_hdr vnet_hdr;
+ struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 };
- if (*len < sizeof(vnet_hdr))
+ if (*len < vnet_hdr_sz)
return -EINVAL;
- *len -= sizeof(vnet_hdr);
+ *len -= vnet_hdr_sz;
- if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
+ if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0))
return -EINVAL;
- return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
+ return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz);
}
/*
__u32 ts_status;
bool is_drop_n_account = false;
unsigned int slot_id = 0;
- bool do_vnet = false;
+ int vnet_hdr_sz = 0;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
* We may add members to them until current aligned size without forcing
netoff = TPACKET_ALIGN(po->tp_hdrlen +
(maclen < 16 ? 16 : maclen)) +
po->tp_reserve;
- if (po->has_vnet_hdr) {
- netoff += sizeof(struct virtio_net_hdr);
- do_vnet = true;
- }
+ vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
+ if (vnet_hdr_sz)
+ netoff += vnet_hdr_sz;
macoff = netoff - maclen;
}
if (netoff > USHRT_MAX) {
snaplen = po->rx_ring.frame_size - macoff;
if ((int)snaplen < 0) {
snaplen = 0;
- do_vnet = false;
+ vnet_hdr_sz = 0;
}
}
} else if (unlikely(macoff + snaplen >
if (unlikely((int)snaplen < 0)) {
snaplen = 0;
macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
- do_vnet = false;
+ vnet_hdr_sz = 0;
}
}
spin_lock(&sk->sk_receive_queue.lock);
__set_bit(slot_id, po->rx_ring.rx_owner_map);
}
- if (do_vnet &&
+ if (vnet_hdr_sz &&
virtio_net_hdr_from_skb(skb, h.raw + macoff -
sizeof(struct virtio_net_hdr),
vio_le(), true, 0)) {
}
static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
- struct virtio_net_hdr *vnet_hdr)
+ struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz)
{
- if (*len < sizeof(*vnet_hdr))
+ int ret;
+
+ if (*len < vnet_hdr_sz)
return -EINVAL;
- *len -= sizeof(*vnet_hdr);
+ *len -= vnet_hdr_sz;
if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
return -EFAULT;
- return __packet_snd_vnet_parse(vnet_hdr, *len);
+ ret = __packet_snd_vnet_parse(vnet_hdr, *len);
+ if (ret)
+ return ret;
+
+ /* move iter to point to the start of mac header */
+ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr))
+ iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr));
+
+ return 0;
}
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
nr_frags = skb_shinfo(skb)->nr_frags;
if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
- pr_err("Packet exceed the number of skb frags(%lu)\n",
- MAX_SKB_FRAGS);
+ pr_err("Packet exceed the number of skb frags(%u)\n",
+ (unsigned int)MAX_SKB_FRAGS);
return -EFAULT;
}
void *ph;
DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
+ int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
unsigned char *addr = NULL;
int tp_len, size_max;
void *data;
size_max = po->tx_ring.frame_size
- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
- if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
+ if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz)
size_max = dev->mtu + reserve + VLAN_HLEN;
reinit_completion(&po->skb_completion);
status = TP_STATUS_SEND_REQUEST;
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
- if (po->has_vnet_hdr) {
+ if (vnet_hdr_sz) {
vnet_hdr = data;
- data += sizeof(*vnet_hdr);
- tp_len -= sizeof(*vnet_hdr);
+ data += vnet_hdr_sz;
+ tp_len -= vnet_hdr_sz;
if (tp_len < 0 ||
__packet_snd_vnet_parse(vnet_hdr, tp_len)) {
tp_len = -EINVAL;
addr, hlen, copylen, &sockc);
if (likely(tp_len >= 0) &&
tp_len > dev->mtu + reserve &&
- !po->has_vnet_hdr &&
+ !vnet_hdr_sz &&
!packet_extra_vlan_len_allowed(dev, skb))
tp_len = -EMSGSIZE;
if (unlikely(tp_len < 0)) {
tpacket_error:
- if (po->tp_loss) {
+ if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) {
__packet_set_status(po, ph,
TP_STATUS_AVAILABLE);
packet_increment_head(&po->tx_ring);
}
}
- if (po->has_vnet_hdr) {
+ if (vnet_hdr_sz) {
if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
tp_len = -EINVAL;
goto tpacket_error;
packet_inc_pending(&po->tx_ring);
status = TP_STATUS_SEND_REQUEST;
- /* Paired with WRITE_ONCE() in packet_setsockopt() */
- err = READ_ONCE(po->xmit)(skb);
+ err = packet_xmit(po, skb);
if (unlikely(err != 0)) {
if (err > 0)
err = net_xmit_errno(err);
struct virtio_net_hdr vnet_hdr = { 0 };
int offset = 0;
struct packet_sock *po = pkt_sk(sk);
- bool has_vnet_hdr = false;
+ int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz);
int hlen, tlen, linear;
int extra_len = 0;
if (sock->type == SOCK_RAW)
reserve = dev->hard_header_len;
- if (po->has_vnet_hdr) {
- err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
+ if (vnet_hdr_sz) {
+ err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz);
if (err)
goto out_unlock;
- has_vnet_hdr = true;
}
if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
packet_parse_headers(skb, sock);
- if (has_vnet_hdr) {
+ if (vnet_hdr_sz) {
err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
if (err)
goto out_free;
- len += sizeof(vnet_hdr);
+ len += vnet_hdr_sz;
virtio_net_hdr_set_proto(skb, &vnet_hdr);
}
- /* Paired with WRITE_ONCE() in packet_setsockopt() */
- err = READ_ONCE(po->xmit)(skb);
+ err = packet_xmit(po, skb);
+
if (unlikely(err != 0)) {
if (err > 0)
err = net_xmit_errno(err);
if (need_rehook) {
dev_hold(dev);
- if (po->running) {
+ if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
rcu_read_unlock();
/* prevents packet_notifier() from calling
* register_prot_hook()
dev->ifindex);
}
- BUG_ON(po->running);
+ BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING));
WRITE_ONCE(po->num, proto);
po->prot_hook.type = proto;
init_completion(&po->skb_completion);
sk->sk_family = PF_PACKET;
po->num = proto;
- po->xmit = dev_queue_xmit;
err = packet_alloc_pending(po);
if (err)
struct sock *sk = sock->sk;
struct sk_buff *skb;
int copied, err;
- int vnet_hdr_len = 0;
+ int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz);
unsigned int origlen = 0;
err = -EINVAL;
packet_rcv_try_clear_pressure(pkt_sk(sk));
- if (pkt_sk(sk)->has_vnet_hdr) {
- err = packet_rcv_vnet(msg, skb, &len);
+ if (vnet_hdr_len) {
+ err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len);
if (err)
goto out_free;
- vnet_hdr_len = sizeof(struct virtio_net_hdr);
}
/* You lose any data beyond the buffer you gave. If it worries
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
ret = -EBUSY;
} else {
- po->tp_loss = !!val;
+ packet_sock_flag_set(po, PACKET_SOCK_TP_LOSS, val);
ret = 0;
}
release_sock(sk);
return 0;
}
case PACKET_VNET_HDR:
+ case PACKET_VNET_HDR_SZ:
{
- int val;
+ int val, hdr_len;
if (sock->type != SOCK_RAW)
return -EINVAL;
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
+ if (optname == PACKET_VNET_HDR_SZ) {
+ if (val && val != sizeof(struct virtio_net_hdr) &&
+ val != sizeof(struct virtio_net_hdr_mrg_rxbuf))
+ return -EINVAL;
+ hdr_len = val;
+ } else {
+ hdr_len = val ? sizeof(struct virtio_net_hdr) : 0;
+ }
lock_sock(sk);
if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
ret = -EBUSY;
} else {
- po->has_vnet_hdr = !!val;
+ WRITE_ONCE(po->vnet_hdr_sz, hdr_len);
ret = 0;
}
release_sock(sk);
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
- /* Paired with all lockless reads of po->xmit */
- WRITE_ONCE(po->xmit, val ? packet_direct_xmit : dev_queue_xmit);
+ packet_sock_flag_set(po, PACKET_SOCK_QDISC_BYPASS, val);
return 0;
}
default:
val = packet_sock_flag(po, PACKET_SOCK_ORIGDEV);
break;
case PACKET_VNET_HDR:
- val = po->has_vnet_hdr;
+ val = !!READ_ONCE(po->vnet_hdr_sz);
+ break;
+ case PACKET_VNET_HDR_SZ:
+ val = READ_ONCE(po->vnet_hdr_sz);
break;
case PACKET_VERSION:
val = po->tp_version;
val = po->tp_reserve;
break;
case PACKET_LOSS:
- val = po->tp_loss;
+ val = packet_sock_flag(po, PACKET_SOCK_TP_LOSS);
break;
case PACKET_TIMESTAMP:
val = READ_ONCE(po->tp_tstamp);
val = packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF);
break;
case PACKET_QDISC_BYPASS:
- val = packet_use_direct_xmit(po);
+ val = packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS);
break;
default:
return -ENOPROTOOPT;
case NETDEV_DOWN:
if (dev->ifindex == po->ifindex) {
spin_lock(&po->bind_lock);
- if (po->running) {
+ if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) {
__unregister_prot_hook(sk, false);
sk->sk_err = ENETDOWN;
if (!sock_flag(sk, SOCK_DEAD))
/* Detach socket from network */
spin_lock(&po->bind_lock);
- was_running = po->running;
+ was_running = packet_sock_flag(po, PACKET_SOCK_RUNNING);
num = po->num;
if (was_running) {
WRITE_ONCE(po->num, 0);
s->sk_type,
ntohs(READ_ONCE(po->num)),
READ_ONCE(po->ifindex),
- po->running,
+ packet_sock_flag(po, PACKET_SOCK_RUNNING),
atomic_read(&s->sk_rmem_alloc),
from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
sock_i_ino(s));