1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* RxRPC packet transmission
4 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
5 * Written by David Howells (dhowells@redhat.com)
8 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10 #include <linux/net.h>
11 #include <linux/gfp.h>
12 #include <linux/skbuff.h>
13 #include <linux/export.h>
15 #include <net/af_rxrpc.h>
17 #include "ar-internal.h"
19 extern int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len);
21 static ssize_t do_udp_sendmsg(struct socket *socket, struct msghdr *msg, size_t len)
23 struct sockaddr *sa = msg->msg_name;
24 struct sock *sk = socket->sk;
26 if (IS_ENABLED(CONFIG_AF_RXRPC_IPV6)) {
27 if (sa->sa_family == AF_INET6) {
28 if (sk->sk_family != AF_INET6) {
29 pr_warn("AF_INET6 address on AF_INET socket\n");
32 return udpv6_sendmsg(sk, msg, len);
35 return udp_sendmsg(sk, msg, len);
38 struct rxrpc_abort_buffer {
39 struct rxrpc_wire_header whdr;
43 static const char rxrpc_keepalive_string[] = "";
46 * Increase Tx backoff on transmission failure and clear it on success.
48 static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret)
51 u16 tx_backoff = READ_ONCE(call->tx_backoff);
54 WRITE_ONCE(call->tx_backoff, tx_backoff + 1);
56 WRITE_ONCE(call->tx_backoff, 0);
61 * Arrange for a keepalive ping a certain time after we last transmitted. This
62 * lets the far side know we're still interested in this call and helps keep
63 * the route through any intervening firewall open.
65 * Receiving a response to the ping will prevent the ->expect_rx_by timer from
68 static void rxrpc_set_keepalive(struct rxrpc_call *call)
70 unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6;
73 WRITE_ONCE(call->keepalive_at, keepalive_at);
74 rxrpc_reduce_call_timer(call, keepalive_at, now,
75 rxrpc_timer_set_for_keepalive);
79 * Fill out an ACK packet.
81 static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn,
82 struct rxrpc_call *call,
83 struct rxrpc_txbuf *txb,
86 struct rxrpc_ackinfo ackinfo;
87 unsigned int qsize, sack, wrap, to;
88 rxrpc_seq_t window, wtop;
93 call->ackr_nr_unacked = 0;
94 atomic_set(&call->ackr_nr_consumed, 0);
95 rxrpc_inc_stat(call->rxnet, stat_tx_ack_fill);
96 clear_bit(RXRPC_CALL_RX_IS_IDLE, &call->flags);
98 window = call->ackr_window;
99 wtop = call->ackr_wtop;
100 sack = call->ackr_sack_base % RXRPC_SACK_SIZE;
101 txb->ack.firstPacket = htonl(window);
102 txb->ack.nAcks = wtop - window;
104 if (after(wtop, window)) {
105 wrap = RXRPC_SACK_SIZE - sack;
106 to = min_t(unsigned int, txb->ack.nAcks, RXRPC_SACK_SIZE);
108 if (sack + txb->ack.nAcks <= RXRPC_SACK_SIZE) {
109 memcpy(txb->acks, call->ackr_sack_table + sack, txb->ack.nAcks);
111 memcpy(txb->acks, call->ackr_sack_table + sack, wrap);
112 memcpy(txb->acks + wrap, call->ackr_sack_table,
117 } else if (before(wtop, window)) {
118 pr_warn("ack window backward %x %x", window, wtop);
119 } else if (txb->ack.reason == RXRPC_ACK_DELAY) {
120 txb->ack.reason = RXRPC_ACK_IDLE;
123 mtu = conn->peer->if_mtu;
124 mtu -= conn->peer->hdrsize;
125 jmax = rxrpc_rx_jumbo_max;
126 qsize = (window - 1) - call->rx_consumed;
127 rsize = max_t(int, call->rx_winsize - qsize, 0);
129 ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
130 ackinfo.maxMTU = htonl(mtu);
131 ackinfo.rwind = htonl(rsize);
132 ackinfo.jumbo_max = htonl(jmax);
137 memcpy(ackp, &ackinfo, sizeof(ackinfo));
138 return txb->ack.nAcks + 3 + sizeof(ackinfo);
142 * Record the beginning of an RTT probe.
144 static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial,
145 enum rxrpc_rtt_tx_trace why)
147 unsigned long avail = call->rtt_avail;
150 if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK))
153 rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK);
154 if (!test_and_clear_bit(rtt_slot, &call->rtt_avail))
157 call->rtt_serial[rtt_slot] = serial;
158 call->rtt_sent_at[rtt_slot] = ktime_get_real();
159 smp_wmb(); /* Write data before avail bit */
160 set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
162 trace_rxrpc_rtt_tx(call, why, rtt_slot, serial);
166 trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial);
171 * Cancel an RTT probe.
173 static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call,
174 rxrpc_serial_t serial, int rtt_slot)
176 if (rtt_slot != -1) {
177 clear_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail);
178 smp_wmb(); /* Clear pending bit before setting slot */
179 set_bit(rtt_slot, &call->rtt_avail);
180 trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_cancel, rtt_slot, serial);
185 * Transmit an ACK packet.
187 int rxrpc_send_ack_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
189 struct rxrpc_connection *conn;
192 rxrpc_serial_t serial;
194 int ret, rtt_slot = -1;
197 if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
202 msg.msg_name = &call->peer->srx.transport;
203 msg.msg_namelen = call->peer->srx.transport_len;
204 msg.msg_control = NULL;
205 msg.msg_controllen = 0;
208 if (txb->ack.reason == RXRPC_ACK_PING)
209 txb->wire.flags |= RXRPC_REQUEST_ACK;
211 n = rxrpc_fill_out_ack(conn, call, txb, &rwind);
215 iov[0].iov_base = &txb->wire;
216 iov[0].iov_len = sizeof(txb->wire) + sizeof(txb->ack) + n;
217 len = iov[0].iov_len;
219 serial = atomic_inc_return(&conn->serial);
220 txb->wire.serial = htonl(serial);
221 trace_rxrpc_tx_ack(call->debug_id, serial,
222 ntohl(txb->ack.firstPacket),
223 ntohl(txb->ack.serial), txb->ack.reason, txb->ack.nAcks,
226 if (txb->ack.reason == RXRPC_ACK_PING)
227 rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping);
229 rxrpc_inc_stat(call->rxnet, stat_tx_ack_send);
231 /* Grab the highest received seq as late as possible */
232 txb->ack.previousPacket = htonl(call->rx_highest_seq);
234 iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
235 ret = do_udp_sendmsg(conn->local->socket, &msg, len);
236 call->peer->last_tx_at = ktime_get_seconds();
238 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
239 rxrpc_tx_point_call_ack);
241 trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
242 rxrpc_tx_point_call_ack);
243 if (txb->wire.flags & RXRPC_REQUEST_ACK)
244 call->peer->rtt_last_req = ktime_get_real();
246 rxrpc_tx_backoff(call, ret);
248 if (!__rxrpc_call_is_complete(call)) {
250 rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
251 rxrpc_set_keepalive(call);
258 * Send an ABORT call packet.
260 int rxrpc_send_abort_packet(struct rxrpc_call *call)
262 struct rxrpc_connection *conn;
263 struct rxrpc_abort_buffer pkt;
266 rxrpc_serial_t serial;
269 /* Don't bother sending aborts for a client call once the server has
270 * hard-ACK'd all of its request data. After that point, we're not
271 * going to stop the operation proceeding, and whilst we might limit
272 * the reply, it's not worth it if we can send a new call on the same
273 * channel instead, thereby closing off this call.
275 if (rxrpc_is_client_call(call) &&
276 test_bit(RXRPC_CALL_TX_ALL_ACKED, &call->flags))
279 if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags))
284 msg.msg_name = &call->peer->srx.transport;
285 msg.msg_namelen = call->peer->srx.transport_len;
286 msg.msg_control = NULL;
287 msg.msg_controllen = 0;
290 pkt.whdr.epoch = htonl(conn->proto.epoch);
291 pkt.whdr.cid = htonl(call->cid);
292 pkt.whdr.callNumber = htonl(call->call_id);
294 pkt.whdr.type = RXRPC_PACKET_TYPE_ABORT;
295 pkt.whdr.flags = conn->out_clientflag;
296 pkt.whdr.userStatus = 0;
297 pkt.whdr.securityIndex = call->security_ix;
299 pkt.whdr.serviceId = htons(call->dest_srx.srx_service);
300 pkt.abort_code = htonl(call->abort_code);
302 iov[0].iov_base = &pkt;
303 iov[0].iov_len = sizeof(pkt);
305 serial = atomic_inc_return(&conn->serial);
306 pkt.whdr.serial = htonl(serial);
308 iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, sizeof(pkt));
309 ret = do_udp_sendmsg(conn->local->socket, &msg, sizeof(pkt));
310 conn->peer->last_tx_at = ktime_get_seconds();
312 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
313 rxrpc_tx_point_call_abort);
315 trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr,
316 rxrpc_tx_point_call_abort);
317 rxrpc_tx_backoff(call, ret);
322 * send a packet through the transport endpoint
324 int rxrpc_send_data_packet(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
326 enum rxrpc_req_ack_trace why;
327 struct rxrpc_connection *conn = call->conn;
330 rxrpc_serial_t serial;
332 int ret, rtt_slot = -1;
334 _enter("%x,{%d}", txb->seq, txb->len);
336 /* Each transmission of a Tx packet needs a new serial number */
337 serial = atomic_inc_return(&conn->serial);
338 txb->wire.serial = htonl(serial);
340 if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) &&
342 txb->wire.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE;
344 iov[0].iov_base = &txb->wire;
345 iov[0].iov_len = sizeof(txb->wire) + txb->len;
346 len = iov[0].iov_len;
347 iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
349 msg.msg_name = &call->peer->srx.transport;
350 msg.msg_namelen = call->peer->srx.transport_len;
351 msg.msg_control = NULL;
352 msg.msg_controllen = 0;
355 /* If our RTT cache needs working on, request an ACK. Also request
356 * ACKs if a DATA packet appears to have been lost.
358 * However, we mustn't request an ACK on the last reply packet of a
359 * service call, lest OpenAFS incorrectly send us an ACK with some
360 * soft-ACKs in it and then never follow up with a proper hard ACK.
362 if (txb->wire.flags & RXRPC_REQUEST_ACK)
363 why = rxrpc_reqack_already_on;
364 else if (test_bit(RXRPC_TXBUF_LAST, &txb->flags) && rxrpc_sending_to_client(txb))
365 why = rxrpc_reqack_no_srv_last;
366 else if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events))
367 why = rxrpc_reqack_ack_lost;
368 else if (test_bit(RXRPC_TXBUF_RESENT, &txb->flags))
369 why = rxrpc_reqack_retrans;
370 else if (call->cong_mode == RXRPC_CALL_SLOW_START && call->cong_cwnd <= 2)
371 why = rxrpc_reqack_slow_start;
372 else if (call->tx_winsize <= 2)
373 why = rxrpc_reqack_small_txwin;
374 else if (call->peer->rtt_count < 3 && txb->seq & 1)
375 why = rxrpc_reqack_more_rtt;
376 else if (ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))
377 why = rxrpc_reqack_old_rtt;
379 goto dont_set_request_ack;
381 rxrpc_inc_stat(call->rxnet, stat_why_req_ack[why]);
382 trace_rxrpc_req_ack(call->debug_id, txb->seq, why);
383 if (why != rxrpc_reqack_no_srv_last)
384 txb->wire.flags |= RXRPC_REQUEST_ACK;
385 dont_set_request_ack:
387 if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) {
389 if ((lose++ & 7) == 7) {
391 trace_rxrpc_tx_data(call, txb->seq, serial,
393 test_bit(RXRPC_TXBUF_RESENT, &txb->flags),
399 trace_rxrpc_tx_data(call, txb->seq, serial, txb->wire.flags,
400 test_bit(RXRPC_TXBUF_RESENT, &txb->flags), false);
402 /* Track what we've attempted to transmit at least once so that the
403 * retransmission algorithm doesn't try to resend what we haven't sent
404 * yet. However, this can race as we can receive an ACK before we get
405 * to this point. But, OTOH, if we won't get an ACK mentioning this
406 * packet unless the far side received it (though it could have
407 * discarded it anyway and NAK'd it).
409 cmpxchg(&call->tx_transmitted, txb->seq - 1, txb->seq);
411 /* send the packet with the don't fragment bit set if we currently
412 * think it's small enough */
413 if (txb->len >= call->peer->maxdata)
414 goto send_fragmentable;
416 txb->last_sent = ktime_get_real();
417 if (txb->wire.flags & RXRPC_REQUEST_ACK)
418 rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
420 /* send the packet by UDP
421 * - returns -EMSGSIZE if UDP would have to fragment the packet
422 * to go out of the interface
423 * - in which case, we'll have processed the ICMP error
424 * message and update the peer record
426 rxrpc_inc_stat(call->rxnet, stat_tx_data_send);
427 ret = do_udp_sendmsg(conn->local->socket, &msg, len);
428 conn->peer->last_tx_at = ktime_get_seconds();
431 rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
432 rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
433 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
434 rxrpc_tx_point_call_data_nofrag);
436 trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
437 rxrpc_tx_point_call_data_nofrag);
440 rxrpc_tx_backoff(call, ret);
441 if (ret == -EMSGSIZE)
442 goto send_fragmentable;
446 call->tx_last_sent = txb->last_sent;
447 if (txb->wire.flags & RXRPC_REQUEST_ACK) {
448 call->peer->rtt_last_req = txb->last_sent;
449 if (call->peer->rtt_count > 1) {
450 unsigned long nowj = jiffies, ack_lost_at;
452 ack_lost_at = rxrpc_get_rto_backoff(call->peer, false);
454 WRITE_ONCE(call->ack_lost_at, ack_lost_at);
455 rxrpc_reduce_call_timer(call, ack_lost_at, nowj,
456 rxrpc_timer_set_for_lost_ack);
461 !test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER,
463 unsigned long nowj = jiffies, expect_rx_by;
465 expect_rx_by = nowj + call->next_rx_timo;
466 WRITE_ONCE(call->expect_rx_by, expect_rx_by);
467 rxrpc_reduce_call_timer(call, expect_rx_by, nowj,
468 rxrpc_timer_set_for_normal);
471 rxrpc_set_keepalive(call);
473 /* Cancel the call if the initial transmission fails,
474 * particularly if that's due to network routing issues that
475 * aren't going away anytime soon. The layer above can arrange
476 * the retransmission.
478 if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags))
479 rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
483 _leave(" = %d [%u]", ret, call->peer->maxdata);
487 /* attempt to send this message with fragmentation enabled */
488 _debug("send fragment");
490 txb->last_sent = ktime_get_real();
491 if (txb->wire.flags & RXRPC_REQUEST_ACK)
492 rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data);
494 switch (conn->local->srx.transport.family) {
497 ip_sock_set_mtu_discover(conn->local->socket->sk,
499 rxrpc_inc_stat(call->rxnet, stat_tx_data_send_frag);
500 ret = do_udp_sendmsg(conn->local->socket, &msg, len);
501 conn->peer->last_tx_at = ktime_get_seconds();
503 ip_sock_set_mtu_discover(conn->local->socket->sk,
512 rxrpc_inc_stat(call->rxnet, stat_tx_data_send_fail);
513 rxrpc_cancel_rtt_probe(call, serial, rtt_slot);
514 trace_rxrpc_tx_fail(call->debug_id, serial, ret,
515 rxrpc_tx_point_call_data_frag);
517 trace_rxrpc_tx_packet(call->debug_id, &txb->wire,
518 rxrpc_tx_point_call_data_frag);
520 rxrpc_tx_backoff(call, ret);
525 * Transmit a connection-level abort.
527 void rxrpc_send_conn_abort(struct rxrpc_connection *conn)
529 struct rxrpc_wire_header whdr;
537 msg.msg_name = &conn->peer->srx.transport;
538 msg.msg_namelen = conn->peer->srx.transport_len;
539 msg.msg_control = NULL;
540 msg.msg_controllen = 0;
543 whdr.epoch = htonl(conn->proto.epoch);
544 whdr.cid = htonl(conn->proto.cid);
547 whdr.type = RXRPC_PACKET_TYPE_ABORT;
548 whdr.flags = conn->out_clientflag;
550 whdr.securityIndex = conn->security_ix;
552 whdr.serviceId = htons(conn->service_id);
554 word = htonl(conn->abort_code);
556 iov[0].iov_base = &whdr;
557 iov[0].iov_len = sizeof(whdr);
558 iov[1].iov_base = &word;
559 iov[1].iov_len = sizeof(word);
561 len = iov[0].iov_len + iov[1].iov_len;
563 serial = atomic_inc_return(&conn->serial);
564 whdr.serial = htonl(serial);
566 iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len);
567 ret = do_udp_sendmsg(conn->local->socket, &msg, len);
569 trace_rxrpc_tx_fail(conn->debug_id, serial, ret,
570 rxrpc_tx_point_conn_abort);
571 _debug("sendmsg failed: %d", ret);
575 trace_rxrpc_tx_packet(conn->debug_id, &whdr, rxrpc_tx_point_conn_abort);
577 conn->peer->last_tx_at = ktime_get_seconds();
581 * Reject a packet through the local endpoint.
583 void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb)
585 struct rxrpc_wire_header whdr;
586 struct sockaddr_rxrpc srx;
587 struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
594 rxrpc_see_skb(skb, rxrpc_skb_see_reject);
596 iov[0].iov_base = &whdr;
597 iov[0].iov_len = sizeof(whdr);
598 iov[1].iov_base = &code;
599 iov[1].iov_len = sizeof(code);
601 msg.msg_name = &srx.transport;
602 msg.msg_control = NULL;
603 msg.msg_controllen = 0;
606 memset(&whdr, 0, sizeof(whdr));
609 case RXRPC_SKB_MARK_REJECT_BUSY:
610 whdr.type = RXRPC_PACKET_TYPE_BUSY;
614 case RXRPC_SKB_MARK_REJECT_ABORT:
615 whdr.type = RXRPC_PACKET_TYPE_ABORT;
616 code = htonl(skb->priority);
617 size = sizeof(whdr) + sizeof(code);
624 if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) {
625 msg.msg_namelen = srx.transport_len;
627 whdr.epoch = htonl(sp->hdr.epoch);
628 whdr.cid = htonl(sp->hdr.cid);
629 whdr.callNumber = htonl(sp->hdr.callNumber);
630 whdr.serviceId = htons(sp->hdr.serviceId);
631 whdr.flags = sp->hdr.flags;
632 whdr.flags ^= RXRPC_CLIENT_INITIATED;
633 whdr.flags &= RXRPC_CLIENT_INITIATED;
635 iov_iter_kvec(&msg.msg_iter, WRITE, iov, ioc, size);
636 ret = do_udp_sendmsg(local->socket, &msg, size);
638 trace_rxrpc_tx_fail(local->debug_id, 0, ret,
639 rxrpc_tx_point_reject);
641 trace_rxrpc_tx_packet(local->debug_id, &whdr,
642 rxrpc_tx_point_reject);
647 * Send a VERSION reply to a peer as a keepalive.
649 void rxrpc_send_keepalive(struct rxrpc_peer *peer)
651 struct rxrpc_wire_header whdr;
659 msg.msg_name = &peer->srx.transport;
660 msg.msg_namelen = peer->srx.transport_len;
661 msg.msg_control = NULL;
662 msg.msg_controllen = 0;
665 whdr.epoch = htonl(peer->local->rxnet->epoch);
670 whdr.type = RXRPC_PACKET_TYPE_VERSION; /* Not client-initiated */
671 whdr.flags = RXRPC_LAST_PACKET;
673 whdr.securityIndex = 0;
677 iov[0].iov_base = &whdr;
678 iov[0].iov_len = sizeof(whdr);
679 iov[1].iov_base = (char *)rxrpc_keepalive_string;
680 iov[1].iov_len = sizeof(rxrpc_keepalive_string);
682 len = iov[0].iov_len + iov[1].iov_len;
684 iov_iter_kvec(&msg.msg_iter, WRITE, iov, 2, len);
685 ret = do_udp_sendmsg(peer->local->socket, &msg, len);
687 trace_rxrpc_tx_fail(peer->debug_id, 0, ret,
688 rxrpc_tx_point_version_keepalive);
690 trace_rxrpc_tx_packet(peer->debug_id, &whdr,
691 rxrpc_tx_point_version_keepalive);
693 peer->last_tx_at = ktime_get_seconds();
698 * Schedule an instant Tx resend.
700 static inline void rxrpc_instant_resend(struct rxrpc_call *call,
701 struct rxrpc_txbuf *txb)
703 if (!__rxrpc_call_is_complete(call))
708 * Transmit one packet.
710 void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb)
714 ret = rxrpc_send_data_packet(call, txb);
720 rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR,
724 _debug("need instant resend %d", ret);
725 rxrpc_instant_resend(call, txb);
728 unsigned long now = jiffies;
729 unsigned long resend_at = now + call->peer->rto_j;
731 WRITE_ONCE(call->resend_at, resend_at);
732 rxrpc_reduce_call_timer(call, resend_at, now,
733 rxrpc_timer_set_for_send);