1 // SPDX-License-Identifier: GPL-2.0-only
3 * VMware vSockets Driver
5 * Copyright (C) 2009-2013 VMware, Inc. All rights reserved.
8 #include <linux/types.h>
9 #include <linux/socket.h>
10 #include <linux/stddef.h>
13 #include "vmci_transport_notify.h"
15 #define PKT_FIELD(vsk, field_name) (vmci_trans(vsk)->notify.pkt.field_name)
17 static bool vmci_transport_notify_waiting_write(struct vsock_sock *vsk)
19 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
23 if (!PKT_FIELD(vsk, peer_waiting_write))
26 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
27 /* When the sender blocks, we take that as a sign that the sender is
28 * faster than the receiver. To reduce the transmit rate of the sender,
29 * we delay the sending of the read notification by decreasing the
30 * write_notify_window. The notification is delayed until the number of
31 * bytes used in the queue drops below the write_notify_window.
34 if (!PKT_FIELD(vsk, peer_waiting_write_detected)) {
35 PKT_FIELD(vsk, peer_waiting_write_detected) = true;
36 if (PKT_FIELD(vsk, write_notify_window) < PAGE_SIZE) {
37 PKT_FIELD(vsk, write_notify_window) =
38 PKT_FIELD(vsk, write_notify_min_window);
40 PKT_FIELD(vsk, write_notify_window) -= PAGE_SIZE;
41 if (PKT_FIELD(vsk, write_notify_window) <
42 PKT_FIELD(vsk, write_notify_min_window))
43 PKT_FIELD(vsk, write_notify_window) =
44 PKT_FIELD(vsk, write_notify_min_window);
48 notify_limit = vmci_trans(vsk)->consume_size -
49 PKT_FIELD(vsk, write_notify_window);
54 /* For now we ignore the wait information and just see if the free
55 * space exceeds the notify limit. Note that improving this function
56 * to be more intelligent will not require a protocol change and will
57 * retain compatibility between endpoints with mixed versions of this
60 * The notify_limit is used to delay notifications in the case where
61 * flow control is enabled. Below the test is expressed in terms of
62 * free space in the queue: if free_space > ConsumeSize -
63 * write_notify_window then notify An alternate way of expressing this
64 * is to rewrite the expression to use the data ready in the receive
65 * queue: if write_notify_window > bufferReady then notify as
66 * free_space == ConsumeSize - bufferReady.
68 retval = vmci_qpair_consume_free_space(vmci_trans(vsk)->qpair) >
70 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
73 * Once we notify the peer, we reset the detected flag so the
74 * next wait will again cause a decrease in the window size.
77 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
86 static bool vmci_transport_notify_waiting_read(struct vsock_sock *vsk)
88 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
89 if (!PKT_FIELD(vsk, peer_waiting_read))
92 /* For now we ignore the wait information and just see if there is any
93 * data for our peer to read. Note that improving this function to be
94 * more intelligent will not require a protocol change and will retain
95 * compatibility between endpoints with mixed versions of this
98 return vmci_qpair_produce_buf_ready(vmci_trans(vsk)->qpair) > 0;
105 vmci_transport_handle_waiting_read(struct sock *sk,
106 struct vmci_transport_packet *pkt,
108 struct sockaddr_vm *dst,
109 struct sockaddr_vm *src)
111 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
112 struct vsock_sock *vsk;
116 PKT_FIELD(vsk, peer_waiting_read) = true;
117 memcpy(&PKT_FIELD(vsk, peer_waiting_read_info), &pkt->u.wait,
118 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
120 if (vmci_transport_notify_waiting_read(vsk)) {
124 sent = vmci_transport_send_wrote_bh(dst, src) > 0;
126 sent = vmci_transport_send_wrote(sk) > 0;
129 PKT_FIELD(vsk, peer_waiting_read) = false;
135 vmci_transport_handle_waiting_write(struct sock *sk,
136 struct vmci_transport_packet *pkt,
138 struct sockaddr_vm *dst,
139 struct sockaddr_vm *src)
141 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
142 struct vsock_sock *vsk;
146 PKT_FIELD(vsk, peer_waiting_write) = true;
147 memcpy(&PKT_FIELD(vsk, peer_waiting_write_info), &pkt->u.wait,
148 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
150 if (vmci_transport_notify_waiting_write(vsk)) {
154 sent = vmci_transport_send_read_bh(dst, src) > 0;
156 sent = vmci_transport_send_read(sk) > 0;
159 PKT_FIELD(vsk, peer_waiting_write) = false;
165 vmci_transport_handle_read(struct sock *sk,
166 struct vmci_transport_packet *pkt,
168 struct sockaddr_vm *dst, struct sockaddr_vm *src)
170 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
171 struct vsock_sock *vsk;
174 PKT_FIELD(vsk, sent_waiting_write) = false;
177 sk->sk_write_space(sk);
180 static bool send_waiting_read(struct sock *sk, u64 room_needed)
182 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
183 struct vsock_sock *vsk;
184 struct vmci_transport_waiting_info waiting_info;
192 if (PKT_FIELD(vsk, sent_waiting_read))
195 if (PKT_FIELD(vsk, write_notify_window) <
196 vmci_trans(vsk)->consume_size)
197 PKT_FIELD(vsk, write_notify_window) =
198 min(PKT_FIELD(vsk, write_notify_window) + PAGE_SIZE,
199 vmci_trans(vsk)->consume_size);
201 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair, &tail, &head);
202 room_left = vmci_trans(vsk)->consume_size - head;
203 if (room_needed >= room_left) {
204 waiting_info.offset = room_needed - room_left;
205 waiting_info.generation =
206 PKT_FIELD(vsk, consume_q_generation) + 1;
208 waiting_info.offset = head + room_needed;
209 waiting_info.generation = PKT_FIELD(vsk, consume_q_generation);
212 ret = vmci_transport_send_waiting_read(sk, &waiting_info) > 0;
214 PKT_FIELD(vsk, sent_waiting_read) = true;
222 static bool send_waiting_write(struct sock *sk, u64 room_needed)
224 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
225 struct vsock_sock *vsk;
226 struct vmci_transport_waiting_info waiting_info;
234 if (PKT_FIELD(vsk, sent_waiting_write))
237 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair, &tail, &head);
238 room_left = vmci_trans(vsk)->produce_size - tail;
239 if (room_needed + 1 >= room_left) {
240 /* Wraps around to current generation. */
241 waiting_info.offset = room_needed + 1 - room_left;
242 waiting_info.generation = PKT_FIELD(vsk, produce_q_generation);
244 waiting_info.offset = tail + room_needed + 1;
245 waiting_info.generation =
246 PKT_FIELD(vsk, produce_q_generation) - 1;
249 ret = vmci_transport_send_waiting_write(sk, &waiting_info) > 0;
251 PKT_FIELD(vsk, sent_waiting_write) = true;
259 static int vmci_transport_send_read_notification(struct sock *sk)
261 struct vsock_sock *vsk;
263 unsigned int retries;
271 if (vmci_transport_notify_waiting_write(vsk)) {
272 /* Notify the peer that we have read, retrying the send on
273 * failure up to our maximum value. XXX For now we just log
274 * the failure, but later we should schedule a work item to
275 * handle the resend until it succeeds. That would require
276 * keeping track of work items in the vsk and cleaning them up
279 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
281 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
282 err = vmci_transport_send_read(sk);
289 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS)
290 pr_err("%p unable to send read notify to peer\n", sk);
292 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
293 PKT_FIELD(vsk, peer_waiting_write) = false;
301 vmci_transport_handle_wrote(struct sock *sk,
302 struct vmci_transport_packet *pkt,
304 struct sockaddr_vm *dst, struct sockaddr_vm *src)
306 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
307 struct vsock_sock *vsk = vsock_sk(sk);
308 PKT_FIELD(vsk, sent_waiting_read) = false;
310 vsock_data_ready(sk);
313 static void vmci_transport_notify_pkt_socket_init(struct sock *sk)
315 struct vsock_sock *vsk = vsock_sk(sk);
317 PKT_FIELD(vsk, write_notify_window) = PAGE_SIZE;
318 PKT_FIELD(vsk, write_notify_min_window) = PAGE_SIZE;
319 PKT_FIELD(vsk, peer_waiting_read) = false;
320 PKT_FIELD(vsk, peer_waiting_write) = false;
321 PKT_FIELD(vsk, peer_waiting_write_detected) = false;
322 PKT_FIELD(vsk, sent_waiting_read) = false;
323 PKT_FIELD(vsk, sent_waiting_write) = false;
324 PKT_FIELD(vsk, produce_q_generation) = 0;
325 PKT_FIELD(vsk, consume_q_generation) = 0;
327 memset(&PKT_FIELD(vsk, peer_waiting_read_info), 0,
328 sizeof(PKT_FIELD(vsk, peer_waiting_read_info)));
329 memset(&PKT_FIELD(vsk, peer_waiting_write_info), 0,
330 sizeof(PKT_FIELD(vsk, peer_waiting_write_info)));
333 static void vmci_transport_notify_pkt_socket_destruct(struct vsock_sock *vsk)
338 vmci_transport_notify_pkt_poll_in(struct sock *sk,
339 size_t target, bool *data_ready_now)
341 struct vsock_sock *vsk = vsock_sk(sk);
343 if (vsock_stream_has_data(vsk) >= target) {
344 *data_ready_now = true;
346 /* We can't read right now because there is not enough data
347 * in the queue. Ask for notifications when there is something
350 if (sk->sk_state == TCP_ESTABLISHED) {
351 if (!send_waiting_read(sk, 1))
355 *data_ready_now = false;
362 vmci_transport_notify_pkt_poll_out(struct sock *sk,
363 size_t target, bool *space_avail_now)
365 s64 produce_q_free_space;
366 struct vsock_sock *vsk = vsock_sk(sk);
368 produce_q_free_space = vsock_stream_has_space(vsk);
369 if (produce_q_free_space > 0) {
370 *space_avail_now = true;
372 } else if (produce_q_free_space == 0) {
373 /* This is a connected socket but we can't currently send data.
374 * Notify the peer that we are waiting if the queue is full. We
375 * only send a waiting write if the queue is full because
376 * otherwise we end up in an infinite WAITING_WRITE, READ,
377 * WAITING_WRITE, READ, etc. loop. Treat failing to send the
378 * notification as a socket error, passing that back through
381 if (!send_waiting_write(sk, 1))
384 *space_avail_now = false;
391 vmci_transport_notify_pkt_recv_init(
394 struct vmci_transport_recv_notify_data *data)
396 struct vsock_sock *vsk = vsock_sk(sk);
398 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
399 data->consume_head = 0;
400 data->produce_tail = 0;
401 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
402 data->notify_on_block = false;
404 if (PKT_FIELD(vsk, write_notify_min_window) < target + 1) {
405 PKT_FIELD(vsk, write_notify_min_window) = target + 1;
406 if (PKT_FIELD(vsk, write_notify_window) <
407 PKT_FIELD(vsk, write_notify_min_window)) {
408 /* If the current window is smaller than the new
409 * minimal window size, we need to reevaluate whether
410 * we need to notify the sender. If the number of ready
411 * bytes are smaller than the new window, we need to
412 * send a notification to the sender before we block.
415 PKT_FIELD(vsk, write_notify_window) =
416 PKT_FIELD(vsk, write_notify_min_window);
417 data->notify_on_block = true;
427 vmci_transport_notify_pkt_recv_pre_block(
430 struct vmci_transport_recv_notify_data *data)
434 /* Notify our peer that we are waiting for data to read. */
435 if (!send_waiting_read(sk, target)) {
439 #ifdef VSOCK_OPTIMIZATION_FLOW_CONTROL
440 if (data->notify_on_block) {
441 err = vmci_transport_send_read_notification(sk);
445 data->notify_on_block = false;
453 vmci_transport_notify_pkt_recv_pre_dequeue(
456 struct vmci_transport_recv_notify_data *data)
458 struct vsock_sock *vsk = vsock_sk(sk);
460 /* Now consume up to len bytes from the queue. Note that since we have
461 * the socket locked we should copy at least ready bytes.
463 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
464 vmci_qpair_get_consume_indexes(vmci_trans(vsk)->qpair,
466 &data->consume_head);
473 vmci_transport_notify_pkt_recv_post_dequeue(
478 struct vmci_transport_recv_notify_data *data)
480 struct vsock_sock *vsk;
487 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
488 /* Detect a wrap-around to maintain queue generation. Note
489 * that this is safe since we hold the socket lock across the
490 * two queue pair operations.
493 vmci_trans(vsk)->consume_size - data->consume_head)
494 PKT_FIELD(vsk, consume_q_generation)++;
497 err = vmci_transport_send_read_notification(sk);
506 vmci_transport_notify_pkt_send_init(
508 struct vmci_transport_send_notify_data *data)
510 #ifdef VSOCK_OPTIMIZATION_WAITING_NOTIFY
511 data->consume_head = 0;
512 data->produce_tail = 0;
519 vmci_transport_notify_pkt_send_pre_block(
521 struct vmci_transport_send_notify_data *data)
523 /* Notify our peer that we are waiting for room to write. */
524 if (!send_waiting_write(sk, 1))
525 return -EHOSTUNREACH;
531 vmci_transport_notify_pkt_send_pre_enqueue(
533 struct vmci_transport_send_notify_data *data)
535 struct vsock_sock *vsk = vsock_sk(sk);
537 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
538 vmci_qpair_get_produce_indexes(vmci_trans(vsk)->qpair,
540 &data->consume_head);
547 vmci_transport_notify_pkt_send_post_enqueue(
550 struct vmci_transport_send_notify_data *data)
553 struct vsock_sock *vsk;
554 bool sent_wrote = false;
559 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
560 /* Detect a wrap-around to maintain queue generation. Note that this
561 * is safe since we hold the socket lock across the two queue pair
564 if (written >= vmci_trans(vsk)->produce_size - data->produce_tail)
565 PKT_FIELD(vsk, produce_q_generation)++;
569 if (vmci_transport_notify_waiting_read(vsk)) {
570 /* Notify the peer that we have written, retrying the send on
571 * failure up to our maximum value. See the XXX comment for the
572 * corresponding piece of code in StreamRecvmsg() for potential
575 while (!(vsk->peer_shutdown & RCV_SHUTDOWN) &&
577 retries < VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
578 err = vmci_transport_send_wrote(sk);
585 if (retries >= VMCI_TRANSPORT_MAX_DGRAM_RESENDS) {
586 pr_err("%p unable to send wrote notify to peer\n", sk);
589 #if defined(VSOCK_OPTIMIZATION_WAITING_NOTIFY)
590 PKT_FIELD(vsk, peer_waiting_read) = false;
598 vmci_transport_notify_pkt_handle_pkt(
600 struct vmci_transport_packet *pkt,
602 struct sockaddr_vm *dst,
603 struct sockaddr_vm *src, bool *pkt_processed)
605 bool processed = false;
608 case VMCI_TRANSPORT_PACKET_TYPE_WROTE:
609 vmci_transport_handle_wrote(sk, pkt, bottom_half, dst, src);
612 case VMCI_TRANSPORT_PACKET_TYPE_READ:
613 vmci_transport_handle_read(sk, pkt, bottom_half, dst, src);
616 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
617 vmci_transport_handle_waiting_write(sk, pkt, bottom_half,
622 case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
623 vmci_transport_handle_waiting_read(sk, pkt, bottom_half,
630 *pkt_processed = processed;
633 static void vmci_transport_notify_pkt_process_request(struct sock *sk)
635 struct vsock_sock *vsk = vsock_sk(sk);
637 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
638 if (vmci_trans(vsk)->consume_size <
639 PKT_FIELD(vsk, write_notify_min_window))
640 PKT_FIELD(vsk, write_notify_min_window) =
641 vmci_trans(vsk)->consume_size;
644 static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
646 struct vsock_sock *vsk = vsock_sk(sk);
648 PKT_FIELD(vsk, write_notify_window) = vmci_trans(vsk)->consume_size;
649 if (vmci_trans(vsk)->consume_size <
650 PKT_FIELD(vsk, write_notify_min_window))
651 PKT_FIELD(vsk, write_notify_min_window) =
652 vmci_trans(vsk)->consume_size;
655 /* Socket control packet based operations. */
656 const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
657 .socket_init = vmci_transport_notify_pkt_socket_init,
658 .socket_destruct = vmci_transport_notify_pkt_socket_destruct,
659 .poll_in = vmci_transport_notify_pkt_poll_in,
660 .poll_out = vmci_transport_notify_pkt_poll_out,
661 .handle_notify_pkt = vmci_transport_notify_pkt_handle_pkt,
662 .recv_init = vmci_transport_notify_pkt_recv_init,
663 .recv_pre_block = vmci_transport_notify_pkt_recv_pre_block,
664 .recv_pre_dequeue = vmci_transport_notify_pkt_recv_pre_dequeue,
665 .recv_post_dequeue = vmci_transport_notify_pkt_recv_post_dequeue,
666 .send_init = vmci_transport_notify_pkt_send_init,
667 .send_pre_block = vmci_transport_notify_pkt_send_pre_block,
668 .send_pre_enqueue = vmci_transport_notify_pkt_send_pre_enqueue,
669 .send_post_enqueue = vmci_transport_notify_pkt_send_post_enqueue,
670 .process_request = vmci_transport_notify_pkt_process_request,
671 .process_negotiate = vmci_transport_notify_pkt_process_negotiate,