drivers/block/drbd/drbd_receiver.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3    drbd_receiver.c
   4
   5    This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
   6
   7    Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
   8    Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
   9    Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
  10
  11  */
  12
  13
  14 #include <linux/module.h>
  15
  16 #include <linux/uaccess.h>
  17 #include <net/sock.h>
  18
  19 #include <linux/drbd.h>
  20 #include <linux/fs.h>
  21 #include <linux/file.h>
  22 #include <linux/in.h>
  23 #include <linux/mm.h>
  24 #include <linux/memcontrol.h>
  25 #include <linux/mm_inline.h>
  26 #include <linux/slab.h>
  27 #include <uapi/linux/sched/types.h>
  28 #include <linux/sched/signal.h>
  29 #include <linux/pkt_sched.h>
  30 #define __KERNEL_SYSCALLS__
  31 #include <linux/unistd.h>
  32 #include <linux/vmalloc.h>
  33 #include <linux/random.h>
  34 #include <linux/string.h>
  35 #include <linux/scatterlist.h>
  36 #include <linux/part_stat.h>
  37 #include "drbd_int.h"
  38 #include "drbd_protocol.h"
  39 #include "drbd_req.h"
  40 #include "drbd_vli.h"
  41
  42 #define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES)
  43
  44 struct packet_info {
  45         enum drbd_packet cmd;
  46         unsigned int size;
  47         unsigned int vnr;
  48         void *data;
  49 };
  50
  51 enum finish_epoch {
  52         FE_STILL_LIVE,
  53         FE_DESTROYED,
  54         FE_RECYCLED,
  55 };
  56
  57 static int drbd_do_features(struct drbd_connection *connection);
  58 static int drbd_do_auth(struct drbd_connection *connection);
  59 static int drbd_disconnected(struct drbd_peer_device *);
  60 static void conn_wait_active_ee_empty(struct drbd_connection *connection);
  61 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
  62 static int e_end_block(struct drbd_work *, int);
  63
  64
  65 #define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
  66
  67 /*
  68  * some helper functions to deal with single linked page lists,
  69  * page->private being our "next" pointer.
  70  */
  71
  72 /* If at least n pages are linked at head, get n pages off.
  73  * Otherwise, don't modify head, and return NULL.
  74  * Locking is the responsibility of the caller.
  75  */
  76 static struct page *page_chain_del(struct page **head, int n)
  77 {
  78         struct page *page;
  79         struct page *tmp;
  80
  81         BUG_ON(!n);
  82         BUG_ON(!head);
  83
  84         page = *head;
  85
  86         if (!page)
  87                 return NULL;
  88
  89         while (page) {
  90                 tmp = page_chain_next(page);
  91                 if (--n == 0)
  92                         break; /* found sufficient pages */
  93                 if (tmp == NULL)
  94                         /* insufficient pages, don't use any of them. */
  95                         return NULL;
  96                 page = tmp;
  97         }
  98
  99         /* add end of list marker for the returned list */
 100         set_page_private(page, 0);
 101         /* actual return value, and adjustment of head */
 102         page = *head;
 103         *head = tmp;
 104         return page;
 105 }
 106
 107 /* may be used outside of locks to find the tail of a (usually short)
 108  * "private" page chain, before adding it back to a global chain head
 109  * with page_chain_add() under a spinlock. */
 110 static struct page *page_chain_tail(struct page *page, int *len)
 111 {
 112         struct page *tmp;
 113         int i = 1;
 114         while ((tmp = page_chain_next(page))) {
 115                 ++i;
 116                 page = tmp;
 117         }
 118         if (len)
 119                 *len = i;
 120         return page;
 121 }
 122
 123 static int page_chain_free(struct page *page)
 124 {
 125         struct page *tmp;
 126         int i = 0;
 127         page_chain_for_each_safe(page, tmp) {
 128                 put_page(page);
 129                 ++i;
 130         }
 131         return i;
 132 }
 133
 134 static void page_chain_add(struct page **head,
 135                 struct page *chain_first, struct page *chain_last)
 136 {
 137 #if 1
 138         struct page *tmp;
 139         tmp = page_chain_tail(chain_first, NULL);
 140         BUG_ON(tmp != chain_last);
 141 #endif
 142
 143         /* add chain to head */
 144         set_page_private(chain_last, (unsigned long)*head);
 145         *head = chain_first;
 146 }
 147
 148 static struct page *__drbd_alloc_pages(struct drbd_device *device,
 149                                        unsigned int number)
 150 {
 151         struct page *page = NULL;
 152         struct page *tmp = NULL;
 153         unsigned int i = 0;
 154
 155         /* Yes, testing drbd_pp_vacant outside the lock is racy.
 156          * So what. It saves a spin_lock. */
 157         if (drbd_pp_vacant >= number) {
 158                 spin_lock(&drbd_pp_lock);
 159                 page = page_chain_del(&drbd_pp_pool, number);
 160                 if (page)
 161                         drbd_pp_vacant -= number;
 162                 spin_unlock(&drbd_pp_lock);
 163                 if (page)
 164                         return page;
 165         }
 166
 167         /* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
 168          * "criss-cross" setup, that might cause write-out on some other DRBD,
 169          * which in turn might block on the other node at this very place.  */
 170         for (i = 0; i < number; i++) {
 171                 tmp = alloc_page(GFP_TRY);
 172                 if (!tmp)
 173                         break;
 174                 set_page_private(tmp, (unsigned long)page);
 175                 page = tmp;
 176         }
 177
 178         if (i == number)
 179                 return page;
 180
 181         /* Not enough pages immediately available this time.
 182          * No need to jump around here, drbd_alloc_pages will retry this
 183          * function "soon". */
 184         if (page) {
 185                 tmp = page_chain_tail(page, NULL);
 186                 spin_lock(&drbd_pp_lock);
 187                 page_chain_add(&drbd_pp_pool, page, tmp);
 188                 drbd_pp_vacant += i;
 189                 spin_unlock(&drbd_pp_lock);
 190         }
 191         return NULL;
 192 }
 193
 194 static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
 195                                            struct list_head *to_be_freed)
 196 {
 197         struct drbd_peer_request *peer_req, *tmp;
 198
 199         /* The EEs are always appended to the end of the list. Since
 200            they are sent in order over the wire, they have to finish
 201            in order. As soon as we see the first not finished we can
 202            stop to examine the list... */
 203
 204         list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
 205                 if (drbd_peer_req_has_active_page(peer_req))
 206                         break;
 207                 list_move(&peer_req->w.list, to_be_freed);
 208         }
 209 }
 210
 211 static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
 212 {
 213         LIST_HEAD(reclaimed);
 214         struct drbd_peer_request *peer_req, *t;
 215
 216         spin_lock_irq(&device->resource->req_lock);
 217         reclaim_finished_net_peer_reqs(device, &reclaimed);
 218         spin_unlock_irq(&device->resource->req_lock);
 219         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 220                 drbd_free_net_peer_req(device, peer_req);
 221 }
 222
 223 static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
 224 {
 225         struct drbd_peer_device *peer_device;
 226         int vnr;
 227
 228         rcu_read_lock();
 229         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
 230                 struct drbd_device *device = peer_device->device;
 231                 if (!atomic_read(&device->pp_in_use_by_net))
 232                         continue;
 233
 234                 kref_get(&device->kref);
 235                 rcu_read_unlock();
 236                 drbd_reclaim_net_peer_reqs(device);
 237                 kref_put(&device->kref, drbd_destroy_device);
 238                 rcu_read_lock();
 239         }
 240         rcu_read_unlock();
 241 }
 242
 243 /**
 244  * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
 245  * @peer_device:        DRBD device.
 246  * @number:             number of pages requested
 247  * @retry:              whether to retry, if not enough pages are available right now
 248  *
 249  * Tries to allocate number pages, first from our own page pool, then from
 250  * the kernel.
 251  * Possibly retry until DRBD frees sufficient pages somewhere else.
 252  *
 253  * If this allocation would exceed the max_buffers setting, we throttle
 254  * allocation (schedule_timeout) to give the system some room to breathe.
 255  *
 256  * We do not use max-buffers as hard limit, because it could lead to
 257  * congestion and further to a distributed deadlock during online-verify or
 258  * (checksum based) resync, if the max-buffers, socket buffer sizes and
 259  * resync-rate settings are mis-configured.
 260  *
 261  * Returns a page chain linked via page->private.
 262  */
 263 struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
 264                               bool retry)
 265 {
 266         struct drbd_device *device = peer_device->device;
 267         struct page *page = NULL;
 268         struct net_conf *nc;
 269         DEFINE_WAIT(wait);
 270         unsigned int mxb;
 271
 272         rcu_read_lock();
 273         nc = rcu_dereference(peer_device->connection->net_conf);
 274         mxb = nc ? nc->max_buffers : 1000000;
 275         rcu_read_unlock();
 276
 277         if (atomic_read(&device->pp_in_use) < mxb)
 278                 page = __drbd_alloc_pages(device, number);
 279
 280         /* Try to keep the fast path fast, but occasionally we need
 281          * to reclaim the pages we lended to the network stack. */
 282         if (page && atomic_read(&device->pp_in_use_by_net) > 512)
 283                 drbd_reclaim_net_peer_reqs(device);
 284
 285         while (page == NULL) {
 286                 prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
 287
 288                 drbd_reclaim_net_peer_reqs(device);
 289
 290                 if (atomic_read(&device->pp_in_use) < mxb) {
 291                         page = __drbd_alloc_pages(device, number);
 292                         if (page)
 293                                 break;
 294                 }
 295
 296                 if (!retry)
 297                         break;
 298
 299                 if (signal_pending(current)) {
 300                         drbd_warn(device, "drbd_alloc_pages interrupted!\n");
 301                         break;
 302                 }
 303
 304                 if (schedule_timeout(HZ/10) == 0)
 305                         mxb = UINT_MAX;
 306         }
 307         finish_wait(&drbd_pp_wait, &wait);
 308
 309         if (page)
 310                 atomic_add(number, &device->pp_in_use);
 311         return page;
 312 }
 313
 314 /* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
 315  * Is also used from inside an other spin_lock_irq(&resource->req_lock);
 316  * Either links the page chain back to the global pool,
 317  * or returns all pages to the system. */
 318 static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
 319 {
 320         atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
 321         int i;
 322
 323         if (page == NULL)
 324                 return;
 325
 326         if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
 327                 i = page_chain_free(page);
 328         else {
 329                 struct page *tmp;
 330                 tmp = page_chain_tail(page, &i);
 331                 spin_lock(&drbd_pp_lock);
 332                 page_chain_add(&drbd_pp_pool, page, tmp);
 333                 drbd_pp_vacant += i;
 334                 spin_unlock(&drbd_pp_lock);
 335         }
 336         i = atomic_sub_return(i, a);
 337         if (i < 0)
 338                 drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
 339                         is_net ? "pp_in_use_by_net" : "pp_in_use", i);
 340         wake_up(&drbd_pp_wait);
 341 }
 342
 343 /*
 344 You need to hold the req_lock:
 345  _drbd_wait_ee_list_empty()
 346
 347 You must not have the req_lock:
 348  drbd_free_peer_req()
 349  drbd_alloc_peer_req()
 350  drbd_free_peer_reqs()
 351  drbd_ee_fix_bhs()
 352  drbd_finish_peer_reqs()
 353  drbd_clear_done_ee()
 354  drbd_wait_ee_list_empty()
 355 */
 356
 357 /* normal: payload_size == request size (bi_size)
 358  * w_same: payload_size == logical_block_size
 359  * trim: payload_size == 0 */
 360 struct drbd_peer_request *
 361 drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
 362                     unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
 363 {
 364         struct drbd_device *device = peer_device->device;
 365         struct drbd_peer_request *peer_req;
 366         struct page *page = NULL;
 367         unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT;
 368
 369         if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
 370                 return NULL;
 371
 372         peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
 373         if (!peer_req) {
 374                 if (!(gfp_mask & __GFP_NOWARN))
 375                         drbd_err(device, "%s: allocation failed\n", __func__);
 376                 return NULL;
 377         }
 378
 379         if (nr_pages) {
 380                 page = drbd_alloc_pages(peer_device, nr_pages,
 381                                         gfpflags_allow_blocking(gfp_mask));
 382                 if (!page)
 383                         goto fail;
 384         }
 385
 386         memset(peer_req, 0, sizeof(*peer_req));
 387         INIT_LIST_HEAD(&peer_req->w.list);
 388         drbd_clear_interval(&peer_req->i);
 389         peer_req->i.size = request_size;
 390         peer_req->i.sector = sector;
 391         peer_req->submit_jif = jiffies;
 392         peer_req->peer_device = peer_device;
 393         peer_req->pages = page;
 394         /*
 395          * The block_id is opaque to the receiver.  It is not endianness
 396          * converted, and sent back to the sender unchanged.
 397          */
 398         peer_req->block_id = id;
 399
 400         return peer_req;
 401
 402  fail:
 403         mempool_free(peer_req, &drbd_ee_mempool);
 404         return NULL;
 405 }
 406
 407 void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
 408                        int is_net)
 409 {
 410         might_sleep();
 411         if (peer_req->flags & EE_HAS_DIGEST)
 412                 kfree(peer_req->digest);
 413         drbd_free_pages(device, peer_req->pages, is_net);
 414         D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
 415         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
 416         if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
 417                 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
 418                 drbd_al_complete_io(device, &peer_req->i);
 419         }
 420         mempool_free(peer_req, &drbd_ee_mempool);
 421 }
 422
 423 int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
 424 {
 425         LIST_HEAD(work_list);
 426         struct drbd_peer_request *peer_req, *t;
 427         int count = 0;
 428         int is_net = list == &device->net_ee;
 429
 430         spin_lock_irq(&device->resource->req_lock);
 431         list_splice_init(list, &work_list);
 432         spin_unlock_irq(&device->resource->req_lock);
 433
 434         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 435                 __drbd_free_peer_req(device, peer_req, is_net);
 436                 count++;
 437         }
 438         return count;
 439 }
 440
 441 /*
 442  * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
 443  */
 444 static int drbd_finish_peer_reqs(struct drbd_device *device)
 445 {
 446         LIST_HEAD(work_list);
 447         LIST_HEAD(reclaimed);
 448         struct drbd_peer_request *peer_req, *t;
 449         int err = 0;
 450
 451         spin_lock_irq(&device->resource->req_lock);
 452         reclaim_finished_net_peer_reqs(device, &reclaimed);
 453         list_splice_init(&device->done_ee, &work_list);
 454         spin_unlock_irq(&device->resource->req_lock);
 455
 456         list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
 457                 drbd_free_net_peer_req(device, peer_req);
 458
 459         /* possible callbacks here:
 460          * e_end_block, and e_end_resync_block, e_send_superseded.
 461          * all ignore the last argument.
 462          */
 463         list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
 464                 int err2;
 465
 466                 /* list_del not necessary, next/prev members not touched */
 467                 err2 = peer_req->w.cb(&peer_req->w, !!err);
 468                 if (!err)
 469                         err = err2;
 470                 drbd_free_peer_req(device, peer_req);
 471         }
 472         wake_up(&device->ee_wait);
 473
 474         return err;
 475 }
 476
 477 static void _drbd_wait_ee_list_empty(struct drbd_device *device,
 478                                      struct list_head *head)
 479 {
 480         DEFINE_WAIT(wait);
 481
 482         /* avoids spin_lock/unlock
 483          * and calling prepare_to_wait in the fast path */
 484         while (!list_empty(head)) {
 485                 prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
 486                 spin_unlock_irq(&device->resource->req_lock);
 487                 io_schedule();
 488                 finish_wait(&device->ee_wait, &wait);
 489                 spin_lock_irq(&device->resource->req_lock);
 490         }
 491 }
 492
 493 static void drbd_wait_ee_list_empty(struct drbd_device *device,
 494                                     struct list_head *head)
 495 {
 496         spin_lock_irq(&device->resource->req_lock);
 497         _drbd_wait_ee_list_empty(device, head);
 498         spin_unlock_irq(&device->resource->req_lock);
 499 }
 500
 501 static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
 502 {
 503         struct kvec iov = {
 504                 .iov_base = buf,
 505                 .iov_len = size,
 506         };
 507         struct msghdr msg = {
 508                 .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
 509         };
 510         iov_iter_kvec(&msg.msg_iter, READ, &iov, 1, size);
 511         return sock_recvmsg(sock, &msg, msg.msg_flags);
 512 }
 513
 514 static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
 515 {
 516         int rv;
 517
 518         rv = drbd_recv_short(connection->data.socket, buf, size, 0);
 519
 520         if (rv < 0) {
 521                 if (rv == -ECONNRESET)
 522                         drbd_info(connection, "sock was reset by peer\n");
 523                 else if (rv != -ERESTARTSYS)
 524                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
 525         } else if (rv == 0) {
 526                 if (test_bit(DISCONNECT_SENT, &connection->flags)) {
 527                         long t;
 528                         rcu_read_lock();
 529                         t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
 530                         rcu_read_unlock();
 531
 532                         t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
 533
 534                         if (t)
 535                                 goto out;
 536                 }
 537                 drbd_info(connection, "sock was shut down by peer\n");
 538         }
 539
 540         if (rv != size)
 541                 conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
 542
 543 out:
 544         return rv;
 545 }
 546
 547 static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
 548 {
 549         int err;
 550
 551         err = drbd_recv(connection, buf, size);
 552         if (err != size) {
 553                 if (err >= 0)
 554                         err = -EIO;
 555         } else
 556                 err = 0;
 557         return err;
 558 }
 559
 560 static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
 561 {
 562         int err;
 563
 564         err = drbd_recv_all(connection, buf, size);
 565         if (err && !signal_pending(current))
 566                 drbd_warn(connection, "short read (expected size %d)\n", (int)size);
 567         return err;
 568 }
 569
 570 /* quoting tcp(7):
 571  *   On individual connections, the socket buffer size must be set prior to the
 572  *   listen(2) or connect(2) calls in order to have it take effect.
 573  * This is our wrapper to do so.
 574  */
 575 static void drbd_setbufsize(struct socket *sock, unsigned int snd,
 576                 unsigned int rcv)
 577 {
 578         /* open coded SO_SNDBUF, SO_RCVBUF */
 579         if (snd) {
 580                 sock->sk->sk_sndbuf = snd;
 581                 sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 582         }
 583         if (rcv) {
 584                 sock->sk->sk_rcvbuf = rcv;
 585                 sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 586         }
 587 }
 588
 589 static struct socket *drbd_try_connect(struct drbd_connection *connection)
 590 {
 591         const char *what;
 592         struct socket *sock;
 593         struct sockaddr_in6 src_in6;
 594         struct sockaddr_in6 peer_in6;
 595         struct net_conf *nc;
 596         int err, peer_addr_len, my_addr_len;
 597         int sndbuf_size, rcvbuf_size, connect_int;
 598         int disconnect_on_error = 1;
 599
 600         rcu_read_lock();
 601         nc = rcu_dereference(connection->net_conf);
 602         if (!nc) {
 603                 rcu_read_unlock();
 604                 return NULL;
 605         }
 606         sndbuf_size = nc->sndbuf_size;
 607         rcvbuf_size = nc->rcvbuf_size;
 608         connect_int = nc->connect_int;
 609         rcu_read_unlock();
 610
 611         my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
 612         memcpy(&src_in6, &connection->my_addr, my_addr_len);
 613
 614         if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
 615                 src_in6.sin6_port = 0;
 616         else
 617                 ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
 618
 619         peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
 620         memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
 621
 622         what = "sock_create_kern";
 623         err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
 624                                SOCK_STREAM, IPPROTO_TCP, &sock);
 625         if (err < 0) {
 626                 sock = NULL;
 627                 goto out;
 628         }
 629
 630         sock->sk->sk_rcvtimeo =
 631         sock->sk->sk_sndtimeo = connect_int * HZ;
 632         drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
 633
 634        /* explicitly bind to the configured IP as source IP
 635         *  for the outgoing connections.
 636         *  This is needed for multihomed hosts and to be
 637         *  able to use lo: interfaces for drbd.
 638         * Make sure to use 0 as port number, so linux selects
 639         *  a free one dynamically.
 640         */
 641         what = "bind before connect";
 642         err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
 643         if (err < 0)
 644                 goto out;
 645
 646         /* connect may fail, peer not yet available.
 647          * stay C_WF_CONNECTION, don't go Disconnecting! */
 648         disconnect_on_error = 0;
 649         what = "connect";
 650         err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
 651
 652 out:
 653         if (err < 0) {
 654                 if (sock) {
 655                         sock_release(sock);
 656                         sock = NULL;
 657                 }
 658                 switch (-err) {
 659                         /* timeout, busy, signal pending */
 660                 case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
 661                 case EINTR: case ERESTARTSYS:
 662                         /* peer not (yet) available, network problem */
 663                 case ECONNREFUSED: case ENETUNREACH:
 664                 case EHOSTDOWN:    case EHOSTUNREACH:
 665                         disconnect_on_error = 0;
 666                         break;
 667                 default:
 668                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 669                 }
 670                 if (disconnect_on_error)
 671                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 672         }
 673
 674         return sock;
 675 }
 676
 677 struct accept_wait_data {
 678         struct drbd_connection *connection;
 679         struct socket *s_listen;
 680         struct completion door_bell;
 681         void (*original_sk_state_change)(struct sock *sk);
 682
 683 };
 684
 685 static void drbd_incoming_connection(struct sock *sk)
 686 {
 687         struct accept_wait_data *ad = sk->sk_user_data;
 688         void (*state_change)(struct sock *sk);
 689
 690         state_change = ad->original_sk_state_change;
 691         if (sk->sk_state == TCP_ESTABLISHED)
 692                 complete(&ad->door_bell);
 693         state_change(sk);
 694 }
 695
 696 static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
 697 {
 698         int err, sndbuf_size, rcvbuf_size, my_addr_len;
 699         struct sockaddr_in6 my_addr;
 700         struct socket *s_listen;
 701         struct net_conf *nc;
 702         const char *what;
 703
 704         rcu_read_lock();
 705         nc = rcu_dereference(connection->net_conf);
 706         if (!nc) {
 707                 rcu_read_unlock();
 708                 return -EIO;
 709         }
 710         sndbuf_size = nc->sndbuf_size;
 711         rcvbuf_size = nc->rcvbuf_size;
 712         rcu_read_unlock();
 713
 714         my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
 715         memcpy(&my_addr, &connection->my_addr, my_addr_len);
 716
 717         what = "sock_create_kern";
 718         err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
 719                                SOCK_STREAM, IPPROTO_TCP, &s_listen);
 720         if (err) {
 721                 s_listen = NULL;
 722                 goto out;
 723         }
 724
 725         s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 726         drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
 727
 728         what = "bind before listen";
 729         err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
 730         if (err < 0)
 731                 goto out;
 732
 733         ad->s_listen = s_listen;
 734         write_lock_bh(&s_listen->sk->sk_callback_lock);
 735         ad->original_sk_state_change = s_listen->sk->sk_state_change;
 736         s_listen->sk->sk_state_change = drbd_incoming_connection;
 737         s_listen->sk->sk_user_data = ad;
 738         write_unlock_bh(&s_listen->sk->sk_callback_lock);
 739
 740         what = "listen";
 741         err = s_listen->ops->listen(s_listen, 5);
 742         if (err < 0)
 743                 goto out;
 744
 745         return 0;
 746 out:
 747         if (s_listen)
 748                 sock_release(s_listen);
 749         if (err < 0) {
 750                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 751                         drbd_err(connection, "%s failed, err = %d\n", what, err);
 752                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 753                 }
 754         }
 755
 756         return -EIO;
 757 }
 758
 759 static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
 760 {
 761         write_lock_bh(&sk->sk_callback_lock);
 762         sk->sk_state_change = ad->original_sk_state_change;
 763         sk->sk_user_data = NULL;
 764         write_unlock_bh(&sk->sk_callback_lock);
 765 }
 766
 767 static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
 768 {
 769         int timeo, connect_int, err = 0;
 770         struct socket *s_estab = NULL;
 771         struct net_conf *nc;
 772
 773         rcu_read_lock();
 774         nc = rcu_dereference(connection->net_conf);
 775         if (!nc) {
 776                 rcu_read_unlock();
 777                 return NULL;
 778         }
 779         connect_int = nc->connect_int;
 780         rcu_read_unlock();
 781
 782         timeo = connect_int * HZ;
 783         /* 28.5% random jitter */
 784         timeo += (prandom_u32() & 1) ? timeo / 7 : -timeo / 7;
 785
 786         err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
 787         if (err <= 0)
 788                 return NULL;
 789
 790         err = kernel_accept(ad->s_listen, &s_estab, 0);
 791         if (err < 0) {
 792                 if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
 793                         drbd_err(connection, "accept failed, err = %d\n", err);
 794                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
 795                 }
 796         }
 797
 798         if (s_estab)
 799                 unregister_state_change(s_estab->sk, ad);
 800
 801         return s_estab;
 802 }
 803
 804 static int decode_header(struct drbd_connection *, void *, struct packet_info *);
 805
 806 static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
 807                              enum drbd_packet cmd)
 808 {
 809         if (!conn_prepare_command(connection, sock))
 810                 return -EIO;
 811         return conn_send_command(connection, sock, cmd, 0, NULL, 0);
 812 }
 813
 814 static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
 815 {
 816         unsigned int header_size = drbd_header_size(connection);
 817         struct packet_info pi;
 818         struct net_conf *nc;
 819         int err;
 820
 821         rcu_read_lock();
 822         nc = rcu_dereference(connection->net_conf);
 823         if (!nc) {
 824                 rcu_read_unlock();
 825                 return -EIO;
 826         }
 827         sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
 828         rcu_read_unlock();
 829
 830         err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
 831         if (err != header_size) {
 832                 if (err >= 0)
 833                         err = -EIO;
 834                 return err;
 835         }
 836         err = decode_header(connection, connection->data.rbuf, &pi);
 837         if (err)
 838                 return err;
 839         return pi.cmd;
 840 }
 841
 842 /**
 843  * drbd_socket_okay() - Free the socket if its connection is not okay
 844  * @sock:       pointer to the pointer to the socket.
 845  */
 846 static bool drbd_socket_okay(struct socket **sock)
 847 {
 848         int rr;
 849         char tb[4];
 850
 851         if (!*sock)
 852                 return false;
 853
 854         rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
 855
 856         if (rr > 0 || rr == -EAGAIN) {
 857                 return true;
 858         } else {
 859                 sock_release(*sock);
 860                 *sock = NULL;
 861                 return false;
 862         }
 863 }
 864
 865 static bool connection_established(struct drbd_connection *connection,
 866                                    struct socket **sock1,
 867                                    struct socket **sock2)
 868 {
 869         struct net_conf *nc;
 870         int timeout;
 871         bool ok;
 872
 873         if (!*sock1 || !*sock2)
 874                 return false;
 875
 876         rcu_read_lock();
 877         nc = rcu_dereference(connection->net_conf);
 878         timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
 879         rcu_read_unlock();
 880         schedule_timeout_interruptible(timeout);
 881
 882         ok = drbd_socket_okay(sock1);
 883         ok = drbd_socket_okay(sock2) && ok;
 884
 885         return ok;
 886 }
 887
 888 /* Gets called if a connection is established, or if a new minor gets created
 889    in a connection */
 890 int drbd_connected(struct drbd_peer_device *peer_device)
 891 {
 892         struct drbd_device *device = peer_device->device;
 893         int err;
 894
 895         atomic_set(&device->packet_seq, 0);
 896         device->peer_seq = 0;
 897
 898         device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
 899                 &peer_device->connection->cstate_mutex :
 900                 &device->own_state_mutex;
 901
 902         err = drbd_send_sync_param(peer_device);
 903         if (!err)
 904                 err = drbd_send_sizes(peer_device, 0, 0);
 905         if (!err)
 906                 err = drbd_send_uuids(peer_device);
 907         if (!err)
 908                 err = drbd_send_current_state(peer_device);
 909         clear_bit(USE_DEGR_WFC_T, &device->flags);
 910         clear_bit(RESIZE_PENDING, &device->flags);
 911         atomic_set(&device->ap_in_flight, 0);
 912         mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
 913         return err;
 914 }
 915
 916 /*
 917  * return values:
 918  *   1 yes, we have a valid connection
 919  *   0 oops, did not work out, please try again
 920  *  -1 peer talks different language,
 921  *     no point in trying again, please go standalone.
 922  *  -2 We do not have a network config...
 923  */
 924 static int conn_connect(struct drbd_connection *connection)
 925 {
 926         struct drbd_socket sock, msock;
 927         struct drbd_peer_device *peer_device;
 928         struct net_conf *nc;
 929         int vnr, timeout, h;
 930         bool discard_my_data, ok;
 931         enum drbd_state_rv rv;
 932         struct accept_wait_data ad = {
 933                 .connection = connection,
 934                 .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
 935         };
 936
 937         clear_bit(DISCONNECT_SENT, &connection->flags);
 938         if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
 939                 return -2;
 940
 941         mutex_init(&sock.mutex);
 942         sock.sbuf = connection->data.sbuf;
 943         sock.rbuf = connection->data.rbuf;
 944         sock.socket = NULL;
 945         mutex_init(&msock.mutex);
 946         msock.sbuf = connection->meta.sbuf;
 947         msock.rbuf = connection->meta.rbuf;
 948         msock.socket = NULL;
 949
 950         /* Assume that the peer only understands protocol 80 until we know better.  */
 951         connection->agreed_pro_version = 80;
 952
 953         if (prepare_listen_socket(connection, &ad))
 954                 return 0;
 955
 956         do {
 957                 struct socket *s;
 958
 959                 s = drbd_try_connect(connection);
 960                 if (s) {
 961                         if (!sock.socket) {
 962                                 sock.socket = s;
 963                                 send_first_packet(connection, &sock, P_INITIAL_DATA);
 964                         } else if (!msock.socket) {
 965                                 clear_bit(RESOLVE_CONFLICTS, &connection->flags);
 966                                 msock.socket = s;
 967                                 send_first_packet(connection, &msock, P_INITIAL_META);
 968                         } else {
 969                                 drbd_err(connection, "Logic error in conn_connect()\n");
 970                                 goto out_release_sockets;
 971                         }
 972                 }
 973
 974                 if (connection_established(connection, &sock.socket, &msock.socket))
 975                         break;
 976
 977 retry:
 978                 s = drbd_wait_for_connect(connection, &ad);
 979                 if (s) {
 980                         int fp = receive_first_packet(connection, s);
 981                         drbd_socket_okay(&sock.socket);
 982                         drbd_socket_okay(&msock.socket);
 983                         switch (fp) {
 984                         case P_INITIAL_DATA:
 985                                 if (sock.socket) {
 986                                         drbd_warn(connection, "initial packet S crossed\n");
 987                                         sock_release(sock.socket);
 988                                         sock.socket = s;
 989                                         goto randomize;
 990                                 }
 991                                 sock.socket = s;
 992                                 break;
 993                         case P_INITIAL_META:
 994                                 set_bit(RESOLVE_CONFLICTS, &connection->flags);
 995                                 if (msock.socket) {
 996                                         drbd_warn(connection, "initial packet M crossed\n");
 997                                         sock_release(msock.socket);
 998                                         msock.socket = s;
 999                                         goto randomize;
1000                                 }
1001                                 msock.socket = s;
1002                                 break;
1003                         default:
1004                                 drbd_warn(connection, "Error receiving initial packet\n");
1005                                 sock_release(s);
1006 randomize:
1007                                 if (prandom_u32() & 1)
1008                                         goto retry;
1009                         }
1010                 }
1011
1012                 if (connection->cstate <= C_DISCONNECTING)
1013                         goto out_release_sockets;
1014                 if (signal_pending(current)) {
1015                         flush_signals(current);
1016                         smp_rmb();
1017                         if (get_t_state(&connection->receiver) == EXITING)
1018                                 goto out_release_sockets;
1019                 }
1020
1021                 ok = connection_established(connection, &sock.socket, &msock.socket);
1022         } while (!ok);
1023
1024         if (ad.s_listen)
1025                 sock_release(ad.s_listen);
1026
1027         sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1028         msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
1029
1030         sock.socket->sk->sk_allocation = GFP_NOIO;
1031         msock.socket->sk->sk_allocation = GFP_NOIO;
1032
1033         sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
1034         msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
1035
1036         /* NOT YET ...
1037          * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
1038          * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1039          * first set it to the P_CONNECTION_FEATURES timeout,
1040          * which we set to 4x the configured ping_timeout. */
1041         rcu_read_lock();
1042         nc = rcu_dereference(connection->net_conf);
1043
1044         sock.socket->sk->sk_sndtimeo =
1045         sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
1046
1047         msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
1048         timeout = nc->timeout * HZ / 10;
1049         discard_my_data = nc->discard_my_data;
1050         rcu_read_unlock();
1051
1052         msock.socket->sk->sk_sndtimeo = timeout;
1053
1054         /* we don't want delays.
1055          * we use TCP_CORK where appropriate, though */
1056         tcp_sock_set_nodelay(sock.socket->sk);
1057         tcp_sock_set_nodelay(msock.socket->sk);
1058
1059         connection->data.socket = sock.socket;
1060         connection->meta.socket = msock.socket;
1061         connection->last_received = jiffies;
1062
1063         h = drbd_do_features(connection);
1064         if (h <= 0)
1065                 return h;
1066
1067         if (connection->cram_hmac_tfm) {
1068                 /* drbd_request_state(device, NS(conn, WFAuth)); */
1069                 switch (drbd_do_auth(connection)) {
1070                 case -1:
1071                         drbd_err(connection, "Authentication of peer failed\n");
1072                         return -1;
1073                 case 0:
1074                         drbd_err(connection, "Authentication of peer failed, trying again.\n");
1075                         return 0;
1076                 }
1077         }
1078
1079         connection->data.socket->sk->sk_sndtimeo = timeout;
1080         connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1081
1082         if (drbd_send_protocol(connection) == -EOPNOTSUPP)
1083                 return -1;
1084
1085         /* Prevent a race between resync-handshake and
1086          * being promoted to Primary.
1087          *
1088          * Grab and release the state mutex, so we know that any current
1089          * drbd_set_role() is finished, and any incoming drbd_set_role
1090          * will see the STATE_SENT flag, and wait for it to be cleared.
1091          */
1092         idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1093                 mutex_lock(peer_device->device->state_mutex);
1094
1095         /* avoid a race with conn_request_state( C_DISCONNECTING ) */
1096         spin_lock_irq(&connection->resource->req_lock);
1097         set_bit(STATE_SENT, &connection->flags);
1098         spin_unlock_irq(&connection->resource->req_lock);
1099
1100         idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
1101                 mutex_unlock(peer_device->device->state_mutex);
1102
1103         rcu_read_lock();
1104         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1105                 struct drbd_device *device = peer_device->device;
1106                 kref_get(&device->kref);
1107                 rcu_read_unlock();
1108
1109                 if (discard_my_data)
1110                         set_bit(DISCARD_MY_DATA, &device->flags);
1111                 else
1112                         clear_bit(DISCARD_MY_DATA, &device->flags);
1113
1114                 drbd_connected(peer_device);
1115                 kref_put(&device->kref, drbd_destroy_device);
1116                 rcu_read_lock();
1117         }
1118         rcu_read_unlock();
1119
1120         rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
1121         if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
1122                 clear_bit(STATE_SENT, &connection->flags);
1123                 return 0;
1124         }
1125
1126         drbd_thread_start(&connection->ack_receiver);
1127         /* opencoded create_singlethread_workqueue(),
1128          * to be able to use format string arguments */
1129         connection->ack_sender =
1130                 alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
1131         if (!connection->ack_sender) {
1132                 drbd_err(connection, "Failed to create workqueue ack_sender\n");
1133                 return 0;
1134         }
1135
1136         mutex_lock(&connection->resource->conf_update);
1137         /* The discard_my_data flag is a single-shot modifier to the next
1138          * connection attempt, the handshake of which is now well underway.
1139          * No need for rcu style copying of the whole struct
1140          * just to clear a single value. */
1141         connection->net_conf->discard_my_data = 0;
1142         mutex_unlock(&connection->resource->conf_update);
1143
1144         return h;
1145
1146 out_release_sockets:
1147         if (ad.s_listen)
1148                 sock_release(ad.s_listen);
1149         if (sock.socket)
1150                 sock_release(sock.socket);
1151         if (msock.socket)
1152                 sock_release(msock.socket);
1153         return -1;
1154 }
1155
1156 static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
1157 {
1158         unsigned int header_size = drbd_header_size(connection);
1159
1160         if (header_size == sizeof(struct p_header100) &&
1161             *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
1162                 struct p_header100 *h = header;
1163                 if (h->pad != 0) {
1164                         drbd_err(connection, "Header padding is not zero\n");
1165                         return -EINVAL;
1166                 }
1167                 pi->vnr = be16_to_cpu(h->volume);
1168                 pi->cmd = be16_to_cpu(h->command);
1169                 pi->size = be32_to_cpu(h->length);
1170         } else if (header_size == sizeof(struct p_header95) &&
1171                    *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
1172                 struct p_header95 *h = header;
1173                 pi->cmd = be16_to_cpu(h->command);
1174                 pi->size = be32_to_cpu(h->length);
1175                 pi->vnr = 0;
1176         } else if (header_size == sizeof(struct p_header80) &&
1177                    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
1178                 struct p_header80 *h = header;
1179                 pi->cmd = be16_to_cpu(h->command);
1180                 pi->size = be16_to_cpu(h->length);
1181                 pi->vnr = 0;
1182         } else {
1183                 drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
1184                          be32_to_cpu(*(__be32 *)header),
1185                          connection->agreed_pro_version);
1186                 return -EINVAL;
1187         }
1188         pi->data = header + header_size;
1189         return 0;
1190 }
1191
1192 static void drbd_unplug_all_devices(struct drbd_connection *connection)
1193 {
1194         if (current->plug == &connection->receiver_plug) {
1195                 blk_finish_plug(&connection->receiver_plug);
1196                 blk_start_plug(&connection->receiver_plug);
1197         } /* else: maybe just schedule() ?? */
1198 }
1199
1200 static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
1201 {
1202         void *buffer = connection->data.rbuf;
1203         int err;
1204
1205         err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
1206         if (err)
1207                 return err;
1208
1209         err = decode_header(connection, buffer, pi);
1210         connection->last_received = jiffies;
1211
1212         return err;
1213 }
1214
1215 static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi)
1216 {
1217         void *buffer = connection->data.rbuf;
1218         unsigned int size = drbd_header_size(connection);
1219         int err;
1220
1221         err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT);
1222         if (err != size) {
1223                 /* If we have nothing in the receive buffer now, to reduce
1224                  * application latency, try to drain the backend queues as
1225                  * quickly as possible, and let remote TCP know what we have
1226                  * received so far. */
1227                 if (err == -EAGAIN) {
1228                         tcp_sock_set_quickack(connection->data.socket->sk, 2);
1229                         drbd_unplug_all_devices(connection);
1230                 }
1231                 if (err > 0) {
1232                         buffer += err;
1233                         size -= err;
1234                 }
1235                 err = drbd_recv_all_warn(connection, buffer, size);
1236                 if (err)
1237                         return err;
1238         }
1239
1240         err = decode_header(connection, connection->data.rbuf, pi);
1241         connection->last_received = jiffies;
1242
1243         return err;
1244 }
1245 /* This is blkdev_issue_flush, but asynchronous.
1246  * We want to submit to all component volumes in parallel,
1247  * then wait for all completions.
1248  */
1249 struct issue_flush_context {
1250         atomic_t pending;
1251         int error;
1252         struct completion done;
1253 };
1254 struct one_flush_context {
1255         struct drbd_device *device;
1256         struct issue_flush_context *ctx;
1257 };
1258
1259 static void one_flush_endio(struct bio *bio)
1260 {
1261         struct one_flush_context *octx = bio->bi_private;
1262         struct drbd_device *device = octx->device;
1263         struct issue_flush_context *ctx = octx->ctx;
1264
1265         if (bio->bi_status) {
1266                 ctx->error = blk_status_to_errno(bio->bi_status);
1267                 drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
1268         }
1269         kfree(octx);
1270         bio_put(bio);
1271
1272         clear_bit(FLUSH_PENDING, &device->flags);
1273         put_ldev(device);
1274         kref_put(&device->kref, drbd_destroy_device);
1275
1276         if (atomic_dec_and_test(&ctx->pending))
1277                 complete(&ctx->done);
1278 }
1279
1280 static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
1281 {
1282         struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0,
1283                                     REQ_OP_FLUSH | REQ_PREFLUSH, GFP_NOIO);
1284         struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
1285
1286         if (!octx) {
1287                 drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n");
1288                 /* FIXME: what else can I do now?  disconnecting or detaching
1289                  * really does not help to improve the state of the world, either.
1290                  */
1291                 bio_put(bio);
1292
1293                 ctx->error = -ENOMEM;
1294                 put_ldev(device);
1295                 kref_put(&device->kref, drbd_destroy_device);
1296                 return;
1297         }
1298
1299         octx->device = device;
1300         octx->ctx = ctx;
1301         bio->bi_private = octx;
1302         bio->bi_end_io = one_flush_endio;
1303
1304         device->flush_jif = jiffies;
1305         set_bit(FLUSH_PENDING, &device->flags);
1306         atomic_inc(&ctx->pending);
1307         submit_bio(bio);
1308 }
1309
1310 static void drbd_flush(struct drbd_connection *connection)
1311 {
1312         if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
1313                 struct drbd_peer_device *peer_device;
1314                 struct issue_flush_context ctx;
1315                 int vnr;
1316
1317                 atomic_set(&ctx.pending, 1);
1318                 ctx.error = 0;
1319                 init_completion(&ctx.done);
1320
1321                 rcu_read_lock();
1322                 idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1323                         struct drbd_device *device = peer_device->device;
1324
1325                         if (!get_ldev(device))
1326                                 continue;
1327                         kref_get(&device->kref);
1328                         rcu_read_unlock();
1329
1330                         submit_one_flush(device, &ctx);
1331
1332                         rcu_read_lock();
1333                 }
1334                 rcu_read_unlock();
1335
1336                 /* Do we want to add a timeout,
1337                  * if disk-timeout is set? */
1338                 if (!atomic_dec_and_test(&ctx.pending))
1339                         wait_for_completion(&ctx.done);
1340
1341                 if (ctx.error) {
1342                         /* would rather check on EOPNOTSUPP, but that is not reliable.
1343                          * don't try again for ANY return value != 0
1344                          * if (rv == -EOPNOTSUPP) */
1345                         /* Any error is already reported by bio_endio callback. */
1346                         drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
1347                 }
1348         }
1349 }
1350
1351 /**
1352  * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
1353  * @connection: DRBD connection.
1354  * @epoch:      Epoch object.
1355  * @ev:         Epoch event.
1356  */
1357 static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
1358                                                struct drbd_epoch *epoch,
1359                                                enum epoch_event ev)
1360 {
1361         int epoch_size;
1362         struct drbd_epoch *next_epoch;
1363         enum finish_epoch rv = FE_STILL_LIVE;
1364
1365         spin_lock(&connection->epoch_lock);
1366         do {
1367                 next_epoch = NULL;
1368
1369                 epoch_size = atomic_read(&epoch->epoch_size);
1370
1371                 switch (ev & ~EV_CLEANUP) {
1372                 case EV_PUT:
1373                         atomic_dec(&epoch->active);
1374                         break;
1375                 case EV_GOT_BARRIER_NR:
1376                         set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
1377                         break;
1378                 case EV_BECAME_LAST:
1379                         /* nothing to do*/
1380                         break;
1381                 }
1382
1383                 if (epoch_size != 0 &&
1384                     atomic_read(&epoch->active) == 0 &&
1385                     (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
1386                         if (!(ev & EV_CLEANUP)) {
1387                                 spin_unlock(&connection->epoch_lock);
1388                                 drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
1389                                 spin_lock(&connection->epoch_lock);
1390                         }
1391 #if 0
1392                         /* FIXME: dec unacked on connection, once we have
1393                          * something to count pending connection packets in. */
1394                         if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
1395                                 dec_unacked(epoch->connection);
1396 #endif
1397
1398                         if (connection->current_epoch != epoch) {
1399                                 next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
1400                                 list_del(&epoch->list);
1401                                 ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
1402                                 connection->epochs--;
1403                                 kfree(epoch);
1404
1405                                 if (rv == FE_STILL_LIVE)
1406                                         rv = FE_DESTROYED;
1407                         } else {
1408                                 epoch->flags = 0;
1409                                 atomic_set(&epoch->epoch_size, 0);
1410                                 /* atomic_set(&epoch->active, 0); is already zero */
1411                                 if (rv == FE_STILL_LIVE)
1412                                         rv = FE_RECYCLED;
1413                         }
1414                 }
1415
1416                 if (!next_epoch)
1417                         break;
1418
1419                 epoch = next_epoch;
1420         } while (1);
1421
1422         spin_unlock(&connection->epoch_lock);
1423
1424         return rv;
1425 }
1426
1427 static enum write_ordering_e
1428 max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
1429 {
1430         struct disk_conf *dc;
1431
1432         dc = rcu_dereference(bdev->disk_conf);
1433
1434         if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
1435                 wo = WO_DRAIN_IO;
1436         if (wo == WO_DRAIN_IO && !dc->disk_drain)
1437                 wo = WO_NONE;
1438
1439         return wo;
1440 }
1441
1442 /*
1443  * drbd_bump_write_ordering() - Fall back to an other write ordering method
1444  * @wo:         Write ordering method to try.
1445  */
1446 void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
1447                               enum write_ordering_e wo)
1448 {
1449         struct drbd_device *device;
1450         enum write_ordering_e pwo;
1451         int vnr;
1452         static char *write_ordering_str[] = {
1453                 [WO_NONE] = "none",
1454                 [WO_DRAIN_IO] = "drain",
1455                 [WO_BDEV_FLUSH] = "flush",
1456         };
1457
1458         pwo = resource->write_ordering;
1459         if (wo != WO_BDEV_FLUSH)
1460                 wo = min(pwo, wo);
1461         rcu_read_lock();
1462         idr_for_each_entry(&resource->devices, device, vnr) {
1463                 if (get_ldev(device)) {
1464                         wo = max_allowed_wo(device->ldev, wo);
1465                         if (device->ldev == bdev)
1466                                 bdev = NULL;
1467                         put_ldev(device);
1468                 }
1469         }
1470
1471         if (bdev)
1472                 wo = max_allowed_wo(bdev, wo);
1473
1474         rcu_read_unlock();
1475
1476         resource->write_ordering = wo;
1477         if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
1478                 drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
1479 }
1480
1481 /*
1482  * Mapping "discard" to ZEROOUT with UNMAP does not work for us:
1483  * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it
1484  * will directly go to fallback mode, submitting normal writes, and
1485  * never even try to UNMAP.
1486  *
1487  * And dm-thin does not do this (yet), mostly because in general it has
1488  * to assume that "skip_block_zeroing" is set.  See also:
1489  * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
1490  * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html
1491  *
1492  * We *may* ignore the discard-zeroes-data setting, if so configured.
1493  *
1494  * Assumption is that this "discard_zeroes_data=0" is only because the backend
1495  * may ignore partial unaligned discards.
1496  *
1497  * LVM/DM thin as of at least
1498  *   LVM version:     2.02.115(2)-RHEL7 (2015-01-28)
1499  *   Library version: 1.02.93-RHEL7 (2015-01-28)
1500  *   Driver version:  4.29.0
1501  * still behaves this way.
1502  *
1503  * For unaligned (wrt. alignment and granularity) or too small discards,
1504  * we zero-out the initial (and/or) trailing unaligned partial chunks,
1505  * but discard all the aligned full chunks.
1506  *
1507  * At least for LVM/DM thin, with skip_block_zeroing=false,
1508  * the result is effectively "discard_zeroes_data=1".
1509  */
1510 /* flags: EE_TRIM|EE_ZEROOUT */
1511 int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags)
1512 {
1513         struct block_device *bdev = device->ldev->backing_bdev;
1514         struct request_queue *q = bdev_get_queue(bdev);
1515         sector_t tmp, nr;
1516         unsigned int max_discard_sectors, granularity;
1517         int alignment;
1518         int err = 0;
1519
1520         if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM))
1521                 goto zero_out;
1522
1523         /* Zero-sector (unknown) and one-sector granularities are the same.  */
1524         granularity = max(q->limits.discard_granularity >> 9, 1U);
1525         alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
1526
1527         max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22));
1528         max_discard_sectors -= max_discard_sectors % granularity;
1529         if (unlikely(!max_discard_sectors))
1530                 goto zero_out;
1531
1532         if (nr_sectors < granularity)
1533                 goto zero_out;
1534
1535         tmp = start;
1536         if (sector_div(tmp, granularity) != alignment) {
1537                 if (nr_sectors < 2*granularity)
1538                         goto zero_out;
1539                 /* start + gran - (start + gran - align) % gran */
1540                 tmp = start + granularity - alignment;
1541                 tmp = start + granularity - sector_div(tmp, granularity);
1542
1543                 nr = tmp - start;
1544                 /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many
1545                  * layers are below us, some may have smaller granularity */
1546                 err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
1547                 nr_sectors -= nr;
1548                 start = tmp;
1549         }
1550         while (nr_sectors >= max_discard_sectors) {
1551                 err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0);
1552                 nr_sectors -= max_discard_sectors;
1553                 start += max_discard_sectors;
1554         }
1555         if (nr_sectors) {
1556                 /* max_discard_sectors is unsigned int (and a multiple of
1557                  * granularity, we made sure of that above already);
1558                  * nr is < max_discard_sectors;
1559                  * I don't need sector_div here, even though nr is sector_t */
1560                 nr = nr_sectors;
1561                 nr -= (unsigned int)nr % granularity;
1562                 if (nr) {
1563                         err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0);
1564                         nr_sectors -= nr;
1565                         start += nr;
1566                 }
1567         }
1568  zero_out:
1569         if (nr_sectors) {
1570                 err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
1571                                 (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP);
1572         }
1573         return err != 0;
1574 }
1575
1576 static bool can_do_reliable_discards(struct drbd_device *device)
1577 {
1578         struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev);
1579         struct disk_conf *dc;
1580         bool can_do;
1581
1582         if (!blk_queue_discard(q))
1583                 return false;
1584
1585         rcu_read_lock();
1586         dc = rcu_dereference(device->ldev->disk_conf);
1587         can_do = dc->discard_zeroes_if_aligned;
1588         rcu_read_unlock();
1589         return can_do;
1590 }
1591
1592 static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req)
1593 {
1594         /* If the backend cannot discard, or does not guarantee
1595          * read-back zeroes in discarded ranges, we fall back to
1596          * zero-out.  Unless configuration specifically requested
1597          * otherwise. */
1598         if (!can_do_reliable_discards(device))
1599                 peer_req->flags |= EE_ZEROOUT;
1600
1601         if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
1602             peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM)))
1603                 peer_req->flags |= EE_WAS_ERROR;
1604         drbd_endio_write_sec_final(peer_req);
1605 }
1606
1607 static void drbd_issue_peer_wsame(struct drbd_device *device,
1608                                   struct drbd_peer_request *peer_req)
1609 {
1610         struct block_device *bdev = device->ldev->backing_bdev;
1611         sector_t s = peer_req->i.sector;
1612         sector_t nr = peer_req->i.size >> 9;
1613         if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages))
1614                 peer_req->flags |= EE_WAS_ERROR;
1615         drbd_endio_write_sec_final(peer_req);
1616 }
1617
1618
1619 /*
1620  * drbd_submit_peer_request()
1621  * @device:     DRBD device.
1622  * @peer_req:   peer request
1623  *
1624  * May spread the pages to multiple bios,
1625  * depending on bio_add_page restrictions.
1626  *
1627  * Returns 0 if all bios have been submitted,
1628  * -ENOMEM if we could not allocate enough bios,
1629  * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
1630  *  single page to an empty bio (which should never happen and likely indicates
1631  *  that the lower level IO stack is in some way broken). This has been observed
1632  *  on certain Xen deployments.
1633  */
1634 /* TODO allocate from our own bio_set. */
1635 int drbd_submit_peer_request(struct drbd_device *device,
1636                              struct drbd_peer_request *peer_req,
1637                              const unsigned op, const unsigned op_flags,
1638                              const int fault_type)
1639 {
1640         struct bio *bios = NULL;
1641         struct bio *bio;
1642         struct page *page = peer_req->pages;
1643         sector_t sector = peer_req->i.sector;
1644         unsigned data_size = peer_req->i.size;
1645         unsigned n_bios = 0;
1646         unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
1647
1648         /* TRIM/DISCARD: for now, always use the helper function
1649          * blkdev_issue_zeroout(..., discard=true).
1650          * It's synchronous, but it does the right thing wrt. bio splitting.
1651          * Correctness first, performance later.  Next step is to code an
1652          * asynchronous variant of the same.
1653          */
1654         if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) {
1655                 /* wait for all pending IO completions, before we start
1656                  * zeroing things out. */
1657                 conn_wait_active_ee_empty(peer_req->peer_device->connection);
1658                 /* add it to the active list now,
1659                  * so we can find it to present it in debugfs */
1660                 peer_req->submit_jif = jiffies;
1661                 peer_req->flags |= EE_SUBMITTED;
1662
1663                 /* If this was a resync request from receive_rs_deallocated(),
1664                  * it is already on the sync_ee list */
1665                 if (list_empty(&peer_req->w.list)) {
1666                         spin_lock_irq(&device->resource->req_lock);
1667                         list_add_tail(&peer_req->w.list, &device->active_ee);
1668                         spin_unlock_irq(&device->resource->req_lock);
1669                 }
1670
1671                 if (peer_req->flags & (EE_TRIM|EE_ZEROOUT))
1672                         drbd_issue_peer_discard_or_zero_out(device, peer_req);
1673                 else /* EE_WRITE_SAME */
1674                         drbd_issue_peer_wsame(device, peer_req);
1675                 return 0;
1676         }
1677
1678         /* In most cases, we will only need one bio.  But in case the lower
1679          * level restrictions happen to be different at this offset on this
1680          * side than those of the sending peer, we may need to submit the
1681          * request in more than one bio.
1682          *
1683          * Plain bio_alloc is good enough here, this is no DRBD internally
1684          * generated bio, but a bio allocated on behalf of the peer.
1685          */
1686 next_bio:
1687         bio = bio_alloc(device->ldev->backing_bdev, nr_pages, op | op_flags,
1688                         GFP_NOIO);
1689         /* > peer_req->i.sector, unless this is the first bio */
1690         bio->bi_iter.bi_sector = sector;
1691         bio->bi_private = peer_req;
1692         bio->bi_end_io = drbd_peer_request_endio;
1693
1694         bio->bi_next = bios;
1695         bios = bio;
1696         ++n_bios;
1697
1698         page_chain_for_each(page) {
1699                 unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
1700                 if (!bio_add_page(bio, page, len, 0))
1701                         goto next_bio;
1702                 data_size -= len;
1703                 sector += len >> 9;
1704                 --nr_pages;
1705         }
1706         D_ASSERT(device, data_size == 0);
1707         D_ASSERT(device, page == NULL);
1708
1709         atomic_set(&peer_req->pending_bios, n_bios);
1710         /* for debugfs: update timestamp, mark as submitted */
1711         peer_req->submit_jif = jiffies;
1712         peer_req->flags |= EE_SUBMITTED;
1713         do {
1714                 bio = bios;
1715                 bios = bios->bi_next;
1716                 bio->bi_next = NULL;
1717
1718                 drbd_submit_bio_noacct(device, fault_type, bio);
1719         } while (bios);
1720         return 0;
1721 }
1722
1723 static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
1724                                              struct drbd_peer_request *peer_req)
1725 {
1726         struct drbd_interval *i = &peer_req->i;
1727
1728         drbd_remove_interval(&device->write_requests, i);
1729         drbd_clear_interval(i);
1730
1731         /* Wake up any processes waiting for this peer request to complete.  */
1732         if (i->waiting)
1733                 wake_up(&device->misc_wait);
1734 }
1735
1736 static void conn_wait_active_ee_empty(struct drbd_connection *connection)
1737 {
1738         struct drbd_peer_device *peer_device;
1739         int vnr;
1740
1741         rcu_read_lock();
1742         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
1743                 struct drbd_device *device = peer_device->device;
1744
1745                 kref_get(&device->kref);
1746                 rcu_read_unlock();
1747                 drbd_wait_ee_list_empty(device, &device->active_ee);
1748                 kref_put(&device->kref, drbd_destroy_device);
1749                 rcu_read_lock();
1750         }
1751         rcu_read_unlock();
1752 }
1753
1754 static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
1755 {
1756         int rv;
1757         struct p_barrier *p = pi->data;
1758         struct drbd_epoch *epoch;
1759
1760         /* FIXME these are unacked on connection,
1761          * not a specific (peer)device.
1762          */
1763         connection->current_epoch->barrier_nr = p->barrier;
1764         connection->current_epoch->connection = connection;
1765         rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
1766
1767         /* P_BARRIER_ACK may imply that the corresponding extent is dropped from
1768          * the activity log, which means it would not be resynced in case the
1769          * R_PRIMARY crashes now.
1770          * Therefore we must send the barrier_ack after the barrier request was
1771          * completed. */
1772         switch (connection->resource->write_ordering) {
1773         case WO_NONE:
1774                 if (rv == FE_RECYCLED)
1775                         return 0;
1776
1777                 /* receiver context, in the writeout path of the other node.
1778                  * avoid potential distributed deadlock */
1779                 epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1780                 if (epoch)
1781                         break;
1782                 else
1783                         drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
1784                 fallthrough;
1785
1786         case WO_BDEV_FLUSH:
1787         case WO_DRAIN_IO:
1788                 conn_wait_active_ee_empty(connection);
1789                 drbd_flush(connection);
1790
1791                 if (atomic_read(&connection->current_epoch->epoch_size)) {
1792                         epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
1793                         if (epoch)
1794                                 break;
1795                 }
1796
1797                 return 0;
1798         default:
1799                 drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
1800                          connection->resource->write_ordering);
1801                 return -EIO;
1802         }
1803
1804         epoch->flags = 0;
1805         atomic_set(&epoch->epoch_size, 0);
1806         atomic_set(&epoch->active, 0);
1807
1808         spin_lock(&connection->epoch_lock);
1809         if (atomic_read(&connection->current_epoch->epoch_size)) {
1810                 list_add(&epoch->list, &connection->current_epoch->list);
1811                 connection->current_epoch = epoch;
1812                 connection->epochs++;
1813         } else {
1814                 /* The current_epoch got recycled while we allocated this one... */
1815                 kfree(epoch);
1816         }
1817         spin_unlock(&connection->epoch_lock);
1818
1819         return 0;
1820 }
1821
1822 /* quick wrapper in case payload size != request_size (write same) */
1823 static void drbd_csum_ee_size(struct crypto_shash *h,
1824                               struct drbd_peer_request *r, void *d,
1825                               unsigned int payload_size)
1826 {
1827         unsigned int tmp = r->i.size;
1828         r->i.size = payload_size;
1829         drbd_csum_ee(h, r, d);
1830         r->i.size = tmp;
1831 }
1832
1833 /* used from receive_RSDataReply (recv_resync_read)
1834  * and from receive_Data.
1835  * data_size: actual payload ("data in")
1836  *      for normal writes that is bi_size.
1837  *      for discards, that is zero.
1838  *      for write same, it is logical_block_size.
1839  * both trim and write same have the bi_size ("data len to be affected")
1840  * as extra argument in the packet header.
1841  */
1842 static struct drbd_peer_request *
1843 read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
1844               struct packet_info *pi) __must_hold(local)
1845 {
1846         struct drbd_device *device = peer_device->device;
1847         const sector_t capacity = get_capacity(device->vdisk);
1848         struct drbd_peer_request *peer_req;
1849         struct page *page;
1850         int digest_size, err;
1851         unsigned int data_size = pi->size, ds;
1852         void *dig_in = peer_device->connection->int_dig_in;
1853         void *dig_vv = peer_device->connection->int_dig_vv;
1854         unsigned long *data;
1855         struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
1856         struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL;
1857         struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL;
1858
1859         digest_size = 0;
1860         if (!trim && peer_device->connection->peer_integrity_tfm) {
1861                 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
1862                 /*
1863                  * FIXME: Receive the incoming digest into the receive buffer
1864                  *        here, together with its struct p_data?
1865                  */
1866                 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
1867                 if (err)
1868                         return NULL;
1869                 data_size -= digest_size;
1870         }
1871
1872         /* assume request_size == data_size, but special case trim and wsame. */
1873         ds = data_size;
1874         if (trim) {
1875                 if (!expect(data_size == 0))
1876                         return NULL;
1877                 ds = be32_to_cpu(trim->size);
1878         } else if (zeroes) {
1879                 if (!expect(data_size == 0))
1880                         return NULL;
1881                 ds = be32_to_cpu(zeroes->size);
1882         } else if (wsame) {
1883                 if (data_size != queue_logical_block_size(device->rq_queue)) {
1884                         drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n",
1885                                 data_size, queue_logical_block_size(device->rq_queue));
1886                         return NULL;
1887                 }
1888                 if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) {
1889                         drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n",
1890                                 data_size, bdev_logical_block_size(device->ldev->backing_bdev));
1891                         return NULL;
1892                 }
1893                 ds = be32_to_cpu(wsame->size);
1894         }
1895
1896         if (!expect(IS_ALIGNED(ds, 512)))
1897                 return NULL;
1898         if (trim || wsame || zeroes) {
1899                 if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
1900                         return NULL;
1901         } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
1902                 return NULL;
1903
1904         /* even though we trust out peer,
1905          * we sometimes have to double check. */
1906         if (sector + (ds>>9) > capacity) {
1907                 drbd_err(device, "request from peer beyond end of local disk: "
1908                         "capacity: %llus < sector: %llus + size: %u\n",
1909                         (unsigned long long)capacity,
1910                         (unsigned long long)sector, ds);
1911                 return NULL;
1912         }
1913
1914         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
1915          * "criss-cross" setup, that might cause write-out on some other DRBD,
1916          * which in turn might block on the other node at this very place.  */
1917         peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
1918         if (!peer_req)
1919                 return NULL;
1920
1921         peer_req->flags |= EE_WRITE;
1922         if (trim) {
1923                 peer_req->flags |= EE_TRIM;
1924                 return peer_req;
1925         }
1926         if (zeroes) {
1927                 peer_req->flags |= EE_ZEROOUT;
1928                 return peer_req;
1929         }
1930         if (wsame)
1931                 peer_req->flags |= EE_WRITE_SAME;
1932
1933         /* receive payload size bytes into page chain */
1934         ds = data_size;
1935         page = peer_req->pages;
1936         page_chain_for_each(page) {
1937                 unsigned len = min_t(int, ds, PAGE_SIZE);
1938                 data = kmap(page);
1939                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1940                 if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
1941                         drbd_err(device, "Fault injection: Corrupting data on receive\n");
1942                         data[0] = data[0] ^ (unsigned long)-1;
1943                 }
1944                 kunmap(page);
1945                 if (err) {
1946                         drbd_free_peer_req(device, peer_req);
1947                         return NULL;
1948                 }
1949                 ds -= len;
1950         }
1951
1952         if (digest_size) {
1953                 drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
1954                 if (memcmp(dig_in, dig_vv, digest_size)) {
1955                         drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
1956                                 (unsigned long long)sector, data_size);
1957                         drbd_free_peer_req(device, peer_req);
1958                         return NULL;
1959                 }
1960         }
1961         device->recv_cnt += data_size >> 9;
1962         return peer_req;
1963 }
1964
1965 /* drbd_drain_block() just takes a data block
1966  * out of the socket input buffer, and discards it.
1967  */
1968 static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
1969 {
1970         struct page *page;
1971         int err = 0;
1972         void *data;
1973
1974         if (!data_size)
1975                 return 0;
1976
1977         page = drbd_alloc_pages(peer_device, 1, 1);
1978
1979         data = kmap(page);
1980         while (data_size) {
1981                 unsigned int len = min_t(int, data_size, PAGE_SIZE);
1982
1983                 err = drbd_recv_all_warn(peer_device->connection, data, len);
1984                 if (err)
1985                         break;
1986                 data_size -= len;
1987         }
1988         kunmap(page);
1989         drbd_free_pages(peer_device->device, page, 0);
1990         return err;
1991 }
1992
1993 static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
1994                            sector_t sector, int data_size)
1995 {
1996         struct bio_vec bvec;
1997         struct bvec_iter iter;
1998         struct bio *bio;
1999         int digest_size, err, expect;
2000         void *dig_in = peer_device->connection->int_dig_in;
2001         void *dig_vv = peer_device->connection->int_dig_vv;
2002
2003         digest_size = 0;
2004         if (peer_device->connection->peer_integrity_tfm) {
2005                 digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
2006                 err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
2007                 if (err)
2008                         return err;
2009                 data_size -= digest_size;
2010         }
2011
2012         /* optimistically update recv_cnt.  if receiving fails below,
2013          * we disconnect anyways, and counters will be reset. */
2014         peer_device->device->recv_cnt += data_size>>9;
2015
2016         bio = req->master_bio;
2017         D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
2018
2019         bio_for_each_segment(bvec, bio, iter) {
2020                 void *mapped = bvec_kmap_local(&bvec);
2021                 expect = min_t(int, data_size, bvec.bv_len);
2022                 err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
2023                 kunmap_local(mapped);
2024                 if (err)
2025                         return err;
2026                 data_size -= expect;
2027         }
2028
2029         if (digest_size) {
2030                 drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
2031                 if (memcmp(dig_in, dig_vv, digest_size)) {
2032                         drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
2033                         return -EINVAL;
2034                 }
2035         }
2036
2037         D_ASSERT(peer_device->device, data_size == 0);
2038         return 0;
2039 }
2040
2041 /*
2042  * e_end_resync_block() is called in ack_sender context via
2043  * drbd_finish_peer_reqs().
2044  */
2045 static int e_end_resync_block(struct drbd_work *w, int unused)
2046 {
2047         struct drbd_peer_request *peer_req =
2048                 container_of(w, struct drbd_peer_request, w);
2049         struct drbd_peer_device *peer_device = peer_req->peer_device;
2050         struct drbd_device *device = peer_device->device;
2051         sector_t sector = peer_req->i.sector;
2052         int err;
2053
2054         D_ASSERT(device, drbd_interval_empty(&peer_req->i));
2055
2056         if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
2057                 drbd_set_in_sync(device, sector, peer_req->i.size);
2058                 err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
2059         } else {
2060                 /* Record failure to sync */
2061                 drbd_rs_failed_io(device, sector, peer_req->i.size);
2062
2063                 err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
2064         }
2065         dec_unacked(device);
2066
2067         return err;
2068 }
2069
2070 static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
2071                             struct packet_info *pi) __releases(local)
2072 {
2073         struct drbd_device *device = peer_device->device;
2074         struct drbd_peer_request *peer_req;
2075
2076         peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
2077         if (!peer_req)
2078                 goto fail;
2079
2080         dec_rs_pending(device);
2081
2082         inc_unacked(device);
2083         /* corresponding dec_unacked() in e_end_resync_block()
2084          * respective _drbd_clear_done_ee */
2085
2086         peer_req->w.cb = e_end_resync_block;
2087         peer_req->submit_jif = jiffies;
2088
2089         spin_lock_irq(&device->resource->req_lock);
2090         list_add_tail(&peer_req->w.list, &device->sync_ee);
2091         spin_unlock_irq(&device->resource->req_lock);
2092
2093         atomic_add(pi->size >> 9, &device->rs_sect_ev);
2094         if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0,
2095                                      DRBD_FAULT_RS_WR) == 0)
2096                 return 0;
2097
2098         /* don't care for the reason here */
2099         drbd_err(device, "submit failed, triggering re-connect\n");
2100         spin_lock_irq(&device->resource->req_lock);
2101         list_del(&peer_req->w.list);
2102         spin_unlock_irq(&device->resource->req_lock);
2103
2104         drbd_free_peer_req(device, peer_req);
2105 fail:
2106         put_ldev(device);
2107         return -EIO;
2108 }
2109
2110 static struct drbd_request *
2111 find_request(struct drbd_device *device, struct rb_root *root, u64 id,
2112              sector_t sector, bool missing_ok, const char *func)
2113 {
2114         struct drbd_request *req;
2115
2116         /* Request object according to our peer */
2117         req = (struct drbd_request *)(unsigned long)id;
2118         if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
2119                 return req;
2120         if (!missing_ok) {
2121                 drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
2122                         (unsigned long)id, (unsigned long long)sector);
2123         }
2124         return NULL;
2125 }
2126
2127 static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
2128 {
2129         struct drbd_peer_device *peer_device;
2130         struct drbd_device *device;
2131         struct drbd_request *req;
2132         sector_t sector;
2133         int err;
2134         struct p_data *p = pi->data;
2135
2136         peer_device = conn_peer_device(connection, pi->vnr);
2137         if (!peer_device)
2138                 return -EIO;
2139         device = peer_device->device;
2140
2141         sector = be64_to_cpu(p->sector);
2142
2143         spin_lock_irq(&device->resource->req_lock);
2144         req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
2145         spin_unlock_irq(&device->resource->req_lock);
2146         if (unlikely(!req))
2147                 return -EIO;
2148
2149         /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
2150          * special casing it there for the various failure cases.
2151          * still no race with drbd_fail_pending_reads */
2152         err = recv_dless_read(peer_device, req, sector, pi->size);
2153         if (!err)
2154                 req_mod(req, DATA_RECEIVED);
2155         /* else: nothing. handled from drbd_disconnect...
2156          * I don't think we may complete this just yet
2157          * in case we are "on-disconnect: freeze" */
2158
2159         return err;
2160 }
2161
2162 static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
2163 {
2164         struct drbd_peer_device *peer_device;
2165         struct drbd_device *device;
2166         sector_t sector;
2167         int err;
2168         struct p_data *p = pi->data;
2169
2170         peer_device = conn_peer_device(connection, pi->vnr);
2171         if (!peer_device)
2172                 return -EIO;
2173         device = peer_device->device;
2174
2175         sector = be64_to_cpu(p->sector);
2176         D_ASSERT(device, p->block_id == ID_SYNCER);
2177
2178         if (get_ldev(device)) {
2179                 /* data is submitted to disk within recv_resync_read.
2180                  * corresponding put_ldev done below on error,
2181                  * or in drbd_peer_request_endio. */
2182                 err = recv_resync_read(peer_device, sector, pi);
2183         } else {
2184                 if (__ratelimit(&drbd_ratelimit_state))
2185                         drbd_err(device, "Can not write resync data to local disk.\n");
2186
2187                 err = drbd_drain_block(peer_device, pi->size);
2188
2189                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2190         }
2191
2192         atomic_add(pi->size >> 9, &device->rs_sect_in);
2193
2194         return err;
2195 }
2196
2197 static void restart_conflicting_writes(struct drbd_device *device,
2198                                        sector_t sector, int size)
2199 {
2200         struct drbd_interval *i;
2201         struct drbd_request *req;
2202
2203         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2204                 if (!i->local)
2205                         continue;
2206                 req = container_of(i, struct drbd_request, i);
2207                 if (req->rq_state & RQ_LOCAL_PENDING ||
2208                     !(req->rq_state & RQ_POSTPONED))
2209                         continue;
2210                 /* as it is RQ_POSTPONED, this will cause it to
2211                  * be queued on the retry workqueue. */
2212                 __req_mod(req, CONFLICT_RESOLVED, NULL);
2213         }
2214 }
2215
2216 /*
2217  * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
2218  */
2219 static int e_end_block(struct drbd_work *w, int cancel)
2220 {
2221         struct drbd_peer_request *peer_req =
2222                 container_of(w, struct drbd_peer_request, w);
2223         struct drbd_peer_device *peer_device = peer_req->peer_device;
2224         struct drbd_device *device = peer_device->device;
2225         sector_t sector = peer_req->i.sector;
2226         int err = 0, pcmd;
2227
2228         if (peer_req->flags & EE_SEND_WRITE_ACK) {
2229                 if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
2230                         pcmd = (device->state.conn >= C_SYNC_SOURCE &&
2231                                 device->state.conn <= C_PAUSED_SYNC_T &&
2232                                 peer_req->flags & EE_MAY_SET_IN_SYNC) ?
2233                                 P_RS_WRITE_ACK : P_WRITE_ACK;
2234                         err = drbd_send_ack(peer_device, pcmd, peer_req);
2235                         if (pcmd == P_RS_WRITE_ACK)
2236                                 drbd_set_in_sync(device, sector, peer_req->i.size);
2237                 } else {
2238                         err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
2239                         /* we expect it to be marked out of sync anyways...
2240                          * maybe assert this?  */
2241                 }
2242                 dec_unacked(device);
2243         }
2244
2245         /* we delete from the conflict detection hash _after_ we sent out the
2246          * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
2247         if (peer_req->flags & EE_IN_INTERVAL_TREE) {
2248                 spin_lock_irq(&device->resource->req_lock);
2249                 D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
2250                 drbd_remove_epoch_entry_interval(device, peer_req);
2251                 if (peer_req->flags & EE_RESTART_REQUESTS)
2252                         restart_conflicting_writes(device, sector, peer_req->i.size);
2253                 spin_unlock_irq(&device->resource->req_lock);
2254         } else
2255                 D_ASSERT(device, drbd_interval_empty(&peer_req->i));
2256
2257         drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
2258
2259         return err;
2260 }
2261
2262 static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
2263 {
2264         struct drbd_peer_request *peer_req =
2265                 container_of(w, struct drbd_peer_request, w);
2266         struct drbd_peer_device *peer_device = peer_req->peer_device;
2267         int err;
2268
2269         err = drbd_send_ack(peer_device, ack, peer_req);
2270         dec_unacked(peer_device->device);
2271
2272         return err;
2273 }
2274
2275 static int e_send_superseded(struct drbd_work *w, int unused)
2276 {
2277         return e_send_ack(w, P_SUPERSEDED);
2278 }
2279
2280 static int e_send_retry_write(struct drbd_work *w, int unused)
2281 {
2282         struct drbd_peer_request *peer_req =
2283                 container_of(w, struct drbd_peer_request, w);
2284         struct drbd_connection *connection = peer_req->peer_device->connection;
2285
2286         return e_send_ack(w, connection->agreed_pro_version >= 100 ?
2287                              P_RETRY_WRITE : P_SUPERSEDED);
2288 }
2289
2290 static bool seq_greater(u32 a, u32 b)
2291 {
2292         /*
2293          * We assume 32-bit wrap-around here.
2294          * For 24-bit wrap-around, we would have to shift:
2295          *  a <<= 8; b <<= 8;
2296          */
2297         return (s32)a - (s32)b > 0;
2298 }
2299
2300 static u32 seq_max(u32 a, u32 b)
2301 {
2302         return seq_greater(a, b) ? a : b;
2303 }
2304
2305 static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
2306 {
2307         struct drbd_device *device = peer_device->device;
2308         unsigned int newest_peer_seq;
2309
2310         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
2311                 spin_lock(&device->peer_seq_lock);
2312                 newest_peer_seq = seq_max(device->peer_seq, peer_seq);
2313                 device->peer_seq = newest_peer_seq;
2314                 spin_unlock(&device->peer_seq_lock);
2315                 /* wake up only if we actually changed device->peer_seq */
2316                 if (peer_seq == newest_peer_seq)
2317                         wake_up(&device->seq_wait);
2318         }
2319 }
2320
2321 static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
2322 {
2323         return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
2324 }
2325
2326 /* maybe change sync_ee into interval trees as well? */
2327 static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
2328 {
2329         struct drbd_peer_request *rs_req;
2330         bool rv = false;
2331
2332         spin_lock_irq(&device->resource->req_lock);
2333         list_for_each_entry(rs_req, &device->sync_ee, w.list) {
2334                 if (overlaps(peer_req->i.sector, peer_req->i.size,
2335                              rs_req->i.sector, rs_req->i.size)) {
2336                         rv = true;
2337                         break;
2338                 }
2339         }
2340         spin_unlock_irq(&device->resource->req_lock);
2341
2342         return rv;
2343 }
2344
2345 /* Called from receive_Data.
2346  * Synchronize packets on sock with packets on msock.
2347  *
2348  * This is here so even when a P_DATA packet traveling via sock overtook an Ack
2349  * packet traveling on msock, they are still processed in the order they have
2350  * been sent.
2351  *
2352  * Note: we don't care for Ack packets overtaking P_DATA packets.
2353  *
2354  * In case packet_seq is larger than device->peer_seq number, there are
2355  * outstanding packets on the msock. We wait for them to arrive.
2356  * In case we are the logically next packet, we update device->peer_seq
2357  * ourselves. Correctly handles 32bit wrap around.
2358  *
2359  * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
2360  * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
2361  * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
2362  * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
2363  *
2364  * returns 0 if we may process the packet,
2365  * -ERESTARTSYS if we were interrupted (by disconnect signal). */
2366 static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
2367 {
2368         struct drbd_device *device = peer_device->device;
2369         DEFINE_WAIT(wait);
2370         long timeout;
2371         int ret = 0, tp;
2372
2373         if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
2374                 return 0;
2375
2376         spin_lock(&device->peer_seq_lock);
2377         for (;;) {
2378                 if (!seq_greater(peer_seq - 1, device->peer_seq)) {
2379                         device->peer_seq = seq_max(device->peer_seq, peer_seq);
2380                         break;
2381                 }
2382
2383                 if (signal_pending(current)) {
2384                         ret = -ERESTARTSYS;
2385                         break;
2386                 }
2387
2388                 rcu_read_lock();
2389                 tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
2390                 rcu_read_unlock();
2391
2392                 if (!tp)
2393                         break;
2394
2395                 /* Only need to wait if two_primaries is enabled */
2396                 prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
2397                 spin_unlock(&device->peer_seq_lock);
2398                 rcu_read_lock();
2399                 timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
2400                 rcu_read_unlock();
2401                 timeout = schedule_timeout(timeout);
2402                 spin_lock(&device->peer_seq_lock);
2403                 if (!timeout) {
2404                         ret = -ETIMEDOUT;
2405                         drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
2406                         break;
2407                 }
2408         }
2409         spin_unlock(&device->peer_seq_lock);
2410         finish_wait(&device->seq_wait, &wait);
2411         return ret;
2412 }
2413
2414 /* see also bio_flags_to_wire()
2415  * DRBD_REQ_*, because we need to semantically map the flags to data packet
2416  * flags and back. We may replicate to other kernel versions. */
2417 static unsigned long wire_flags_to_bio_flags(u32 dpf)
2418 {
2419         return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
2420                 (dpf & DP_FUA ? REQ_FUA : 0) |
2421                 (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
2422 }
2423
2424 static unsigned long wire_flags_to_bio_op(u32 dpf)
2425 {
2426         if (dpf & DP_ZEROES)
2427                 return REQ_OP_WRITE_ZEROES;
2428         if (dpf & DP_DISCARD)
2429                 return REQ_OP_DISCARD;
2430         if (dpf & DP_WSAME)
2431                 return REQ_OP_WRITE_SAME;
2432         else
2433                 return REQ_OP_WRITE;
2434 }
2435
2436 static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
2437                                     unsigned int size)
2438 {
2439         struct drbd_interval *i;
2440
2441     repeat:
2442         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2443                 struct drbd_request *req;
2444                 struct bio_and_error m;
2445
2446                 if (!i->local)
2447                         continue;
2448                 req = container_of(i, struct drbd_request, i);
2449                 if (!(req->rq_state & RQ_POSTPONED))
2450                         continue;
2451                 req->rq_state &= ~RQ_POSTPONED;
2452                 __req_mod(req, NEG_ACKED, &m);
2453                 spin_unlock_irq(&device->resource->req_lock);
2454                 if (m.bio)
2455                         complete_master_bio(device, &m);
2456                 spin_lock_irq(&device->resource->req_lock);
2457                 goto repeat;
2458         }
2459 }
2460
2461 static int handle_write_conflicts(struct drbd_device *device,
2462                                   struct drbd_peer_request *peer_req)
2463 {
2464         struct drbd_connection *connection = peer_req->peer_device->connection;
2465         bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
2466         sector_t sector = peer_req->i.sector;
2467         const unsigned int size = peer_req->i.size;
2468         struct drbd_interval *i;
2469         bool equal;
2470         int err;
2471
2472         /*
2473          * Inserting the peer request into the write_requests tree will prevent
2474          * new conflicting local requests from being added.
2475          */
2476         drbd_insert_interval(&device->write_requests, &peer_req->i);
2477
2478     repeat:
2479         drbd_for_each_overlap(i, &device->write_requests, sector, size) {
2480                 if (i == &peer_req->i)
2481                         continue;
2482                 if (i->completed)
2483                         continue;
2484
2485                 if (!i->local) {
2486                         /*
2487                          * Our peer has sent a conflicting remote request; this
2488                          * should not happen in a two-node setup.  Wait for the
2489                          * earlier peer request to complete.
2490                          */
2491                         err = drbd_wait_misc(device, i);
2492                         if (err)
2493                                 goto out;
2494                         goto repeat;
2495                 }
2496
2497                 equal = i->sector == sector && i->size == size;
2498                 if (resolve_conflicts) {
2499                         /*
2500                          * If the peer request is fully contained within the
2501                          * overlapping request, it can be considered overwritten
2502                          * and thus superseded; otherwise, it will be retried
2503                          * once all overlapping requests have completed.
2504                          */
2505                         bool superseded = i->sector <= sector && i->sector +
2506                                        (i->size >> 9) >= sector + (size >> 9);
2507
2508                         if (!equal)
2509                                 drbd_alert(device, "Concurrent writes detected: "
2510                                                "local=%llus +%u, remote=%llus +%u, "
2511                                                "assuming %s came first\n",
2512                                           (unsigned long long)i->sector, i->size,
2513                                           (unsigned long long)sector, size,
2514                                           superseded ? "local" : "remote");
2515
2516                         peer_req->w.cb = superseded ? e_send_superseded :
2517                                                    e_send_retry_write;
2518                         list_add_tail(&peer_req->w.list, &device->done_ee);
2519                         queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
2520
2521                         err = -ENOENT;
2522                         goto out;
2523                 } else {
2524                         struct drbd_request *req =
2525                                 container_of(i, struct drbd_request, i);
2526
2527                         if (!equal)
2528                                 drbd_alert(device, "Concurrent writes detected: "
2529                                                "local=%llus +%u, remote=%llus +%u\n",
2530                                           (unsigned long long)i->sector, i->size,
2531                                           (unsigned long long)sector, size);
2532
2533                         if (req->rq_state & RQ_LOCAL_PENDING ||
2534                             !(req->rq_state & RQ_POSTPONED)) {
2535                                 /*
2536                                  * Wait for the node with the discard flag to
2537                                  * decide if this request has been superseded
2538                                  * or needs to be retried.
2539                                  * Requests that have been superseded will
2540                                  * disappear from the write_requests tree.
2541                                  *
2542                                  * In addition, wait for the conflicting
2543                                  * request to finish locally before submitting
2544                                  * the conflicting peer request.
2545                                  */
2546                                 err = drbd_wait_misc(device, &req->i);
2547                                 if (err) {
2548                                         _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
2549                                         fail_postponed_requests(device, sector, size);
2550                                         goto out;
2551                                 }
2552                                 goto repeat;
2553                         }
2554                         /*
2555                          * Remember to restart the conflicting requests after
2556                          * the new peer request has completed.
2557                          */
2558                         peer_req->flags |= EE_RESTART_REQUESTS;
2559                 }
2560         }
2561         err = 0;
2562
2563     out:
2564         if (err)
2565                 drbd_remove_epoch_entry_interval(device, peer_req);
2566         return err;
2567 }
2568
2569 /* mirrored write */
2570 static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
2571 {
2572         struct drbd_peer_device *peer_device;
2573         struct drbd_device *device;
2574         struct net_conf *nc;
2575         sector_t sector;
2576         struct drbd_peer_request *peer_req;
2577         struct p_data *p = pi->data;
2578         u32 peer_seq = be32_to_cpu(p->seq_num);
2579         int op, op_flags;
2580         u32 dp_flags;
2581         int err, tp;
2582
2583         peer_device = conn_peer_device(connection, pi->vnr);
2584         if (!peer_device)
2585                 return -EIO;
2586         device = peer_device->device;
2587
2588         if (!get_ldev(device)) {
2589                 int err2;
2590
2591                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2592                 drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
2593                 atomic_inc(&connection->current_epoch->epoch_size);
2594                 err2 = drbd_drain_block(peer_device, pi->size);
2595                 if (!err)
2596                         err = err2;
2597                 return err;
2598         }
2599
2600         /*
2601          * Corresponding put_ldev done either below (on various errors), or in
2602          * drbd_peer_request_endio, if we successfully submit the data at the
2603          * end of this function.
2604          */
2605
2606         sector = be64_to_cpu(p->sector);
2607         peer_req = read_in_block(peer_device, p->block_id, sector, pi);
2608         if (!peer_req) {
2609                 put_ldev(device);
2610                 return -EIO;
2611         }
2612
2613         peer_req->w.cb = e_end_block;
2614         peer_req->submit_jif = jiffies;
2615         peer_req->flags |= EE_APPLICATION;
2616
2617         dp_flags = be32_to_cpu(p->dp_flags);
2618         op = wire_flags_to_bio_op(dp_flags);
2619         op_flags = wire_flags_to_bio_flags(dp_flags);
2620         if (pi->cmd == P_TRIM) {
2621                 D_ASSERT(peer_device, peer_req->i.size > 0);
2622                 D_ASSERT(peer_device, op == REQ_OP_DISCARD);
2623                 D_ASSERT(peer_device, peer_req->pages == NULL);
2624                 /* need to play safe: an older DRBD sender
2625                  * may mean zero-out while sending P_TRIM. */
2626                 if (0 == (connection->agreed_features & DRBD_FF_WZEROES))
2627                         peer_req->flags |= EE_ZEROOUT;
2628         } else if (pi->cmd == P_ZEROES) {
2629                 D_ASSERT(peer_device, peer_req->i.size > 0);
2630                 D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
2631                 D_ASSERT(peer_device, peer_req->pages == NULL);
2632                 /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
2633                 if (dp_flags & DP_DISCARD)
2634                         peer_req->flags |= EE_TRIM;
2635         } else if (peer_req->pages == NULL) {
2636                 D_ASSERT(device, peer_req->i.size == 0);
2637                 D_ASSERT(device, dp_flags & DP_FLUSH);
2638         }
2639
2640         if (dp_flags & DP_MAY_SET_IN_SYNC)
2641                 peer_req->flags |= EE_MAY_SET_IN_SYNC;
2642
2643         spin_lock(&connection->epoch_lock);
2644         peer_req->epoch = connection->current_epoch;
2645         atomic_inc(&peer_req->epoch->epoch_size);
2646         atomic_inc(&peer_req->epoch->active);
2647         spin_unlock(&connection->epoch_lock);
2648
2649         rcu_read_lock();
2650         nc = rcu_dereference(peer_device->connection->net_conf);
2651         tp = nc->two_primaries;
2652         if (peer_device->connection->agreed_pro_version < 100) {
2653                 switch (nc->wire_protocol) {
2654                 case DRBD_PROT_C:
2655                         dp_flags |= DP_SEND_WRITE_ACK;
2656                         break;
2657                 case DRBD_PROT_B:
2658                         dp_flags |= DP_SEND_RECEIVE_ACK;
2659                         break;
2660                 }
2661         }
2662         rcu_read_unlock();
2663
2664         if (dp_flags & DP_SEND_WRITE_ACK) {
2665                 peer_req->flags |= EE_SEND_WRITE_ACK;
2666                 inc_unacked(device);
2667                 /* corresponding dec_unacked() in e_end_block()
2668                  * respective _drbd_clear_done_ee */
2669         }
2670
2671         if (dp_flags & DP_SEND_RECEIVE_ACK) {
2672                 /* I really don't like it that the receiver thread
2673                  * sends on the msock, but anyways */
2674                 drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
2675         }
2676
2677         if (tp) {
2678                 /* two primaries implies protocol C */
2679                 D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
2680                 peer_req->flags |= EE_IN_INTERVAL_TREE;
2681                 err = wait_for_and_update_peer_seq(peer_device, peer_seq);
2682                 if (err)
2683                         goto out_interrupted;
2684                 spin_lock_irq(&device->resource->req_lock);
2685                 err = handle_write_conflicts(device, peer_req);
2686                 if (err) {
2687                         spin_unlock_irq(&device->resource->req_lock);
2688                         if (err == -ENOENT) {
2689                                 put_ldev(device);
2690                                 return 0;
2691                         }
2692                         goto out_interrupted;
2693                 }
2694         } else {
2695                 update_peer_seq(peer_device, peer_seq);
2696                 spin_lock_irq(&device->resource->req_lock);
2697         }
2698         /* TRIM and WRITE_SAME are processed synchronously,
2699          * we wait for all pending requests, respectively wait for
2700          * active_ee to become empty in drbd_submit_peer_request();
2701          * better not add ourselves here. */
2702         if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0)
2703                 list_add_tail(&peer_req->w.list, &device->active_ee);
2704         spin_unlock_irq(&device->resource->req_lock);
2705
2706         if (device->state.conn == C_SYNC_TARGET)
2707                 wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
2708
2709         if (device->state.pdsk < D_INCONSISTENT) {
2710                 /* In case we have the only disk of the cluster, */
2711                 drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
2712                 peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
2713                 drbd_al_begin_io(device, &peer_req->i);
2714                 peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
2715         }
2716
2717         err = drbd_submit_peer_request(device, peer_req, op, op_flags,
2718                                        DRBD_FAULT_DT_WR);
2719         if (!err)
2720                 return 0;
2721
2722         /* don't care for the reason here */
2723         drbd_err(device, "submit failed, triggering re-connect\n");
2724         spin_lock_irq(&device->resource->req_lock);
2725         list_del(&peer_req->w.list);
2726         drbd_remove_epoch_entry_interval(device, peer_req);
2727         spin_unlock_irq(&device->resource->req_lock);
2728         if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
2729                 peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
2730                 drbd_al_complete_io(device, &peer_req->i);
2731         }
2732
2733 out_interrupted:
2734         drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
2735         put_ldev(device);
2736         drbd_free_peer_req(device, peer_req);
2737         return err;
2738 }
2739
2740 /* We may throttle resync, if the lower device seems to be busy,
2741  * and current sync rate is above c_min_rate.
2742  *
2743  * To decide whether or not the lower device is busy, we use a scheme similar
2744  * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
2745  * (more than 64 sectors) of activity we cannot account for with our own resync
2746  * activity, it obviously is "busy".
2747  *
2748  * The current sync rate used here uses only the most recent two step marks,
2749  * to have a short time average so we can react faster.
2750  */
2751 bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
2752                 bool throttle_if_app_is_waiting)
2753 {
2754         struct lc_element *tmp;
2755         bool throttle = drbd_rs_c_min_rate_throttle(device);
2756
2757         if (!throttle || throttle_if_app_is_waiting)
2758                 return throttle;
2759
2760         spin_lock_irq(&device->al_lock);
2761         tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
2762         if (tmp) {
2763                 struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
2764                 if (test_bit(BME_PRIORITY, &bm_ext->flags))
2765                         throttle = false;
2766                 /* Do not slow down if app IO is already waiting for this extent,
2767                  * and our progress is necessary for application IO to complete. */
2768         }
2769         spin_unlock_irq(&device->al_lock);
2770
2771         return throttle;
2772 }
2773
2774 bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
2775 {
2776         struct gendisk *disk = device->ldev->backing_bdev->bd_disk;
2777         unsigned long db, dt, dbdt;
2778         unsigned int c_min_rate;
2779         int curr_events;
2780
2781         rcu_read_lock();
2782         c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
2783         rcu_read_unlock();
2784
2785         /* feature disabled? */
2786         if (c_min_rate == 0)
2787                 return false;
2788
2789         curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
2790                         atomic_read(&device->rs_sect_ev);
2791
2792         if (atomic_read(&device->ap_actlog_cnt)
2793             || curr_events - device->rs_last_events > 64) {
2794                 unsigned long rs_left;
2795                 int i;
2796
2797                 device->rs_last_events = curr_events;
2798
2799                 /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
2800                  * approx. */
2801                 i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
2802
2803                 if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
2804                         rs_left = device->ov_left;
2805                 else
2806                         rs_left = drbd_bm_total_weight(device) - device->rs_failed;
2807
2808                 dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
2809                 if (!dt)
2810                         dt++;
2811                 db = device->rs_mark_left[i] - rs_left;
2812                 dbdt = Bit2KB(db/dt);
2813
2814                 if (dbdt > c_min_rate)
2815                         return true;
2816         }
2817         return false;
2818 }
2819
2820 static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
2821 {
2822         struct drbd_peer_device *peer_device;
2823         struct drbd_device *device;
2824         sector_t sector;
2825         sector_t capacity;
2826         struct drbd_peer_request *peer_req;
2827         struct digest_info *di = NULL;
2828         int size, verb;
2829         unsigned int fault_type;
2830         struct p_block_req *p = pi->data;
2831
2832         peer_device = conn_peer_device(connection, pi->vnr);
2833         if (!peer_device)
2834                 return -EIO;
2835         device = peer_device->device;
2836         capacity = get_capacity(device->vdisk);
2837
2838         sector = be64_to_cpu(p->sector);
2839         size   = be32_to_cpu(p->blksize);
2840
2841         if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
2842                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2843                                 (unsigned long long)sector, size);
2844                 return -EINVAL;
2845         }
2846         if (sector + (size>>9) > capacity) {
2847                 drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
2848                                 (unsigned long long)sector, size);
2849                 return -EINVAL;
2850         }
2851
2852         if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
2853                 verb = 1;
2854                 switch (pi->cmd) {
2855                 case P_DATA_REQUEST:
2856                         drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
2857                         break;
2858                 case P_RS_THIN_REQ:
2859                 case P_RS_DATA_REQUEST:
2860                 case P_CSUM_RS_REQUEST:
2861                 case P_OV_REQUEST:
2862                         drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
2863                         break;
2864                 case P_OV_REPLY:
2865                         verb = 0;
2866                         dec_rs_pending(device);
2867                         drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
2868                         break;
2869                 default:
2870                         BUG();
2871                 }
2872                 if (verb && __ratelimit(&drbd_ratelimit_state))
2873                         drbd_err(device, "Can not satisfy peer's read request, "
2874                             "no local data.\n");
2875
2876                 /* drain possibly payload */
2877                 return drbd_drain_block(peer_device, pi->size);
2878         }
2879
2880         /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
2881          * "criss-cross" setup, that might cause write-out on some other DRBD,
2882          * which in turn might block on the other node at this very place.  */
2883         peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
2884                         size, GFP_NOIO);
2885         if (!peer_req) {
2886                 put_ldev(device);
2887                 return -ENOMEM;
2888         }
2889
2890         switch (pi->cmd) {
2891         case P_DATA_REQUEST:
2892                 peer_req->w.cb = w_e_end_data_req;
2893                 fault_type = DRBD_FAULT_DT_RD;
2894                 /* application IO, don't drbd_rs_begin_io */
2895                 peer_req->flags |= EE_APPLICATION;
2896                 goto submit;
2897
2898         case P_RS_THIN_REQ:
2899                 /* If at some point in the future we have a smart way to
2900                    find out if this data block is completely deallocated,
2901                    then we would do something smarter here than reading
2902                    the block... */
2903                 peer_req->flags |= EE_RS_THIN_REQ;
2904                 fallthrough;
2905         case P_RS_DATA_REQUEST:
2906                 peer_req->w.cb = w_e_end_rsdata_req;
2907                 fault_type = DRBD_FAULT_RS_RD;
2908                 /* used in the sector offset progress display */
2909                 device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2910                 break;
2911
2912         case P_OV_REPLY:
2913         case P_CSUM_RS_REQUEST:
2914                 fault_type = DRBD_FAULT_RS_RD;
2915                 di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
2916                 if (!di)
2917                         goto out_free_e;
2918
2919                 di->digest_size = pi->size;
2920                 di->digest = (((char *)di)+sizeof(struct digest_info));
2921
2922                 peer_req->digest = di;
2923                 peer_req->flags |= EE_HAS_DIGEST;
2924
2925                 if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
2926                         goto out_free_e;
2927
2928                 if (pi->cmd == P_CSUM_RS_REQUEST) {
2929                         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
2930                         peer_req->w.cb = w_e_end_csum_rs_req;
2931                         /* used in the sector offset progress display */
2932                         device->bm_resync_fo = BM_SECT_TO_BIT(sector);
2933                         /* remember to report stats in drbd_resync_finished */
2934                         device->use_csums = true;
2935                 } else if (pi->cmd == P_OV_REPLY) {
2936                         /* track progress, we may need to throttle */
2937                         atomic_add(size >> 9, &device->rs_sect_in);
2938                         peer_req->w.cb = w_e_end_ov_reply;
2939                         dec_rs_pending(device);
2940                         /* drbd_rs_begin_io done when we sent this request,
2941                          * but accounting still needs to be done. */
2942                         goto submit_for_resync;
2943                 }
2944                 break;
2945
2946         case P_OV_REQUEST:
2947                 if (device->ov_start_sector == ~(sector_t)0 &&
2948                     peer_device->connection->agreed_pro_version >= 90) {
2949                         unsigned long now = jiffies;
2950                         int i;
2951                         device->ov_start_sector = sector;
2952                         device->ov_position = sector;
2953                         device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
2954                         device->rs_total = device->ov_left;
2955                         for (i = 0; i < DRBD_SYNC_MARKS; i++) {
2956                                 device->rs_mark_left[i] = device->ov_left;
2957                                 device->rs_mark_time[i] = now;
2958                         }
2959                         drbd_info(device, "Online Verify start sector: %llu\n",
2960                                         (unsigned long long)sector);
2961                 }
2962                 peer_req->w.cb = w_e_end_ov_req;
2963                 fault_type = DRBD_FAULT_RS_RD;
2964                 break;
2965
2966         default:
2967                 BUG();
2968         }
2969
2970         /* Throttle, drbd_rs_begin_io and submit should become asynchronous
2971          * wrt the receiver, but it is not as straightforward as it may seem.
2972          * Various places in the resync start and stop logic assume resync
2973          * requests are processed in order, requeuing this on the worker thread
2974          * introduces a bunch of new code for synchronization between threads.
2975          *
2976          * Unlimited throttling before drbd_rs_begin_io may stall the resync
2977          * "forever", throttling after drbd_rs_begin_io will lock that extent
2978          * for application writes for the same time.  For now, just throttle
2979          * here, where the rest of the code expects the receiver to sleep for
2980          * a while, anyways.
2981          */
2982
2983         /* Throttle before drbd_rs_begin_io, as that locks out application IO;
2984          * this defers syncer requests for some time, before letting at least
2985          * on request through.  The resync controller on the receiving side
2986          * will adapt to the incoming rate accordingly.
2987          *
2988          * We cannot throttle here if remote is Primary/SyncTarget:
2989          * we would also throttle its application reads.
2990          * In that case, throttling is done on the SyncTarget only.
2991          */
2992
2993         /* Even though this may be a resync request, we do add to "read_ee";
2994          * "sync_ee" is only used for resync WRITEs.
2995          * Add to list early, so debugfs can find this request
2996          * even if we have to sleep below. */
2997         spin_lock_irq(&device->resource->req_lock);
2998         list_add_tail(&peer_req->w.list, &device->read_ee);
2999         spin_unlock_irq(&device->resource->req_lock);
3000
3001         update_receiver_timing_details(connection, drbd_rs_should_slow_down);
3002         if (device->state.peer != R_PRIMARY
3003         && drbd_rs_should_slow_down(device, sector, false))
3004                 schedule_timeout_uninterruptible(HZ/10);
3005         update_receiver_timing_details(connection, drbd_rs_begin_io);
3006         if (drbd_rs_begin_io(device, sector))
3007                 goto out_free_e;
3008
3009 submit_for_resync:
3010         atomic_add(size >> 9, &device->rs_sect_ev);
3011
3012 submit:
3013         update_receiver_timing_details(connection, drbd_submit_peer_request);
3014         inc_unacked(device);
3015         if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0,
3016                                      fault_type) == 0)
3017                 return 0;
3018
3019         /* don't care for the reason here */
3020         drbd_err(device, "submit failed, triggering re-connect\n");
3021
3022 out_free_e:
3023         spin_lock_irq(&device->resource->req_lock);
3024         list_del(&peer_req->w.list);
3025         spin_unlock_irq(&device->resource->req_lock);
3026         /* no drbd_rs_complete_io(), we are dropping the connection anyways */
3027
3028         put_ldev(device);
3029         drbd_free_peer_req(device, peer_req);
3030         return -EIO;
3031 }
3032
3033 /*
3034  * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
3035  */
3036 static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
3037 {
3038         struct drbd_device *device = peer_device->device;
3039         int self, peer, rv = -100;
3040         unsigned long ch_self, ch_peer;
3041         enum drbd_after_sb_p after_sb_0p;
3042
3043         self = device->ldev->md.uuid[UI_BITMAP] & 1;
3044         peer = device->p_uuid[UI_BITMAP] & 1;
3045
3046         ch_peer = device->p_uuid[UI_SIZE];
3047         ch_self = device->comm_bm_set;
3048
3049         rcu_read_lock();
3050         after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
3051         rcu_read_unlock();
3052         switch (after_sb_0p) {
3053         case ASB_CONSENSUS:
3054         case ASB_DISCARD_SECONDARY:
3055         case ASB_CALL_HELPER:
3056         case ASB_VIOLENTLY:
3057                 drbd_err(device, "Configuration error.\n");
3058                 break;
3059         case ASB_DISCONNECT:
3060                 break;
3061         case ASB_DISCARD_YOUNGER_PRI:
3062                 if (self == 0 && peer == 1) {
3063                         rv = -1;
3064                         break;
3065                 }
3066                 if (self == 1 && peer == 0) {
3067                         rv =  1;
3068                         break;
3069                 }
3070                 fallthrough;    /* to one of the other strategies */
3071         case ASB_DISCARD_OLDER_PRI:
3072                 if (self == 0 && peer == 1) {
3073                         rv = 1;
3074                         break;
3075                 }
3076                 if (self == 1 && peer == 0) {
3077                         rv = -1;
3078                         break;
3079                 }
3080                 /* Else fall through to one of the other strategies... */
3081                 drbd_warn(device, "Discard younger/older primary did not find a decision\n"
3082                      "Using discard-least-changes instead\n");
3083                 fallthrough;
3084         case ASB_DISCARD_ZERO_CHG:
3085                 if (ch_peer == 0 && ch_self == 0) {
3086                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
3087                                 ? -1 : 1;
3088                         break;
3089                 } else {
3090                         if (ch_peer == 0) { rv =  1; break; }
3091                         if (ch_self == 0) { rv = -1; break; }
3092                 }
3093                 if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
3094                         break;
3095                 fallthrough;
3096         case ASB_DISCARD_LEAST_CHG:
3097                 if      (ch_self < ch_peer)
3098                         rv = -1;
3099                 else if (ch_self > ch_peer)
3100                         rv =  1;
3101                 else /* ( ch_self == ch_peer ) */
3102                      /* Well, then use something else. */
3103                         rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
3104                                 ? -1 : 1;
3105                 break;
3106         case ASB_DISCARD_LOCAL:
3107                 rv = -1;
3108                 break;
3109         case ASB_DISCARD_REMOTE:
3110                 rv =  1;
3111         }
3112
3113         return rv;
3114 }
3115
3116 /*
3117  * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
3118  */
3119 static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
3120 {
3121         struct drbd_device *device = peer_device->device;
3122         int hg, rv = -100;
3123         enum drbd_after_sb_p after_sb_1p;
3124
3125         rcu_read_lock();
3126         after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
3127         rcu_read_unlock();
3128         switch (after_sb_1p) {
3129         case ASB_DISCARD_YOUNGER_PRI:
3130         case ASB_DISCARD_OLDER_PRI:
3131         case ASB_DISCARD_LEAST_CHG:
3132         case ASB_DISCARD_LOCAL:
3133         case ASB_DISCARD_REMOTE:
3134         case ASB_DISCARD_ZERO_CHG:
3135                 drbd_err(device, "Configuration error.\n");
3136                 break;
3137         case ASB_DISCONNECT:
3138                 break;
3139         case ASB_CONSENSUS:
3140                 hg = drbd_asb_recover_0p(peer_device);
3141                 if (hg == -1 && device->state.role == R_SECONDARY)
3142                         rv = hg;
3143                 if (hg == 1  && device->state.role == R_PRIMARY)
3144                         rv = hg;
3145                 break;
3146         case ASB_VIOLENTLY:
3147                 rv = drbd_asb_recover_0p(peer_device);
3148                 break;
3149         case ASB_DISCARD_SECONDARY:
3150                 return device->state.role == R_PRIMARY ? 1 : -1;
3151         case ASB_CALL_HELPER:
3152                 hg = drbd_asb_recover_0p(peer_device);
3153                 if (hg == -1 && device->state.role == R_PRIMARY) {
3154                         enum drbd_state_rv rv2;
3155
3156                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3157                           * we might be here in C_WF_REPORT_PARAMS which is transient.
3158                           * we do not need to wait for the after state change work either. */
3159                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3160                         if (rv2 != SS_SUCCESS) {
3161                                 drbd_khelper(device, "pri-lost-after-sb");
3162                         } else {
3163                                 drbd_warn(device, "Successfully gave up primary role.\n");
3164                                 rv = hg;
3165                         }
3166                 } else
3167                         rv = hg;
3168         }
3169
3170         return rv;
3171 }
3172
3173 /*
3174  * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
3175  */
3176 static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
3177 {
3178         struct drbd_device *device = peer_device->device;
3179         int hg, rv = -100;
3180         enum drbd_after_sb_p after_sb_2p;
3181
3182         rcu_read_lock();
3183         after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
3184         rcu_read_unlock();
3185         switch (after_sb_2p) {
3186         case ASB_DISCARD_YOUNGER_PRI:
3187         case ASB_DISCARD_OLDER_PRI:
3188         case ASB_DISCARD_LEAST_CHG:
3189         case ASB_DISCARD_LOCAL:
3190         case ASB_DISCARD_REMOTE:
3191         case ASB_CONSENSUS:
3192         case ASB_DISCARD_SECONDARY:
3193         case ASB_DISCARD_ZERO_CHG:
3194                 drbd_err(device, "Configuration error.\n");
3195                 break;
3196         case ASB_VIOLENTLY:
3197                 rv = drbd_asb_recover_0p(peer_device);
3198                 break;
3199         case ASB_DISCONNECT:
3200                 break;
3201         case ASB_CALL_HELPER:
3202                 hg = drbd_asb_recover_0p(peer_device);
3203                 if (hg == -1) {
3204                         enum drbd_state_rv rv2;
3205
3206                          /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
3207                           * we might be here in C_WF_REPORT_PARAMS which is transient.
3208                           * we do not need to wait for the after state change work either. */
3209                         rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
3210                         if (rv2 != SS_SUCCESS) {
3211                                 drbd_khelper(device, "pri-lost-after-sb");
3212                         } else {
3213                                 drbd_warn(device, "Successfully gave up primary role.\n");
3214                                 rv = hg;
3215                         }
3216                 } else
3217                         rv = hg;
3218         }
3219
3220         return rv;
3221 }
3222
3223 static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
3224                            u64 bits, u64 flags)
3225 {
3226         if (!uuid) {
3227                 drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
3228                 return;
3229         }
3230         drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
3231              text,
3232              (unsigned long long)uuid[UI_CURRENT],
3233              (unsigned long long)uuid[UI_BITMAP],
3234              (unsigned long long)uuid[UI_HISTORY_START],
3235              (unsigned long long)uuid[UI_HISTORY_END],
3236              (unsigned long long)bits,
3237              (unsigned long long)flags);
3238 }
3239
3240 /*
3241   100   after split brain try auto recover
3242     2   C_SYNC_SOURCE set BitMap
3243     1   C_SYNC_SOURCE use BitMap
3244     0   no Sync
3245    -1   C_SYNC_TARGET use BitMap
3246    -2   C_SYNC_TARGET set BitMap
3247  -100   after split brain, disconnect
3248 -1000   unrelated data
3249 -1091   requires proto 91
3250 -1096   requires proto 96
3251  */
3252
3253 static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
3254 {
3255         struct drbd_peer_device *const peer_device = first_peer_device(device);
3256         struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
3257         u64 self, peer;
3258         int i, j;
3259
3260         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3261         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3262
3263         *rule_nr = 10;
3264         if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
3265                 return 0;
3266
3267         *rule_nr = 20;
3268         if ((self == UUID_JUST_CREATED || self == (u64)0) &&
3269              peer != UUID_JUST_CREATED)
3270                 return -2;
3271
3272         *rule_nr = 30;
3273         if (self != UUID_JUST_CREATED &&
3274             (peer == UUID_JUST_CREATED || peer == (u64)0))
3275                 return 2;
3276
3277         if (self == peer) {
3278                 int rct, dc; /* roles at crash time */
3279
3280                 if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
3281
3282                         if (connection->agreed_pro_version < 91)
3283                                 return -1091;
3284
3285                         if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
3286                             (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
3287                                 drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
3288                                 drbd_uuid_move_history(device);
3289                                 device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
3290                                 device->ldev->md.uuid[UI_BITMAP] = 0;
3291
3292                                 drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3293                                                device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3294                                 *rule_nr = 34;
3295                         } else {
3296                                 drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
3297                                 *rule_nr = 36;
3298                         }
3299
3300                         return 1;
3301                 }
3302
3303                 if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
3304
3305                         if (connection->agreed_pro_version < 91)
3306                                 return -1091;
3307
3308                         if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
3309                             (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
3310                                 drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
3311
3312                                 device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
3313                                 device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
3314                                 device->p_uuid[UI_BITMAP] = 0UL;
3315
3316                                 drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3317                                 *rule_nr = 35;
3318                         } else {
3319                                 drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
3320                                 *rule_nr = 37;
3321                         }
3322
3323                         return -1;
3324                 }
3325
3326                 /* Common power [off|failure] */
3327                 rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
3328                         (device->p_uuid[UI_FLAGS] & 2);
3329                 /* lowest bit is set when we were primary,
3330                  * next bit (weight 2) is set when peer was primary */
3331                 *rule_nr = 40;
3332
3333                 /* Neither has the "crashed primary" flag set,
3334                  * only a replication link hickup. */
3335                 if (rct == 0)
3336                         return 0;
3337
3338                 /* Current UUID equal and no bitmap uuid; does not necessarily
3339                  * mean this was a "simultaneous hard crash", maybe IO was
3340                  * frozen, so no UUID-bump happened.
3341                  * This is a protocol change, overload DRBD_FF_WSAME as flag
3342                  * for "new-enough" peer DRBD version. */
3343                 if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
3344                         *rule_nr = 41;
3345                         if (!(connection->agreed_features & DRBD_FF_WSAME)) {
3346                                 drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
3347                                 return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
3348                         }
3349                         if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
3350                                 /* At least one has the "crashed primary" bit set,
3351                                  * both are primary now, but neither has rotated its UUIDs?
3352                                  * "Can not happen." */
3353                                 drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
3354                                 return -100;
3355                         }
3356                         if (device->state.role == R_PRIMARY)
3357                                 return 1;
3358                         return -1;
3359                 }
3360
3361                 /* Both are secondary.
3362                  * Really looks like recovery from simultaneous hard crash.
3363                  * Check which had been primary before, and arbitrate. */
3364                 switch (rct) {
3365                 case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
3366                 case 1: /*  self_pri && !peer_pri */ return 1;
3367                 case 2: /* !self_pri &&  peer_pri */ return -1;
3368                 case 3: /*  self_pri &&  peer_pri */
3369                         dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
3370                         return dc ? -1 : 1;
3371                 }
3372         }
3373
3374         *rule_nr = 50;
3375         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3376         if (self == peer)
3377                 return -1;
3378
3379         *rule_nr = 51;
3380         peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
3381         if (self == peer) {
3382                 if (connection->agreed_pro_version < 96 ?
3383                     (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
3384                     (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
3385                     peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
3386                         /* The last P_SYNC_UUID did not get though. Undo the last start of
3387                            resync as sync source modifications of the peer's UUIDs. */
3388
3389                         if (connection->agreed_pro_version < 91)
3390                                 return -1091;
3391
3392                         device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
3393                         device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
3394
3395                         drbd_info(device, "Lost last syncUUID packet, corrected:\n");
3396                         drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3397
3398                         return -1;
3399                 }
3400         }
3401
3402         *rule_nr = 60;
3403         self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
3404         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3405                 peer = device->p_uuid[i] & ~((u64)1);
3406                 if (self == peer)
3407                         return -2;
3408         }
3409
3410         *rule_nr = 70;
3411         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3412         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3413         if (self == peer)
3414                 return 1;
3415
3416         *rule_nr = 71;
3417         self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
3418         if (self == peer) {
3419                 if (connection->agreed_pro_version < 96 ?
3420                     (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
3421                     (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
3422                     self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
3423                         /* The last P_SYNC_UUID did not get though. Undo the last start of
3424                            resync as sync source modifications of our UUIDs. */
3425
3426                         if (connection->agreed_pro_version < 91)
3427                                 return -1091;
3428
3429                         __drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
3430                         __drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
3431
3432                         drbd_info(device, "Last syncUUID did not get through, corrected:\n");
3433                         drbd_uuid_dump(device, "self", device->ldev->md.uuid,
3434                                        device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
3435
3436                         return 1;
3437                 }
3438         }
3439
3440
3441         *rule_nr = 80;
3442         peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
3443         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3444                 self = device->ldev->md.uuid[i] & ~((u64)1);
3445                 if (self == peer)
3446                         return 2;
3447         }
3448
3449         *rule_nr = 90;
3450         self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
3451         peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
3452         if (self == peer && self != ((u64)0))
3453                 return 100;
3454
3455         *rule_nr = 100;
3456         for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
3457                 self = device->ldev->md.uuid[i] & ~((u64)1);
3458                 for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
3459                         peer = device->p_uuid[j] & ~((u64)1);
3460                         if (self == peer)
3461                                 return -100;
3462                 }
3463         }
3464
3465         return -1000;
3466 }
3467
3468 /* drbd_sync_handshake() returns the new conn state on success, or
3469    CONN_MASK (-1) on failure.
3470  */
3471 static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
3472                                            enum drbd_role peer_role,
3473                                            enum drbd_disk_state peer_disk) __must_hold(local)
3474 {
3475         struct drbd_device *device = peer_device->device;
3476         enum drbd_conns rv = C_MASK;
3477         enum drbd_disk_state mydisk;
3478         struct net_conf *nc;
3479         int hg, rule_nr, rr_conflict, tentative, always_asbp;
3480
3481         mydisk = device->state.disk;
3482         if (mydisk == D_NEGOTIATING)
3483                 mydisk = device->new_state_tmp.disk;
3484
3485         drbd_info(device, "drbd_sync_handshake:\n");
3486
3487         spin_lock_irq(&device->ldev->md.uuid_lock);
3488         drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
3489         drbd_uuid_dump(device, "peer", device->p_uuid,
3490                        device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
3491
3492         hg = drbd_uuid_compare(device, peer_role, &rule_nr);
3493         spin_unlock_irq(&device->ldev->md.uuid_lock);
3494
3495         drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
3496
3497         if (hg == -1000) {
3498                 drbd_alert(device, "Unrelated data, aborting!\n");
3499                 return C_MASK;
3500         }
3501         if (hg < -0x10000) {
3502                 int proto, fflags;
3503                 hg = -hg;
3504                 proto = hg & 0xff;
3505                 fflags = (hg >> 8) & 0xff;
3506                 drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
3507                                         proto, fflags);
3508                 return C_MASK;
3509         }
3510         if (hg < -1000) {
3511                 drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
3512                 return C_MASK;
3513         }
3514
3515         if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
3516             (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
3517                 int f = (hg == -100) || abs(hg) == 2;
3518                 hg = mydisk > D_INCONSISTENT ? 1 : -1;
3519                 if (f)
3520                         hg = hg*2;
3521                 drbd_info(device, "Becoming sync %s due to disk states.\n",
3522                      hg > 0 ? "source" : "target");
3523         }
3524
3525         if (abs(hg) == 100)
3526                 drbd_khelper(device, "initial-split-brain");
3527
3528         rcu_read_lock();
3529         nc = rcu_dereference(peer_device->connection->net_conf);
3530         always_asbp = nc->always_asbp;
3531         rr_conflict = nc->rr_conflict;
3532         tentative = nc->tentative;
3533         rcu_read_unlock();
3534
3535         if (hg == 100 || (hg == -100 && always_asbp)) {
3536                 int pcount = (device->state.role == R_PRIMARY)
3537                            + (peer_role == R_PRIMARY);
3538                 int forced = (hg == -100);
3539
3540                 switch (pcount) {
3541                 case 0:
3542                         hg = drbd_asb_recover_0p(peer_device);
3543                         break;
3544                 case 1:
3545                         hg = drbd_asb_recover_1p(peer_device);
3546                         break;
3547                 case 2:
3548                         hg = drbd_asb_recover_2p(peer_device);
3549                         break;
3550                 }
3551                 if (abs(hg) < 100) {
3552                         drbd_warn(device, "Split-Brain detected, %d primaries, "
3553                              "automatically solved. Sync from %s node\n",
3554                              pcount, (hg < 0) ? "peer" : "this");
3555                         if (forced) {
3556                                 drbd_warn(device, "Doing a full sync, since"
3557                                      " UUIDs where ambiguous.\n");
3558                                 hg = hg*2;
3559                         }
3560                 }
3561         }
3562
3563         if (hg == -100) {
3564                 if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
3565                         hg = -1;
3566                 if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
3567                         hg = 1;
3568
3569                 if (abs(hg) < 100)
3570                         drbd_warn(device, "Split-Brain detected, manually solved. "
3571                              "Sync from %s node\n",
3572                              (hg < 0) ? "peer" : "this");
3573         }
3574
3575         if (hg == -100) {
3576                 /* FIXME this log message is not correct if we end up here
3577                  * after an attempted attach on a diskless node.
3578                  * We just refuse to attach -- well, we drop the "connection"
3579                  * to that disk, in a way... */
3580                 drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
3581                 drbd_khelper(device, "split-brain");
3582                 return C_MASK;
3583         }
3584
3585         if (hg > 0 && mydisk <= D_INCONSISTENT) {
3586                 drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
3587                 return C_MASK;
3588         }
3589
3590         if (hg < 0 && /* by intention we do not use mydisk here. */
3591             device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
3592                 switch (rr_conflict) {
3593                 case ASB_CALL_HELPER:
3594                         drbd_khelper(device, "pri-lost");
3595                         fallthrough;
3596                 case ASB_DISCONNECT:
3597                         drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
3598                         return C_MASK;
3599                 case ASB_VIOLENTLY:
3600                         drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
3601                              "assumption\n");
3602                 }
3603         }
3604
3605         if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
3606                 if (hg == 0)
3607                         drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
3608                 else
3609                         drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
3610                                  drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
3611                                  abs(hg) >= 2 ? "full" : "bit-map based");
3612                 return C_MASK;
3613         }
3614
3615         if (abs(hg) >= 2) {
3616                 drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
3617                 if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
3618                                         BM_LOCKED_SET_ALLOWED))
3619                         return C_MASK;
3620         }
3621
3622         if (hg > 0) { /* become sync source. */
3623                 rv = C_WF_BITMAP_S;
3624         } else if (hg < 0) { /* become sync target */
3625                 rv = C_WF_BITMAP_T;
3626         } else {
3627                 rv = C_CONNECTED;
3628                 if (drbd_bm_total_weight(device)) {
3629                         drbd_info(device, "No resync, but %lu bits in bitmap!\n",
3630                              drbd_bm_total_weight(device));
3631                 }
3632         }
3633
3634         return rv;
3635 }
3636
3637 static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
3638 {
3639         /* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
3640         if (peer == ASB_DISCARD_REMOTE)
3641                 return ASB_DISCARD_LOCAL;
3642
3643         /* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
3644         if (peer == ASB_DISCARD_LOCAL)
3645                 return ASB_DISCARD_REMOTE;
3646
3647         /* everything else is valid if they are equal on both sides. */
3648         return peer;
3649 }
3650
3651 static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
3652 {
3653         struct p_protocol *p = pi->data;
3654         enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
3655         int p_proto, p_discard_my_data, p_two_primaries, cf;
3656         struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
3657         char integrity_alg[SHARED_SECRET_MAX] = "";
3658         struct crypto_shash *peer_integrity_tfm = NULL;
3659         void *int_dig_in = NULL, *int_dig_vv = NULL;
3660
3661         p_proto         = be32_to_cpu(p->protocol);
3662         p_after_sb_0p   = be32_to_cpu(p->after_sb_0p);
3663         p_after_sb_1p   = be32_to_cpu(p->after_sb_1p);
3664         p_after_sb_2p   = be32_to_cpu(p->after_sb_2p);
3665         p_two_primaries = be32_to_cpu(p->two_primaries);
3666         cf              = be32_to_cpu(p->conn_flags);
3667         p_discard_my_data = cf & CF_DISCARD_MY_DATA;
3668
3669         if (connection->agreed_pro_version >= 87) {
3670                 int err;
3671
3672                 if (pi->size > sizeof(integrity_alg))
3673                         return -EIO;
3674                 err = drbd_recv_all(connection, integrity_alg, pi->size);
3675                 if (err)
3676                         return err;
3677                 integrity_alg[SHARED_SECRET_MAX - 1] = 0;
3678         }
3679
3680         if (pi->cmd != P_PROTOCOL_UPDATE) {
3681                 clear_bit(CONN_DRY_RUN, &connection->flags);
3682
3683                 if (cf & CF_DRY_RUN)
3684                         set_bit(CONN_DRY_RUN, &connection->flags);
3685
3686                 rcu_read_lock();
3687                 nc = rcu_dereference(connection->net_conf);
3688
3689                 if (p_proto != nc->wire_protocol) {
3690                         drbd_err(connection, "incompatible %s settings\n", "protocol");
3691                         goto disconnect_rcu_unlock;
3692                 }
3693
3694                 if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
3695                         drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
3696                         goto disconnect_rcu_unlock;
3697                 }
3698
3699                 if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
3700                         drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
3701                         goto disconnect_rcu_unlock;
3702                 }
3703
3704                 if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
3705                         drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
3706                         goto disconnect_rcu_unlock;
3707                 }
3708
3709                 if (p_discard_my_data && nc->discard_my_data) {
3710                         drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
3711                         goto disconnect_rcu_unlock;
3712                 }
3713
3714                 if (p_two_primaries != nc->two_primaries) {
3715                         drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
3716                         goto disconnect_rcu_unlock;
3717                 }
3718
3719                 if (strcmp(integrity_alg, nc->integrity_alg)) {
3720                         drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
3721                         goto disconnect_rcu_unlock;
3722                 }
3723
3724                 rcu_read_unlock();
3725         }
3726
3727         if (integrity_alg[0]) {
3728                 int hash_size;
3729
3730                 /*
3731                  * We can only change the peer data integrity algorithm
3732                  * here.  Changing our own data integrity algorithm
3733                  * requires that we send a P_PROTOCOL_UPDATE packet at
3734                  * the same time; otherwise, the peer has no way to
3735                  * tell between which packets the algorithm should
3736                  * change.
3737                  */
3738
3739                 peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0);
3740                 if (IS_ERR(peer_integrity_tfm)) {
3741                         peer_integrity_tfm = NULL;
3742                         drbd_err(connection, "peer data-integrity-alg %s not supported\n",
3743                                  integrity_alg);
3744                         goto disconnect;
3745                 }
3746
3747                 hash_size = crypto_shash_digestsize(peer_integrity_tfm);
3748                 int_dig_in = kmalloc(hash_size, GFP_KERNEL);
3749                 int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
3750                 if (!(int_dig_in && int_dig_vv)) {
3751                         drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
3752                         goto disconnect;
3753                 }
3754         }
3755
3756         new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
3757         if (!new_net_conf)
3758                 goto disconnect;
3759
3760         mutex_lock(&connection->data.mutex);
3761         mutex_lock(&connection->resource->conf_update);
3762         old_net_conf = connection->net_conf;
3763         *new_net_conf = *old_net_conf;
3764
3765         new_net_conf->wire_protocol = p_proto;
3766         new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
3767         new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
3768         new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
3769         new_net_conf->two_primaries = p_two_primaries;
3770
3771         rcu_assign_pointer(connection->net_conf, new_net_conf);
3772         mutex_unlock(&connection->resource->conf_update);
3773         mutex_unlock(&connection->data.mutex);
3774
3775         crypto_free_shash(connection->peer_integrity_tfm);
3776         kfree(connection->int_dig_in);
3777         kfree(connection->int_dig_vv);
3778         connection->peer_integrity_tfm = peer_integrity_tfm;
3779         connection->int_dig_in = int_dig_in;
3780         connection->int_dig_vv = int_dig_vv;
3781
3782         if (strcmp(old_net_conf->integrity_alg, integrity_alg))
3783                 drbd_info(connection, "peer data-integrity-alg: %s\n",
3784                           integrity_alg[0] ? integrity_alg : "(none)");
3785
3786         synchronize_rcu();
3787         kfree(old_net_conf);
3788         return 0;
3789
3790 disconnect_rcu_unlock:
3791         rcu_read_unlock();
3792 disconnect:
3793         crypto_free_shash(peer_integrity_tfm);
3794         kfree(int_dig_in);
3795         kfree(int_dig_vv);
3796         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
3797         return -EIO;
3798 }
3799
3800 /* helper function
3801  * input: alg name, feature name
3802  * return: NULL (alg name was "")
3803  *         ERR_PTR(error) if something goes wrong
3804  *         or the crypto hash ptr, if it worked out ok. */
3805 static struct crypto_shash *drbd_crypto_alloc_digest_safe(
3806                 const struct drbd_device *device,
3807                 const char *alg, const char *name)
3808 {
3809         struct crypto_shash *tfm;
3810
3811         if (!alg[0])
3812                 return NULL;
3813
3814         tfm = crypto_alloc_shash(alg, 0, 0);
3815         if (IS_ERR(tfm)) {
3816                 drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
3817                         alg, name, PTR_ERR(tfm));
3818                 return tfm;
3819         }
3820         return tfm;
3821 }
3822
3823 static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
3824 {
3825         void *buffer = connection->data.rbuf;
3826         int size = pi->size;
3827
3828         while (size) {
3829                 int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
3830                 s = drbd_recv(connection, buffer, s);
3831                 if (s <= 0) {
3832                         if (s < 0)
3833                                 return s;
3834                         break;
3835                 }
3836                 size -= s;
3837         }
3838         if (size)
3839                 return -EIO;
3840         return 0;
3841 }
3842
3843 /*
3844  * config_unknown_volume  -  device configuration command for unknown volume
3845  *
3846  * When a device is added to an existing connection, the node on which the
3847  * device is added first will send configuration commands to its peer but the
3848  * peer will not know about the device yet.  It will warn and ignore these
3849  * commands.  Once the device is added on the second node, the second node will
3850  * send the same device configuration commands, but in the other direction.
3851  *
3852  * (We can also end up here if drbd is misconfigured.)
3853  */
3854 static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
3855 {
3856         drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
3857                   cmdname(pi->cmd), pi->vnr);
3858         return ignore_remaining_packet(connection, pi);
3859 }
3860
3861 static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
3862 {
3863         struct drbd_peer_device *peer_device;
3864         struct drbd_device *device;
3865         struct p_rs_param_95 *p;
3866         unsigned int header_size, data_size, exp_max_sz;
3867         struct crypto_shash *verify_tfm = NULL;
3868         struct crypto_shash *csums_tfm = NULL;
3869         struct net_conf *old_net_conf, *new_net_conf = NULL;
3870         struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
3871         const int apv = connection->agreed_pro_version;
3872         struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
3873         unsigned int fifo_size = 0;
3874         int err;
3875
3876         peer_device = conn_peer_device(connection, pi->vnr);
3877         if (!peer_device)
3878                 return config_unknown_volume(connection, pi);
3879         device = peer_device->device;
3880
3881         exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
3882                     : apv == 88 ? sizeof(struct p_rs_param)
3883                                         + SHARED_SECRET_MAX
3884                     : apv <= 94 ? sizeof(struct p_rs_param_89)
3885                     : /* apv >= 95 */ sizeof(struct p_rs_param_95);
3886
3887         if (pi->size > exp_max_sz) {
3888                 drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
3889                     pi->size, exp_max_sz);
3890                 return -EIO;
3891         }
3892
3893         if (apv <= 88) {
3894                 header_size = sizeof(struct p_rs_param);
3895                 data_size = pi->size - header_size;
3896         } else if (apv <= 94) {
3897                 header_size = sizeof(struct p_rs_param_89);
3898                 data_size = pi->size - header_size;
3899                 D_ASSERT(device, data_size == 0);
3900         } else {
3901                 header_size = sizeof(struct p_rs_param_95);
3902                 data_size = pi->size - header_size;
3903                 D_ASSERT(device, data_size == 0);
3904         }
3905
3906         /* initialize verify_alg and csums_alg */
3907         p = pi->data;
3908         BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX);
3909         memset(&p->algs, 0, sizeof(p->algs));
3910
3911         err = drbd_recv_all(peer_device->connection, p, header_size);
3912         if (err)
3913                 return err;
3914
3915         mutex_lock(&connection->resource->conf_update);
3916         old_net_conf = peer_device->connection->net_conf;
3917         if (get_ldev(device)) {
3918                 new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
3919                 if (!new_disk_conf) {
3920                         put_ldev(device);
3921                         mutex_unlock(&connection->resource->conf_update);
3922                         drbd_err(device, "Allocation of new disk_conf failed\n");
3923                         return -ENOMEM;
3924                 }
3925
3926                 old_disk_conf = device->ldev->disk_conf;
3927                 *new_disk_conf = *old_disk_conf;
3928
3929                 new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
3930         }
3931
3932         if (apv >= 88) {
3933                 if (apv == 88) {
3934                         if (data_size > SHARED_SECRET_MAX || data_size == 0) {
3935                                 drbd_err(device, "verify-alg of wrong size, "
3936                                         "peer wants %u, accepting only up to %u byte\n",
3937                                         data_size, SHARED_SECRET_MAX);
3938                                 err = -EIO;
3939                                 goto reconnect;
3940                         }
3941
3942                         err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
3943                         if (err)
3944                                 goto reconnect;
3945                         /* we expect NUL terminated string */
3946                         /* but just in case someone tries to be evil */
3947                         D_ASSERT(device, p->verify_alg[data_size-1] == 0);
3948                         p->verify_alg[data_size-1] = 0;
3949
3950                 } else /* apv >= 89 */ {
3951                         /* we still expect NUL terminated strings */
3952                         /* but just in case someone tries to be evil */
3953                         D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
3954                         D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
3955                         p->verify_alg[SHARED_SECRET_MAX-1] = 0;
3956                         p->csums_alg[SHARED_SECRET_MAX-1] = 0;
3957                 }
3958
3959                 if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
3960                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3961                                 drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
3962                                     old_net_conf->verify_alg, p->verify_alg);
3963                                 goto disconnect;
3964                         }
3965                         verify_tfm = drbd_crypto_alloc_digest_safe(device,
3966                                         p->verify_alg, "verify-alg");
3967                         if (IS_ERR(verify_tfm)) {
3968                                 verify_tfm = NULL;
3969                                 goto disconnect;
3970                         }
3971                 }
3972
3973                 if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
3974                         if (device->state.conn == C_WF_REPORT_PARAMS) {
3975                                 drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
3976                                     old_net_conf->csums_alg, p->csums_alg);
3977                                 goto disconnect;
3978                         }
3979                         csums_tfm = drbd_crypto_alloc_digest_safe(device,
3980                                         p->csums_alg, "csums-alg");
3981                         if (IS_ERR(csums_tfm)) {
3982                                 csums_tfm = NULL;
3983                                 goto disconnect;
3984                         }
3985                 }
3986
3987                 if (apv > 94 && new_disk_conf) {
3988                         new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
3989                         new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
3990                         new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
3991                         new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
3992
3993                         fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
3994                         if (fifo_size != device->rs_plan_s->size) {
3995                                 new_plan = fifo_alloc(fifo_size);
3996                                 if (!new_plan) {
3997                                         drbd_err(device, "kmalloc of fifo_buffer failed");
3998                                         put_ldev(device);
3999                                         goto disconnect;
4000                                 }
4001                         }
4002                 }
4003
4004                 if (verify_tfm || csums_tfm) {
4005                         new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
4006                         if (!new_net_conf)
4007                                 goto disconnect;
4008
4009                         *new_net_conf = *old_net_conf;
4010
4011                         if (verify_tfm) {
4012                                 strcpy(new_net_conf->verify_alg, p->verify_alg);
4013                                 new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
4014                                 crypto_free_shash(peer_device->connection->verify_tfm);
4015                                 peer_device->connection->verify_tfm = verify_tfm;
4016                                 drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
4017                         }
4018                         if (csums_tfm) {
4019                                 strcpy(new_net_conf->csums_alg, p->csums_alg);
4020                                 new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
4021                                 crypto_free_shash(peer_device->connection->csums_tfm);
4022                                 peer_device->connection->csums_tfm = csums_tfm;
4023                                 drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
4024                         }
4025                         rcu_assign_pointer(connection->net_conf, new_net_conf);
4026                 }
4027         }
4028
4029         if (new_disk_conf) {
4030                 rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4031                 put_ldev(device);
4032         }
4033
4034         if (new_plan) {
4035                 old_plan = device->rs_plan_s;
4036                 rcu_assign_pointer(device->rs_plan_s, new_plan);
4037         }
4038
4039         mutex_unlock(&connection->resource->conf_update);
4040         synchronize_rcu();
4041         if (new_net_conf)
4042                 kfree(old_net_conf);
4043         kfree(old_disk_conf);
4044         kfree(old_plan);
4045
4046         return 0;
4047
4048 reconnect:
4049         if (new_disk_conf) {
4050                 put_ldev(device);
4051                 kfree(new_disk_conf);
4052         }
4053         mutex_unlock(&connection->resource->conf_update);
4054         return -EIO;
4055
4056 disconnect:
4057         kfree(new_plan);
4058         if (new_disk_conf) {
4059                 put_ldev(device);
4060                 kfree(new_disk_conf);
4061         }
4062         mutex_unlock(&connection->resource->conf_update);
4063         /* just for completeness: actually not needed,
4064          * as this is not reached if csums_tfm was ok. */
4065         crypto_free_shash(csums_tfm);
4066         /* but free the verify_tfm again, if csums_tfm did not work out */
4067         crypto_free_shash(verify_tfm);
4068         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4069         return -EIO;
4070 }
4071
4072 /* warn if the arguments differ by more than 12.5% */
4073 static void warn_if_differ_considerably(struct drbd_device *device,
4074         const char *s, sector_t a, sector_t b)
4075 {
4076         sector_t d;
4077         if (a == 0 || b == 0)
4078                 return;
4079         d = (a > b) ? (a - b) : (b - a);
4080         if (d > (a>>3) || d > (b>>3))
4081                 drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
4082                      (unsigned long long)a, (unsigned long long)b);
4083 }
4084
4085 static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
4086 {
4087         struct drbd_peer_device *peer_device;
4088         struct drbd_device *device;
4089         struct p_sizes *p = pi->data;
4090         struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
4091         enum determine_dev_size dd = DS_UNCHANGED;
4092         sector_t p_size, p_usize, p_csize, my_usize;
4093         sector_t new_size, cur_size;
4094         int ldsc = 0; /* local disk size changed */
4095         enum dds_flags ddsf;
4096
4097         peer_device = conn_peer_device(connection, pi->vnr);
4098         if (!peer_device)
4099                 return config_unknown_volume(connection, pi);
4100         device = peer_device->device;
4101         cur_size = get_capacity(device->vdisk);
4102
4103         p_size = be64_to_cpu(p->d_size);
4104         p_usize = be64_to_cpu(p->u_size);
4105         p_csize = be64_to_cpu(p->c_size);
4106
4107         /* just store the peer's disk size for now.
4108          * we still need to figure out whether we accept that. */
4109         device->p_size = p_size;
4110
4111         if (get_ldev(device)) {
4112                 rcu_read_lock();
4113                 my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
4114                 rcu_read_unlock();
4115
4116                 warn_if_differ_considerably(device, "lower level device sizes",
4117                            p_size, drbd_get_max_capacity(device->ldev));
4118                 warn_if_differ_considerably(device, "user requested size",
4119                                             p_usize, my_usize);
4120
4121                 /* if this is the first connect, or an otherwise expected
4122                  * param exchange, choose the minimum */
4123                 if (device->state.conn == C_WF_REPORT_PARAMS)
4124                         p_usize = min_not_zero(my_usize, p_usize);
4125
4126                 /* Never shrink a device with usable data during connect,
4127                  * or "attach" on the peer.
4128                  * But allow online shrinking if we are connected. */
4129                 new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
4130                 if (new_size < cur_size &&
4131                     device->state.disk >= D_OUTDATED &&
4132                     (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) {
4133                         drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
4134                                         (unsigned long long)new_size, (unsigned long long)cur_size);
4135                         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4136                         put_ldev(device);
4137                         return -EIO;
4138                 }
4139
4140                 if (my_usize != p_usize) {
4141                         struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
4142
4143                         new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
4144                         if (!new_disk_conf) {
4145                                 put_ldev(device);
4146                                 return -ENOMEM;
4147                         }
4148
4149                         mutex_lock(&connection->resource->conf_update);
4150                         old_disk_conf = device->ldev->disk_conf;
4151                         *new_disk_conf = *old_disk_conf;
4152                         new_disk_conf->disk_size = p_usize;
4153
4154                         rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
4155                         mutex_unlock(&connection->resource->conf_update);
4156                         synchronize_rcu();
4157                         kfree(old_disk_conf);
4158
4159                         drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n",
4160                                  (unsigned long)p_usize, (unsigned long)my_usize);
4161                 }
4162
4163                 put_ldev(device);
4164         }
4165
4166         device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
4167         /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
4168            In case we cleared the QUEUE_FLAG_DISCARD from our queue in
4169            drbd_reconsider_queue_parameters(), we can be sure that after
4170            drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
4171
4172         ddsf = be16_to_cpu(p->dds_flags);
4173         if (get_ldev(device)) {
4174                 drbd_reconsider_queue_parameters(device, device->ldev, o);
4175                 dd = drbd_determine_dev_size(device, ddsf, NULL);
4176                 put_ldev(device);
4177                 if (dd == DS_ERROR)
4178                         return -EIO;
4179                 drbd_md_sync(device);
4180         } else {
4181                 /*
4182                  * I am diskless, need to accept the peer's *current* size.
4183                  * I must NOT accept the peers backing disk size,
4184                  * it may have been larger than mine all along...
4185                  *
4186                  * At this point, the peer knows more about my disk, or at
4187                  * least about what we last agreed upon, than myself.
4188                  * So if his c_size is less than his d_size, the most likely
4189                  * reason is that *my* d_size was smaller last time we checked.
4190                  *
4191                  * However, if he sends a zero current size,
4192                  * take his (user-capped or) backing disk size anyways.
4193                  *
4194                  * Unless of course he does not have a disk himself.
4195                  * In which case we ignore this completely.
4196                  */
4197                 sector_t new_size = p_csize ?: p_usize ?: p_size;
4198                 drbd_reconsider_queue_parameters(device, NULL, o);
4199                 if (new_size == 0) {
4200                         /* Ignore, peer does not know nothing. */
4201                 } else if (new_size == cur_size) {
4202                         /* nothing to do */
4203                 } else if (cur_size != 0 && p_size == 0) {
4204                         drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
4205                                         (unsigned long long)new_size, (unsigned long long)cur_size);
4206                 } else if (new_size < cur_size && device->state.role == R_PRIMARY) {
4207                         drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
4208                                         (unsigned long long)new_size, (unsigned long long)cur_size);
4209                         conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4210                         return -EIO;
4211                 } else {
4212                         /* I believe the peer, if
4213                          *  - I don't have a current size myself
4214                          *  - we agree on the size anyways
4215                          *  - I do have a current size, am Secondary,
4216                          *    and he has the only disk
4217                          *  - I do have a current size, am Primary,
4218                          *    and he has the only disk,
4219                          *    which is larger than my current size
4220                          */
4221                         drbd_set_my_capacity(device, new_size);
4222                 }
4223         }
4224
4225         if (get_ldev(device)) {
4226                 if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
4227                         device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
4228                         ldsc = 1;
4229                 }
4230
4231                 put_ldev(device);
4232         }
4233
4234         if (device->state.conn > C_WF_REPORT_PARAMS) {
4235                 if (be64_to_cpu(p->c_size) != get_capacity(device->vdisk) ||
4236                     ldsc) {
4237                         /* we have different sizes, probably peer
4238                          * needs to know my new size... */
4239                         drbd_send_sizes(peer_device, 0, ddsf);
4240                 }
4241                 if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
4242                     (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
4243                         if (device->state.pdsk >= D_INCONSISTENT &&
4244                             device->state.disk >= D_INCONSISTENT) {
4245                                 if (ddsf & DDSF_NO_RESYNC)
4246                                         drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
4247                                 else
4248                                         resync_after_online_grow(device);
4249                         } else
4250                                 set_bit(RESYNC_AFTER_NEG, &device->flags);
4251                 }
4252         }
4253
4254         return 0;
4255 }
4256
4257 static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
4258 {
4259         struct drbd_peer_device *peer_device;
4260         struct drbd_device *device;
4261         struct p_uuids *p = pi->data;
4262         u64 *p_uuid;
4263         int i, updated_uuids = 0;
4264
4265         peer_device = conn_peer_device(connection, pi->vnr);
4266         if (!peer_device)
4267                 return config_unknown_volume(connection, pi);
4268         device = peer_device->device;
4269
4270         p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO);
4271         if (!p_uuid)
4272                 return false;
4273
4274         for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
4275                 p_uuid[i] = be64_to_cpu(p->uuid[i]);
4276
4277         kfree(device->p_uuid);
4278         device->p_uuid = p_uuid;
4279
4280         if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
4281             device->state.disk < D_INCONSISTENT &&
4282             device->state.role == R_PRIMARY &&
4283             (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
4284                 drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
4285                     (unsigned long long)device->ed_uuid);
4286                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4287                 return -EIO;
4288         }
4289
4290         if (get_ldev(device)) {
4291                 int skip_initial_sync =
4292                         device->state.conn == C_CONNECTED &&
4293                         peer_device->connection->agreed_pro_version >= 90 &&
4294                         device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
4295                         (p_uuid[UI_FLAGS] & 8);
4296                 if (skip_initial_sync) {
4297                         drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
4298                         drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
4299                                         "clear_n_write from receive_uuids",
4300                                         BM_LOCKED_TEST_ALLOWED);
4301                         _drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
4302                         _drbd_uuid_set(device, UI_BITMAP, 0);
4303                         _drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
4304                                         CS_VERBOSE, NULL);
4305                         drbd_md_sync(device);
4306                         updated_uuids = 1;
4307                 }
4308                 put_ldev(device);
4309         } else if (device->state.disk < D_INCONSISTENT &&
4310                    device->state.role == R_PRIMARY) {
4311                 /* I am a diskless primary, the peer just created a new current UUID
4312                    for me. */
4313                 updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4314         }
4315
4316         /* Before we test for the disk state, we should wait until an eventually
4317            ongoing cluster wide state change is finished. That is important if
4318            we are primary and are detaching from our disk. We need to see the
4319            new disk state... */
4320         mutex_lock(device->state_mutex);
4321         mutex_unlock(device->state_mutex);
4322         if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
4323                 updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
4324
4325         if (updated_uuids)
4326                 drbd_print_uuids(device, "receiver updated UUIDs to");
4327
4328         return 0;
4329 }
4330
4331 /**
4332  * convert_state() - Converts the peer's view of the cluster state to our point of view
4333  * @ps:         The state as seen by the peer.
4334  */
4335 static union drbd_state convert_state(union drbd_state ps)
4336 {
4337         union drbd_state ms;
4338
4339         static enum drbd_conns c_tab[] = {
4340                 [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
4341                 [C_CONNECTED] = C_CONNECTED,
4342
4343                 [C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
4344                 [C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
4345                 [C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
4346                 [C_VERIFY_S]       = C_VERIFY_T,
4347                 [C_MASK]   = C_MASK,
4348         };
4349
4350         ms.i = ps.i;
4351
4352         ms.conn = c_tab[ps.conn];
4353         ms.peer = ps.role;
4354         ms.role = ps.peer;
4355         ms.pdsk = ps.disk;
4356         ms.disk = ps.pdsk;
4357         ms.peer_isp = (ps.aftr_isp | ps.user_isp);
4358
4359         return ms;
4360 }
4361
4362 static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
4363 {
4364         struct drbd_peer_device *peer_device;
4365         struct drbd_device *device;
4366         struct p_req_state *p = pi->data;
4367         union drbd_state mask, val;
4368         enum drbd_state_rv rv;
4369
4370         peer_device = conn_peer_device(connection, pi->vnr);
4371         if (!peer_device)
4372                 return -EIO;
4373         device = peer_device->device;
4374
4375         mask.i = be32_to_cpu(p->mask);
4376         val.i = be32_to_cpu(p->val);
4377
4378         if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
4379             mutex_is_locked(device->state_mutex)) {
4380                 drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
4381                 return 0;
4382         }
4383
4384         mask = convert_state(mask);
4385         val = convert_state(val);
4386
4387         rv = drbd_change_state(device, CS_VERBOSE, mask, val);
4388         drbd_send_sr_reply(peer_device, rv);
4389
4390         drbd_md_sync(device);
4391
4392         return 0;
4393 }
4394
4395 static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
4396 {
4397         struct p_req_state *p = pi->data;
4398         union drbd_state mask, val;
4399         enum drbd_state_rv rv;
4400
4401         mask.i = be32_to_cpu(p->mask);
4402         val.i = be32_to_cpu(p->val);
4403
4404         if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
4405             mutex_is_locked(&connection->cstate_mutex)) {
4406                 conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
4407                 return 0;
4408         }
4409
4410         mask = convert_state(mask);
4411         val = convert_state(val);
4412
4413         rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
4414         conn_send_sr_reply(connection, rv);
4415
4416         return 0;
4417 }
4418
4419 static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
4420 {
4421         struct drbd_peer_device *peer_device;
4422         struct drbd_device *device;
4423         struct p_state *p = pi->data;
4424         union drbd_state os, ns, peer_state;
4425         enum drbd_disk_state real_peer_disk;
4426         enum chg_state_flags cs_flags;
4427         int rv;
4428
4429         peer_device = conn_peer_device(connection, pi->vnr);
4430         if (!peer_device)
4431                 return config_unknown_volume(connection, pi);
4432         device = peer_device->device;
4433
4434         peer_state.i = be32_to_cpu(p->state);
4435
4436         real_peer_disk = peer_state.disk;
4437         if (peer_state.disk == D_NEGOTIATING) {
4438                 real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
4439                 drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
4440         }
4441
4442         spin_lock_irq(&device->resource->req_lock);
4443  retry:
4444         os = ns = drbd_read_state(device);
4445         spin_unlock_irq(&device->resource->req_lock);
4446
4447         /* If some other part of the code (ack_receiver thread, timeout)
4448          * already decided to close the connection again,
4449          * we must not "re-establish" it here. */
4450         if (os.conn <= C_TEAR_DOWN)
4451                 return -ECONNRESET;
4452
4453         /* If this is the "end of sync" confirmation, usually the peer disk
4454          * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
4455          * set) resync started in PausedSyncT, or if the timing of pause-/
4456          * unpause-sync events has been "just right", the peer disk may
4457          * transition from D_CONSISTENT to D_UP_TO_DATE as well.
4458          */
4459         if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
4460             real_peer_disk == D_UP_TO_DATE &&
4461             os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
4462                 /* If we are (becoming) SyncSource, but peer is still in sync
4463                  * preparation, ignore its uptodate-ness to avoid flapping, it
4464                  * will change to inconsistent once the peer reaches active
4465                  * syncing states.
4466                  * It may have changed syncer-paused flags, however, so we
4467                  * cannot ignore this completely. */
4468                 if (peer_state.conn > C_CONNECTED &&
4469                     peer_state.conn < C_SYNC_SOURCE)
4470                         real_peer_disk = D_INCONSISTENT;
4471
4472                 /* if peer_state changes to connected at the same time,
4473                  * it explicitly notifies us that it finished resync.
4474                  * Maybe we should finish it up, too? */
4475                 else if (os.conn >= C_SYNC_SOURCE &&
4476                          peer_state.conn == C_CONNECTED) {
4477                         if (drbd_bm_total_weight(device) <= device->rs_failed)
4478                                 drbd_resync_finished(device);
4479                         return 0;
4480                 }
4481         }
4482
4483         /* explicit verify finished notification, stop sector reached. */
4484         if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
4485             peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
4486                 ov_out_of_sync_print(device);
4487                 drbd_resync_finished(device);
4488                 return 0;
4489         }
4490
4491         /* peer says his disk is inconsistent, while we think it is uptodate,
4492          * and this happens while the peer still thinks we have a sync going on,
4493          * but we think we are already done with the sync.
4494          * We ignore this to avoid flapping pdsk.
4495          * This should not happen, if the peer is a recent version of drbd. */
4496         if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
4497             os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
4498                 real_peer_disk = D_UP_TO_DATE;
4499
4500         if (ns.conn == C_WF_REPORT_PARAMS)
4501                 ns.conn = C_CONNECTED;
4502
4503         if (peer_state.conn == C_AHEAD)
4504                 ns.conn = C_BEHIND;
4505
4506         /* TODO:
4507          * if (primary and diskless and peer uuid != effective uuid)
4508          *     abort attach on peer;
4509          *
4510          * If this node does not have good data, was already connected, but
4511          * the peer did a late attach only now, trying to "negotiate" with me,
4512          * AND I am currently Primary, possibly frozen, with some specific
4513          * "effective" uuid, this should never be reached, really, because
4514          * we first send the uuids, then the current state.
4515          *
4516          * In this scenario, we already dropped the connection hard
4517          * when we received the unsuitable uuids (receive_uuids().
4518          *
4519          * Should we want to change this, that is: not drop the connection in
4520          * receive_uuids() already, then we would need to add a branch here
4521          * that aborts the attach of "unsuitable uuids" on the peer in case
4522          * this node is currently Diskless Primary.
4523          */
4524
4525         if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
4526             get_ldev_if_state(device, D_NEGOTIATING)) {
4527                 int cr; /* consider resync */
4528
4529                 /* if we established a new connection */
4530                 cr  = (os.conn < C_CONNECTED);
4531                 /* if we had an established connection
4532                  * and one of the nodes newly attaches a disk */
4533                 cr |= (os.conn == C_CONNECTED &&
4534                        (peer_state.disk == D_NEGOTIATING ||
4535                         os.disk == D_NEGOTIATING));
4536                 /* if we have both been inconsistent, and the peer has been
4537                  * forced to be UpToDate with --force */
4538                 cr |= test_bit(CONSIDER_RESYNC, &device->flags);
4539                 /* if we had been plain connected, and the admin requested to
4540                  * start a sync by "invalidate" or "invalidate-remote" */
4541                 cr |= (os.conn == C_CONNECTED &&
4542                                 (peer_state.conn >= C_STARTING_SYNC_S &&
4543                                  peer_state.conn <= C_WF_BITMAP_T));
4544
4545                 if (cr)
4546                         ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
4547
4548                 put_ldev(device);
4549                 if (ns.conn == C_MASK) {
4550                         ns.conn = C_CONNECTED;
4551                         if (device->state.disk == D_NEGOTIATING) {
4552                                 drbd_force_state(device, NS(disk, D_FAILED));
4553                         } else if (peer_state.disk == D_NEGOTIATING) {
4554                                 drbd_err(device, "Disk attach process on the peer node was aborted.\n");
4555                                 peer_state.disk = D_DISKLESS;
4556                                 real_peer_disk = D_DISKLESS;
4557                         } else {
4558                                 if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
4559                                         return -EIO;
4560                                 D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
4561                                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4562                                 return -EIO;
4563                         }
4564                 }
4565         }
4566
4567         spin_lock_irq(&device->resource->req_lock);
4568         if (os.i != drbd_read_state(device).i)
4569                 goto retry;
4570         clear_bit(CONSIDER_RESYNC, &device->flags);
4571         ns.peer = peer_state.role;
4572         ns.pdsk = real_peer_disk;
4573         ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
4574         if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
4575                 ns.disk = device->new_state_tmp.disk;
4576         cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
4577         if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
4578             test_bit(NEW_CUR_UUID, &device->flags)) {
4579                 /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
4580                    for temporal network outages! */
4581                 spin_unlock_irq(&device->resource->req_lock);
4582                 drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
4583                 tl_clear(peer_device->connection);
4584                 drbd_uuid_new_current(device);
4585                 clear_bit(NEW_CUR_UUID, &device->flags);
4586                 conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
4587                 return -EIO;
4588         }
4589         rv = _drbd_set_state(device, ns, cs_flags, NULL);
4590         ns = drbd_read_state(device);
4591         spin_unlock_irq(&device->resource->req_lock);
4592
4593         if (rv < SS_SUCCESS) {
4594                 conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
4595                 return -EIO;
4596         }
4597
4598         if (os.conn > C_WF_REPORT_PARAMS) {
4599                 if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
4600                     peer_state.disk != D_NEGOTIATING ) {
4601                         /* we want resync, peer has not yet decided to sync... */
4602                         /* Nowadays only used when forcing a node into primary role and
4603                            setting its disk to UpToDate with that */
4604                         drbd_send_uuids(peer_device);
4605                         drbd_send_current_state(peer_device);
4606                 }
4607         }
4608
4609         clear_bit(DISCARD_MY_DATA, &device->flags);
4610
4611         drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
4612
4613         return 0;
4614 }
4615
4616 static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
4617 {
4618         struct drbd_peer_device *peer_device;
4619         struct drbd_device *device;
4620         struct p_rs_uuid *p = pi->data;
4621
4622         peer_device = conn_peer_device(connection, pi->vnr);
4623         if (!peer_device)
4624                 return -EIO;
4625         device = peer_device->device;
4626
4627         wait_event(device->misc_wait,
4628                    device->state.conn == C_WF_SYNC_UUID ||
4629                    device->state.conn == C_BEHIND ||
4630                    device->state.conn < C_CONNECTED ||
4631                    device->state.disk < D_NEGOTIATING);
4632
4633         /* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
4634
4635         /* Here the _drbd_uuid_ functions are right, current should
4636            _not_ be rotated into the history */
4637         if (get_ldev_if_state(device, D_NEGOTIATING)) {
4638                 _drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
4639                 _drbd_uuid_set(device, UI_BITMAP, 0UL);
4640
4641                 drbd_print_uuids(device, "updated sync uuid");
4642                 drbd_start_resync(device, C_SYNC_TARGET);
4643
4644                 put_ldev(device);
4645         } else
4646                 drbd_err(device, "Ignoring SyncUUID packet!\n");
4647
4648         return 0;
4649 }
4650
4651 /*
4652  * receive_bitmap_plain
4653  *
4654  * Return 0 when done, 1 when another iteration is needed, and a negative error
4655  * code upon failure.
4656  */
4657 static int
4658 receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
4659                      unsigned long *p, struct bm_xfer_ctx *c)
4660 {
4661         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
4662                                  drbd_header_size(peer_device->connection);
4663         unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
4664                                        c->bm_words - c->word_offset);
4665         unsigned int want = num_words * sizeof(*p);
4666         int err;
4667
4668         if (want != size) {
4669                 drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
4670                 return -EIO;
4671         }
4672         if (want == 0)
4673                 return 0;
4674         err = drbd_recv_all(peer_device->connection, p, want);
4675         if (err)
4676                 return err;
4677
4678         drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
4679
4680         c->word_offset += num_words;
4681         c->bit_offset = c->word_offset * BITS_PER_LONG;
4682         if (c->bit_offset > c->bm_bits)
4683                 c->bit_offset = c->bm_bits;
4684
4685         return 1;
4686 }
4687
4688 static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
4689 {
4690         return (enum drbd_bitmap_code)(p->encoding & 0x0f);
4691 }
4692
4693 static int dcbp_get_start(struct p_compressed_bm *p)
4694 {
4695         return (p->encoding & 0x80) != 0;
4696 }
4697
4698 static int dcbp_get_pad_bits(struct p_compressed_bm *p)
4699 {
4700         return (p->encoding >> 4) & 0x7;
4701 }
4702
4703 /*
4704  * recv_bm_rle_bits
4705  *
4706  * Return 0 when done, 1 when another iteration is needed, and a negative error
4707  * code upon failure.
4708  */
4709 static int
4710 recv_bm_rle_bits(struct drbd_peer_device *peer_device,
4711                 struct p_compressed_bm *p,
4712                  struct bm_xfer_ctx *c,
4713                  unsigned int len)
4714 {
4715         struct bitstream bs;
4716         u64 look_ahead;
4717         u64 rl;
4718         u64 tmp;
4719         unsigned long s = c->bit_offset;
4720         unsigned long e;
4721         int toggle = dcbp_get_start(p);
4722         int have;
4723         int bits;
4724
4725         bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
4726
4727         bits = bitstream_get_bits(&bs, &look_ahead, 64);
4728         if (bits < 0)
4729                 return -EIO;
4730
4731         for (have = bits; have > 0; s += rl, toggle = !toggle) {
4732                 bits = vli_decode_bits(&rl, look_ahead);
4733                 if (bits <= 0)
4734                         return -EIO;
4735
4736                 if (toggle) {
4737                         e = s + rl -1;
4738                         if (e >= c->bm_bits) {
4739                                 drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
4740                                 return -EIO;
4741                         }
4742                         _drbd_bm_set_bits(peer_device->device, s, e);
4743                 }
4744
4745                 if (have < bits) {
4746                         drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
4747                                 have, bits, look_ahead,
4748                                 (unsigned int)(bs.cur.b - p->code),
4749                                 (unsigned int)bs.buf_len);
4750                         return -EIO;
4751                 }
4752                 /* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
4753                 if (likely(bits < 64))
4754                         look_ahead >>= bits;
4755                 else
4756                         look_ahead = 0;
4757                 have -= bits;
4758
4759                 bits = bitstream_get_bits(&bs, &tmp, 64 - have);
4760                 if (bits < 0)
4761                         return -EIO;
4762                 look_ahead |= tmp << have;
4763                 have += bits;
4764         }
4765
4766         c->bit_offset = s;
4767         bm_xfer_ctx_bit_to_word_offset(c);
4768
4769         return (s != c->bm_bits);
4770 }
4771
4772 /*
4773  * decode_bitmap_c
4774  *
4775  * Return 0 when done, 1 when another iteration is needed, and a negative error
4776  * code upon failure.
4777  */
4778 static int
4779 decode_bitmap_c(struct drbd_peer_device *peer_device,
4780                 struct p_compressed_bm *p,
4781                 struct bm_xfer_ctx *c,
4782                 unsigned int len)
4783 {
4784         if (dcbp_get_code(p) == RLE_VLI_Bits)
4785                 return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
4786
4787         /* other variants had been implemented for evaluation,
4788          * but have been dropped as this one turned out to be "best"
4789          * during all our tests. */
4790
4791         drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
4792         conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
4793         return -EIO;
4794 }
4795
4796 void INFO_bm_xfer_stats(struct drbd_device *device,
4797                 const char *direction, struct bm_xfer_ctx *c)
4798 {
4799         /* what would it take to transfer it "plaintext" */
4800         unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
4801         unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
4802         unsigned int plain =
4803                 header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
4804                 c->bm_words * sizeof(unsigned long);
4805         unsigned int total = c->bytes[0] + c->bytes[1];
4806         unsigned int r;
4807
4808         /* total can not be zero. but just in case: */
4809         if (total == 0)
4810                 return;
4811
4812         /* don't report if not compressed */
4813         if (total >= plain)
4814                 return;
4815
4816         /* total < plain. check for overflow, still */
4817         r = (total > UINT_MAX/1000) ? (total / (plain/1000))
4818                                     : (1000 * total / plain);
4819
4820         if (r > 1000)
4821                 r = 1000;
4822
4823         r = 1000 - r;
4824         drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
4825              "total %u; compression: %u.%u%%\n",
4826                         direction,
4827                         c->bytes[1], c->packets[1],
4828                         c->bytes[0], c->packets[0],
4829                         total, r/10, r % 10);
4830 }
4831
4832 /* Since we are processing the bitfield from lower addresses to higher,
4833    it does not matter if the process it in 32 bit chunks or 64 bit
4834    chunks as long as it is little endian. (Understand it as byte stream,
4835    beginning with the lowest byte...) If we would use big endian
4836    we would need to process it from the highest address to the lowest,
4837    in order to be agnostic to the 32 vs 64 bits issue.
4838
4839    returns 0 on failure, 1 if we successfully received it. */
4840 static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
4841 {
4842         struct drbd_peer_device *peer_device;
4843         struct drbd_device *device;
4844         struct bm_xfer_ctx c;
4845         int err;
4846
4847         peer_device = conn_peer_device(connection, pi->vnr);
4848         if (!peer_device)
4849                 return -EIO;
4850         device = peer_device->device;
4851
4852         drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
4853         /* you are supposed to send additional out-of-sync information
4854          * if you actually set bits during this phase */
4855
4856         c = (struct bm_xfer_ctx) {
4857                 .bm_bits = drbd_bm_bits(device),
4858                 .bm_words = drbd_bm_words(device),
4859         };
4860
4861         for(;;) {
4862                 if (pi->cmd == P_BITMAP)
4863                         err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
4864                 else if (pi->cmd == P_COMPRESSED_BITMAP) {
4865                         /* MAYBE: sanity check that we speak proto >= 90,
4866                          * and the feature is enabled! */
4867                         struct p_compressed_bm *p = pi->data;
4868
4869                         if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
4870                                 drbd_err(device, "ReportCBitmap packet too large\n");
4871                                 err = -EIO;
4872                                 goto out;
4873                         }
4874                         if (pi->size <= sizeof(*p)) {
4875                                 drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
4876                                 err = -EIO;
4877                                 goto out;
4878                         }
4879                         err = drbd_recv_all(peer_device->connection, p, pi->size);
4880                         if (err)
4881                                goto out;
4882                         err = decode_bitmap_c(peer_device, p, &c, pi->size);
4883                 } else {
4884                         drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
4885                         err = -EIO;
4886                         goto out;
4887                 }
4888
4889                 c.packets[pi->cmd == P_BITMAP]++;
4890                 c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
4891
4892                 if (err <= 0) {
4893                         if (err < 0)
4894                                 goto out;
4895                         break;
4896                 }
4897                 err = drbd_recv_header(peer_device->connection, pi);
4898                 if (err)
4899                         goto out;
4900         }
4901
4902         INFO_bm_xfer_stats(device, "receive", &c);
4903
4904         if (device->state.conn == C_WF_BITMAP_T) {
4905                 enum drbd_state_rv rv;
4906
4907                 err = drbd_send_bitmap(device);
4908                 if (err)
4909                         goto out;
4910                 /* Omit CS_ORDERED with this state transition to avoid deadlocks. */
4911                 rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
4912                 D_ASSERT(device, rv == SS_SUCCESS);
4913         } else if (device->state.conn != C_WF_BITMAP_S) {
4914                 /* admin may have requested C_DISCONNECTING,
4915                  * other threads may have noticed network errors */
4916                 drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
4917                     drbd_conn_str(device->state.conn));
4918         }
4919         err = 0;
4920
4921  out:
4922         drbd_bm_unlock(device);
4923         if (!err && device->state.conn == C_WF_BITMAP_S)
4924                 drbd_start_resync(device, C_SYNC_SOURCE);
4925         return err;
4926 }
4927
4928 static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
4929 {
4930         drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
4931                  pi->cmd, pi->size);
4932
4933         return ignore_remaining_packet(connection, pi);
4934 }
4935
4936 static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
4937 {
4938         /* Make sure we've acked all the TCP data associated
4939          * with the data requests being unplugged */
4940         tcp_sock_set_quickack(connection->data.socket->sk, 2);
4941         return 0;
4942 }
4943
4944 static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
4945 {
4946         struct drbd_peer_device *peer_device;
4947         struct drbd_device *device;
4948         struct p_block_desc *p = pi->data;
4949
4950         peer_device = conn_peer_device(connection, pi->vnr);
4951         if (!peer_device)
4952                 return -EIO;
4953         device = peer_device->device;
4954
4955         switch (device->state.conn) {
4956         case C_WF_SYNC_UUID:
4957         case C_WF_BITMAP_T:
4958         case C_BEHIND:
4959                         break;
4960         default:
4961                 drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
4962                                 drbd_conn_str(device->state.conn));
4963         }
4964
4965         drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
4966
4967         return 0;
4968 }
4969
4970 static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
4971 {
4972         struct drbd_peer_device *peer_device;
4973         struct p_block_desc *p = pi->data;
4974         struct drbd_device *device;
4975         sector_t sector;
4976         int size, err = 0;
4977
4978         peer_device = conn_peer_device(connection, pi->vnr);
4979         if (!peer_device)
4980                 return -EIO;
4981         device = peer_device->device;
4982
4983         sector = be64_to_cpu(p->sector);
4984         size = be32_to_cpu(p->blksize);
4985
4986         dec_rs_pending(device);
4987
4988         if (get_ldev(device)) {
4989                 struct drbd_peer_request *peer_req;
4990                 const int op = REQ_OP_WRITE_ZEROES;
4991
4992                 peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
4993                                                size, 0, GFP_NOIO);
4994                 if (!peer_req) {
4995                         put_ldev(device);
4996                         return -ENOMEM;
4997                 }
4998
4999                 peer_req->w.cb = e_end_resync_block;
5000                 peer_req->submit_jif = jiffies;
5001                 peer_req->flags |= EE_TRIM;
5002
5003                 spin_lock_irq(&device->resource->req_lock);
5004                 list_add_tail(&peer_req->w.list, &device->sync_ee);
5005                 spin_unlock_irq(&device->resource->req_lock);
5006
5007                 atomic_add(pi->size >> 9, &device->rs_sect_ev);
5008                 err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR);
5009
5010                 if (err) {
5011                         spin_lock_irq(&device->resource->req_lock);
5012                         list_del(&peer_req->w.list);
5013                         spin_unlock_irq(&device->resource->req_lock);
5014
5015                         drbd_free_peer_req(device, peer_req);
5016                         put_ldev(device);
5017                         err = 0;
5018                         goto fail;
5019                 }
5020
5021                 inc_unacked(device);
5022
5023                 /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
5024                    as well as drbd_rs_complete_io() */
5025         } else {
5026         fail:
5027                 drbd_rs_complete_io(device, sector);
5028                 drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
5029         }
5030
5031         atomic_add(size >> 9, &device->rs_sect_in);
5032
5033         return err;
5034 }
5035
5036 struct data_cmd {
5037         int expect_payload;
5038         unsigned int pkt_size;
5039         int (*fn)(struct drbd_connection *, struct packet_info *);
5040 };
5041
5042 static struct data_cmd drbd_cmd_handler[] = {
5043         [P_DATA]            = { 1, sizeof(struct p_data), receive_Data },
5044         [P_DATA_REPLY]      = { 1, sizeof(struct p_data), receive_DataReply },
5045         [P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
5046         [P_BARRIER]         = { 0, sizeof(struct p_barrier), receive_Barrier } ,
5047         [P_BITMAP]          = { 1, 0, receive_bitmap } ,
5048         [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
5049         [P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
5050         [P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
5051         [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
5052         [P_SYNC_PARAM]      = { 1, 0, receive_SyncParam },
5053         [P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
5054         [P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
5055         [P_UUIDS]           = { 0, sizeof(struct p_uuids), receive_uuids },
5056         [P_SIZES]           = { 0, sizeof(struct p_sizes), receive_sizes },
5057         [P_STATE]           = { 0, sizeof(struct p_state), receive_state },
5058         [P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
5059         [P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
5060         [P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
5061         [P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
5062         [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
5063         [P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_DataRequest },
5064         [P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
5065         [P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
5066         [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
5067         [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
5068         [P_TRIM]            = { 0, sizeof(struct p_trim), receive_Data },
5069         [P_ZEROES]          = { 0, sizeof(struct p_trim), receive_Data },
5070         [P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
5071         [P_WSAME]           = { 1, sizeof(struct p_wsame), receive_Data },
5072 };
5073
5074 static void drbdd(struct drbd_connection *connection)
5075 {
5076         struct packet_info pi;
5077         size_t shs; /* sub header size */
5078         int err;
5079
5080         while (get_t_state(&connection->receiver) == RUNNING) {
5081                 struct data_cmd const *cmd;
5082
5083                 drbd_thread_current_set_cpu(&connection->receiver);
5084                 update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug);
5085                 if (drbd_recv_header_maybe_unplug(connection, &pi))
5086                         goto err_out;
5087
5088                 cmd = &drbd_cmd_handler[pi.cmd];
5089                 if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
5090                         drbd_err(connection, "Unexpected data packet %s (0x%04x)",
5091                                  cmdname(pi.cmd), pi.cmd);
5092                         goto err_out;
5093                 }
5094
5095                 shs = cmd->pkt_size;
5096                 if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
5097                         shs += sizeof(struct o_qlim);
5098                 if (pi.size > shs && !cmd->expect_payload) {
5099                         drbd_err(connection, "No payload expected %s l:%d\n",
5100                                  cmdname(pi.cmd), pi.size);
5101                         goto err_out;
5102                 }
5103                 if (pi.size < shs) {
5104                         drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
5105                                  cmdname(pi.cmd), (int)shs, pi.size);
5106                         goto err_out;
5107                 }
5108
5109                 if (shs) {
5110                         update_receiver_timing_details(connection, drbd_recv_all_warn);
5111                         err = drbd_recv_all_warn(connection, pi.data, shs);
5112                         if (err)
5113                                 goto err_out;
5114                         pi.size -= shs;
5115                 }
5116
5117                 update_receiver_timing_details(connection, cmd->fn);
5118                 err = cmd->fn(connection, &pi);
5119                 if (err) {
5120                         drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
5121                                  cmdname(pi.cmd), err, pi.size);
5122                         goto err_out;
5123                 }
5124         }
5125         return;
5126
5127     err_out:
5128         conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
5129 }
5130
5131 static void conn_disconnect(struct drbd_connection *connection)
5132 {
5133         struct drbd_peer_device *peer_device;
5134         enum drbd_conns oc;
5135         int vnr;
5136
5137         if (connection->cstate == C_STANDALONE)
5138                 return;
5139
5140         /* We are about to start the cleanup after connection loss.
5141          * Make sure drbd_make_request knows about that.
5142          * Usually we should be in some network failure state already,
5143          * but just in case we are not, we fix it up here.
5144          */
5145         conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
5146
5147         /* ack_receiver does not clean up anything. it must not interfere, either */
5148         drbd_thread_stop(&connection->ack_receiver);
5149         if (connection->ack_sender) {
5150                 destroy_workqueue(connection->ack_sender);
5151                 connection->ack_sender = NULL;
5152         }
5153         drbd_free_sock(connection);
5154
5155         rcu_read_lock();
5156         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5157                 struct drbd_device *device = peer_device->device;
5158                 kref_get(&device->kref);
5159                 rcu_read_unlock();
5160                 drbd_disconnected(peer_device);
5161                 kref_put(&device->kref, drbd_destroy_device);
5162                 rcu_read_lock();
5163         }
5164         rcu_read_unlock();
5165
5166         if (!list_empty(&connection->current_epoch->list))
5167                 drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
5168         /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
5169         atomic_set(&connection->current_epoch->epoch_size, 0);
5170         connection->send.seen_any_write_yet = false;
5171
5172         drbd_info(connection, "Connection closed\n");
5173
5174         if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
5175                 conn_try_outdate_peer_async(connection);
5176
5177         spin_lock_irq(&connection->resource->req_lock);
5178         oc = connection->cstate;
5179         if (oc >= C_UNCONNECTED)
5180                 _conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
5181
5182         spin_unlock_irq(&connection->resource->req_lock);
5183
5184         if (oc == C_DISCONNECTING)
5185                 conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
5186 }
5187
5188 static int drbd_disconnected(struct drbd_peer_device *peer_device)
5189 {
5190         struct drbd_device *device = peer_device->device;
5191         unsigned int i;
5192
5193         /* wait for current activity to cease. */
5194         spin_lock_irq(&device->resource->req_lock);
5195         _drbd_wait_ee_list_empty(device, &device->active_ee);
5196         _drbd_wait_ee_list_empty(device, &device->sync_ee);
5197         _drbd_wait_ee_list_empty(device, &device->read_ee);
5198         spin_unlock_irq(&device->resource->req_lock);
5199
5200         /* We do not have data structures that would allow us to
5201          * get the rs_pending_cnt down to 0 again.
5202          *  * On C_SYNC_TARGET we do not have any data structures describing
5203          *    the pending RSDataRequest's we have sent.
5204          *  * On C_SYNC_SOURCE there is no data structure that tracks
5205          *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
5206          *  And no, it is not the sum of the reference counts in the
5207          *  resync_LRU. The resync_LRU tracks the whole operation including
5208          *  the disk-IO, while the rs_pending_cnt only tracks the blocks
5209          *  on the fly. */
5210         drbd_rs_cancel_all(device);
5211         device->rs_total = 0;
5212         device->rs_failed = 0;
5213         atomic_set(&device->rs_pending_cnt, 0);
5214         wake_up(&device->misc_wait);
5215
5216         del_timer_sync(&device->resync_timer);
5217         resync_timer_fn(&device->resync_timer);
5218
5219         /* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
5220          * w_make_resync_request etc. which may still be on the worker queue
5221          * to be "canceled" */
5222         drbd_flush_workqueue(&peer_device->connection->sender_work);
5223
5224         drbd_finish_peer_reqs(device);
5225
5226         /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
5227            might have issued a work again. The one before drbd_finish_peer_reqs() is
5228            necessary to reclain net_ee in drbd_finish_peer_reqs(). */
5229         drbd_flush_workqueue(&peer_device->connection->sender_work);
5230
5231         /* need to do it again, drbd_finish_peer_reqs() may have populated it
5232          * again via drbd_try_clear_on_disk_bm(). */
5233         drbd_rs_cancel_all(device);
5234
5235         kfree(device->p_uuid);
5236         device->p_uuid = NULL;
5237
5238         if (!drbd_suspended(device))
5239                 tl_clear(peer_device->connection);
5240
5241         drbd_md_sync(device);
5242
5243         if (get_ldev(device)) {
5244                 drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
5245                                 "write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
5246                 put_ldev(device);
5247         }
5248
5249         /* tcp_close and release of sendpage pages can be deferred.  I don't
5250          * want to use SO_LINGER, because apparently it can be deferred for
5251          * more than 20 seconds (longest time I checked).
5252          *
5253          * Actually we don't care for exactly when the network stack does its
5254          * put_page(), but release our reference on these pages right here.
5255          */
5256         i = drbd_free_peer_reqs(device, &device->net_ee);
5257         if (i)
5258                 drbd_info(device, "net_ee not empty, killed %u entries\n", i);
5259         i = atomic_read(&device->pp_in_use_by_net);
5260         if (i)
5261                 drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
5262         i = atomic_read(&device->pp_in_use);
5263         if (i)
5264                 drbd_info(device, "pp_in_use = %d, expected 0\n", i);
5265
5266         D_ASSERT(device, list_empty(&device->read_ee));
5267         D_ASSERT(device, list_empty(&device->active_ee));
5268         D_ASSERT(device, list_empty(&device->sync_ee));
5269         D_ASSERT(device, list_empty(&device->done_ee));
5270
5271         return 0;
5272 }
5273
5274 /*
5275  * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
5276  * we can agree on is stored in agreed_pro_version.
5277  *
5278  * feature flags and the reserved array should be enough room for future
5279  * enhancements of the handshake protocol, and possible plugins...
5280  *
5281  * for now, they are expected to be zero, but ignored.
5282  */
5283 static int drbd_send_features(struct drbd_connection *connection)
5284 {
5285         struct drbd_socket *sock;
5286         struct p_connection_features *p;
5287
5288         sock = &connection->data;
5289         p = conn_prepare_command(connection, sock);
5290         if (!p)
5291                 return -EIO;
5292         memset(p, 0, sizeof(*p));
5293         p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
5294         p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
5295         p->feature_flags = cpu_to_be32(PRO_FEATURES);
5296         return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
5297 }
5298
5299 /*
5300  * return values:
5301  *   1 yes, we have a valid connection
5302  *   0 oops, did not work out, please try again
5303  *  -1 peer talks different language,
5304  *     no point in trying again, please go standalone.
5305  */
5306 static int drbd_do_features(struct drbd_connection *connection)
5307 {
5308         /* ASSERT current == connection->receiver ... */
5309         struct p_connection_features *p;
5310         const int expect = sizeof(struct p_connection_features);
5311         struct packet_info pi;
5312         int err;
5313
5314         err = drbd_send_features(connection);
5315         if (err)
5316                 return 0;
5317
5318         err = drbd_recv_header(connection, &pi);
5319         if (err)
5320                 return 0;
5321
5322         if (pi.cmd != P_CONNECTION_FEATURES) {
5323                 drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
5324                          cmdname(pi.cmd), pi.cmd);
5325                 return -1;
5326         }
5327
5328         if (pi.size != expect) {
5329                 drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
5330                      expect, pi.size);
5331                 return -1;
5332         }
5333
5334         p = pi.data;
5335         err = drbd_recv_all_warn(connection, p, expect);
5336         if (err)
5337                 return 0;
5338
5339         p->protocol_min = be32_to_cpu(p->protocol_min);
5340         p->protocol_max = be32_to_cpu(p->protocol_max);
5341         if (p->protocol_max == 0)
5342                 p->protocol_max = p->protocol_min;
5343
5344         if (PRO_VERSION_MAX < p->protocol_min ||
5345             PRO_VERSION_MIN > p->protocol_max)
5346                 goto incompat;
5347
5348         connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
5349         connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
5350
5351         drbd_info(connection, "Handshake successful: "
5352              "Agreed network protocol version %d\n", connection->agreed_pro_version);
5353
5354         drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n",
5355                   connection->agreed_features,
5356                   connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
5357                   connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
5358                   connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "",
5359                   connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" :
5360                   connection->agreed_features ? "" : " none");
5361
5362         return 1;
5363
5364  incompat:
5365         drbd_err(connection, "incompatible DRBD dialects: "
5366             "I support %d-%d, peer supports %d-%d\n",
5367             PRO_VERSION_MIN, PRO_VERSION_MAX,
5368             p->protocol_min, p->protocol_max);
5369         return -1;
5370 }
5371
5372 #if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
5373 static int drbd_do_auth(struct drbd_connection *connection)
5374 {
5375         drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
5376         drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
5377         return -1;
5378 }
5379 #else
5380 #define CHALLENGE_LEN 64
5381
5382 /* Return value:
5383         1 - auth succeeded,
5384         0 - failed, try again (network error),
5385         -1 - auth failed, don't try again.
5386 */
5387
5388 static int drbd_do_auth(struct drbd_connection *connection)
5389 {
5390         struct drbd_socket *sock;
5391         char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
5392         char *response = NULL;
5393         char *right_response = NULL;
5394         char *peers_ch = NULL;
5395         unsigned int key_len;
5396         char secret[SHARED_SECRET_MAX]; /* 64 byte */
5397         unsigned int resp_size;
5398         struct shash_desc *desc;
5399         struct packet_info pi;
5400         struct net_conf *nc;
5401         int err, rv;
5402
5403         /* FIXME: Put the challenge/response into the preallocated socket buffer.  */
5404
5405         rcu_read_lock();
5406         nc = rcu_dereference(connection->net_conf);
5407         key_len = strlen(nc->shared_secret);
5408         memcpy(secret, nc->shared_secret, key_len);
5409         rcu_read_unlock();
5410
5411         desc = kmalloc(sizeof(struct shash_desc) +
5412                        crypto_shash_descsize(connection->cram_hmac_tfm),
5413                        GFP_KERNEL);
5414         if (!desc) {
5415                 rv = -1;
5416                 goto fail;
5417         }
5418         desc->tfm = connection->cram_hmac_tfm;
5419
5420         rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
5421         if (rv) {
5422                 drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv);
5423                 rv = -1;
5424                 goto fail;
5425         }
5426
5427         get_random_bytes(my_challenge, CHALLENGE_LEN);
5428
5429         sock = &connection->data;
5430         if (!conn_prepare_command(connection, sock)) {
5431                 rv = 0;
5432                 goto fail;
5433         }
5434         rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
5435                                 my_challenge, CHALLENGE_LEN);
5436         if (!rv)
5437                 goto fail;
5438
5439         err = drbd_recv_header(connection, &pi);
5440         if (err) {
5441                 rv = 0;
5442                 goto fail;
5443         }
5444
5445         if (pi.cmd != P_AUTH_CHALLENGE) {
5446                 drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
5447                          cmdname(pi.cmd), pi.cmd);
5448                 rv = -1;
5449                 goto fail;
5450         }
5451
5452         if (pi.size > CHALLENGE_LEN * 2) {
5453                 drbd_err(connection, "expected AuthChallenge payload too big.\n");
5454                 rv = -1;
5455                 goto fail;
5456         }
5457
5458         if (pi.size < CHALLENGE_LEN) {
5459                 drbd_err(connection, "AuthChallenge payload too small.\n");
5460                 rv = -1;
5461                 goto fail;
5462         }
5463
5464         peers_ch = kmalloc(pi.size, GFP_NOIO);
5465         if (!peers_ch) {
5466                 rv = -1;
5467                 goto fail;
5468         }
5469
5470         err = drbd_recv_all_warn(connection, peers_ch, pi.size);
5471         if (err) {
5472                 rv = 0;
5473                 goto fail;
5474         }
5475
5476         if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
5477                 drbd_err(connection, "Peer presented the same challenge!\n");
5478                 rv = -1;
5479                 goto fail;
5480         }
5481
5482         resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
5483         response = kmalloc(resp_size, GFP_NOIO);
5484         if (!response) {
5485                 rv = -1;
5486                 goto fail;
5487         }
5488
5489         rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
5490         if (rv) {
5491                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5492                 rv = -1;
5493                 goto fail;
5494         }
5495
5496         if (!conn_prepare_command(connection, sock)) {
5497                 rv = 0;
5498                 goto fail;
5499         }
5500         rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
5501                                 response, resp_size);
5502         if (!rv)
5503                 goto fail;
5504
5505         err = drbd_recv_header(connection, &pi);
5506         if (err) {
5507                 rv = 0;
5508                 goto fail;
5509         }
5510
5511         if (pi.cmd != P_AUTH_RESPONSE) {
5512                 drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
5513                          cmdname(pi.cmd), pi.cmd);
5514                 rv = 0;
5515                 goto fail;
5516         }
5517
5518         if (pi.size != resp_size) {
5519                 drbd_err(connection, "expected AuthResponse payload of wrong size\n");
5520                 rv = 0;
5521                 goto fail;
5522         }
5523
5524         err = drbd_recv_all_warn(connection, response , resp_size);
5525         if (err) {
5526                 rv = 0;
5527                 goto fail;
5528         }
5529
5530         right_response = kmalloc(resp_size, GFP_NOIO);
5531         if (!right_response) {
5532                 rv = -1;
5533                 goto fail;
5534         }
5535
5536         rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
5537                                  right_response);
5538         if (rv) {
5539                 drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
5540                 rv = -1;
5541                 goto fail;
5542         }
5543
5544         rv = !memcmp(response, right_response, resp_size);
5545
5546         if (rv)
5547                 drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
5548                      resp_size);
5549         else
5550                 rv = -1;
5551
5552  fail:
5553         kfree(peers_ch);
5554         kfree(response);
5555         kfree(right_response);
5556         if (desc) {
5557                 shash_desc_zero(desc);
5558                 kfree(desc);
5559         }
5560
5561         return rv;
5562 }
5563 #endif
5564
5565 int drbd_receiver(struct drbd_thread *thi)
5566 {
5567         struct drbd_connection *connection = thi->connection;
5568         int h;
5569
5570         drbd_info(connection, "receiver (re)started\n");
5571
5572         do {
5573                 h = conn_connect(connection);
5574                 if (h == 0) {
5575                         conn_disconnect(connection);
5576                         schedule_timeout_interruptible(HZ);
5577                 }
5578                 if (h == -1) {
5579                         drbd_warn(connection, "Discarding network configuration.\n");
5580                         conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
5581                 }
5582         } while (h == 0);
5583
5584         if (h > 0) {
5585                 blk_start_plug(&connection->receiver_plug);
5586                 drbdd(connection);
5587                 blk_finish_plug(&connection->receiver_plug);
5588         }
5589
5590         conn_disconnect(connection);
5591
5592         drbd_info(connection, "receiver terminated\n");
5593         return 0;
5594 }
5595
5596 /* ********* acknowledge sender ******** */
5597
5598 static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5599 {
5600         struct p_req_state_reply *p = pi->data;
5601         int retcode = be32_to_cpu(p->retcode);
5602
5603         if (retcode >= SS_SUCCESS) {
5604                 set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
5605         } else {
5606                 set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
5607                 drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
5608                          drbd_set_st_err_str(retcode), retcode);
5609         }
5610         wake_up(&connection->ping_wait);
5611
5612         return 0;
5613 }
5614
5615 static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
5616 {
5617         struct drbd_peer_device *peer_device;
5618         struct drbd_device *device;
5619         struct p_req_state_reply *p = pi->data;
5620         int retcode = be32_to_cpu(p->retcode);
5621
5622         peer_device = conn_peer_device(connection, pi->vnr);
5623         if (!peer_device)
5624                 return -EIO;
5625         device = peer_device->device;
5626
5627         if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
5628                 D_ASSERT(device, connection->agreed_pro_version < 100);
5629                 return got_conn_RqSReply(connection, pi);
5630         }
5631
5632         if (retcode >= SS_SUCCESS) {
5633                 set_bit(CL_ST_CHG_SUCCESS, &device->flags);
5634         } else {
5635                 set_bit(CL_ST_CHG_FAIL, &device->flags);
5636                 drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
5637                         drbd_set_st_err_str(retcode), retcode);
5638         }
5639         wake_up(&device->state_wait);
5640
5641         return 0;
5642 }
5643
5644 static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
5645 {
5646         return drbd_send_ping_ack(connection);
5647
5648 }
5649
5650 static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
5651 {
5652         /* restore idle timeout */
5653         connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
5654         if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
5655                 wake_up(&connection->ping_wait);
5656
5657         return 0;
5658 }
5659
5660 static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
5661 {
5662         struct drbd_peer_device *peer_device;
5663         struct drbd_device *device;
5664         struct p_block_ack *p = pi->data;
5665         sector_t sector = be64_to_cpu(p->sector);
5666         int blksize = be32_to_cpu(p->blksize);
5667
5668         peer_device = conn_peer_device(connection, pi->vnr);
5669         if (!peer_device)
5670                 return -EIO;
5671         device = peer_device->device;
5672
5673         D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
5674
5675         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5676
5677         if (get_ldev(device)) {
5678                 drbd_rs_complete_io(device, sector);
5679                 drbd_set_in_sync(device, sector, blksize);
5680                 /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
5681                 device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
5682                 put_ldev(device);
5683         }
5684         dec_rs_pending(device);
5685         atomic_add(blksize >> 9, &device->rs_sect_in);
5686
5687         return 0;
5688 }
5689
5690 static int
5691 validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
5692                               struct rb_root *root, const char *func,
5693                               enum drbd_req_event what, bool missing_ok)
5694 {
5695         struct drbd_request *req;
5696         struct bio_and_error m;
5697
5698         spin_lock_irq(&device->resource->req_lock);
5699         req = find_request(device, root, id, sector, missing_ok, func);
5700         if (unlikely(!req)) {
5701                 spin_unlock_irq(&device->resource->req_lock);
5702                 return -EIO;
5703         }
5704         __req_mod(req, what, &m);
5705         spin_unlock_irq(&device->resource->req_lock);
5706
5707         if (m.bio)
5708                 complete_master_bio(device, &m);
5709         return 0;
5710 }
5711
5712 static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
5713 {
5714         struct drbd_peer_device *peer_device;
5715         struct drbd_device *device;
5716         struct p_block_ack *p = pi->data;
5717         sector_t sector = be64_to_cpu(p->sector);
5718         int blksize = be32_to_cpu(p->blksize);
5719         enum drbd_req_event what;
5720
5721         peer_device = conn_peer_device(connection, pi->vnr);
5722         if (!peer_device)
5723                 return -EIO;
5724         device = peer_device->device;
5725
5726         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5727
5728         if (p->block_id == ID_SYNCER) {
5729                 drbd_set_in_sync(device, sector, blksize);
5730                 dec_rs_pending(device);
5731                 return 0;
5732         }
5733         switch (pi->cmd) {
5734         case P_RS_WRITE_ACK:
5735                 what = WRITE_ACKED_BY_PEER_AND_SIS;
5736                 break;
5737         case P_WRITE_ACK:
5738                 what = WRITE_ACKED_BY_PEER;
5739                 break;
5740         case P_RECV_ACK:
5741                 what = RECV_ACKED_BY_PEER;
5742                 break;
5743         case P_SUPERSEDED:
5744                 what = CONFLICT_RESOLVED;
5745                 break;
5746         case P_RETRY_WRITE:
5747                 what = POSTPONE_WRITE;
5748                 break;
5749         default:
5750                 BUG();
5751         }
5752
5753         return validate_req_change_req_state(device, p->block_id, sector,
5754                                              &device->write_requests, __func__,
5755                                              what, false);
5756 }
5757
5758 static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
5759 {
5760         struct drbd_peer_device *peer_device;
5761         struct drbd_device *device;
5762         struct p_block_ack *p = pi->data;
5763         sector_t sector = be64_to_cpu(p->sector);
5764         int size = be32_to_cpu(p->blksize);
5765         int err;
5766
5767         peer_device = conn_peer_device(connection, pi->vnr);
5768         if (!peer_device)
5769                 return -EIO;
5770         device = peer_device->device;
5771
5772         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5773
5774         if (p->block_id == ID_SYNCER) {
5775                 dec_rs_pending(device);
5776                 drbd_rs_failed_io(device, sector, size);
5777                 return 0;
5778         }
5779
5780         err = validate_req_change_req_state(device, p->block_id, sector,
5781                                             &device->write_requests, __func__,
5782                                             NEG_ACKED, true);
5783         if (err) {
5784                 /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
5785                    The master bio might already be completed, therefore the
5786                    request is no longer in the collision hash. */
5787                 /* In Protocol B we might already have got a P_RECV_ACK
5788                    but then get a P_NEG_ACK afterwards. */
5789                 drbd_set_out_of_sync(device, sector, size);
5790         }
5791         return 0;
5792 }
5793
5794 static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
5795 {
5796         struct drbd_peer_device *peer_device;
5797         struct drbd_device *device;
5798         struct p_block_ack *p = pi->data;
5799         sector_t sector = be64_to_cpu(p->sector);
5800
5801         peer_device = conn_peer_device(connection, pi->vnr);
5802         if (!peer_device)
5803                 return -EIO;
5804         device = peer_device->device;
5805
5806         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5807
5808         drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
5809             (unsigned long long)sector, be32_to_cpu(p->blksize));
5810
5811         return validate_req_change_req_state(device, p->block_id, sector,
5812                                              &device->read_requests, __func__,
5813                                              NEG_ACKED, false);
5814 }
5815
5816 static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
5817 {
5818         struct drbd_peer_device *peer_device;
5819         struct drbd_device *device;
5820         sector_t sector;
5821         int size;
5822         struct p_block_ack *p = pi->data;
5823
5824         peer_device = conn_peer_device(connection, pi->vnr);
5825         if (!peer_device)
5826                 return -EIO;
5827         device = peer_device->device;
5828
5829         sector = be64_to_cpu(p->sector);
5830         size = be32_to_cpu(p->blksize);
5831
5832         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5833
5834         dec_rs_pending(device);
5835
5836         if (get_ldev_if_state(device, D_FAILED)) {
5837                 drbd_rs_complete_io(device, sector);
5838                 switch (pi->cmd) {
5839                 case P_NEG_RS_DREPLY:
5840                         drbd_rs_failed_io(device, sector, size);
5841                         break;
5842                 case P_RS_CANCEL:
5843                         break;
5844                 default:
5845                         BUG();
5846                 }
5847                 put_ldev(device);
5848         }
5849
5850         return 0;
5851 }
5852
5853 static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
5854 {
5855         struct p_barrier_ack *p = pi->data;
5856         struct drbd_peer_device *peer_device;
5857         int vnr;
5858
5859         tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
5860
5861         rcu_read_lock();
5862         idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
5863                 struct drbd_device *device = peer_device->device;
5864
5865                 if (device->state.conn == C_AHEAD &&
5866                     atomic_read(&device->ap_in_flight) == 0 &&
5867                     !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
5868                         device->start_resync_timer.expires = jiffies + HZ;
5869                         add_timer(&device->start_resync_timer);
5870                 }
5871         }
5872         rcu_read_unlock();
5873
5874         return 0;
5875 }
5876
5877 static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
5878 {
5879         struct drbd_peer_device *peer_device;
5880         struct drbd_device *device;
5881         struct p_block_ack *p = pi->data;
5882         struct drbd_device_work *dw;
5883         sector_t sector;
5884         int size;
5885
5886         peer_device = conn_peer_device(connection, pi->vnr);
5887         if (!peer_device)
5888                 return -EIO;
5889         device = peer_device->device;
5890
5891         sector = be64_to_cpu(p->sector);
5892         size = be32_to_cpu(p->blksize);
5893
5894         update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
5895
5896         if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
5897                 drbd_ov_out_of_sync_found(device, sector, size);
5898         else
5899                 ov_out_of_sync_print(device);
5900
5901         if (!get_ldev(device))
5902                 return 0;
5903
5904         drbd_rs_complete_io(device, sector);
5905         dec_rs_pending(device);
5906
5907         --device->ov_left;
5908
5909         /* let's advance progress step marks only for every other megabyte */
5910         if ((device->ov_left & 0x200) == 0x200)
5911                 drbd_advance_rs_marks(device, device->ov_left);
5912
5913         if (device->ov_left == 0) {
5914                 dw = kmalloc(sizeof(*dw), GFP_NOIO);
5915                 if (dw) {
5916                         dw->w.cb = w_ov_finished;
5917                         dw->device = device;
5918                         drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
5919                 } else {
5920                         drbd_err(device, "kmalloc(dw) failed.");
5921                         ov_out_of_sync_print(device);
5922                         drbd_resync_finished(device);
5923                 }
5924         }
5925         put_ldev(device);
5926         return 0;
5927 }
5928
5929 static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
5930 {
5931         return 0;
5932 }
5933
5934 struct meta_sock_cmd {
5935         size_t pkt_size;
5936         int (*fn)(struct drbd_connection *connection, struct packet_info *);
5937 };
5938
5939 static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
5940 {
5941         long t;
5942         struct net_conf *nc;
5943
5944         rcu_read_lock();
5945         nc = rcu_dereference(connection->net_conf);
5946         t = ping_timeout ? nc->ping_timeo : nc->ping_int;
5947         rcu_read_unlock();
5948
5949         t *= HZ;
5950         if (ping_timeout)
5951                 t /= 10;
5952
5953         connection->meta.socket->sk->sk_rcvtimeo = t;
5954 }
5955
5956 static void set_ping_timeout(struct drbd_connection *connection)
5957 {
5958         set_rcvtimeo(connection, 1);
5959 }
5960
5961 static void set_idle_timeout(struct drbd_connection *connection)
5962 {
5963         set_rcvtimeo(connection, 0);
5964 }
5965
5966 static struct meta_sock_cmd ack_receiver_tbl[] = {
5967         [P_PING]            = { 0, got_Ping },
5968         [P_PING_ACK]        = { 0, got_PingAck },
5969         [P_RECV_ACK]        = { sizeof(struct p_block_ack), got_BlockAck },
5970         [P_WRITE_ACK]       = { sizeof(struct p_block_ack), got_BlockAck },
5971         [P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
5972         [P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
5973         [P_NEG_ACK]         = { sizeof(struct p_block_ack), got_NegAck },
5974         [P_NEG_DREPLY]      = { sizeof(struct p_block_ack), got_NegDReply },
5975         [P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
5976         [P_OV_RESULT]       = { sizeof(struct p_block_ack), got_OVResult },
5977         [P_BARRIER_ACK]     = { sizeof(struct p_barrier_ack), got_BarrierAck },
5978         [P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
5979         [P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
5980         [P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
5981         [P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
5982         [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
5983         [P_RETRY_WRITE]     = { sizeof(struct p_block_ack), got_BlockAck },
5984 };
5985
5986 int drbd_ack_receiver(struct drbd_thread *thi)
5987 {
5988         struct drbd_connection *connection = thi->connection;
5989         struct meta_sock_cmd *cmd = NULL;
5990         struct packet_info pi;
5991         unsigned long pre_recv_jif;
5992         int rv;
5993         void *buf    = connection->meta.rbuf;
5994         int received = 0;
5995         unsigned int header_size = drbd_header_size(connection);
5996         int expect   = header_size;
5997         bool ping_timeout_active = false;
5998
5999         sched_set_fifo_low(current);
6000
6001         while (get_t_state(thi) == RUNNING) {
6002                 drbd_thread_current_set_cpu(thi);
6003
6004                 conn_reclaim_net_peer_reqs(connection);
6005
6006                 if (test_and_clear_bit(SEND_PING, &connection->flags)) {
6007                         if (drbd_send_ping(connection)) {
6008                                 drbd_err(connection, "drbd_send_ping has failed\n");
6009                                 goto reconnect;
6010                         }
6011                         set_ping_timeout(connection);
6012                         ping_timeout_active = true;
6013                 }
6014
6015                 pre_recv_jif = jiffies;
6016                 rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
6017
6018                 /* Note:
6019                  * -EINTR        (on meta) we got a signal
6020                  * -EAGAIN       (on meta) rcvtimeo expired
6021                  * -ECONNRESET   other side closed the connection
6022                  * -ERESTARTSYS  (on data) we got a signal
6023                  * rv <  0       other than above: unexpected error!
6024                  * rv == expected: full header or command
6025                  * rv <  expected: "woken" by signal during receive
6026                  * rv == 0       : "connection shut down by peer"
6027                  */
6028                 if (likely(rv > 0)) {
6029                         received += rv;
6030                         buf      += rv;
6031                 } else if (rv == 0) {
6032                         if (test_bit(DISCONNECT_SENT, &connection->flags)) {
6033                                 long t;
6034                                 rcu_read_lock();
6035                                 t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
6036                                 rcu_read_unlock();
6037
6038                                 t = wait_event_timeout(connection->ping_wait,
6039                                                        connection->cstate < C_WF_REPORT_PARAMS,
6040                                                        t);
6041                                 if (t)
6042                                         break;
6043                         }
6044                         drbd_err(connection, "meta connection shut down by peer.\n");
6045                         goto reconnect;
6046                 } else if (rv == -EAGAIN) {
6047                         /* If the data socket received something meanwhile,
6048                          * that is good enough: peer is still alive. */
6049                         if (time_after(connection->last_received, pre_recv_jif))
6050                                 continue;
6051                         if (ping_timeout_active) {
6052                                 drbd_err(connection, "PingAck did not arrive in time.\n");
6053                                 goto reconnect;
6054                         }
6055                         set_bit(SEND_PING, &connection->flags);
6056                         continue;
6057                 } else if (rv == -EINTR) {
6058                         /* maybe drbd_thread_stop(): the while condition will notice.
6059                          * maybe woken for send_ping: we'll send a ping above,
6060                          * and change the rcvtimeo */
6061                         flush_signals(current);
6062                         continue;
6063                 } else {
6064                         drbd_err(connection, "sock_recvmsg returned %d\n", rv);
6065                         goto reconnect;
6066                 }
6067
6068                 if (received == expect && cmd == NULL) {
6069                         if (decode_header(connection, connection->meta.rbuf, &pi))
6070                                 goto reconnect;
6071                         cmd = &ack_receiver_tbl[pi.cmd];
6072                         if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
6073                                 drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
6074                                          cmdname(pi.cmd), pi.cmd);
6075                                 goto disconnect;
6076                         }
6077                         expect = header_size + cmd->pkt_size;
6078                         if (pi.size != expect - header_size) {
6079                                 drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
6080                                         pi.cmd, pi.size);
6081                                 goto reconnect;
6082                         }
6083                 }
6084                 if (received == expect) {
6085                         bool err;
6086
6087                         err = cmd->fn(connection, &pi);
6088                         if (err) {
6089                                 drbd_err(connection, "%ps failed\n", cmd->fn);
6090                                 goto reconnect;
6091                         }
6092
6093                         connection->last_received = jiffies;
6094
6095                         if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
6096                                 set_idle_timeout(connection);
6097                                 ping_timeout_active = false;
6098                         }
6099
6100                         buf      = connection->meta.rbuf;
6101                         received = 0;
6102                         expect   = header_size;
6103                         cmd      = NULL;
6104                 }
6105         }
6106
6107         if (0) {
6108 reconnect:
6109                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6110                 conn_md_sync(connection);
6111         }
6112         if (0) {
6113 disconnect:
6114                 conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
6115         }
6116
6117         drbd_info(connection, "ack_receiver terminated\n");
6118
6119         return 0;
6120 }
6121
6122 void drbd_send_acks_wf(struct work_struct *ws)
6123 {
6124         struct drbd_peer_device *peer_device =
6125                 container_of(ws, struct drbd_peer_device, send_acks_work);
6126         struct drbd_connection *connection = peer_device->connection;
6127         struct drbd_device *device = peer_device->device;
6128         struct net_conf *nc;
6129         int tcp_cork, err;
6130
6131         rcu_read_lock();
6132         nc = rcu_dereference(connection->net_conf);
6133         tcp_cork = nc->tcp_cork;
6134         rcu_read_unlock();
6135
6136         if (tcp_cork)
6137                 tcp_sock_set_cork(connection->meta.socket->sk, true);
6138
6139         err = drbd_finish_peer_reqs(device);
6140         kref_put(&device->kref, drbd_destroy_device);
6141         /* get is in drbd_endio_write_sec_final(). That is necessary to keep the
6142            struct work_struct send_acks_work alive, which is in the peer_device object */
6143
6144         if (err) {
6145                 conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
6146                 return;
6147         }
6148
6149         if (tcp_cork)
6150                 tcp_sock_set_cork(connection->meta.socket->sk, false);
6151
6152         return;
6153 }