Merge branch 'drbd-8.4_ed6' into for-3.8-drivers-drbd-8.4_ed6
authorPhilipp Reisner <philipp.reisner@linbit.com>
Fri, 9 Nov 2012 13:18:43 +0000 (14:18 +0100)
committerPhilipp Reisner <philipp.reisner@linbit.com>
Fri, 9 Nov 2012 13:20:23 +0000 (14:20 +0100)
1  2 
drivers/block/drbd/drbd_bitmap.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_interval.c
drivers/block/drbd/drbd_main.c
drivers/block/drbd/drbd_nl.c
drivers/block/drbd/drbd_receiver.c
drivers/block/drbd/drbd_req.c
include/linux/idr.h

@@@ -376,12 -391,12 +391,12 @@@ static struct page **bm_realloc_pages(s
         * GFP_NOIO, as this is called while drbd IO is "suspended",
         * and during resize or attach on diskless Primary,
         * we must not block on IO to ourselves.
-        * Context is receiver thread or cqueue thread/dmsetup.  */
+        * Context is receiver thread or dmsetup. */
        bytes = sizeof(struct page *)*want;
 -      new_pages = kmalloc(bytes, GFP_NOIO);
 +      new_pages = kzalloc(bytes, GFP_NOIO);
        if (!new_pages) {
                new_pages = __vmalloc(bytes,
 -                              GFP_NOIO | __GFP_HIGHMEM,
 +                              GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
                                PAGE_KERNEL);
                if (!new_pages)
                        return NULL;
@@@ -1425,13 -1478,21 +1477,21 @@@ static inline void bm_set_full_words_wi
  {
        int i;
        int bits;
 -      unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+       int changed = 0;
 +      unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
        for (i = first_word; i < last_word; i++) {
                bits = hweight_long(paddr[i]);
                paddr[i] = ~0UL;
-               b->bm_set += BITS_PER_LONG - bits;
+               changed += BITS_PER_LONG - bits;
        }
 -      kunmap_atomic(paddr, KM_IRQ1);
 +      kunmap_atomic(paddr);
+       if (changed) {
+               /* We only need lazy writeout, the information is still in the
+                * remote bitmap as well, and is reconstructed during the next
+                * bitmap exchange, if lost locally due to a crash. */
+               bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+               b->bm_set += changed;
+       }
  }
  
  /* Same thing as drbd_bm_set_bits,
@@@ -59,9 -64,8 +63,8 @@@
  
  /* module parameter, defined in drbd_main.c */
  extern unsigned int minor_count;
 -extern int disable_sendpage;
 -extern int allow_oos;
 +extern bool disable_sendpage;
 +extern bool allow_oos;
- extern unsigned int cn_idx;
  
  #ifdef CONFIG_DRBD_FAULT_INJECTION
  extern int enable_faults;
@@@ -1138,46 -1028,16 +1027,16 @@@ struct drbd_conf 
        int rs_last_events;  /* counter of read or write "events" (unit sectors)
                              * on the lower level device when we last looked. */
        int c_sync_rate; /* current resync rate after syncer throttle magic */
-       struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+       struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
        int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
-       int rs_planed;    /* resync sectors already planned */
        atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
 -      int peer_max_bio_size;
 -      int local_max_bio_size;
 +      unsigned int peer_max_bio_size;
 +      unsigned int local_max_bio_size;
  };
  
- static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       set_bit(f, &mdev->drbd_flags[0]);
- }
- static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       clear_bit(f, &mdev->drbd_flags[0]);
- }
- static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       return test_bit(f, &mdev->drbd_flags[0]);
- }
- static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       return test_and_set_bit(f, &mdev->drbd_flags[0]);
- }
- static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
-       return test_and_clear_bit(f, &mdev->drbd_flags[0]);
- }
  static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
  {
-       struct drbd_conf *mdev;
-       mdev = minor < minor_count ? minor_table[minor] : NULL;
-       return mdev;
+       return (struct drbd_conf *)idr_find(&minors, minor);
  }
  
  static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
@@@ -1468,17 -1300,20 +1299,20 @@@ struct bm_extent 
  #endif
  #endif
  
- /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
-  * With a value of 8 all IO in one 128K block make it to the same slot of the
-  * hash table. */
- #define HT_SHIFT 8
- #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+ /* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+  * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+  * Since we may live in a mixed-platform cluster,
+  * we limit us to a platform agnostic constant here for now.
+  * A followup commit may allow even bigger BIO sizes,
+  * once we thought that through. */
 -#define DRBD_MAX_BIO_SIZE (1 << 20)
++#define DRBD_MAX_BIO_SIZE (1U << 20)
+ #if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+ #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+ #endif
 -#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12)       /* Works always = 4k */
 +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
  
- #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
- /* Number of elements in the app_reads_hash */
- #define APP_R_HSIZE 15
 -#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* Header 80 only allows packets up to 32KiB data */
 -#define DRBD_MAX_BIO_SIZE_P95    (1 << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
++#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
++#define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
  
  extern int  drbd_bm_init(struct drbd_conf *mdev);
  extern int  drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
@@@ -1575,7 -1419,8 +1418,8 @@@ extern void conn_free_crypto(struct drb
  extern int proc_details;
  
  /* drbd_req */
 -extern int drbd_make_request(struct request_queue *q, struct bio *bio);
+ extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
 +extern void drbd_make_request(struct request_queue *q, struct bio *bio);
  extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
  extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
  extern int is_valid_ar_handle(struct drbd_request *, sector_t);
index 0000000,0e53f10..89c497c
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,177 +1,207 @@@
 - * update_interval_end  -  recompute end of @node
++#include <asm/bug.h>
++#include <linux/rbtree_augmented.h>
+ #include "drbd_interval.h"
+ /**
+  * interval_end  -  return end of @node
+  */
+ static inline
+ sector_t interval_end(struct rb_node *node)
+ {
+       struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+       return this->end;
+ }
+ /**
 -static void
 -update_interval_end(struct rb_node *node, void *__unused)
++ * compute_subtree_last  -  compute end of @node
+  *
+  * The end of an interval is the highest (start + (size >> 9)) value of this
+  * node and of its children.  Called for @node and its parents whenever the end
+  * may have changed.
+  */
 -      struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
 -      sector_t end;
++static inline sector_t
++compute_subtree_last(struct drbd_interval *node)
+ {
 -      end = this->sector + (this->size >> 9);
 -      if (node->rb_left) {
 -              sector_t left = interval_end(node->rb_left);
 -              if (left > end)
 -                      end = left;
++      sector_t max = node->sector + (node->size >> 9);
 -      if (node->rb_right) {
 -              sector_t right = interval_end(node->rb_right);
 -              if (right > end)
 -                      end = right;
++      if (node->rb.rb_left) {
++              sector_t left = interval_end(node->rb.rb_left);
++              if (left > max)
++                      max = left;
++      }
++      if (node->rb.rb_right) {
++              sector_t right = interval_end(node->rb.rb_right);
++              if (right > max)
++                      max = right;
+       }
 -      this->end = end;
++      return max;
++}
++
++static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
++{
++      while (rb != stop) {
++              struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
++              sector_t subtree_last = compute_subtree_last(node);
++              if (node->end == subtree_last)
++                      break;
++              node->end = subtree_last;
++              rb = rb_parent(&node->rb);
+       }
 -      rb_insert_color(&this->rb, root);
 -      rb_augment_insert(&this->rb, update_interval_end, NULL);
+ }
++static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
++{
++      struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
++      struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
++
++      new->end = old->end;
++}
++
++static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
++{
++      struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
++      struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
++
++      new->end = old->end;
++      old->end = compute_subtree_last(old);
++}
++
++static const struct rb_augment_callbacks augment_callbacks = {
++      augment_propagate,
++      augment_copy,
++      augment_rotate,
++};
++
+ /**
+  * drbd_insert_interval  -  insert a new interval into a tree
+  */
+ bool
+ drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+ {
+       struct rb_node **new = &root->rb_node, *parent = NULL;
+       BUG_ON(!IS_ALIGNED(this->size, 512));
+       while (*new) {
+               struct drbd_interval *here =
+                       rb_entry(*new, struct drbd_interval, rb);
+               parent = *new;
+               if (this->sector < here->sector)
+                       new = &(*new)->rb_left;
+               else if (this->sector > here->sector)
+                       new = &(*new)->rb_right;
+               else if (this < here)
+                       new = &(*new)->rb_left;
+               else if (this > here)
+                       new = &(*new)->rb_right;
+               else
+                       return false;
+       }
+       rb_link_node(&this->rb, parent, new);
 -      struct rb_node *deepest;
 -
 -      deepest = rb_augment_erase_begin(&this->rb);
 -      rb_erase(&this->rb, root);
 -      rb_augment_erase_end(deepest, update_interval_end, NULL);
++      rb_insert_augmented(&this->rb, root, &augment_callbacks);
+       return true;
+ }
+ /**
+  * drbd_contains_interval  -  check if a tree contains a given interval
+  * @sector:   start sector of @interval
+  * @interval: may not be a valid pointer
+  *
+  * Returns if the tree contains the node @interval with start sector @start.
+  * Does not dereference @interval until @interval is known to be a valid object
+  * in @tree.  Returns %false if @interval is in the tree but with a different
+  * sector number.
+  */
+ bool
+ drbd_contains_interval(struct rb_root *root, sector_t sector,
+                      struct drbd_interval *interval)
+ {
+       struct rb_node *node = root->rb_node;
+       while (node) {
+               struct drbd_interval *here =
+                       rb_entry(node, struct drbd_interval, rb);
+               if (sector < here->sector)
+                       node = node->rb_left;
+               else if (sector > here->sector)
+                       node = node->rb_right;
+               else if (interval < here)
+                       node = node->rb_left;
+               else if (interval > here)
+                       node = node->rb_right;
+               else
+                       return true;
+       }
+       return false;
+ }
+ /**
+  * drbd_remove_interval  -  remove an interval from a tree
+  */
+ void
+ drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+ {
++      rb_erase_augmented(&this->rb, root, &augment_callbacks);
+ }
+ /**
+  * drbd_find_overlap  - search for an interval overlapping with [sector, sector + size)
+  * @sector:   start sector
+  * @size:     size, aligned to 512 bytes
+  *
+  * Returns an interval overlapping with [sector, sector + size), or NULL if
+  * there is none.  When there is more than one overlapping interval in the
+  * tree, the interval with the lowest start sector is returned, and all other
+  * overlapping intervals will be on the right side of the tree, reachable with
+  * rb_next().
+  */
+ struct drbd_interval *
+ drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+ {
+       struct rb_node *node = root->rb_node;
+       struct drbd_interval *overlap = NULL;
+       sector_t end = sector + (size >> 9);
+       BUG_ON(!IS_ALIGNED(size, 512));
+       while (node) {
+               struct drbd_interval *here =
+                       rb_entry(node, struct drbd_interval, rb);
+               if (node->rb_left &&
+                   sector < interval_end(node->rb_left)) {
+                       /* Overlap if any must be on left side */
+                       node = node->rb_left;
+               } else if (here->sector < end &&
+                          sector < here->sector + (here->size >> 9)) {
+                       overlap = here;
+                       break;
+               } else if (sector >= here->sector) {
+                       /* Overlap if any must be on right side */
+                       node = node->rb_right;
+               } else
+                       break;
+       }
+       return overlap;
+ }
+ struct drbd_interval *
+ drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+ {
+       sector_t end = sector + (size >> 9);
+       struct rb_node *node;
+       for (;;) {
+               node = rb_next(&i->rb);
+               if (!node)
+                       return NULL;
+               i = rb_entry(node, struct drbd_interval, rb);
+               if (i->sector >= end)
+                       return NULL;
+               if (sector < i->sector + (i->size >> 9))
+                       return i;
+       }
+ }
@@@ -118,9 -105,8 +105,8 @@@ module_param(fault_devs, int, 0644)
  
  /* module parameter, defined */
  unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
 -int disable_sendpage;
 -int allow_oos;
 +bool disable_sendpage;
 +bool allow_oos;
- unsigned int cn_idx = CN_IDX_DRBD;
  int proc_details;       /* Detail level in proc drbd*/
  
  /* Module parameter for setting the user mode helper program
@@@ -158,1609 -145,184 +145,184 @@@ DEFINE_RATELIMIT_STATE(drbd_ratelimit_s
  
  static const struct block_device_operations drbd_ops = {
        .owner =   THIS_MODULE,
-       .open =    drbd_open,
-       .release = drbd_release,
- };
- struct bio *bio_alloc_drbd(gfp_t gfp_mask)
- {
-       if (!drbd_md_io_bio_set)
-               return bio_alloc(gfp_mask, 1);
-       return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
- }
- #ifdef __CHECKER__
- /* When checking with sparse, and this is an inline function, sparse will
-    give tons of false positives. When this is a real functions sparse works.
-  */
- int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
- {
-       int io_allowed;
-       atomic_inc(&mdev->local_cnt);
-       io_allowed = (mdev->state.disk >= mins);
-       if (!io_allowed) {
-               if (atomic_dec_and_test(&mdev->local_cnt))
-                       wake_up(&mdev->misc_wait);
-       }
-       return io_allowed;
- }
- #endif
- /**
-  * DOC: The transfer log
-  *
-  * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
-  * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
-  * of the list. There is always at least one &struct drbd_tl_epoch object.
-  *
-  * Each &struct drbd_tl_epoch has a circular double linked list of requests
-  * attached.
-  */
- static int tl_init(struct drbd_conf *mdev)
- {
-       struct drbd_tl_epoch *b;
-       /* during device minor initialization, we may well use GFP_KERNEL */
-       b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
-       if (!b)
-               return 0;
-       INIT_LIST_HEAD(&b->requests);
-       INIT_LIST_HEAD(&b->w.list);
-       b->next = NULL;
-       b->br_number = 4711;
-       b->n_writes = 0;
-       b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-       mdev->oldest_tle = b;
-       mdev->newest_tle = b;
-       INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
-       INIT_LIST_HEAD(&mdev->barrier_acked_requests);
-       mdev->tl_hash = NULL;
-       mdev->tl_hash_s = 0;
-       return 1;
- }
- static void tl_cleanup(struct drbd_conf *mdev)
- {
-       D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
-       D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-       kfree(mdev->oldest_tle);
-       mdev->oldest_tle = NULL;
-       kfree(mdev->unused_spare_tle);
-       mdev->unused_spare_tle = NULL;
-       kfree(mdev->tl_hash);
-       mdev->tl_hash = NULL;
-       mdev->tl_hash_s = 0;
- }
- /**
-  * _tl_add_barrier() - Adds a barrier to the transfer log
-  * @mdev:     DRBD device.
-  * @new:      Barrier to be added before the current head of the TL.
-  *
-  * The caller must hold the req_lock.
-  */
- void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
- {
-       struct drbd_tl_epoch *newest_before;
-       INIT_LIST_HEAD(&new->requests);
-       INIT_LIST_HEAD(&new->w.list);
-       new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-       new->next = NULL;
-       new->n_writes = 0;
-       newest_before = mdev->newest_tle;
-       new->br_number = newest_before->br_number+1;
-       if (mdev->newest_tle != new) {
-               mdev->newest_tle->next = new;
-               mdev->newest_tle = new;
-       }
- }
- /**
-  * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
-  * @mdev:     DRBD device.
-  * @barrier_nr:       Expected identifier of the DRBD write barrier packet.
-  * @set_size: Expected number of requests before that barrier.
-  *
-  * In case the passed barrier_nr or set_size does not match the oldest
-  * &struct drbd_tl_epoch objects this function will cause a termination
-  * of the connection.
-  */
- void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
-                      unsigned int set_size)
- {
-       struct drbd_tl_epoch *b, *nob; /* next old barrier */
-       struct list_head *le, *tle;
-       struct drbd_request *r;
-       spin_lock_irq(&mdev->req_lock);
-       b = mdev->oldest_tle;
-       /* first some paranoia code */
-       if (b == NULL) {
-               dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
-                       barrier_nr);
-               goto bail;
-       }
-       if (b->br_number != barrier_nr) {
-               dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
-                       barrier_nr, b->br_number);
-               goto bail;
-       }
-       if (b->n_writes != set_size) {
-               dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
-                       barrier_nr, set_size, b->n_writes);
-               goto bail;
-       }
-       /* Clean up list of requests processed during current epoch */
-       list_for_each_safe(le, tle, &b->requests) {
-               r = list_entry(le, struct drbd_request, tl_requests);
-               _req_mod(r, barrier_acked);
-       }
-       /* There could be requests on the list waiting for completion
-          of the write to the local disk. To avoid corruptions of
-          slab's data structures we have to remove the lists head.
-          Also there could have been a barrier ack out of sequence, overtaking
-          the write acks - which would be a bug and violating write ordering.
-          To not deadlock in case we lose connection while such requests are
-          still pending, we need some way to find them for the
-          _req_mode(connection_lost_while_pending).
-          These have been list_move'd to the out_of_sequence_requests list in
-          _req_mod(, barrier_acked) above.
-          */
-       list_splice_init(&b->requests, &mdev->barrier_acked_requests);
-       nob = b->next;
-       if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
-               _tl_add_barrier(mdev, b);
-               if (nob)
-                       mdev->oldest_tle = nob;
-               /* if nob == NULL b was the only barrier, and becomes the new
-                  barrier. Therefore mdev->oldest_tle points already to b */
-       } else {
-               D_ASSERT(nob != NULL);
-               mdev->oldest_tle = nob;
-               kfree(b);
-       }
-       spin_unlock_irq(&mdev->req_lock);
-       dec_ap_pending(mdev);
-       return;
- bail:
-       spin_unlock_irq(&mdev->req_lock);
-       drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- }
- /**
-  * _tl_restart() - Walks the transfer log, and applies an action to all requests
-  * @mdev:     DRBD device.
-  * @what:       The action/event to perform with all request objects
-  *
-  * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
-  * restart_frozen_disk_io.
-  */
- static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
- {
-       struct drbd_tl_epoch *b, *tmp, **pn;
-       struct list_head *le, *tle, carry_reads;
-       struct drbd_request *req;
-       int rv, n_writes, n_reads;
-       b = mdev->oldest_tle;
-       pn = &mdev->oldest_tle;
-       while (b) {
-               n_writes = 0;
-               n_reads = 0;
-               INIT_LIST_HEAD(&carry_reads);
-               list_for_each_safe(le, tle, &b->requests) {
-                       req = list_entry(le, struct drbd_request, tl_requests);
-                       rv = _req_mod(req, what);
-                       n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
-                       n_reads  += (rv & MR_READ) >> MR_READ_SHIFT;
-               }
-               tmp = b->next;
-               if (n_writes) {
-                       if (what == resend) {
-                               b->n_writes = n_writes;
-                               if (b->w.cb == NULL) {
-                                       b->w.cb = w_send_barrier;
-                                       inc_ap_pending(mdev);
-                                       drbd_set_flag(mdev, CREATE_BARRIER);
-                               }
-                               drbd_queue_work(&mdev->data.work, &b->w);
-                       }
-                       pn = &b->next;
-               } else {
-                       if (n_reads)
-                               list_add(&carry_reads, &b->requests);
-                       /* there could still be requests on that ring list,
-                        * in case local io is still pending */
-                       list_del(&b->requests);
-                       /* dec_ap_pending corresponding to queue_barrier.
-                        * the newest barrier may not have been queued yet,
-                        * in which case w.cb is still NULL. */
-                       if (b->w.cb != NULL)
-                               dec_ap_pending(mdev);
-                       if (b == mdev->newest_tle) {
-                               /* recycle, but reinit! */
-                               D_ASSERT(tmp == NULL);
-                               INIT_LIST_HEAD(&b->requests);
-                               list_splice(&carry_reads, &b->requests);
-                               INIT_LIST_HEAD(&b->w.list);
-                               b->w.cb = NULL;
-                               b->br_number = net_random();
-                               b->n_writes = 0;
-                               *pn = b;
-                               break;
-                       }
-                       *pn = tmp;
-                       kfree(b);
-               }
-               b = tmp;
-               list_splice(&carry_reads, &b->requests);
-       }
-       /* Actions operating on the disk state, also want to work on
-          requests that got barrier acked. */
-       list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-               req = list_entry(le, struct drbd_request, tl_requests);
-               _req_mod(req, what);
-       }
- }
- /**
-  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
-  * @mdev:     DRBD device.
-  *
-  * This is called after the connection to the peer was lost. The storage covered
-  * by the requests on the transfer gets marked as our of sync. Called from the
-  * receiver thread and the worker thread.
-  */
- void tl_clear(struct drbd_conf *mdev)
- {
-       spin_lock_irq(&mdev->req_lock);
-       _tl_clear(mdev);
-       spin_unlock_irq(&mdev->req_lock);
- }
- static void _tl_clear(struct drbd_conf *mdev)
- {
-       struct list_head *le, *tle;
-       struct drbd_request *r;
-       _tl_restart(mdev, connection_lost_while_pending);
-       /* we expect this list to be empty. */
-       D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-       /* but just in case, clean it up anyways! */
-       list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
-               r = list_entry(le, struct drbd_request, tl_requests);
-               /* It would be nice to complete outside of spinlock.
-                * But this is easier for now. */
-               _req_mod(r, connection_lost_while_pending);
-       }
-       /* ensure bit indicating barrier is required is clear */
-       drbd_clear_flag(mdev, CREATE_BARRIER);
-       memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
- }
- void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
- {
-       spin_lock_irq(&mdev->req_lock);
-       _tl_restart(mdev, what);
-       spin_unlock_irq(&mdev->req_lock);
- }
- /**
-  * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
-  * @mdev:     DRBD device.
-  */
- void tl_abort_disk_io(struct drbd_conf *mdev)
- {
-       struct drbd_tl_epoch *b;
-       struct list_head *le, *tle;
-       struct drbd_request *req;
-       spin_lock_irq(&mdev->req_lock);
-       b = mdev->oldest_tle;
-       while (b) {
-               list_for_each_safe(le, tle, &b->requests) {
-                       req = list_entry(le, struct drbd_request, tl_requests);
-                       if (!(req->rq_state & RQ_LOCAL_PENDING))
-                               continue;
-                       _req_mod(req, abort_disk_io);
-               }
-               b = b->next;
-       }
-       list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-               req = list_entry(le, struct drbd_request, tl_requests);
-               if (!(req->rq_state & RQ_LOCAL_PENDING))
-                       continue;
-               _req_mod(req, abort_disk_io);
-       }
-       spin_unlock_irq(&mdev->req_lock);
- }
- /**
-  * cl_wide_st_chg() - true if the state change is a cluster wide one
-  * @mdev:     DRBD device.
-  * @os:               old (current) state.
-  * @ns:               new (wanted) state.
-  */
- static int cl_wide_st_chg(struct drbd_conf *mdev,
-                         union drbd_state os, union drbd_state ns)
- {
-       return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
-                ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
-                 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-                 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
-                 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
-               (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
-               (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
- }
- enum drbd_state_rv
- drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
-                 union drbd_state mask, union drbd_state val)
- {
-       unsigned long flags;
-       union drbd_state os, ns;
-       enum drbd_state_rv rv;
-       spin_lock_irqsave(&mdev->req_lock, flags);
-       os = mdev->state;
-       ns.i = (os.i & ~mask.i) | val.i;
-       rv = _drbd_set_state(mdev, ns, f, NULL);
-       ns = mdev->state;
-       spin_unlock_irqrestore(&mdev->req_lock, flags);
-       return rv;
- }
- /**
-  * drbd_force_state() - Impose a change which happens outside our control on our state
-  * @mdev:     DRBD device.
-  * @mask:     mask of state bits to change.
-  * @val:      value of new state bits.
-  */
- void drbd_force_state(struct drbd_conf *mdev,
-       union drbd_state mask, union drbd_state val)
- {
-       drbd_change_state(mdev, CS_HARD, mask, val);
- }
- static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
- static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
-                                                   union drbd_state,
-                                                   union drbd_state);
- enum sanitize_state_warnings {
-       NO_WARNING,
-       ABORTED_ONLINE_VERIFY,
-       ABORTED_RESYNC,
-       CONNECTION_LOST_NEGOTIATING,
-       IMPLICITLY_UPGRADED_DISK,
-       IMPLICITLY_UPGRADED_PDSK,
- };
- static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-                                      union drbd_state ns, enum sanitize_state_warnings *warn);
- int drbd_send_state_req(struct drbd_conf *,
-                       union drbd_state, union drbd_state);
- static enum drbd_state_rv
- _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
-            union drbd_state val)
- {
-       union drbd_state os, ns;
-       unsigned long flags;
-       enum drbd_state_rv rv;
-       if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS))
-               return SS_CW_SUCCESS;
-       if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL))
-               return SS_CW_FAILED_BY_PEER;
-       rv = 0;
-       spin_lock_irqsave(&mdev->req_lock, flags);
-       os = mdev->state;
-       ns.i = (os.i & ~mask.i) | val.i;
-       ns = sanitize_state(mdev, os, ns, NULL);
-       if (!cl_wide_st_chg(mdev, os, ns))
-               rv = SS_CW_NO_NEED;
-       if (!rv) {
-               rv = is_valid_state(mdev, ns);
-               if (rv == SS_SUCCESS) {
-                       rv = is_valid_state_transition(mdev, ns, os);
-                       if (rv == SS_SUCCESS)
-                               rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
-               }
-       }
-       spin_unlock_irqrestore(&mdev->req_lock, flags);
-       return rv;
- }
- /**
-  * drbd_req_state() - Perform an eventually cluster wide state change
-  * @mdev:     DRBD device.
-  * @mask:     mask of state bits to change.
-  * @val:      value of new state bits.
-  * @f:                flags
-  *
-  * Should not be called directly, use drbd_request_state() or
-  * _drbd_request_state().
-  */
- static enum drbd_state_rv
- drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
-              union drbd_state val, enum chg_state_flags f)
- {
-       struct completion done;
-       unsigned long flags;
-       union drbd_state os, ns;
-       enum drbd_state_rv rv;
-       init_completion(&done);
-       if (f & CS_SERIALIZE)
-               mutex_lock(&mdev->state_mutex);
-       spin_lock_irqsave(&mdev->req_lock, flags);
-       os = mdev->state;
-       ns.i = (os.i & ~mask.i) | val.i;
-       ns = sanitize_state(mdev, os, ns, NULL);
-       if (cl_wide_st_chg(mdev, os, ns)) {
-               rv = is_valid_state(mdev, ns);
-               if (rv == SS_SUCCESS)
-                       rv = is_valid_state_transition(mdev, ns, os);
-               spin_unlock_irqrestore(&mdev->req_lock, flags);
-               if (rv < SS_SUCCESS) {
-                       if (f & CS_VERBOSE)
-                               print_st_err(mdev, os, ns, rv);
-                       goto abort;
-               }
-               drbd_state_lock(mdev);
-               if (!drbd_send_state_req(mdev, mask, val)) {
-                       drbd_state_unlock(mdev);
-                       rv = SS_CW_FAILED_BY_PEER;
-                       if (f & CS_VERBOSE)
-                               print_st_err(mdev, os, ns, rv);
-                       goto abort;
-               }
-               if (mask.conn == C_MASK && val.conn == C_DISCONNECTING)
-                       drbd_set_flag(mdev, DISCONNECT_SENT);
-               wait_event(mdev->state_wait,
-                       (rv = _req_st_cond(mdev, mask, val)));
-               if (rv < SS_SUCCESS) {
-                       drbd_state_unlock(mdev);
-                       if (f & CS_VERBOSE)
-                               print_st_err(mdev, os, ns, rv);
-                       goto abort;
-               }
-               spin_lock_irqsave(&mdev->req_lock, flags);
-               os = mdev->state;
-               ns.i = (os.i & ~mask.i) | val.i;
-               rv = _drbd_set_state(mdev, ns, f, &done);
-               drbd_state_unlock(mdev);
-       } else {
-               rv = _drbd_set_state(mdev, ns, f, &done);
-       }
-       spin_unlock_irqrestore(&mdev->req_lock, flags);
-       if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
-               D_ASSERT(current != mdev->worker.task);
-               wait_for_completion(&done);
-       }
- abort:
-       if (f & CS_SERIALIZE)
-               mutex_unlock(&mdev->state_mutex);
-       return rv;
- }
- /**
-  * _drbd_request_state() - Request a state change (with flags)
-  * @mdev:     DRBD device.
-  * @mask:     mask of state bits to change.
-  * @val:      value of new state bits.
-  * @f:                flags
-  *
-  * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
-  * flag, or when logging of failed state change requests is not desired.
-  */
- enum drbd_state_rv
- _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
-                   union drbd_state val, enum chg_state_flags f)
- {
-       enum drbd_state_rv rv;
-       wait_event(mdev->state_wait,
-                  (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
-       return rv;
- }
- static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
- {
-       dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
-           name,
-           drbd_conn_str(ns.conn),
-           drbd_role_str(ns.role),
-           drbd_role_str(ns.peer),
-           drbd_disk_str(ns.disk),
-           drbd_disk_str(ns.pdsk),
-           is_susp(ns) ? 's' : 'r',
-           ns.aftr_isp ? 'a' : '-',
-           ns.peer_isp ? 'p' : '-',
-           ns.user_isp ? 'u' : '-'
-           );
- }
- void print_st_err(struct drbd_conf *mdev, union drbd_state os,
-                 union drbd_state ns, enum drbd_state_rv err)
- {
-       if (err == SS_IN_TRANSIENT_STATE)
-               return;
-       dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
-       print_st(mdev, " state", os);
-       print_st(mdev, "wanted", ns);
- }
- /**
-  * is_valid_state() - Returns an SS_ error code if ns is not valid
-  * @mdev:     DRBD device.
-  * @ns:               State to consider.
-  */
- static enum drbd_state_rv
- is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
- {
-       /* See drbd_state_sw_errors in drbd_strings.c */
-       enum drbd_fencing_p fp;
-       enum drbd_state_rv rv = SS_SUCCESS;
-       fp = FP_DONT_CARE;
-       if (get_ldev(mdev)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-       }
-       if (get_net_conf(mdev)) {
-               if (!mdev->net_conf->two_primaries &&
-                   ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
-                       rv = SS_TWO_PRIMARIES;
-               put_net_conf(mdev);
-       }
-       if (rv <= 0)
-               /* already found a reason to abort */;
-       else if (ns.role == R_SECONDARY && mdev->open_cnt)
-               rv = SS_DEVICE_IN_USE;
-       else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
-               rv = SS_NO_UP_TO_DATE_DISK;
-       else if (fp >= FP_RESOURCE &&
-                ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
-               rv = SS_PRIMARY_NOP;
-       else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
-               rv = SS_NO_UP_TO_DATE_DISK;
-       else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
-               rv = SS_NO_LOCAL_DISK;
-       else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
-               rv = SS_NO_REMOTE_DISK;
-       else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
-               rv = SS_NO_UP_TO_DATE_DISK;
-       else if ((ns.conn == C_CONNECTED ||
-                 ns.conn == C_WF_BITMAP_S ||
-                 ns.conn == C_SYNC_SOURCE ||
-                 ns.conn == C_PAUSED_SYNC_S) &&
-                 ns.disk == D_OUTDATED)
-               rv = SS_CONNECTED_OUTDATES;
-       else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-                (mdev->sync_conf.verify_alg[0] == 0))
-               rv = SS_NO_VERIFY_ALG;
-       else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-                 mdev->agreed_pro_version < 88)
-               rv = SS_NOT_SUPPORTED;
-       else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
-               rv = SS_CONNECTED_OUTDATES;
-       return rv;
- }
- /**
-  * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
-  * @mdev:     DRBD device.
-  * @ns:               new state.
-  * @os:               old state.
-  */
- static enum drbd_state_rv
- is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
-                         union drbd_state os)
- {
-       enum drbd_state_rv rv = SS_SUCCESS;
-       if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
-           os.conn > C_CONNECTED)
-               rv = SS_RESYNC_RUNNING;
-       if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
-               rv = SS_ALREADY_STANDALONE;
-       if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
-               rv = SS_IS_DISKLESS;
-       if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
-               rv = SS_NO_NET_CONFIG;
-       if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
-               rv = SS_LOWER_THAN_OUTDATED;
-       if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
-               rv = SS_IN_TRANSIENT_STATE;
-       if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
-               rv = SS_IN_TRANSIENT_STATE;
-       /* While establishing a connection only allow cstate to change.
-          Delay/refuse role changes, detach attach etc... */
-       if (drbd_test_flag(mdev, STATE_SENT) &&
-           !(os.conn == C_WF_REPORT_PARAMS ||
-             (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
-               rv = SS_IN_TRANSIENT_STATE;
-       if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
-               rv = SS_NEED_CONNECTION;
-       if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
-           ns.conn != os.conn && os.conn > C_CONNECTED)
-               rv = SS_RESYNC_RUNNING;
-       if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
-           os.conn < C_CONNECTED)
-               rv = SS_NEED_CONNECTION;
-       if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
-           && os.conn < C_WF_REPORT_PARAMS)
-               rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
-       return rv;
- }
- static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
- {
-       static const char *msg_table[] = {
-               [NO_WARNING] = "",
-               [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
-               [ABORTED_RESYNC] = "Resync aborted.",
-               [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
-               [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
-               [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
-       };
-       if (warn != NO_WARNING)
-               dev_warn(DEV, "%s\n", msg_table[warn]);
- }
- /**
-  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
-  * @mdev:     DRBD device.
-  * @os:               old state.
-  * @ns:               new state.
-  * @warn_sync_abort:
-  *
-  * When we loose connection, we have to set the state of the peers disk (pdsk)
-  * to D_UNKNOWN. This rule and many more along those lines are in this function.
-  */
- static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-                                      union drbd_state ns, enum sanitize_state_warnings *warn)
- {
-       enum drbd_fencing_p fp;
-       enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
-       if (warn)
-               *warn = NO_WARNING;
-       fp = FP_DONT_CARE;
-       if (get_ldev(mdev)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-       }
-       /* Disallow Network errors to configure a device's network part */
-       if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
-           os.conn <= C_DISCONNECTING)
-               ns.conn = os.conn;
-       /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
-        * If you try to go into some Sync* state, that shall fail (elsewhere). */
-       if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
-           ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
-               ns.conn = os.conn;
-       /* we cannot fail (again) if we already detached */
-       if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
-               ns.disk = D_DISKLESS;
-       /* After C_DISCONNECTING only C_STANDALONE may follow */
-       if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
-               ns.conn = os.conn;
-       if (ns.conn < C_CONNECTED) {
-               ns.peer_isp = 0;
-               ns.peer = R_UNKNOWN;
-               if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
-                       ns.pdsk = D_UNKNOWN;
-       }
-       /* Clear the aftr_isp when becoming unconfigured */
-       if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
-               ns.aftr_isp = 0;
-       /* Abort resync if a disk fails/detaches */
-       if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
-           (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-               if (warn)
-                       *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
-                               ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
-               ns.conn = C_CONNECTED;
-       }
-       /* Connection breaks down before we finished "Negotiating" */
-       if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
-           get_ldev_if_state(mdev, D_NEGOTIATING)) {
-               if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
-                       ns.disk = mdev->new_state_tmp.disk;
-                       ns.pdsk = mdev->new_state_tmp.pdsk;
-               } else {
-                       if (warn)
-                               *warn = CONNECTION_LOST_NEGOTIATING;
-                       ns.disk = D_DISKLESS;
-                       ns.pdsk = D_UNKNOWN;
-               }
-               put_ldev(mdev);
-       }
-       /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
-       if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
-               if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
-                       ns.disk = D_UP_TO_DATE;
-               if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
-                       ns.pdsk = D_UP_TO_DATE;
-       }
-       /* Implications of the connection stat on the disk states */
-       disk_min = D_DISKLESS;
-       disk_max = D_UP_TO_DATE;
-       pdsk_min = D_INCONSISTENT;
-       pdsk_max = D_UNKNOWN;
-       switch ((enum drbd_conns)ns.conn) {
-       case C_WF_BITMAP_T:
-       case C_PAUSED_SYNC_T:
-       case C_STARTING_SYNC_T:
-       case C_WF_SYNC_UUID:
-       case C_BEHIND:
-               disk_min = D_INCONSISTENT;
-               disk_max = D_OUTDATED;
-               pdsk_min = D_UP_TO_DATE;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_VERIFY_S:
-       case C_VERIFY_T:
-               disk_min = D_UP_TO_DATE;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_UP_TO_DATE;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_CONNECTED:
-               disk_min = D_DISKLESS;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_DISKLESS;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_WF_BITMAP_S:
-       case C_PAUSED_SYNC_S:
-       case C_STARTING_SYNC_S:
-       case C_AHEAD:
-               disk_min = D_UP_TO_DATE;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_INCONSISTENT;
-               pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
-               break;
-       case C_SYNC_TARGET:
-               disk_min = D_INCONSISTENT;
-               disk_max = D_INCONSISTENT;
-               pdsk_min = D_UP_TO_DATE;
-               pdsk_max = D_UP_TO_DATE;
-               break;
-       case C_SYNC_SOURCE:
-               disk_min = D_UP_TO_DATE;
-               disk_max = D_UP_TO_DATE;
-               pdsk_min = D_INCONSISTENT;
-               pdsk_max = D_INCONSISTENT;
-               break;
-       case C_STANDALONE:
-       case C_DISCONNECTING:
-       case C_UNCONNECTED:
-       case C_TIMEOUT:
-       case C_BROKEN_PIPE:
-       case C_NETWORK_FAILURE:
-       case C_PROTOCOL_ERROR:
-       case C_TEAR_DOWN:
-       case C_WF_CONNECTION:
-       case C_WF_REPORT_PARAMS:
-       case C_MASK:
-               break;
-       }
-       if (ns.disk > disk_max)
-               ns.disk = disk_max;
-       if (ns.disk < disk_min) {
-               if (warn)
-                       *warn = IMPLICITLY_UPGRADED_DISK;
-               ns.disk = disk_min;
-       }
-       if (ns.pdsk > pdsk_max)
-               ns.pdsk = pdsk_max;
-       if (ns.pdsk < pdsk_min) {
-               if (warn)
-                       *warn = IMPLICITLY_UPGRADED_PDSK;
-               ns.pdsk = pdsk_min;
-       }
-       if (fp == FP_STONITH &&
-           (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
-           !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
-               ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-       if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
-           (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
-           !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
-               ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
-       if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
-               if (ns.conn == C_SYNC_SOURCE)
-                       ns.conn = C_PAUSED_SYNC_S;
-               if (ns.conn == C_SYNC_TARGET)
-                       ns.conn = C_PAUSED_SYNC_T;
-       } else {
-               if (ns.conn == C_PAUSED_SYNC_S)
-                       ns.conn = C_SYNC_SOURCE;
-               if (ns.conn == C_PAUSED_SYNC_T)
-                       ns.conn = C_SYNC_TARGET;
-       }
-       return ns;
- }
- /* helper for __drbd_set_state */
- static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
- {
-       if (mdev->agreed_pro_version < 90)
-               mdev->ov_start_sector = 0;
-       mdev->rs_total = drbd_bm_bits(mdev);
-       mdev->ov_position = 0;
-       if (cs == C_VERIFY_T) {
-               /* starting online verify from an arbitrary position
-                * does not fit well into the existing protocol.
-                * on C_VERIFY_T, we initialize ov_left and friends
-                * implicitly in receive_DataRequest once the
-                * first P_OV_REQUEST is received */
-               mdev->ov_start_sector = ~(sector_t)0;
-       } else {
-               unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
-               if (bit >= mdev->rs_total) {
-                       mdev->ov_start_sector =
-                               BM_BIT_TO_SECT(mdev->rs_total - 1);
-                       mdev->rs_total = 1;
-               } else
-                       mdev->rs_total -= bit;
-               mdev->ov_position = mdev->ov_start_sector;
-       }
-       mdev->ov_left = mdev->rs_total;
- }
- static void drbd_resume_al(struct drbd_conf *mdev)
- {
-       if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED))
-               dev_info(DEV, "Resumed AL updates\n");
- }
- /**
-  * __drbd_set_state() - Set a new DRBD state
-  * @mdev:     DRBD device.
-  * @ns:               new state.
-  * @flags:    Flags
-  * @done:     Optional completion, that will get completed after the after_state_ch() finished
-  *
-  * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
-  */
- enum drbd_state_rv
- __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
-                enum chg_state_flags flags, struct completion *done)
- {
-       union drbd_state os;
-       enum drbd_state_rv rv = SS_SUCCESS;
-       enum sanitize_state_warnings ssw;
-       struct after_state_chg_work *ascw;
-       os = mdev->state;
-       ns = sanitize_state(mdev, os, ns, &ssw);
-       if (ns.i == os.i)
-               return SS_NOTHING_TO_DO;
-       if (!(flags & CS_HARD)) {
-               /*  pre-state-change checks ; only look at ns  */
-               /* See drbd_state_sw_errors in drbd_strings.c */
-               rv = is_valid_state(mdev, ns);
-               if (rv < SS_SUCCESS) {
-                       /* If the old state was illegal as well, then let
-                          this happen...*/
-                       if (is_valid_state(mdev, os) == rv)
-                               rv = is_valid_state_transition(mdev, ns, os);
-               } else
-                       rv = is_valid_state_transition(mdev, ns, os);
-       }
-       if (rv < SS_SUCCESS) {
-               if (flags & CS_VERBOSE)
-                       print_st_err(mdev, os, ns, rv);
-               return rv;
-       }
-       print_sanitize_warnings(mdev, ssw);
-       {
-       char *pbp, pb[300];
-       pbp = pb;
-       *pbp = 0;
-       if (ns.role != os.role)
-               pbp += sprintf(pbp, "role( %s -> %s ) ",
-                              drbd_role_str(os.role),
-                              drbd_role_str(ns.role));
-       if (ns.peer != os.peer)
-               pbp += sprintf(pbp, "peer( %s -> %s ) ",
-                              drbd_role_str(os.peer),
-                              drbd_role_str(ns.peer));
-       if (ns.conn != os.conn)
-               pbp += sprintf(pbp, "conn( %s -> %s ) ",
-                              drbd_conn_str(os.conn),
-                              drbd_conn_str(ns.conn));
-       if (ns.disk != os.disk)
-               pbp += sprintf(pbp, "disk( %s -> %s ) ",
-                              drbd_disk_str(os.disk),
-                              drbd_disk_str(ns.disk));
-       if (ns.pdsk != os.pdsk)
-               pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
-                              drbd_disk_str(os.pdsk),
-                              drbd_disk_str(ns.pdsk));
-       if (is_susp(ns) != is_susp(os))
-               pbp += sprintf(pbp, "susp( %d -> %d ) ",
-                              is_susp(os),
-                              is_susp(ns));
-       if (ns.aftr_isp != os.aftr_isp)
-               pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
-                              os.aftr_isp,
-                              ns.aftr_isp);
-       if (ns.peer_isp != os.peer_isp)
-               pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
-                              os.peer_isp,
-                              ns.peer_isp);
-       if (ns.user_isp != os.user_isp)
-               pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
-                              os.user_isp,
-                              ns.user_isp);
-       dev_info(DEV, "%s\n", pb);
-       }
-       /* solve the race between becoming unconfigured,
-        * worker doing the cleanup, and
-        * admin reconfiguring us:
-        * on (re)configure, first set CONFIG_PENDING,
-        * then wait for a potentially exiting worker,
-        * start the worker, and schedule one no_op.
-        * then proceed with configuration.
-        */
-       if (ns.disk == D_DISKLESS &&
-           ns.conn == C_STANDALONE &&
-           ns.role == R_SECONDARY &&
-           !drbd_test_and_set_flag(mdev, CONFIG_PENDING))
-               drbd_set_flag(mdev, DEVICE_DYING);
-       /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
-        * on the ldev here, to be sure the transition -> D_DISKLESS resp.
-        * drbd_ldev_destroy() won't happen before our corresponding
-        * after_state_ch works run, where we put_ldev again. */
-       if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
-           (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
-               atomic_inc(&mdev->local_cnt);
-       mdev->state = ns;
-       if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
-               drbd_print_uuids(mdev, "attached to UUIDs");
-       wake_up(&mdev->misc_wait);
-       wake_up(&mdev->state_wait);
-       /* Aborted verify run, or we reached the stop sector.
-        * Log the last position, unless end-of-device. */
-       if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
-           ns.conn <= C_CONNECTED) {
-               mdev->ov_start_sector =
-                       BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
-               if (mdev->ov_left)
-                       dev_info(DEV, "Online Verify reached sector %llu\n",
-                               (unsigned long long)mdev->ov_start_sector);
-       }
-       if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
-           (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
-               dev_info(DEV, "Syncer continues.\n");
-               mdev->rs_paused += (long)jiffies
-                                 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
-               if (ns.conn == C_SYNC_TARGET)
-                       mod_timer(&mdev->resync_timer, jiffies);
-       }
-       if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
-           (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
-               dev_info(DEV, "Resync suspended\n");
-               mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
-       }
-       if (os.conn == C_CONNECTED &&
-           (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
-               unsigned long now = jiffies;
-               int i;
-               set_ov_position(mdev, ns.conn);
-               mdev->rs_start = now;
-               mdev->rs_last_events = 0;
-               mdev->rs_last_sect_ev = 0;
-               mdev->ov_last_oos_size = 0;
-               mdev->ov_last_oos_start = 0;
-               for (i = 0; i < DRBD_SYNC_MARKS; i++) {
-                       mdev->rs_mark_left[i] = mdev->ov_left;
-                       mdev->rs_mark_time[i] = now;
-               }
-               drbd_rs_controller_reset(mdev);
-               if (ns.conn == C_VERIFY_S) {
-                       dev_info(DEV, "Starting Online Verify from sector %llu\n",
-                                       (unsigned long long)mdev->ov_position);
-                       mod_timer(&mdev->resync_timer, jiffies);
-               }
-       }
-       if (get_ldev(mdev)) {
-               u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
-                                                MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
-                                                MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-               if (drbd_test_flag(mdev, CRASHED_PRIMARY))
-                       mdf |= MDF_CRASHED_PRIMARY;
-               if (mdev->state.role == R_PRIMARY ||
-                   (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
-                       mdf |= MDF_PRIMARY_IND;
-               if (mdev->state.conn > C_WF_REPORT_PARAMS)
-                       mdf |= MDF_CONNECTED_IND;
-               if (mdev->state.disk > D_INCONSISTENT)
-                       mdf |= MDF_CONSISTENT;
-               if (mdev->state.disk > D_OUTDATED)
-                       mdf |= MDF_WAS_UP_TO_DATE;
-               if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
-                       mdf |= MDF_PEER_OUT_DATED;
-               if (mdf != mdev->ldev->md.flags) {
-                       mdev->ldev->md.flags = mdf;
-                       drbd_md_mark_dirty(mdev);
-               }
-               if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
-                       drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
-               put_ldev(mdev);
-       }
-       /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
-       if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
-           os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
-               drbd_set_flag(mdev, CONSIDER_RESYNC);
-       /* Receiver should clean up itself */
-       if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
-               drbd_thread_stop_nowait(&mdev->receiver);
-       /* Now the receiver finished cleaning up itself, it should die */
-       if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
-               drbd_thread_stop_nowait(&mdev->receiver);
-       /* Upon network failure, we need to restart the receiver. */
-       if (os.conn > C_WF_CONNECTION &&
-           ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
-               drbd_thread_restart_nowait(&mdev->receiver);
-       /* Resume AL writing if we get a connection */
-       if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
-               drbd_resume_al(mdev);
-       /* remember last connect and attach times so request_timer_fn() won't
-        * kill newly established sessions while we are still trying to thaw
-        * previously frozen IO */
-       if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
-               mdev->last_reconnect_jif = jiffies;
-       if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-           ns.disk > D_NEGOTIATING)
-               mdev->last_reattach_jif = jiffies;
-       ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
-       if (ascw) {
-               ascw->os = os;
-               ascw->ns = ns;
-               ascw->flags = flags;
-               ascw->w.cb = w_after_state_ch;
-               ascw->done = done;
-               drbd_queue_work(&mdev->data.work, &ascw->w);
-       } else {
-               dev_warn(DEV, "Could not kmalloc an ascw\n");
-       }
-       return rv;
- }
- static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
- {
-       struct after_state_chg_work *ascw =
-               container_of(w, struct after_state_chg_work, w);
-       after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
-       if (ascw->flags & CS_WAIT_COMPLETE) {
-               D_ASSERT(ascw->done != NULL);
-               complete(ascw->done);
-       }
-       kfree(ascw);
-       return 1;
- }
- static void abw_start_sync(struct drbd_conf *mdev, int rv)
- {
-       if (rv) {
-               dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
-               _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
-               return;
-       }
-       switch (mdev->state.conn) {
-       case C_STARTING_SYNC_T:
-               _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
-               break;
-       case C_STARTING_SYNC_S:
-               drbd_start_resync(mdev, C_SYNC_SOURCE);
-               break;
-       }
- }
- int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
-               int (*io_fn)(struct drbd_conf *),
-               char *why, enum bm_flag flags)
- {
-       int rv;
-       D_ASSERT(current == mdev->worker.task);
-       /* open coded non-blocking drbd_suspend_io(mdev); */
-       drbd_set_flag(mdev, SUSPEND_IO);
+       .open =    drbd_open,
+       .release = drbd_release,
+ };
  
-       drbd_bm_lock(mdev, why, flags);
-       rv = io_fn(mdev);
-       drbd_bm_unlock(mdev);
 -static void bio_destructor_drbd(struct bio *bio)
 -{
 -      bio_free(bio, drbd_md_io_bio_set);
 -}
 -
+ struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+ {
+       struct bio *bio;
  
-       drbd_resume_io(mdev);
+       if (!drbd_md_io_bio_set)
+               return bio_alloc(gfp_mask, 1);
  
-       return rv;
+       bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+       if (!bio)
+               return NULL;
 -      bio->bi_destructor = bio_destructor_drbd;
+       return bio;
  }
  
- /**
-  * after_state_ch() - Perform after state change actions that may sleep
-  * @mdev:     DRBD device.
-  * @os:               old state.
-  * @ns:               new state.
-  * @flags:    Flags
+ #ifdef __CHECKER__
+ /* When checking with sparse, and this is an inline function, sparse will
+    give tons of false positives. When this is a real functions sparse works.
   */
- static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
-                          union drbd_state ns, enum chg_state_flags flags)
+ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
  {
-       enum drbd_fencing_p fp;
-       enum drbd_req_event what = nothing;
-       union drbd_state nsm = (union drbd_state){ .i = -1 };
-       if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
-               drbd_clear_flag(mdev, CRASHED_PRIMARY);
-               if (mdev->p_uuid)
-                       mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
-       }
+       int io_allowed;
  
-       fp = FP_DONT_CARE;
-       if (get_ldev(mdev)) {
-               fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
+       atomic_inc(&mdev->local_cnt);
+       io_allowed = (mdev->state.disk >= mins);
+       if (!io_allowed) {
+               if (atomic_dec_and_test(&mdev->local_cnt))
+                       wake_up(&mdev->misc_wait);
        }
+       return io_allowed;
+ }
  
-       /* Inform userspace about the change... */
-       drbd_bcast_state(mdev, ns);
-       if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
-           (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
-               drbd_khelper(mdev, "pri-on-incon-degr");
-       /* Here we have the actions that are performed after a
-          state change. This function might sleep */
-       if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
-               mod_timer(&mdev->request_timer, jiffies + HZ);
-       nsm.i = -1;
-       if (ns.susp_nod) {
-               if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
-                       what = resend;
-               if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
-                   ns.disk > D_NEGOTIATING)
-                       what = restart_frozen_disk_io;
-               if (what != nothing)
-                       nsm.susp_nod = 0;
-       }
+ #endif
  
-       if (ns.susp_fen) {
-               /* case1: The outdate peer handler is successful: */
-               if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
-                       if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
-                               drbd_uuid_new_current(mdev);
-                               drbd_clear_flag(mdev, NEW_CUR_UUID);
-                       }
-                       spin_lock_irq(&mdev->req_lock);
-                       _tl_clear(mdev);
-                       _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
-                       spin_unlock_irq(&mdev->req_lock);
-               }
-               /* case2: The connection was established again: */
-               if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
-                       drbd_clear_flag(mdev, NEW_CUR_UUID);
-                       what = resend;
-                       nsm.susp_fen = 0;
+ /**
+  * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+  * @tconn:    DRBD connection.
+  * @barrier_nr:       Expected identifier of the DRBD write barrier packet.
+  * @set_size: Expected number of requests before that barrier.
+  *
+  * In case the passed barrier_nr or set_size does not match the oldest
+  * epoch of not yet barrier-acked requests, this function will cause a
+  * termination of the connection.
+  */
+ void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
+               unsigned int set_size)
+ {
+       struct drbd_request *r;
+       struct drbd_request *req = NULL;
+       int expect_epoch = 0;
+       int expect_size = 0;
+       spin_lock_irq(&tconn->req_lock);
 -      /* find latest not yet barrier-acked write request,
++      /* find oldest not yet barrier-acked write request,
+        * count writes in its epoch. */
+       list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+               const unsigned s = r->rq_state;
+               if (!req) {
+                       if (!(s & RQ_WRITE))
+                               continue;
+                       if (!(s & RQ_NET_MASK))
+                               continue;
+                       if (s & RQ_NET_DONE)
+                               continue;
+                       req = r;
+                       expect_epoch = req->epoch;
+                       expect_size ++;
+               } else {
+                       if (r->epoch != expect_epoch)
+                               break;
+                       if (!(s & RQ_WRITE))
+                               continue;
+                       /* if (s & RQ_DONE): not expected */
+                       /* if (!(s & RQ_NET_MASK)): not expected */
+                       expect_size++;
                }
        }
  
-       if (what != nothing) {
-               spin_lock_irq(&mdev->req_lock);
-               _tl_restart(mdev, what);
-               nsm.i &= mdev->state.i;
-               _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
-               spin_unlock_irq(&mdev->req_lock);
+       /* first some paranoia code */
+       if (req == NULL) {
+               conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+                        barrier_nr);
+               goto bail;
        }
-       /* Became sync source.  With protocol >= 96, we still need to send out
-        * the sync uuid now. Need to do that before any drbd_send_state, or
-        * the other side may go "paused sync" before receiving the sync uuids,
-        * which is unexpected. */
-       if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
-           (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
-           mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
-               drbd_gen_and_send_sync_uuid(mdev);
-               put_ldev(mdev);
+       if (expect_epoch != barrier_nr) {
+               conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
+                        barrier_nr, expect_epoch);
+               goto bail;
        }
  
-       /* Do not change the order of the if above and the two below... */
-       if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
-               /* we probably will start a resync soon.
-                * make sure those things are properly reset. */
-               mdev->rs_total = 0;
-               mdev->rs_failed = 0;
-               atomic_set(&mdev->rs_pending_cnt, 0);
-               drbd_rs_cancel_all(mdev);
-               drbd_send_uuids(mdev);
-               drbd_send_state(mdev, ns);
-       }
-       /* No point in queuing send_bitmap if we don't have a connection
-        * anymore, so check also the _current_ state, not only the new state
-        * at the time this work was queued. */
-       if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
-           mdev->state.conn == C_WF_BITMAP_S)
-               drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
-                               "send_bitmap (WFBitMapS)",
-                               BM_LOCKED_TEST_ALLOWED);
-       /* Lost contact to peer's copy of the data */
-       if ((os.pdsk >= D_INCONSISTENT &&
-            os.pdsk != D_UNKNOWN &&
-            os.pdsk != D_OUTDATED)
-       &&  (ns.pdsk < D_INCONSISTENT ||
-            ns.pdsk == D_UNKNOWN ||
-            ns.pdsk == D_OUTDATED)) {
-               if (get_ldev(mdev)) {
-                       if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
-                           mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-                               if (is_susp(mdev->state)) {
-                                       drbd_set_flag(mdev, NEW_CUR_UUID);
-                               } else {
-                                       drbd_uuid_new_current(mdev);
-                                       drbd_send_uuids(mdev);
-                               }
-                       }
-                       put_ldev(mdev);
-               }
+       if (expect_size != set_size) {
+               conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+                        barrier_nr, set_size, expect_size);
+               goto bail;
        }
  
-       if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-               if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
-                   mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
-                       drbd_uuid_new_current(mdev);
-                       drbd_send_uuids(mdev);
-               }
-               /* D_DISKLESS Peer becomes secondary */
-               if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
-                       /* We may still be Primary ourselves.
-                        * No harm done if the bitmap still changes,
-                        * redirtied pages will follow later. */
-                       drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
-                               "demote diskless peer", BM_LOCKED_SET_ALLOWED);
-               put_ldev(mdev);
 -      /* Clean up list of requests processed during current epoch */
 -      list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
++      /* Clean up list of requests processed during current epoch. */
++      /* this extra list walk restart is paranoia,
++       * to catch requests being barrier-acked "unexpectedly".
++       * It usually should find the same req again, or some READ preceding it. */
++      list_for_each_entry(req, &tconn->transfer_log, tl_requests)
++              if (req->epoch == expect_epoch)
++                      break;
++      list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) {
+               if (req->epoch != expect_epoch)
+                       break;
+               _req_mod(req, BARRIER_ACKED);
        }
+       spin_unlock_irq(&tconn->req_lock);
  
-       /* Write out all changed bits on demote.
-        * Though, no need to da that just yet
-        * if there is a resync going on still */
-       if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
-               mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
-               /* No changes to the bitmap expected this time, so assert that,
-                * even though no harm was done if it did change. */
-               drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
-                               "demote", BM_LOCKED_TEST_ALLOWED);
-               put_ldev(mdev);
-       }
+       return;
  
-       /* Last part of the attaching process ... */
-       if (ns.conn >= C_CONNECTED &&
-           os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
-               drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
-               drbd_send_uuids(mdev);
-               drbd_send_state(mdev, ns);
-       }
+ bail:
+       spin_unlock_irq(&tconn->req_lock);
+       conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+ }
  
-       /* We want to pause/continue resync, tell peer. */
-       if (ns.conn >= C_CONNECTED &&
-            ((os.aftr_isp != ns.aftr_isp) ||
-             (os.user_isp != ns.user_isp)))
-               drbd_send_state(mdev, ns);
-       /* In case one of the isp bits got set, suspend other devices. */
-       if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
-           (ns.aftr_isp || ns.peer_isp || ns.user_isp))
-               suspend_other_sg(mdev);
-       /* Make sure the peer gets informed about eventual state
-          changes (ISP bits) while we were in WFReportParams. */
-       if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-               drbd_send_state(mdev, ns);
-       if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-               drbd_send_state(mdev, ns);
-       /* We are in the progress to start a full sync... */
-       if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
-           (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
-               /* no other bitmap changes expected during this phase */
-               drbd_queue_bitmap_io(mdev,
-                       &drbd_bmio_set_n_write, &abw_start_sync,
-                       "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
-       /* We are invalidating our self... */
-       if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
-           os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
-               /* other bitmap operation expected during this phase */
-               drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
-                       "set_n_write from invalidate", BM_LOCKED_MASK);
-       /* first half of local IO error, failure to attach,
-        * or administrative detach */
-       if (os.disk != D_FAILED && ns.disk == D_FAILED) {
-               /* corresponding get_ldev was in __drbd_set_state, to serialize
-                * our cleanup here with the transition to D_DISKLESS.
-                * But it is still not safe to dreference ldev here, we may end
-                * up here from a failed attach, before ldev was even set.  */
-               if (mdev->ldev) {
-                       enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error;
-                       /* In some setups, this handler triggers a suicide,
-                        * basically mapping IO error to node failure, to
-                        * reduce the number of different failure scenarios.
-                        *
-                        * This handler intentionally runs before we abort IO,
-                        * notify the peer, or try to update our meta data. */
-                       if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR))
-                               drbd_khelper(mdev, "local-io-error");
-                       /* Immediately allow completion of all application IO,
-                        * that waits for completion from the local disk,
-                        * if this was a force-detach due to disk_timeout
-                        * or administrator request (drbdsetup detach --force).
-                        * Do NOT abort otherwise.
-                        * Aborting local requests may cause serious problems,
-                        * if requests are completed to upper layers already,
-                        * and then later the already submitted local bio completes.
-                        * This can cause DMA into former bio pages that meanwhile
-                        * have been re-used for other things.
-                        * So aborting local requests may cause crashes,
-                        * or even worse, silent data corruption.
-                        */
-                       if (drbd_test_flag(mdev, FORCE_DETACH))
-                               tl_abort_disk_io(mdev);
-                       /* current state still has to be D_FAILED,
-                        * there is only one way out: to D_DISKLESS,
-                        * and that may only happen after our put_ldev below. */
-                       if (mdev->state.disk != D_FAILED)
-                               dev_err(DEV,
-                                       "ASSERT FAILED: disk is %s during detach\n",
-                                       drbd_disk_str(mdev->state.disk));
-                       if (ns.conn >= C_CONNECTED)
-                               drbd_send_state(mdev, ns);
-                       drbd_rs_cancel_all(mdev);
-                       /* In case we want to get something to stable storage still,
-                        * this may be the last chance.
-                        * Following put_ldev may transition to D_DISKLESS. */
-                       drbd_md_sync(mdev);
-               }
-               put_ldev(mdev);
-       }
  
-         /* second half of local IO error, failure to attach,
-          * or administrative detach,
-          * after local_cnt references have reached zero again */
-         if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
-                 /* We must still be diskless,
-                  * re-attach has to be serialized with this! */
-                 if (mdev->state.disk != D_DISKLESS)
-                         dev_err(DEV,
-                                 "ASSERT FAILED: disk is %s while going diskless\n",
-                                 drbd_disk_str(mdev->state.disk));
-               if (ns.conn >= C_CONNECTED)
-                       drbd_send_state(mdev, ns);
-               /* corresponding get_ldev in __drbd_set_state
-                * this may finally trigger drbd_ldev_destroy. */
-               put_ldev(mdev);
-       }
+ /**
+  * _tl_restart() - Walks the transfer log, and applies an action to all requests
+  * @mdev:     DRBD device.
+  * @what:       The action/event to perform with all request objects
+  *
+  * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+  * RESTART_FROZEN_DISK_IO.
+  */
+ /* must hold resource->req_lock */
+ void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+ {
+       struct drbd_request *req, *r;
  
-       /* Notify peer that I had a local IO error, and did not detached.. */
-       if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
-               drbd_send_state(mdev, ns);
+       list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests)
+               _req_mod(req, what);
+ }
  
-       /* Disks got bigger while they were detached */
-       if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
-           drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) {
-               if (ns.conn == C_CONNECTED)
-                       resync_after_online_grow(mdev);
      }
+ void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+ {
+       spin_lock_irq(&tconn->req_lock);
+       _tl_restart(tconn, what);
+       spin_unlock_irq(&tconn->req_lock);
+ }
  
-       /* A resync finished or aborted, wake paused devices... */
-       if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
-           (os.peer_isp && !ns.peer_isp) ||
-           (os.user_isp && !ns.user_isp))
-               resume_next_sg(mdev);
-       /* sync target done with resync.  Explicitly notify peer, even though
-        * it should (at least for non-empty resyncs) already know itself. */
-       if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-               drbd_send_state(mdev, ns);
-       /* Verify finished, or reached stop sector.  Peer did not know about
-        * the stop sector, and we may even have changed the stop sector during
-        * verify to interrupt/stop early.  Send the new state. */
-       if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
-       && mdev->agreed_pro_version >= 97)
-               drbd_send_state(mdev, ns);
-       /* Wake up role changes, that were delayed because of connection establishing */
-       if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
-               drbd_clear_flag(mdev, STATE_SENT);
-               wake_up(&mdev->state_wait);
-       }
+ /**
+  * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+  * @mdev:     DRBD device.
+  *
+  * This is called after the connection to the peer was lost. The storage covered
+  * by the requests on the transfer gets marked as our of sync. Called from the
+  * receiver thread and the worker thread.
+  */
+ void tl_clear(struct drbd_tconn *tconn)
+ {
+       tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
+ }
  
-       /* This triggers bitmap writeout of potentially still unwritten pages
-        * if the resync finished cleanly, or aborted because of peer disk
-        * failure, or because of connection loss.
-        * For resync aborted because of local disk failure, we cannot do
-        * any bitmap writeout anymore.
-        * No harm done if some bits change during this phase.
-        */
-       if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
-               drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
-                       "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
-               put_ldev(mdev);
-       }
+ /**
+  * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+  * @mdev:     DRBD device.
+  */
+ void tl_abort_disk_io(struct drbd_conf *mdev)
+ {
+       struct drbd_tconn *tconn = mdev->tconn;
+       struct drbd_request *req, *r;
  
-       /* free tl_hash if we Got thawed and are C_STANDALONE */
-       if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
-               drbd_free_tl_hash(mdev);
-       /* Upon network connection, we need to start the receiver */
-       if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
-               drbd_thread_start(&mdev->receiver);
-       /* Terminate worker thread if we are unconfigured - it will be
-          restarted as needed... */
-       if (ns.disk == D_DISKLESS &&
-           ns.conn == C_STANDALONE &&
-           ns.role == R_SECONDARY) {
-               if (os.aftr_isp != ns.aftr_isp)
-                       resume_next_sg(mdev);
-               /* set in __drbd_set_state, unless CONFIG_PENDING was set */
-               if (drbd_test_flag(mdev, DEVICE_DYING))
-                       drbd_thread_stop_nowait(&mdev->worker);
+       spin_lock_irq(&tconn->req_lock);
+       list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
+               if (!(req->rq_state & RQ_LOCAL_PENDING))
+                       continue;
+               if (req->w.mdev != mdev)
+                       continue;
+               _req_mod(req, ABORT_DISK_IO);
        }
-       drbd_md_sync(mdev);
+       spin_unlock_irq(&tconn->req_lock);
  }
  
  static int drbd_thread_setup(void *arg)
  {
        struct drbd_thread *thi = (struct drbd_thread *) arg;
@@@ -2209,19 -911,20 +911,21 @@@ void drbd_gen_and_send_sync_uuid(struc
  
  int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
  {
-       struct p_sizes p;
+       struct drbd_socket *sock;
+       struct p_sizes *p;
        sector_t d_size, u_size;
 -      int q_order_type, max_bio_size;
 +      int q_order_type;
 +      unsigned int max_bio_size;
-       int ok;
  
        if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
                D_ASSERT(mdev->ldev->backing_bdev);
                d_size = drbd_get_max_capacity(mdev->ldev);
-               u_size = mdev->ldev->dc.disk_size;
+               rcu_read_lock();
+               u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+               rcu_read_unlock();
                q_order_type = drbd_queue_order_type(mdev);
                max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
 -              max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
 +              max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
                put_ldev(mdev);
        } else {
                d_size = 0;
                max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
        }
  
-       /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
-       if (mdev->agreed_pro_version <= 94)
-               max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+       sock = &mdev->tconn->data;
+       p = drbd_prepare_command(mdev, sock);
+       if (!p)
+               return -EIO;
  
-       p.d_size = cpu_to_be64(d_size);
-       p.u_size = cpu_to_be64(u_size);
-       p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
-       p.max_bio_size = cpu_to_be32(max_bio_size);
-       p.queue_order_type = cpu_to_be16(q_order_type);
-       p.dds_flags = cpu_to_be16(flags);
+       if (mdev->tconn->agreed_pro_version <= 94)
 -              max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
++              max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+       else if (mdev->tconn->agreed_pro_version < 100)
 -              max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE_P95);
++              max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
  
-       ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
-                          (struct p_header80 *)&p, sizeof(p));
-       return ok;
+       p->d_size = cpu_to_be64(d_size);
+       p->u_size = cpu_to_be64(u_size);
+       p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+       p->max_bio_size = cpu_to_be32(max_bio_size);
+       p->queue_order_type = cpu_to_be16(q_order_type);
+       p->dds_flags = cpu_to_be16(flags);
+       return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
  }
  
  /**
@@@ -3980,20 -2989,16 +2990,16 @@@ int drbd_md_read(struct drbd_conf *mdev
        for (i = UI_CURRENT; i < UI_SIZE; i++)
                bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
        bdev->md.flags = be32_to_cpu(buffer->flags);
-       mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
        bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
  
-       spin_lock_irq(&mdev->req_lock);
+       spin_lock_irq(&mdev->tconn->req_lock);
        if (mdev->state.conn < C_CONNECTED) {
 -              int peer;
 +              unsigned int peer;
                peer = be32_to_cpu(buffer->la_peer_max_bio_size);
 -              peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
 +              peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
                mdev->peer_max_bio_size = peer;
        }
-       spin_unlock_irq(&mdev->req_lock);
-       if (mdev->sync_conf.al_extents < 7)
-               mdev->sync_conf.al_extents = 127;
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
   err:
        drbd_md_put_buffer(mdev);
  #include "drbd_req.h"
  #include "drbd_wrappers.h"
  #include <asm/unaligned.h>
- #include <linux/drbd_tag_magic.h>
  #include <linux/drbd_limits.h>
- #include <linux/compiler.h>
  #include <linux/kthread.h>
  
- static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
- static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
- static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
- /* see get_sb_bdev and bd_claim */
+ #include <net/genetlink.h>
+ /* .doit */
+ // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+ // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+ /* .dumpit */
+ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+ #include <linux/drbd_genl_api.h>
+ #include "drbd_nla.h"
+ #include <linux/genl_magic_func.h>
+ /* used blkdev_get_by_path, to claim our meta data device(s) */
  static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
  
- /* Generate the tag_list to struct functions */
- #define NL_PACKET(name, number, fields) \
- static int name ## _from_tags(struct drbd_conf *mdev, \
-       unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
- static int name ## _from_tags(struct drbd_conf *mdev, \
-       unsigned short *tags, struct name *arg) \
- { \
-       int tag; \
-       int dlen; \
-       \
-       while ((tag = get_unaligned(tags++)) != TT_END) {       \
-               dlen = get_unaligned(tags++);                   \
-               switch (tag_number(tag)) { \
-               fields \
-               default: \
-                       if (tag & T_MANDATORY) { \
-                               dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
-                               return 0; \
-                       } \
-               } \
-               tags = (unsigned short *)((char *)tags + dlen); \
-       } \
-       return 1; \
- }
- #define NL_INTEGER(pn, pr, member) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
-               arg->member = get_unaligned((int *)(tags));     \
-               break;
- #define NL_INT64(pn, pr, member) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
-               arg->member = get_unaligned((u64 *)(tags));     \
+ /* Configuration is strictly serialized, because generic netlink message
+  * processing is strictly serialized by the genl_lock().
+  * Which means we can use one static global drbd_config_context struct.
+  */
+ static struct drbd_config_context {
+       /* assigned from drbd_genlmsghdr */
+       unsigned int minor;
+       /* assigned from request attributes, if present */
+       unsigned int volume;
+ #define VOLUME_UNSPECIFIED            (-1U)
+       /* pointer into the request skb,
+        * limited lifetime! */
+       char *resource_name;
+       struct nlattr *my_addr;
+       struct nlattr *peer_addr;
+       /* reply buffer */
+       struct sk_buff *reply_skb;
+       /* pointer into reply buffer */
+       struct drbd_genlmsghdr *reply_dh;
+       /* resolved from attributes, if possible */
+       struct drbd_conf *mdev;
+       struct drbd_tconn *tconn;
+ } adm_ctx;
+ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+ {
+       genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+       if (genlmsg_reply(skb, info))
+               printk(KERN_ERR "drbd: error sending genl reply\n");
+ }
+ /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+  * reason it could fail was no space in skb, and there are 4k available. */
+ int drbd_msg_put_info(const char *info)
+ {
+       struct sk_buff *skb = adm_ctx.reply_skb;
+       struct nlattr *nla;
+       int err = -EMSGSIZE;
+       if (!info || !info[0])
+               return 0;
+       nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+       if (!nla)
+               return err;
+       err = nla_put_string(skb, T_info_text, info);
+       if (err) {
+               nla_nest_cancel(skb, nla);
+               return err;
+       } else
+               nla_nest_end(skb, nla);
+       return 0;
+ }
+ /* This would be a good candidate for a "pre_doit" hook,
+  * and per-family private info->pointers.
+  * But we need to stay compatible with older kernels.
+  * If it returns successfully, adm_ctx members are valid.
+  */
+ #define DRBD_ADM_NEED_MINOR   1
+ #define DRBD_ADM_NEED_RESOURCE        2
+ #define DRBD_ADM_NEED_CONNECTION 4
+ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
+               unsigned flags)
+ {
+       struct drbd_genlmsghdr *d_in = info->userhdr;
+       const u8 cmd = info->genlhdr->cmd;
+       int err;
+       memset(&adm_ctx, 0, sizeof(adm_ctx));
+       /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
 -      if (cmd != DRBD_ADM_GET_STATUS
 -      && security_netlink_recv(skb, CAP_SYS_ADMIN))
++      if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+              return -EPERM;
+       adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+       if (!adm_ctx.reply_skb) {
+               err = -ENOMEM;
+               goto fail;
+       }
+       adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
+                                       info, &drbd_genl_family, 0, cmd);
+       /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+        * but anyways */
+       if (!adm_ctx.reply_dh) {
+               err = -ENOMEM;
+               goto fail;
+       }
+       adm_ctx.reply_dh->minor = d_in->minor;
+       adm_ctx.reply_dh->ret_code = NO_ERROR;
+       adm_ctx.volume = VOLUME_UNSPECIFIED;
+       if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+               struct nlattr *nla;
+               /* parse and validate only */
+               err = drbd_cfg_context_from_attrs(NULL, info);
+               if (err)
+                       goto fail;
+               /* It was present, and valid,
+                * copy it over to the reply skb. */
+               err = nla_put_nohdr(adm_ctx.reply_skb,
+                               info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+                               info->attrs[DRBD_NLA_CFG_CONTEXT]);
+               if (err)
+                       goto fail;
+               /* and assign stuff to the global adm_ctx */
+               nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+               if (nla)
+                       adm_ctx.volume = nla_get_u32(nla);
+               nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+               if (nla)
+                       adm_ctx.resource_name = nla_data(nla);
+               adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+               adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+               if ((adm_ctx.my_addr &&
+                    nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) ||
+                   (adm_ctx.peer_addr &&
+                    nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) {
+                       err = -EINVAL;
+                       goto fail;
+               }
+       }
+       adm_ctx.minor = d_in->minor;
+       adm_ctx.mdev = minor_to_mdev(d_in->minor);
+       adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name);
+       if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) {
+               drbd_msg_put_info("unknown minor");
+               return ERR_MINOR_INVALID;
+       }
+       if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) {
+               drbd_msg_put_info("unknown resource");
+               return ERR_INVALID_REQUEST;
+       }
+       if (flags & DRBD_ADM_NEED_CONNECTION) {
+               if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) {
+                       drbd_msg_put_info("no resource name expected");
+                       return ERR_INVALID_REQUEST;
+               }
+               if (adm_ctx.mdev) {
+                       drbd_msg_put_info("no minor number expected");
+                       return ERR_INVALID_REQUEST;
+               }
+               if (adm_ctx.my_addr && adm_ctx.peer_addr)
+                       adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
+                                                         nla_len(adm_ctx.my_addr),
+                                                         nla_data(adm_ctx.peer_addr),
+                                                         nla_len(adm_ctx.peer_addr));
+               if (!adm_ctx.tconn) {
+                       drbd_msg_put_info("unknown connection");
+                       return ERR_INVALID_REQUEST;
+               }
+       }
+       /* some more paranoia, if the request was over-determined */
+       if (adm_ctx.mdev && adm_ctx.tconn &&
+           adm_ctx.mdev->tconn != adm_ctx.tconn) {
+               pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
+                               adm_ctx.minor, adm_ctx.resource_name,
+                               adm_ctx.mdev->tconn->name);
+               drbd_msg_put_info("minor exists in different resource");
+               return ERR_INVALID_REQUEST;
+       }
+       if (adm_ctx.mdev &&
+           adm_ctx.volume != VOLUME_UNSPECIFIED &&
+           adm_ctx.volume != adm_ctx.mdev->vnr) {
+               pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+                               adm_ctx.minor, adm_ctx.volume,
+                               adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name);
+               drbd_msg_put_info("minor exists as different volume");
+               return ERR_INVALID_REQUEST;
+       }
+       return NO_ERROR;
+ fail:
+       nlmsg_free(adm_ctx.reply_skb);
+       adm_ctx.reply_skb = NULL;
+       return err;
+ }
+ static int drbd_adm_finish(struct genl_info *info, int retcode)
+ {
+       if (adm_ctx.tconn) {
+               kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+               adm_ctx.tconn = NULL;
+       }
+       if (!adm_ctx.reply_skb)
+               return -ENOMEM;
+       adm_ctx.reply_dh->ret_code = retcode;
+       drbd_adm_send_reply(adm_ctx.reply_skb, info);
+       return 0;
+ }
+ static void setup_khelper_env(struct drbd_tconn *tconn, char **envp)
+ {
+       char *afs;
+       /* FIXME: A future version will not allow this case. */
+       if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0)
+               return;
+       switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) {
+       case AF_INET6:
+               afs = "ipv6";
+               snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+                        &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr);
                break;
- #define NL_BIT(pn, pr, member) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
-               arg->member = *(char *)(tags) ? 1 : 0; \
+       case AF_INET:
+               afs = "ipv4";
+               snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+                        &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
                break;
- #define NL_STRING(pn, pr, member, len) \
-       case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
-               if (dlen > len) { \
-                       dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
-                               #member, dlen, (unsigned int)len); \
-                       return 0; \
-               } \
-                arg->member ## _len = dlen; \
-                memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
-                break;
- #include <linux/drbd_nl.h>
- /* Generate the struct to tag_list functions */
- #define NL_PACKET(name, number, fields) \
- static unsigned short* \
- name ## _to_tags(struct drbd_conf *mdev, \
-       struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
- static unsigned short* \
- name ## _to_tags(struct drbd_conf *mdev, \
-       struct name *arg, unsigned short *tags) \
- { \
-       fields \
-       return tags; \
- }
- #define NL_INTEGER(pn, pr, member) \
-       put_unaligned(pn | pr | TT_INTEGER, tags++);    \
-       put_unaligned(sizeof(int), tags++);             \
-       put_unaligned(arg->member, (int *)tags);        \
-       tags = (unsigned short *)((char *)tags+sizeof(int));
- #define NL_INT64(pn, pr, member) \
-       put_unaligned(pn | pr | TT_INT64, tags++);      \
-       put_unaligned(sizeof(u64), tags++);             \
-       put_unaligned(arg->member, (u64 *)tags);        \
-       tags = (unsigned short *)((char *)tags+sizeof(u64));
- #define NL_BIT(pn, pr, member) \
-       put_unaligned(pn | pr | TT_BIT, tags++);        \
-       put_unaligned(sizeof(char), tags++);            \
-       *(char *)tags = arg->member; \
-       tags = (unsigned short *)((char *)tags+sizeof(char));
- #define NL_STRING(pn, pr, member, len) \
-       put_unaligned(pn | pr | TT_STRING, tags++);     \
-       put_unaligned(arg->member ## _len, tags++);     \
-       memcpy(tags, arg->member, arg->member ## _len); \
-       tags = (unsigned short *)((char *)tags + arg->member ## _len);
- #include <linux/drbd_nl.h>
- void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
- void drbd_nl_send_reply(struct cn_msg *, int);
+       default:
+               afs = "ssocks";
+               snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+                        &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
+       }
+       snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+ }
  
  int drbd_khelper(struct drbd_conf *mdev, char *cmd)
  {
        drbd_md_sync(mdev);
  
        dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
-       drbd_bcast_ev_helper(mdev, cmd);
+       sib.sib_reason = SIB_HELPER_PRE;
+       sib.helper_name = cmd;
+       drbd_bcast_event(mdev, &sib);
 -      ret = call_usermodehelper(usermode_helper, argv, envp, 1);
 +      ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
        if (ret)
                dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
                                usermode_helper, cmd, mb,
                dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
                                usermode_helper, cmd, mb,
                                (ret >> 8) & 0xff, ret);
+       sib.sib_reason = SIB_HELPER_POST;
+       sib.helper_exit_code = ret;
+       drbd_bcast_event(mdev, &sib);
+       if (current == tconn->worker.task)
+               clear_bit(CALLBACK_PENDING, &tconn->flags);
  
-       if (current == mdev->worker.task)
-               drbd_clear_flag(mdev, CALLBACK_PENDING);
+       if (ret < 0) /* Ignore any ERRNOs we got. */
+               ret = 0;
+       return ret;
+ }
+ int conn_khelper(struct drbd_tconn *tconn, char *cmd)
+ {
+       char *envp[] = { "HOME=/",
+                       "TERM=linux",
+                       "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                        (char[20]) { }, /* address family */
+                        (char[60]) { }, /* address */
+                       NULL };
+       char *argv[] = {usermode_helper, cmd, tconn->name, NULL };
+       int ret;
+       setup_khelper_env(tconn, envp);
+       conn_md_sync(tconn);
+       conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name);
+       /* TODO: conn_bcast_event() ?? */
 -      ret = call_usermodehelper(usermode_helper, argv, envp, 1);
++      ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+       if (ret)
+               conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+                         usermode_helper, cmd, tconn->name,
+                         (ret >> 8) & 0xff, ret);
+       else
+               conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+                         usermode_helper, cmd, tconn->name,
+                         (ret >> 8) & 0xff, ret);
+       /* TODO: conn_bcast_event() ?? */
  
        if (ret < 0) /* Ignore any ERRNOs we got. */
                ret = 0;
@@@ -852,12 -1054,14 +1054,14 @@@ void drbd_reconsider_max_bio_size(struc
           Because new from 8.3.8 onwards the peer can use multiple
           BIOs for a single peer_request */
        if (mdev->state.conn >= C_CONNECTED) {
-               if (mdev->agreed_pro_version < 94) {
-                       peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+               if (mdev->tconn->agreed_pro_version < 94)
 -                      peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
++                      peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
                        /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
-               } else if (mdev->agreed_pro_version == 94)
+               else if (mdev->tconn->agreed_pro_version == 94)
                        peer = DRBD_MAX_SIZE_H80_PACKET;
-               else /* drbd 8.3.8 onwards */
+               else if (mdev->tconn->agreed_pro_version < 100)
+                       peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
+               else
                        peer = DRBD_MAX_BIO_SIZE;
        }
  
@@@ -1383,170 -1706,144 +1706,141 @@@ static int adm_detach(struct drbd_conf 
                retcode = SS_NOTHING_TO_DO;
        if (ret)
                retcode = ERR_INTR;
-       reply->ret_code = retcode;
  out:
-       return 0;
+       return retcode;
  }
  
- static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                           struct drbd_nl_cfg_reply *reply)
+ /* Detaching the disk is a process in multiple stages.  First we need to lock
+  * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+  * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+  * internal references as well.
+  * Only then we have finally detached. */
+ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
  {
-       int i, ns;
        enum drbd_ret_code retcode;
-       struct net_conf *new_conf = NULL;
-       struct crypto_hash *tfm = NULL;
-       struct crypto_hash *integrity_w_tfm = NULL;
-       struct crypto_hash *integrity_r_tfm = NULL;
-       struct hlist_head *new_tl_hash = NULL;
-       struct hlist_head *new_ee_hash = NULL;
-       struct drbd_conf *odev;
-       char hmac_name[CRYPTO_MAX_ALG_NAME];
-       void *int_dig_out = NULL;
-       void *int_dig_in = NULL;
-       void *int_dig_vv = NULL;
-       struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+       struct detach_parms parms = { };
+       int err;
  
-       drbd_reconfig_start(mdev);
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       if (mdev->state.conn > C_STANDALONE) {
-               retcode = ERR_NET_CONFIGURED;
-               goto fail;
+       if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+               err = detach_parms_from_attrs(&parms, info);
+               if (err) {
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
+                       goto out;
+               }
        }
  
-       /* allocation not in the IO path, cqueue thread context */
-       new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
-       if (!new_conf) {
-               retcode = ERR_NOMEM;
-               goto fail;
-       }
+       retcode = adm_detach(adm_ctx.mdev, parms.force_detach);
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
+ }
  
-       new_conf->timeout          = DRBD_TIMEOUT_DEF;
-       new_conf->try_connect_int  = DRBD_CONNECT_INT_DEF;
-       new_conf->ping_int         = DRBD_PING_INT_DEF;
-       new_conf->max_epoch_size   = DRBD_MAX_EPOCH_SIZE_DEF;
-       new_conf->max_buffers      = DRBD_MAX_BUFFERS_DEF;
-       new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
-       new_conf->sndbuf_size      = DRBD_SNDBUF_SIZE_DEF;
-       new_conf->rcvbuf_size      = DRBD_RCVBUF_SIZE_DEF;
-       new_conf->ko_count         = DRBD_KO_COUNT_DEF;
-       new_conf->after_sb_0p      = DRBD_AFTER_SB_0P_DEF;
-       new_conf->after_sb_1p      = DRBD_AFTER_SB_1P_DEF;
-       new_conf->after_sb_2p      = DRBD_AFTER_SB_2P_DEF;
-       new_conf->want_lose        = 0;
-       new_conf->two_primaries    = 0;
-       new_conf->wire_protocol    = DRBD_PROT_C;
-       new_conf->ping_timeo       = DRBD_PING_TIMEO_DEF;
-       new_conf->rr_conflict      = DRBD_RR_CONFLICT_DEF;
-       new_conf->on_congestion    = DRBD_ON_CONGESTION_DEF;
-       new_conf->cong_extents     = DRBD_CONG_EXTENTS_DEF;
-       if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
-               retcode = ERR_MANDATORY_TAG;
-               goto fail;
+ static bool conn_resync_running(struct drbd_tconn *tconn)
+ {
+       struct drbd_conf *mdev;
+       bool rv = false;
+       int vnr;
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               if (mdev->state.conn == C_SYNC_SOURCE ||
+                   mdev->state.conn == C_SYNC_TARGET ||
+                   mdev->state.conn == C_PAUSED_SYNC_S ||
+                   mdev->state.conn == C_PAUSED_SYNC_T) {
+                       rv = true;
+                       break;
+               }
        }
+       rcu_read_unlock();
  
-       if (new_conf->two_primaries
-           && (new_conf->wire_protocol != DRBD_PROT_C)) {
-               retcode = ERR_NOT_PROTO_C;
-               goto fail;
-       }
+       return rv;
+ }
  
-       if (get_ldev(mdev)) {
-               enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
-               put_ldev(mdev);
-               if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
-                       retcode = ERR_STONITH_AND_PROT_A;
-                       goto fail;
+ static bool conn_ov_running(struct drbd_tconn *tconn)
+ {
+       struct drbd_conf *mdev;
+       bool rv = false;
+       int vnr;
+       rcu_read_lock();
+       idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+               if (mdev->state.conn == C_VERIFY_S ||
+                   mdev->state.conn == C_VERIFY_T) {
+                       rv = true;
+                       break;
                }
        }
+       rcu_read_unlock();
  
-       if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
-               retcode = ERR_CONG_NOT_PROTO_A;
-               goto fail;
-       }
+       return rv;
+ }
  
-       if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
-               retcode = ERR_DISCARD;
-               goto fail;
-       }
+ static enum drbd_ret_code
+ _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf)
+ {
+       struct drbd_conf *mdev;
+       int i;
  
-       retcode = NO_ERROR;
+       if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) {
+               if (new_conf->wire_protocol != old_conf->wire_protocol)
+                       return ERR_NEED_APV_100;
  
-       new_my_addr = (struct sockaddr *)&new_conf->my_addr;
-       new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
-       for (i = 0; i < minor_count; i++) {
-               odev = minor_to_mdev(i);
-               if (!odev || odev == mdev)
-                       continue;
-               if (get_net_conf(odev)) {
-                       taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
-                       if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
-                           !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
-                               retcode = ERR_LOCAL_ADDR;
-                       taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
-                       if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
-                           !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
-                               retcode = ERR_PEER_ADDR;
-                       put_net_conf(odev);
-                       if (retcode != NO_ERROR)
-                               goto fail;
-               }
+               if (new_conf->two_primaries != old_conf->two_primaries)
+                       return ERR_NEED_APV_100;
 -              if (!new_conf->integrity_alg != !old_conf->integrity_alg)
 -                      return ERR_NEED_APV_100;
 -
+               if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
+                       return ERR_NEED_APV_100;
        }
  
-       if (new_conf->cram_hmac_alg[0] != 0) {
-               snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
-                       new_conf->cram_hmac_alg);
-               tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(tfm)) {
-                       tfm = NULL;
-                       retcode = ERR_AUTH_ALG;
-                       goto fail;
-               }
+       if (!new_conf->two_primaries &&
+           conn_highest_role(tconn) == R_PRIMARY &&
+           conn_highest_peer(tconn) == R_PRIMARY)
+               return ERR_NEED_ALLOW_TWO_PRI;
  
-               if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
-                       retcode = ERR_AUTH_ALG_ND;
-                       goto fail;
+       if (new_conf->two_primaries &&
+           (new_conf->wire_protocol != DRBD_PROT_C))
+               return ERR_NOT_PROTO_C;
+       idr_for_each_entry(&tconn->volumes, mdev, i) {
+               if (get_ldev(mdev)) {
+                       enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+                       put_ldev(mdev);
+                       if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+                               return ERR_STONITH_AND_PROT_A;
                }
+               if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data)
+                       return ERR_DISCARD_IMPOSSIBLE;
        }
  
-       if (new_conf->integrity_alg[0]) {
-               integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(integrity_w_tfm)) {
-                       integrity_w_tfm = NULL;
-                       retcode=ERR_INTEGRITY_ALG;
-                       goto fail;
-               }
+       if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
+               return ERR_CONG_NOT_PROTO_A;
  
-               if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
-                       retcode=ERR_INTEGRITY_ALG_ND;
-                       goto fail;
-               }
+       return NO_ERROR;
+ }
  
-               integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
-               if (IS_ERR(integrity_r_tfm)) {
-                       integrity_r_tfm = NULL;
-                       retcode=ERR_INTEGRITY_ALG;
-                       goto fail;
-               }
-       }
+ static enum drbd_ret_code
+ check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf)
+ {
+       static enum drbd_ret_code rv;
+       struct drbd_conf *mdev;
+       int i;
  
-       ns = new_conf->max_epoch_size/8;
-       if (mdev->tl_hash_s != ns) {
-               new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
-               if (!new_tl_hash) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
-               }
-       }
+       rcu_read_lock();
+       rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf);
+       rcu_read_unlock();
  
-       ns = new_conf->max_buffers/8;
-       if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
-               new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
-               if (!new_ee_hash) {
-                       retcode = ERR_NOMEM;
-                       goto fail;
+       /* tconn->volumes protected by genl_lock() here */
+       idr_for_each_entry(&tconn->volumes, mdev, i) {
+               if (!mdev->bitmap) {
+                       if(drbd_bm_init(mdev))
+                               return ERR_NOMEM;
                }
        }
  
        return 0;
  }
  
- static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                          struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
  {
-       reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
-       return 0;
+       return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
  }
  
- static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                          struct drbd_nl_cfg_reply *reply)
+ int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr)
  {
-       unsigned short *tl;
+       struct nlattr *nla;
+       nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+       if (!nla)
+               goto nla_put_failure;
+       if (vnr != VOLUME_UNSPECIFIED &&
+           nla_put_u32(skb, T_ctx_volume, vnr))
+               goto nla_put_failure;
+       if (nla_put_string(skb, T_ctx_resource_name, tconn->name))
+               goto nla_put_failure;
+       if (tconn->my_addr_len &&
+           nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr))
+               goto nla_put_failure;
+       if (tconn->peer_addr_len &&
+           nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr))
+               goto nla_put_failure;
+       nla_nest_end(skb, nla);
+       return 0;
  
-       tl = reply->tag_list;
+ nla_put_failure:
+       if (nla)
+               nla_nest_cancel(skb, nla);
+       return -EMSGSIZE;
+ }
  
-       if (get_ldev(mdev)) {
-               tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
-               put_ldev(mdev);
-       }
+ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
+               const struct sib_info *sib)
+ {
+       struct state_info *si = NULL; /* for sizeof(si->member); */
+       struct net_conf *nc;
+       struct nlattr *nla;
+       int got_ldev;
+       int err = 0;
+       int exclude_sensitive;
+       /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+        * to.  So we better exclude_sensitive information.
+        *
+        * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+        * in the context of the requesting user process. Exclude sensitive
+        * information, unless current has superuser.
+        *
+        * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+        * relies on the current implementation of netlink_dump(), which
+        * executes the dump callback successively from netlink_recvmsg(),
+        * always in the context of the receiving process */
+       exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+       got_ldev = get_ldev(mdev);
+       /* We need to add connection name and volume number information still.
+        * Minor number is in drbd_genlmsghdr. */
+       if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr))
+               goto nla_put_failure;
+       if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
+               goto nla_put_failure;
+       rcu_read_lock();
+       if (got_ldev)
+               if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
+                       goto nla_put_failure;
+       nc = rcu_dereference(mdev->tconn->net_conf);
+       if (nc)
+               err = net_conf_to_skb(skb, nc, exclude_sensitive);
+       rcu_read_unlock();
+       if (err)
+               goto nla_put_failure;
+       nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+       if (!nla)
+               goto nla_put_failure;
+       if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+           nla_put_u32(skb, T_current_state, mdev->state.i) ||
+           nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) ||
+           nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) ||
+           nla_put_u64(skb, T_send_cnt, mdev->send_cnt) ||
+           nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) ||
+           nla_put_u64(skb, T_read_cnt, mdev->read_cnt) ||
+           nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) ||
+           nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) ||
+           nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) ||
+           nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) ||
+           nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) ||
+           nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt)))
+               goto nla_put_failure;
+       if (got_ldev) {
+               int err;
  
-       if (get_net_conf(mdev)) {
-               tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
-               put_net_conf(mdev);
+               spin_lock_irq(&mdev->ldev->md.uuid_lock);
+               err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid);
+               spin_unlock_irq(&mdev->ldev->md.uuid_lock);
+               if (err)
+                       goto nla_put_failure;
+               if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) ||
+                   nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) ||
+                   nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev)))
+                       goto nla_put_failure;
+               if (C_SYNC_SOURCE <= mdev->state.conn &&
+                   C_PAUSED_SYNC_T >= mdev->state.conn) {
+                       if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) ||
+                           nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed))
+                               goto nla_put_failure;
+               }
        }
-       tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
  
-       put_unaligned(TT_END, tl++); /* Close the tag list */
+       if (sib) {
+               switch(sib->sib_reason) {
+               case SIB_SYNC_PROGRESS:
+               case SIB_GET_STATUS_REPLY:
+                       break;
+               case SIB_STATE_CHANGE:
+                       if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+                           nla_put_u32(skb, T_new_state, sib->ns.i))
+                               goto nla_put_failure;
+                       break;
+               case SIB_HELPER_POST:
+                       if (nla_put_u32(skb, T_helper_exit_code,
+                                       sib->helper_exit_code))
+                               goto nla_put_failure;
+                       /* fall through */
+               case SIB_HELPER_PRE:
+                       if (nla_put_string(skb, T_helper, sib->helper_name))
+                               goto nla_put_failure;
+                       break;
+               }
+       }
+       nla_nest_end(skb, nla);
  
-       return (int)((char *)tl - (char *)reply->tag_list);
+       if (0)
+ nla_put_failure:
+               err = -EMSGSIZE;
+       if (got_ldev)
+               put_ldev(mdev);
+       return err;
  }
  
- static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                            struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
  {
-       unsigned short *tl = reply->tag_list;
-       union drbd_state s = mdev->state;
-       unsigned long rs_left;
-       unsigned int res;
+       enum drbd_ret_code retcode;
+       int err;
  
-       tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       /* no local ref, no bitmap, no syncer progress. */
-       if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
-               if (get_ldev(mdev)) {
-                       drbd_get_syncer_progress(mdev, &rs_left, &res);
-                       tl = tl_add_int(tl, T_sync_progress, &res);
-                       put_ldev(mdev);
-               }
+       err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL);
+       if (err) {
+               nlmsg_free(adm_ctx.reply_skb);
+               return err;
        }
-       put_unaligned(TT_END, tl++); /* Close the tag list */
-       return (int)((char *)tl - (char *)reply->tag_list);
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
- static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                            struct drbd_nl_cfg_reply *reply)
+ int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
  {
-       unsigned short *tl;
-       tl = reply->tag_list;
+       struct drbd_conf *mdev;
+       struct drbd_genlmsghdr *dh;
+       struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0];
+       struct drbd_tconn *tconn = NULL;
+       struct drbd_tconn *tmp;
+       unsigned volume = cb->args[1];
+       /* Open coded, deferred, iteration:
+        * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+        *      idr_for_each_entry(&tconn->volumes, mdev, i) {
+        *        ...
+        *      }
+        * }
+        * where tconn is cb->args[0];
+        * and i is cb->args[1];
+        *
+        * cb->args[2] indicates if we shall loop over all resources,
+        * or just dump all volumes of a single resource.
+        *
+        * This may miss entries inserted after this dump started,
+        * or entries deleted before they are reached.
+        *
+        * We need to make sure the mdev won't disappear while
+        * we are looking at it, and revalidate our iterators
+        * on each iteration.
+        */
  
-       if (get_ldev(mdev)) {
-               unsigned long flags;
-               spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
-               tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
-               tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
-               spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
-               put_ldev(mdev);
+       /* synchronize with conn_create()/conn_destroy() */
+       rcu_read_lock();
+       /* revalidate iterator position */
+       list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) {
+               if (pos == NULL) {
+                       /* first iteration */
+                       pos = tmp;
+                       tconn = pos;
+                       break;
+               }
+               if (tmp == pos) {
+                       tconn = pos;
+                       break;
+               }
        }
-       put_unaligned(TT_END, tl++); /* Close the tag list */
+       if (tconn) {
+ next_tconn:
+               mdev = idr_get_next(&tconn->volumes, &volume);
+               if (!mdev) {
+                       /* No more volumes to dump on this tconn.
+                        * Advance tconn iterator. */
+                       pos = list_entry_rcu(tconn->all_tconn.next,
+                                            struct drbd_tconn, all_tconn);
+                       /* Did we dump any volume on this tconn yet? */
+                       if (volume != 0) {
+                               /* If we reached the end of the list,
+                                * or only a single resource dump was requested,
+                                * we are done. */
+                               if (&pos->all_tconn == &drbd_tconns || cb->args[2])
+                                       goto out;
+                               volume = 0;
+                               tconn = pos;
+                               goto next_tconn;
+                       }
+               }
 -              dh = genlmsg_put(skb, NETLINK_CB(cb->skb).pid,
++              dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                               cb->nlh->nlmsg_seq, &drbd_genl_family,
+                               NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+               if (!dh)
+                       goto out;
+               if (!mdev) {
+                       /* This is a tconn without a single volume.
+                        * Suprisingly enough, it may have a network
+                        * configuration. */
+                       struct net_conf *nc;
+                       dh->minor = -1U;
+                       dh->ret_code = NO_ERROR;
+                       if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED))
+                               goto cancel;
+                       nc = rcu_dereference(tconn->net_conf);
+                       if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+                               goto cancel;
+                       goto done;
+               }
  
-       return (int)((char *)tl - (char *)reply->tag_list);
+               D_ASSERT(mdev->vnr == volume);
+               D_ASSERT(mdev->tconn == tconn);
+               dh->minor = mdev_to_minor(mdev);
+               dh->ret_code = NO_ERROR;
+               if (nla_put_status_info(skb, mdev, NULL)) {
+ cancel:
+                       genlmsg_cancel(skb, dh);
+                       goto out;
+               }
+ done:
+               genlmsg_end(skb, dh);
+         }
+ out:
+       rcu_read_unlock();
+       /* where to start the next iteration */
+         cb->args[0] = (long)pos;
+         cb->args[1] = (pos == tconn) ? volume + 1 : 0;
+       /* No more tconns/volumes/minors found results in an empty skb.
+        * Which will terminate the dump. */
+         return skb->len;
  }
  
- /**
-  * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
-  * @mdev:     DRBD device.
-  * @nlp:      Netlink/connector packet from drbdsetup
-  * @reply:    Reply packet for drbdsetup
+ /*
+  * Request status of all resources, or of all volumes within a single resource.
+  *
+  * This is a dump, as the answer may not fit in a single reply skb otherwise.
+  * Which means we cannot use the family->attrbuf or other such members, because
+  * dump is NOT protected by the genl_lock().  During dump, we only have access
+  * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+  *
+  * Once things are setup properly, we call into get_one_status().
   */
- static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                                   struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
  {
-       unsigned short *tl;
-       char rv;
+       const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+       struct nlattr *nla;
+       const char *resource_name;
+       struct drbd_tconn *tconn;
+       int maxtype;
+       /* Is this a followup call? */
+       if (cb->args[0]) {
+               /* ... of a single resource dump,
+                * and the resource iterator has been advanced already? */
+               if (cb->args[2] && cb->args[2] != cb->args[0])
+                       return 0; /* DONE. */
+               goto dump;
+       }
+       /* First call (from netlink_dump_start).  We need to figure out
+        * which resource(s) the user wants us to dump. */
+       nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+                       nlmsg_attrlen(cb->nlh, hdrlen),
+                       DRBD_NLA_CFG_CONTEXT);
+       /* No explicit context given.  Dump all. */
+       if (!nla)
+               goto dump;
+       maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+       nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+       if (IS_ERR(nla))
+               return PTR_ERR(nla);
+       /* context given, but no name present? */
+       if (!nla)
+               return -EINVAL;
+       resource_name = nla_data(nla);
+       tconn = conn_get_by_name(resource_name);
+       if (!tconn)
+               return -ENODEV;
+       kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */
+       /* prime iterators, and set "filter" mode mark:
+        * only dump this tconn. */
+       cb->args[0] = (long)tconn;
+       /* cb->args[1] = 0; passed in this way. */
+       cb->args[2] = (long)tconn;
+ dump:
+       return get_one_status(skb, cb);
+ }
  
-       tl = reply->tag_list;
+ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+ {
+       enum drbd_ret_code retcode;
+       struct timeout_parms tp;
+       int err;
  
-       rv = mdev->state.pdsk == D_OUTDATED        ? UT_PEER_OUTDATED :
-         drbd_test_flag(mdev, USE_DEGR_WFC_T) ? UT_DEGRADED : UT_DEFAULT;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
  
-       tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
-       put_unaligned(TT_END, tl++); /* Close the tag list */
+       tp.timeout_type =
+               adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+               test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED :
+               UT_DEFAULT;
  
-       return (int)((char *)tl - (char *)reply->tag_list);
+       err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+       if (err) {
+               nlmsg_free(adm_ctx.reply_skb);
+               return err;
+       }
+ out:
+       drbd_adm_finish(info, retcode);
+       return 0;
  }
  
- static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
-                                   struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
  {
-       /* default to resume from last known position, if possible */
-       struct start_ov args = {
-               .start_sector = mdev->ov_start_sector,
-               .stop_sector = ULLONG_MAX,
-       };
+       struct drbd_conf *mdev;
+       enum drbd_ret_code retcode;
+       struct start_ov_parms parms;
  
-       if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
-               reply->ret_code = ERR_MANDATORY_TAG;
-               return 0;
+       retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+       if (!adm_ctx.reply_skb)
+               return retcode;
+       if (retcode != NO_ERROR)
+               goto out;
+       mdev = adm_ctx.mdev;
+       /* resume from last known position, if possible */
+       parms.ov_start_sector = mdev->ov_start_sector;
+       parms.ov_stop_sector = ULLONG_MAX;
+       if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+               int err = start_ov_parms_from_attrs(&parms, info);
+               if (err) {
+                       retcode = ERR_MANDATORY_TAG;
+                       drbd_msg_put_info(from_attrs_err_to_txt(err));
+                       goto out;
+               }
        }
+       /* w_make_ov_request expects position to be aligned */
+       mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+       mdev->ov_stop_sector = parms.ov_stop_sector;
  
        /* If there is still bitmap IO pending, e.g. previous resync or verify
         * just being finished, wait for it before requesting a new resync. */
@@@ -656,19 -699,11 +699,11 @@@ static int prepare_listen_socket(struc
                goto out;
        }
  
-       timeo = mdev->net_conf->try_connect_int * HZ;
-       timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
-       s_listen->sk->sk_reuse    = SK_CAN_REUSE; /* SO_REUSEADDR */
-       s_listen->sk->sk_rcvtimeo = timeo;
-       s_listen->sk->sk_sndtimeo = timeo;
-       drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
-                       mdev->net_conf->rcvbuf_size);
 -      s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
++      s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+       drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
  
        what = "bind before listen";
-       err = s_listen->ops->bind(s_listen,
-                             (struct sockaddr *) mdev->net_conf->my_addr,
-                             mdev->net_conf->my_addr_len);
+       err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
        if (err < 0)
                goto out;
  
@@@ -829,33 -967,39 +967,39 @@@ randomize
                                goto out_release_sockets;
                }
  
-               if (sock && msock) {
-                       ok = drbd_socket_okay(mdev, &sock);
-                       ok = drbd_socket_okay(mdev, &msock) && ok;
-                       if (ok)
-                               break;
-               }
-       } while (1);
+               ok = drbd_socket_okay(&sock.socket);
+               ok = drbd_socket_okay(&msock.socket) && ok;
+       } while (!ok);
+       if (ad.s_listen)
+               sock_release(ad.s_listen);
  
-       msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
-       sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 -      sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
 -      msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
++      sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
++      msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
  
-       sock->sk->sk_allocation = GFP_NOIO;
-       msock->sk->sk_allocation = GFP_NOIO;
+       sock.socket->sk->sk_allocation = GFP_NOIO;
+       msock.socket->sk->sk_allocation = GFP_NOIO;
  
-       sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
-       msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+       sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+       msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
  
        /* NOT YET ...
-        * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-        * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
-        * first set it to the P_HAND_SHAKE timeout,
+        * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
+        * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+        * first set it to the P_CONNECTION_FEATURES timeout,
         * which we set to 4x the configured ping_timeout. */
-       sock->sk->sk_sndtimeo =
-       sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+       rcu_read_lock();
+       nc = rcu_dereference(tconn->net_conf);
+       sock.socket->sk->sk_sndtimeo =
+       sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
  
-       msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
-       msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+       msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+       timeout = nc->timeout * HZ / 10;
+       discard_my_data = nc->discard_my_data;
+       rcu_read_unlock();
+       msock.socket->sk->sk_sndtimeo = timeout;
  
        /* we don't want delays.
         * we use TCP_CORK where appropriate, though */
@@@ -1030,119 -1094,54 +1094,54 @@@ void __drbd_make_request(struct drbd_co
  
        /* no point in adding empty flushes to the transfer log,
         * they are mapped to drbd barriers already. */
-       if (likely(size!=0))
-               list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+       if (likely(req->i.size!=0)) {
+               if (rw == WRITE)
+                       mdev->tconn->current_tle_writes++;
  
-       /* NOTE remote first: to get the concurrent write detection right,
-        * we must register the request before start of local IO.  */
-       if (remote) {
-               /* either WRITE and C_CONNECTED,
-                * or READ, and no local disk,
-                * or READ, but not in sync.
-                */
-               _req_mod(req, (rw == WRITE)
-                               ? queue_for_net_write
-                               : queue_for_net_read);
+               list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
        }
-       if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
-               _req_mod(req, queue_for_send_oos);
-       if (remote &&
-           mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
-               maybe_pull_ahead(mdev);
  
-       /* If this was a flush, queue a drbd barrier/start a new epoch.
-        * Unless the current epoch was empty anyways, or we are not currently
-        * replicating, in which case there is no point. */
-       if (unlikely(bio->bi_rw & REQ_FLUSH)
-               && mdev->newest_tle->n_writes
-               && drbd_should_do_remote(mdev->state))
-               queue_barrier(mdev);
-       spin_unlock_irq(&mdev->req_lock);
-       kfree(b); /* if someone else has beaten us to it... */
-       if (local) {
-               req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
-               /* State may have changed since we grabbed our reference on the
-                * mdev->ldev member. Double check, and short-circuit to endio.
-                * In case the last activity log transaction failed to get on
-                * stable storage, and this is a WRITE, we may not even submit
-                * this bio. */
-               if (get_ldev(mdev)) {
-                       if (drbd_insert_fault(mdev,   rw == WRITE ? DRBD_FAULT_DT_WR
-                                                   : rw == READ  ? DRBD_FAULT_DT_RD
-                                                   :               DRBD_FAULT_DT_RA))
-                               bio_endio(req->private_bio, -EIO);
-                       else
-                               generic_make_request(req->private_bio);
-                       put_ldev(mdev);
+       if (rw == WRITE) {
+               if (!drbd_process_write_request(req))
+                       no_remote = true;
+       } else {
+               /* We either have a private_bio, or we can read from remote.
+                * Otherwise we had done the goto nodata above. */
+               if (req->private_bio == NULL) {
+                       _req_mod(req, TO_BE_SENT);
+                       _req_mod(req, QUEUE_FOR_NET_READ);
                } else
-                       bio_endio(req->private_bio, -EIO);
+                       no_remote = true;
        }
  
-       return 0;
- fail_conflicting:
-       /* this is a conflicting request.
-        * even though it may have been only _partially_
-        * overlapping with one of the currently pending requests,
-        * without even submitting or sending it, we will
-        * pretend that it was successfully served right now.
-        */
-       _drbd_end_io_acct(mdev, req);
-       spin_unlock_irq(&mdev->req_lock);
-       if (remote)
-               dec_ap_pending(mdev);
-       /* THINK: do we want to fail it (-EIO), or pretend success?
-        * this pretends success. */
-       err = 0;
- fail_free_complete:
-       if (req->rq_state & RQ_IN_ACT_LOG)
-               drbd_al_complete_io(mdev, sector);
- fail_and_free_req:
-       if (local) {
-               bio_put(req->private_bio);
-               req->private_bio = NULL;
-               put_ldev(mdev);
+       if (req->private_bio) {
+               /* needs to be marked within the same spinlock */
+               _req_mod(req, TO_BE_SUBMITTED);
+               /* but we need to give up the spinlock to submit */
+               spin_unlock_irq(&mdev->tconn->req_lock);
+               drbd_submit_req_private_bio(req);
+               spin_lock_irq(&mdev->tconn->req_lock);
+       } else if (no_remote) {
+ nodata:
+               if (__ratelimit(&drbd_ratelimit_state))
+                       dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+                                       (unsigned long long)req->i.sector, req->i.size >> 9);
+               /* A write may have been queued for send_oos, however.
+                * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
        }
-       if (!ret)
-               bio_endio(bio, err);
-       drbd_req_free(req);
-       dec_ap_bio(mdev);
-       kfree(b);
-       return ret;
- }
  
- /* helper function for drbd_make_request
-  * if we can determine just by the mdev (state) that this request will fail,
-  * return 1
-  * otherwise return 0
-  */
- static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
- {
-       if (mdev->state.role != R_PRIMARY &&
-               (!allow_oos || is_write)) {
-               if (__ratelimit(&drbd_ratelimit_state)) {
-                       dev_err(DEV, "Process %s[%u] tried to %s; "
-                           "since we are not in Primary state, "
-                           "we cannot allow this\n",
-                           current->comm, current->pid,
-                           is_write ? "WRITE" : "READ");
-               }
-               return 1;
-       }
+ out:
+       if (drbd_req_put_completion_ref(req, &m, 1))
+               kref_put(&req->kref, drbd_req_destroy);
+       spin_unlock_irq(&mdev->tconn->req_lock);
  
-       return 0;
+       if (m.bio)
+               complete_master_bio(mdev, &m);
+       return;
  }
  
 -int drbd_make_request(struct request_queue *q, struct bio *bio)
 +void drbd_make_request(struct request_queue *q, struct bio *bio)
  {
-       unsigned int s_enr, e_enr;
        struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
        unsigned long start_time;
  
        /*
         * what we "blindly" assume:
         */
-       D_ASSERT((bio->bi_size & 0x1ff) == 0);
-       /* to make some things easier, force alignment of requests within the
-        * granularity of our hash tables */
-       s_enr = bio->bi_sector >> HT_SHIFT;
-       e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
-       if (likely(s_enr == e_enr)) {
-               do {
-                       inc_ap_bio(mdev, 1);
-               } while (drbd_make_request_common(mdev, bio, start_time));
-               return;
-       }
-       /* can this bio be split generically?
-        * Maybe add our own split-arbitrary-bios function. */
-       if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
-               /* rather error out here than BUG in bio_split */
-               dev_err(DEV, "bio would need to, but cannot, be split: "
-                   "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
-                   bio->bi_vcnt, bio->bi_idx, bio->bi_size,
-                   (unsigned long long)bio->bi_sector);
-               bio_endio(bio, -EINVAL);
-       } else {
-               /* This bio crosses some boundary, so we have to split it. */
-               struct bio_pair *bp;
-               /* works for the "do not cross hash slot boundaries" case
-                * e.g. sector 262269, size 4096
-                * s_enr = 262269 >> 6 = 4097
-                * e_enr = (262269+8-1) >> 6 = 4098
-                * HT_SHIFT = 6
-                * sps = 64, mask = 63
-                * first_sectors = 64 - (262269 & 63) = 3
-                */
-               const sector_t sect = bio->bi_sector;
-               const int sps = 1 << HT_SHIFT; /* sectors per slot */
-               const int mask = sps - 1;
-               const sector_t first_sectors = sps - (sect & mask);
-               bp = bio_split(bio, first_sectors);
+       D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
  
-               /* we need to get a "reference count" (ap_bio_cnt)
-                * to avoid races with the disconnect/reconnect/suspend code.
-                * In case we need to split the bio here, we need to get three references
-                * atomically, otherwise we might deadlock when trying to submit the
-                * second one! */
-               inc_ap_bio(mdev, 3);
-               D_ASSERT(e_enr == s_enr + 1);
-               while (drbd_make_request_common(mdev, &bp->bio1, start_time))
-                       inc_ap_bio(mdev, 1);
-               while (drbd_make_request_common(mdev, &bp->bio2, start_time))
-                       inc_ap_bio(mdev, 1);
-               dec_ap_bio(mdev);
-               bio_pair_release(bp);
-       }
+       inc_ap_bio(mdev);
+       __drbd_make_request(mdev, bio, start_time);
 -
 -      return 0;
  }
  
- /* This is called by bio_add_page().  With this function we reduce
-  * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
-  * units (was AL_EXTENTs).
+ /* This is called by bio_add_page().
+  *
+  * q->max_hw_sectors and other global limits are already enforced there.
   *
-  * we do the calculation within the lower 32bit of the byte offsets,
-  * since we don't care for actual offset, but only check whether it
-  * would cross "activity log extent" boundaries.
+  * We need to call down to our lower level device,
+  * in case it has special restrictions.
+  *
+  * We also may need to enforce configured max-bio-bvecs limits.
   *
   * As long as the BIO is empty we have to allow at least one bvec,
-  * regardless of size and offset.  so the resulting bio may still
-  * cross extent boundaries.  those are dealt with (bio_split) in
-  * drbd_make_request.
+  * regardless of size and offset, so no need to ask lower levels.
   */
  int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
  {
Simple merge