* GFP_NOIO, as this is called while drbd IO is "suspended",
* and during resize or attach on diskless Primary,
* we must not block on IO to ourselves.
- * Context is receiver thread or cqueue thread/dmsetup. */
+ * Context is receiver thread or dmsetup. */
bytes = sizeof(struct page *)*want;
- new_pages = kmalloc(bytes, GFP_NOIO);
+ new_pages = kzalloc(bytes, GFP_NOIO);
if (!new_pages) {
new_pages = __vmalloc(bytes,
- GFP_NOIO | __GFP_HIGHMEM,
+ GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO,
PAGE_KERNEL);
if (!new_pages)
return NULL;
{
int i;
int bits;
- unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+ int changed = 0;
+ unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
- b->bm_set += BITS_PER_LONG - bits;
+ changed += BITS_PER_LONG - bits;
}
- kunmap_atomic(paddr, KM_IRQ1);
+ kunmap_atomic(paddr);
+ if (changed) {
+ /* We only need lazy writeout, the information is still in the
+ * remote bitmap as well, and is reconstructed during the next
+ * bitmap exchange, if lost locally due to a crash. */
+ bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+ b->bm_set += changed;
+ }
}
/* Same thing as drbd_bm_set_bits,
/* module parameter, defined in drbd_main.c */
extern unsigned int minor_count;
-extern int disable_sendpage;
-extern int allow_oos;
+extern bool disable_sendpage;
+extern bool allow_oos;
- extern unsigned int cn_idx;
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int enable_faults;
int rs_last_events; /* counter of read or write "events" (unit sectors)
* on the lower level device when we last looked. */
int c_sync_rate; /* current resync rate after syncer throttle magic */
- struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
- int rs_planed; /* resync sectors already planned */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
- int peer_max_bio_size;
- int local_max_bio_size;
+ unsigned int peer_max_bio_size;
+ unsigned int local_max_bio_size;
};
- static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
- set_bit(f, &mdev->drbd_flags[0]);
- }
-
- static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
- clear_bit(f, &mdev->drbd_flags[0]);
- }
-
- static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
- return test_bit(f, &mdev->drbd_flags[0]);
- }
-
- static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
- return test_and_set_bit(f, &mdev->drbd_flags[0]);
- }
-
- static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
- {
- return test_and_clear_bit(f, &mdev->drbd_flags[0]);
- }
-
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
{
- struct drbd_conf *mdev;
-
- mdev = minor < minor_count ? minor_table[minor] : NULL;
-
- return mdev;
+ return (struct drbd_conf *)idr_find(&minors, minor);
}
static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
#endif
#endif
- /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 8 all IO in one 128K block make it to the same slot of the
- * hash table. */
- #define HT_SHIFT 8
- #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+ /* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
-#define DRBD_MAX_BIO_SIZE (1 << 20)
++#define DRBD_MAX_BIO_SIZE (1U << 20)
+ #if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+ #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+ #endif
-#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
- #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
-
- /* Number of elements in the app_reads_hash */
- #define APP_R_HSIZE 15
-#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* Header 80 only allows packets up to 32KiB data */
-#define DRBD_MAX_BIO_SIZE_P95 (1 << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
++#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
++#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
extern int drbd_bm_init(struct drbd_conf *mdev);
extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
extern int proc_details;
/* drbd_req */
-extern int drbd_make_request(struct request_queue *q, struct bio *bio);
+ extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
+extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
extern int is_valid_ar_handle(struct drbd_request *, sector_t);
--- /dev/null
- * update_interval_end - recompute end of @node
++#include <asm/bug.h>
++#include <linux/rbtree_augmented.h>
+ #include "drbd_interval.h"
+
+ /**
+ * interval_end - return end of @node
+ */
+ static inline
+ sector_t interval_end(struct rb_node *node)
+ {
+ struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+ return this->end;
+ }
+
+ /**
-static void
-update_interval_end(struct rb_node *node, void *__unused)
++ * compute_subtree_last - compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children. Called for @node and its parents whenever the end
+ * may have changed.
+ */
- struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
- sector_t end;
++static inline sector_t
++compute_subtree_last(struct drbd_interval *node)
+ {
- end = this->sector + (this->size >> 9);
- if (node->rb_left) {
- sector_t left = interval_end(node->rb_left);
- if (left > end)
- end = left;
++ sector_t max = node->sector + (node->size >> 9);
+
- if (node->rb_right) {
- sector_t right = interval_end(node->rb_right);
- if (right > end)
- end = right;
++ if (node->rb.rb_left) {
++ sector_t left = interval_end(node->rb.rb_left);
++ if (left > max)
++ max = left;
++ }
++ if (node->rb.rb_right) {
++ sector_t right = interval_end(node->rb.rb_right);
++ if (right > max)
++ max = right;
+ }
- this->end = end;
++ return max;
++}
++
++static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
++{
++ while (rb != stop) {
++ struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
++ sector_t subtree_last = compute_subtree_last(node);
++ if (node->end == subtree_last)
++ break;
++ node->end = subtree_last;
++ rb = rb_parent(&node->rb);
+ }
- rb_insert_color(&this->rb, root);
- rb_augment_insert(&this->rb, update_interval_end, NULL);
+ }
+
++static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
++{
++ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
++ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
++
++ new->end = old->end;
++}
++
++static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
++{
++ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
++ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
++
++ new->end = old->end;
++ old->end = compute_subtree_last(old);
++}
++
++static const struct rb_augment_callbacks augment_callbacks = {
++ augment_propagate,
++ augment_copy,
++ augment_rotate,
++};
++
+ /**
+ * drbd_insert_interval - insert a new interval into a tree
+ */
+ bool
+ drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+ {
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+
+ BUG_ON(!IS_ALIGNED(this->size, 512));
+
+ while (*new) {
+ struct drbd_interval *here =
+ rb_entry(*new, struct drbd_interval, rb);
+
+ parent = *new;
+ if (this->sector < here->sector)
+ new = &(*new)->rb_left;
+ else if (this->sector > here->sector)
+ new = &(*new)->rb_right;
+ else if (this < here)
+ new = &(*new)->rb_left;
+ else if (this > here)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ rb_link_node(&this->rb, parent, new);
- struct rb_node *deepest;
-
- deepest = rb_augment_erase_begin(&this->rb);
- rb_erase(&this->rb, root);
- rb_augment_erase_end(deepest, update_interval_end, NULL);
++ rb_insert_augmented(&this->rb, root, &augment_callbacks);
+ return true;
+ }
+
+ /**
+ * drbd_contains_interval - check if a tree contains a given interval
+ * @sector: start sector of @interval
+ * @interval: may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree. Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+ bool
+ drbd_contains_interval(struct rb_root *root, sector_t sector,
+ struct drbd_interval *interval)
+ {
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (sector < here->sector)
+ node = node->rb_left;
+ else if (sector > here->sector)
+ node = node->rb_right;
+ else if (interval < here)
+ node = node->rb_left;
+ else if (interval > here)
+ node = node->rb_right;
+ else
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * drbd_remove_interval - remove an interval from a tree
+ */
+ void
+ drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+ {
++ rb_erase_augmented(&this->rb, root, &augment_callbacks);
+ }
+
+ /**
+ * drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
+ * @sector: start sector
+ * @size: size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none. When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+ struct drbd_interval *
+ drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+ {
+ struct rb_node *node = root->rb_node;
+ struct drbd_interval *overlap = NULL;
+ sector_t end = sector + (size >> 9);
+
+ BUG_ON(!IS_ALIGNED(size, 512));
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (node->rb_left &&
+ sector < interval_end(node->rb_left)) {
+ /* Overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (here->sector < end &&
+ sector < here->sector + (here->size >> 9)) {
+ overlap = here;
+ break;
+ } else if (sector >= here->sector) {
+ /* Overlap if any must be on right side */
+ node = node->rb_right;
+ } else
+ break;
+ }
+ return overlap;
+ }
+
+ struct drbd_interval *
+ drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+ {
+ sector_t end = sector + (size >> 9);
+ struct rb_node *node;
+
+ for (;;) {
+ node = rb_next(&i->rb);
+ if (!node)
+ return NULL;
+ i = rb_entry(node, struct drbd_interval, rb);
+ if (i->sector >= end)
+ return NULL;
+ if (sector < i->sector + (i->size >> 9))
+ return i;
+ }
+ }
/* module parameter, defined */
unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
-int disable_sendpage;
-int allow_oos;
+bool disable_sendpage;
+bool allow_oos;
- unsigned int cn_idx = CN_IDX_DRBD;
int proc_details; /* Detail level in proc drbd*/
/* Module parameter for setting the user mode helper program
static const struct block_device_operations drbd_ops = {
.owner = THIS_MODULE,
- .open = drbd_open,
- .release = drbd_release,
- };
-
- struct bio *bio_alloc_drbd(gfp_t gfp_mask)
- {
- if (!drbd_md_io_bio_set)
- return bio_alloc(gfp_mask, 1);
-
- return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
- }
-
- #ifdef __CHECKER__
- /* When checking with sparse, and this is an inline function, sparse will
- give tons of false positives. When this is a real functions sparse works.
- */
- int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
- {
- int io_allowed;
-
- atomic_inc(&mdev->local_cnt);
- io_allowed = (mdev->state.disk >= mins);
- if (!io_allowed) {
- if (atomic_dec_and_test(&mdev->local_cnt))
- wake_up(&mdev->misc_wait);
- }
- return io_allowed;
- }
-
- #endif
-
- /**
- * DOC: The transfer log
- *
- * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
- * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
- * of the list. There is always at least one &struct drbd_tl_epoch object.
- *
- * Each &struct drbd_tl_epoch has a circular double linked list of requests
- * attached.
- */
- static int tl_init(struct drbd_conf *mdev)
- {
- struct drbd_tl_epoch *b;
-
- /* during device minor initialization, we may well use GFP_KERNEL */
- b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
- if (!b)
- return 0;
- INIT_LIST_HEAD(&b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->next = NULL;
- b->br_number = 4711;
- b->n_writes = 0;
- b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-
- mdev->oldest_tle = b;
- mdev->newest_tle = b;
- INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
- INIT_LIST_HEAD(&mdev->barrier_acked_requests);
-
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
-
- return 1;
- }
-
- static void tl_cleanup(struct drbd_conf *mdev)
- {
- D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
- kfree(mdev->oldest_tle);
- mdev->oldest_tle = NULL;
- kfree(mdev->unused_spare_tle);
- mdev->unused_spare_tle = NULL;
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
- }
-
- /**
- * _tl_add_barrier() - Adds a barrier to the transfer log
- * @mdev: DRBD device.
- * @new: Barrier to be added before the current head of the TL.
- *
- * The caller must hold the req_lock.
- */
- void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
- {
- struct drbd_tl_epoch *newest_before;
-
- INIT_LIST_HEAD(&new->requests);
- INIT_LIST_HEAD(&new->w.list);
- new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
- new->next = NULL;
- new->n_writes = 0;
-
- newest_before = mdev->newest_tle;
- new->br_number = newest_before->br_number+1;
- if (mdev->newest_tle != new) {
- mdev->newest_tle->next = new;
- mdev->newest_tle = new;
- }
- }
-
- /**
- * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
- * @mdev: DRBD device.
- * @barrier_nr: Expected identifier of the DRBD write barrier packet.
- * @set_size: Expected number of requests before that barrier.
- *
- * In case the passed barrier_nr or set_size does not match the oldest
- * &struct drbd_tl_epoch objects this function will cause a termination
- * of the connection.
- */
- void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
- unsigned int set_size)
- {
- struct drbd_tl_epoch *b, *nob; /* next old barrier */
- struct list_head *le, *tle;
- struct drbd_request *r;
-
- spin_lock_irq(&mdev->req_lock);
-
- b = mdev->oldest_tle;
-
- /* first some paranoia code */
- if (b == NULL) {
- dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
- barrier_nr);
- goto bail;
- }
- if (b->br_number != barrier_nr) {
- dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
- barrier_nr, b->br_number);
- goto bail;
- }
- if (b->n_writes != set_size) {
- dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
- barrier_nr, set_size, b->n_writes);
- goto bail;
- }
-
- /* Clean up list of requests processed during current epoch */
- list_for_each_safe(le, tle, &b->requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- _req_mod(r, barrier_acked);
- }
- /* There could be requests on the list waiting for completion
- of the write to the local disk. To avoid corruptions of
- slab's data structures we have to remove the lists head.
-
- Also there could have been a barrier ack out of sequence, overtaking
- the write acks - which would be a bug and violating write ordering.
- To not deadlock in case we lose connection while such requests are
- still pending, we need some way to find them for the
- _req_mode(connection_lost_while_pending).
-
- These have been list_move'd to the out_of_sequence_requests list in
- _req_mod(, barrier_acked) above.
- */
- list_splice_init(&b->requests, &mdev->barrier_acked_requests);
-
- nob = b->next;
- if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
- _tl_add_barrier(mdev, b);
- if (nob)
- mdev->oldest_tle = nob;
- /* if nob == NULL b was the only barrier, and becomes the new
- barrier. Therefore mdev->oldest_tle points already to b */
- } else {
- D_ASSERT(nob != NULL);
- mdev->oldest_tle = nob;
- kfree(b);
- }
-
- spin_unlock_irq(&mdev->req_lock);
- dec_ap_pending(mdev);
-
- return;
-
- bail:
- spin_unlock_irq(&mdev->req_lock);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- }
-
-
- /**
- * _tl_restart() - Walks the transfer log, and applies an action to all requests
- * @mdev: DRBD device.
- * @what: The action/event to perform with all request objects
- *
- * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
- * restart_frozen_disk_io.
- */
- static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
- {
- struct drbd_tl_epoch *b, *tmp, **pn;
- struct list_head *le, *tle, carry_reads;
- struct drbd_request *req;
- int rv, n_writes, n_reads;
-
- b = mdev->oldest_tle;
- pn = &mdev->oldest_tle;
- while (b) {
- n_writes = 0;
- n_reads = 0;
- INIT_LIST_HEAD(&carry_reads);
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- rv = _req_mod(req, what);
-
- n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
- n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
- }
- tmp = b->next;
-
- if (n_writes) {
- if (what == resend) {
- b->n_writes = n_writes;
- if (b->w.cb == NULL) {
- b->w.cb = w_send_barrier;
- inc_ap_pending(mdev);
- drbd_set_flag(mdev, CREATE_BARRIER);
- }
-
- drbd_queue_work(&mdev->data.work, &b->w);
- }
- pn = &b->next;
- } else {
- if (n_reads)
- list_add(&carry_reads, &b->requests);
- /* there could still be requests on that ring list,
- * in case local io is still pending */
- list_del(&b->requests);
-
- /* dec_ap_pending corresponding to queue_barrier.
- * the newest barrier may not have been queued yet,
- * in which case w.cb is still NULL. */
- if (b->w.cb != NULL)
- dec_ap_pending(mdev);
-
- if (b == mdev->newest_tle) {
- /* recycle, but reinit! */
- D_ASSERT(tmp == NULL);
- INIT_LIST_HEAD(&b->requests);
- list_splice(&carry_reads, &b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->w.cb = NULL;
- b->br_number = net_random();
- b->n_writes = 0;
-
- *pn = b;
- break;
- }
- *pn = tmp;
- kfree(b);
- }
- b = tmp;
- list_splice(&carry_reads, &b->requests);
- }
-
- /* Actions operating on the disk state, also want to work on
- requests that got barrier acked. */
-
- list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- _req_mod(req, what);
- }
- }
-
-
- /**
- * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
- * @mdev: DRBD device.
- *
- * This is called after the connection to the peer was lost. The storage covered
- * by the requests on the transfer gets marked as our of sync. Called from the
- * receiver thread and the worker thread.
- */
- void tl_clear(struct drbd_conf *mdev)
- {
- spin_lock_irq(&mdev->req_lock);
- _tl_clear(mdev);
- spin_unlock_irq(&mdev->req_lock);
- }
-
- static void _tl_clear(struct drbd_conf *mdev)
- {
- struct list_head *le, *tle;
- struct drbd_request *r;
-
- _tl_restart(mdev, connection_lost_while_pending);
-
- /* we expect this list to be empty. */
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-
- /* but just in case, clean it up anyways! */
- list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(r, connection_lost_while_pending);
- }
-
- /* ensure bit indicating barrier is required is clear */
- drbd_clear_flag(mdev, CREATE_BARRIER);
-
- memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
-
- }
-
- void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
- {
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- spin_unlock_irq(&mdev->req_lock);
- }
-
- /**
- * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
- * @mdev: DRBD device.
- */
- void tl_abort_disk_io(struct drbd_conf *mdev)
- {
- struct drbd_tl_epoch *b;
- struct list_head *le, *tle;
- struct drbd_request *req;
-
- spin_lock_irq(&mdev->req_lock);
- b = mdev->oldest_tle;
- while (b) {
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- if (!(req->rq_state & RQ_LOCAL_PENDING))
- continue;
- _req_mod(req, abort_disk_io);
- }
- b = b->next;
- }
-
- list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- if (!(req->rq_state & RQ_LOCAL_PENDING))
- continue;
- _req_mod(req, abort_disk_io);
- }
-
- spin_unlock_irq(&mdev->req_lock);
- }
-
- /**
- * cl_wide_st_chg() - true if the state change is a cluster wide one
- * @mdev: DRBD device.
- * @os: old (current) state.
- * @ns: new (wanted) state.
- */
- static int cl_wide_st_chg(struct drbd_conf *mdev,
- union drbd_state os, union drbd_state ns)
- {
- return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
- ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
- (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
- (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
- (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
- (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
- }
-
- enum drbd_state_rv
- drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
- union drbd_state mask, union drbd_state val)
- {
- unsigned long flags;
- union drbd_state os, ns;
- enum drbd_state_rv rv;
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, NULL);
- ns = mdev->state;
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
- }
-
- /**
- * drbd_force_state() - Impose a change which happens outside our control on our state
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- */
- void drbd_force_state(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
- {
- drbd_change_state(mdev, CS_HARD, mask, val);
- }
-
- static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
- static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
- union drbd_state,
- union drbd_state);
- enum sanitize_state_warnings {
- NO_WARNING,
- ABORTED_ONLINE_VERIFY,
- ABORTED_RESYNC,
- CONNECTION_LOST_NEGOTIATING,
- IMPLICITLY_UPGRADED_DISK,
- IMPLICITLY_UPGRADED_PDSK,
- };
- static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum sanitize_state_warnings *warn);
- int drbd_send_state_req(struct drbd_conf *,
- union drbd_state, union drbd_state);
-
- static enum drbd_state_rv
- _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val)
- {
- union drbd_state os, ns;
- unsigned long flags;
- enum drbd_state_rv rv;
-
- if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS))
- return SS_CW_SUCCESS;
-
- if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL))
- return SS_CW_FAILED_BY_PEER;
-
- rv = 0;
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (!cl_wide_st_chg(mdev, os, ns))
- rv = SS_CW_NO_NEED;
- if (!rv) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS) {
- rv = is_valid_state_transition(mdev, ns, os);
- if (rv == SS_SUCCESS)
- rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
- }
- }
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
- }
-
- /**
- * drbd_req_state() - Perform an eventually cluster wide state change
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Should not be called directly, use drbd_request_state() or
- * _drbd_request_state().
- */
- static enum drbd_state_rv
- drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val, enum chg_state_flags f)
- {
- struct completion done;
- unsigned long flags;
- union drbd_state os, ns;
- enum drbd_state_rv rv;
-
- init_completion(&done);
-
- if (f & CS_SERIALIZE)
- mutex_lock(&mdev->state_mutex);
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (cl_wide_st_chg(mdev, os, ns)) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS)
- rv = is_valid_state_transition(mdev, ns, os);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (rv < SS_SUCCESS) {
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- drbd_state_lock(mdev);
- if (!drbd_send_state_req(mdev, mask, val)) {
- drbd_state_unlock(mdev);
- rv = SS_CW_FAILED_BY_PEER;
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- if (mask.conn == C_MASK && val.conn == C_DISCONNECTING)
- drbd_set_flag(mdev, DISCONNECT_SENT);
-
- wait_event(mdev->state_wait,
- (rv = _req_st_cond(mdev, mask, val)));
-
- if (rv < SS_SUCCESS) {
- drbd_state_unlock(mdev);
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, &done);
- drbd_state_unlock(mdev);
- } else {
- rv = _drbd_set_state(mdev, ns, f, &done);
- }
-
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
- D_ASSERT(current != mdev->worker.task);
- wait_for_completion(&done);
- }
-
- abort:
- if (f & CS_SERIALIZE)
- mutex_unlock(&mdev->state_mutex);
-
- return rv;
- }
-
- /**
- * _drbd_request_state() - Request a state change (with flags)
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
- * flag, or when logging of failed state change requests is not desired.
- */
- enum drbd_state_rv
- _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val, enum chg_state_flags f)
- {
- enum drbd_state_rv rv;
-
- wait_event(mdev->state_wait,
- (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
-
- return rv;
- }
-
- static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
- {
- dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
- name,
- drbd_conn_str(ns.conn),
- drbd_role_str(ns.role),
- drbd_role_str(ns.peer),
- drbd_disk_str(ns.disk),
- drbd_disk_str(ns.pdsk),
- is_susp(ns) ? 's' : 'r',
- ns.aftr_isp ? 'a' : '-',
- ns.peer_isp ? 'p' : '-',
- ns.user_isp ? 'u' : '-'
- );
- }
-
- void print_st_err(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum drbd_state_rv err)
- {
- if (err == SS_IN_TRANSIENT_STATE)
- return;
- dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
- print_st(mdev, " state", os);
- print_st(mdev, "wanted", ns);
- }
-
-
- /**
- * is_valid_state() - Returns an SS_ error code if ns is not valid
- * @mdev: DRBD device.
- * @ns: State to consider.
- */
- static enum drbd_state_rv
- is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
- {
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- enum drbd_fencing_p fp;
- enum drbd_state_rv rv = SS_SUCCESS;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- if (get_net_conf(mdev)) {
- if (!mdev->net_conf->two_primaries &&
- ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
- rv = SS_TWO_PRIMARIES;
- put_net_conf(mdev);
- }
-
- if (rv <= 0)
- /* already found a reason to abort */;
- else if (ns.role == R_SECONDARY && mdev->open_cnt)
- rv = SS_DEVICE_IN_USE;
-
- else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (fp >= FP_RESOURCE &&
- ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
- rv = SS_PRIMARY_NOP;
-
- else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
- rv = SS_NO_LOCAL_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
- rv = SS_NO_REMOTE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if ((ns.conn == C_CONNECTED ||
- ns.conn == C_WF_BITMAP_S ||
- ns.conn == C_SYNC_SOURCE ||
- ns.conn == C_PAUSED_SYNC_S) &&
- ns.disk == D_OUTDATED)
- rv = SS_CONNECTED_OUTDATES;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- (mdev->sync_conf.verify_alg[0] == 0))
- rv = SS_NO_VERIFY_ALG;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- mdev->agreed_pro_version < 88)
- rv = SS_NOT_SUPPORTED;
-
- else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
- rv = SS_CONNECTED_OUTDATES;
-
- return rv;
- }
-
- /**
- * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
- * @mdev: DRBD device.
- * @ns: new state.
- * @os: old state.
- */
- static enum drbd_state_rv
- is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
- union drbd_state os)
- {
- enum drbd_state_rv rv = SS_SUCCESS;
-
- if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
- os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
- rv = SS_ALREADY_STANDALONE;
-
- if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
- rv = SS_IS_DISKLESS;
-
- if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
- rv = SS_NO_NET_CONFIG;
-
- if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
- rv = SS_LOWER_THAN_OUTDATED;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
- rv = SS_IN_TRANSIENT_STATE;
-
- if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
- rv = SS_IN_TRANSIENT_STATE;
-
- /* While establishing a connection only allow cstate to change.
- Delay/refuse role changes, detach attach etc... */
- if (drbd_test_flag(mdev, STATE_SENT) &&
- !(os.conn == C_WF_REPORT_PARAMS ||
- (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
- rv = SS_IN_TRANSIENT_STATE;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- ns.conn != os.conn && os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
- os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
- && os.conn < C_WF_REPORT_PARAMS)
- rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
-
- return rv;
- }
-
- static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
- {
- static const char *msg_table[] = {
- [NO_WARNING] = "",
- [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
- [ABORTED_RESYNC] = "Resync aborted.",
- [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
- [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
- [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
- };
-
- if (warn != NO_WARNING)
- dev_warn(DEV, "%s\n", msg_table[warn]);
- }
-
- /**
- * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @warn_sync_abort:
- *
- * When we loose connection, we have to set the state of the peers disk (pdsk)
- * to D_UNKNOWN. This rule and many more along those lines are in this function.
- */
- static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum sanitize_state_warnings *warn)
- {
- enum drbd_fencing_p fp;
- enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
-
- if (warn)
- *warn = NO_WARNING;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- /* Disallow Network errors to configure a device's network part */
- if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
- os.conn <= C_DISCONNECTING)
- ns.conn = os.conn;
-
- /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
- * If you try to go into some Sync* state, that shall fail (elsewhere). */
- if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
- ns.conn = os.conn;
-
- /* we cannot fail (again) if we already detached */
- if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
- ns.disk = D_DISKLESS;
-
- /* After C_DISCONNECTING only C_STANDALONE may follow */
- if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
- ns.conn = os.conn;
-
- if (ns.conn < C_CONNECTED) {
- ns.peer_isp = 0;
- ns.peer = R_UNKNOWN;
- if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
- ns.pdsk = D_UNKNOWN;
- }
-
- /* Clear the aftr_isp when becoming unconfigured */
- if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
- ns.aftr_isp = 0;
-
- /* Abort resync if a disk fails/detaches */
- if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
- (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
- if (warn)
- *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
- ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
- ns.conn = C_CONNECTED;
- }
-
- /* Connection breaks down before we finished "Negotiating" */
- if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
- get_ldev_if_state(mdev, D_NEGOTIATING)) {
- if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
- ns.disk = mdev->new_state_tmp.disk;
- ns.pdsk = mdev->new_state_tmp.pdsk;
- } else {
- if (warn)
- *warn = CONNECTION_LOST_NEGOTIATING;
- ns.disk = D_DISKLESS;
- ns.pdsk = D_UNKNOWN;
- }
- put_ldev(mdev);
- }
-
- /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
- if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
- if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
- ns.disk = D_UP_TO_DATE;
- if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
- ns.pdsk = D_UP_TO_DATE;
- }
-
- /* Implications of the connection stat on the disk states */
- disk_min = D_DISKLESS;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_UNKNOWN;
- switch ((enum drbd_conns)ns.conn) {
- case C_WF_BITMAP_T:
- case C_PAUSED_SYNC_T:
- case C_STARTING_SYNC_T:
- case C_WF_SYNC_UUID:
- case C_BEHIND:
- disk_min = D_INCONSISTENT;
- disk_max = D_OUTDATED;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_VERIFY_S:
- case C_VERIFY_T:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_CONNECTED:
- disk_min = D_DISKLESS;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_DISKLESS;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_WF_BITMAP_S:
- case C_PAUSED_SYNC_S:
- case C_STARTING_SYNC_S:
- case C_AHEAD:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
- break;
- case C_SYNC_TARGET:
- disk_min = D_INCONSISTENT;
- disk_max = D_INCONSISTENT;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_SYNC_SOURCE:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_INCONSISTENT;
- break;
- case C_STANDALONE:
- case C_DISCONNECTING:
- case C_UNCONNECTED:
- case C_TIMEOUT:
- case C_BROKEN_PIPE:
- case C_NETWORK_FAILURE:
- case C_PROTOCOL_ERROR:
- case C_TEAR_DOWN:
- case C_WF_CONNECTION:
- case C_WF_REPORT_PARAMS:
- case C_MASK:
- break;
- }
- if (ns.disk > disk_max)
- ns.disk = disk_max;
-
- if (ns.disk < disk_min) {
- if (warn)
- *warn = IMPLICITLY_UPGRADED_DISK;
- ns.disk = disk_min;
- }
- if (ns.pdsk > pdsk_max)
- ns.pdsk = pdsk_max;
-
- if (ns.pdsk < pdsk_min) {
- if (warn)
- *warn = IMPLICITLY_UPGRADED_PDSK;
- ns.pdsk = pdsk_min;
- }
-
- if (fp == FP_STONITH &&
- (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
- !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
- ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-
- if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
- !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
- ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
-
- if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
- if (ns.conn == C_SYNC_SOURCE)
- ns.conn = C_PAUSED_SYNC_S;
- if (ns.conn == C_SYNC_TARGET)
- ns.conn = C_PAUSED_SYNC_T;
- } else {
- if (ns.conn == C_PAUSED_SYNC_S)
- ns.conn = C_SYNC_SOURCE;
- if (ns.conn == C_PAUSED_SYNC_T)
- ns.conn = C_SYNC_TARGET;
- }
-
- return ns;
- }
-
- /* helper for __drbd_set_state */
- static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
- {
- if (mdev->agreed_pro_version < 90)
- mdev->ov_start_sector = 0;
- mdev->rs_total = drbd_bm_bits(mdev);
- mdev->ov_position = 0;
- if (cs == C_VERIFY_T) {
- /* starting online verify from an arbitrary position
- * does not fit well into the existing protocol.
- * on C_VERIFY_T, we initialize ov_left and friends
- * implicitly in receive_DataRequest once the
- * first P_OV_REQUEST is received */
- mdev->ov_start_sector = ~(sector_t)0;
- } else {
- unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
- if (bit >= mdev->rs_total) {
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(mdev->rs_total - 1);
- mdev->rs_total = 1;
- } else
- mdev->rs_total -= bit;
- mdev->ov_position = mdev->ov_start_sector;
- }
- mdev->ov_left = mdev->rs_total;
- }
-
- static void drbd_resume_al(struct drbd_conf *mdev)
- {
- if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED))
- dev_info(DEV, "Resumed AL updates\n");
- }
-
- /**
- * __drbd_set_state() - Set a new DRBD state
- * @mdev: DRBD device.
- * @ns: new state.
- * @flags: Flags
- * @done: Optional completion, that will get completed after the after_state_ch() finished
- *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
- */
- enum drbd_state_rv
- __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
- enum chg_state_flags flags, struct completion *done)
- {
- union drbd_state os;
- enum drbd_state_rv rv = SS_SUCCESS;
- enum sanitize_state_warnings ssw;
- struct after_state_chg_work *ascw;
-
- os = mdev->state;
-
- ns = sanitize_state(mdev, os, ns, &ssw);
-
- if (ns.i == os.i)
- return SS_NOTHING_TO_DO;
-
- if (!(flags & CS_HARD)) {
- /* pre-state-change checks ; only look at ns */
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- rv = is_valid_state(mdev, ns);
- if (rv < SS_SUCCESS) {
- /* If the old state was illegal as well, then let
- this happen...*/
-
- if (is_valid_state(mdev, os) == rv)
- rv = is_valid_state_transition(mdev, ns, os);
- } else
- rv = is_valid_state_transition(mdev, ns, os);
- }
-
- if (rv < SS_SUCCESS) {
- if (flags & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- return rv;
- }
-
- print_sanitize_warnings(mdev, ssw);
-
- {
- char *pbp, pb[300];
- pbp = pb;
- *pbp = 0;
- if (ns.role != os.role)
- pbp += sprintf(pbp, "role( %s -> %s ) ",
- drbd_role_str(os.role),
- drbd_role_str(ns.role));
- if (ns.peer != os.peer)
- pbp += sprintf(pbp, "peer( %s -> %s ) ",
- drbd_role_str(os.peer),
- drbd_role_str(ns.peer));
- if (ns.conn != os.conn)
- pbp += sprintf(pbp, "conn( %s -> %s ) ",
- drbd_conn_str(os.conn),
- drbd_conn_str(ns.conn));
- if (ns.disk != os.disk)
- pbp += sprintf(pbp, "disk( %s -> %s ) ",
- drbd_disk_str(os.disk),
- drbd_disk_str(ns.disk));
- if (ns.pdsk != os.pdsk)
- pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
- drbd_disk_str(os.pdsk),
- drbd_disk_str(ns.pdsk));
- if (is_susp(ns) != is_susp(os))
- pbp += sprintf(pbp, "susp( %d -> %d ) ",
- is_susp(os),
- is_susp(ns));
- if (ns.aftr_isp != os.aftr_isp)
- pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
- os.aftr_isp,
- ns.aftr_isp);
- if (ns.peer_isp != os.peer_isp)
- pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
- os.peer_isp,
- ns.peer_isp);
- if (ns.user_isp != os.user_isp)
- pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
- os.user_isp,
- ns.user_isp);
- dev_info(DEV, "%s\n", pb);
- }
-
- /* solve the race between becoming unconfigured,
- * worker doing the cleanup, and
- * admin reconfiguring us:
- * on (re)configure, first set CONFIG_PENDING,
- * then wait for a potentially exiting worker,
- * start the worker, and schedule one no_op.
- * then proceed with configuration.
- */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY &&
- !drbd_test_and_set_flag(mdev, CONFIG_PENDING))
- drbd_set_flag(mdev, DEVICE_DYING);
-
- /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
- * on the ldev here, to be sure the transition -> D_DISKLESS resp.
- * drbd_ldev_destroy() won't happen before our corresponding
- * after_state_ch works run, where we put_ldev again. */
- if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
- (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
- atomic_inc(&mdev->local_cnt);
-
- mdev->state = ns;
-
- if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
- drbd_print_uuids(mdev, "attached to UUIDs");
-
- wake_up(&mdev->misc_wait);
- wake_up(&mdev->state_wait);
-
- /* Aborted verify run, or we reached the stop sector.
- * Log the last position, unless end-of-device. */
- if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
- ns.conn <= C_CONNECTED) {
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
- if (mdev->ov_left)
- dev_info(DEV, "Online Verify reached sector %llu\n",
- (unsigned long long)mdev->ov_start_sector);
- }
-
- if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
- (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
- dev_info(DEV, "Syncer continues.\n");
- mdev->rs_paused += (long)jiffies
- -(long)mdev->rs_mark_time[mdev->rs_last_mark];
- if (ns.conn == C_SYNC_TARGET)
- mod_timer(&mdev->resync_timer, jiffies);
- }
-
- if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
- (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
- dev_info(DEV, "Resync suspended\n");
- mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
- }
-
- if (os.conn == C_CONNECTED &&
- (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
- unsigned long now = jiffies;
- int i;
-
- set_ov_position(mdev, ns.conn);
- mdev->rs_start = now;
- mdev->rs_last_events = 0;
- mdev->rs_last_sect_ev = 0;
- mdev->ov_last_oos_size = 0;
- mdev->ov_last_oos_start = 0;
-
- for (i = 0; i < DRBD_SYNC_MARKS; i++) {
- mdev->rs_mark_left[i] = mdev->ov_left;
- mdev->rs_mark_time[i] = now;
- }
-
- drbd_rs_controller_reset(mdev);
-
- if (ns.conn == C_VERIFY_S) {
- dev_info(DEV, "Starting Online Verify from sector %llu\n",
- (unsigned long long)mdev->ov_position);
- mod_timer(&mdev->resync_timer, jiffies);
- }
- }
-
- if (get_ldev(mdev)) {
- u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
- MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
- MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-
- if (drbd_test_flag(mdev, CRASHED_PRIMARY))
- mdf |= MDF_CRASHED_PRIMARY;
- if (mdev->state.role == R_PRIMARY ||
- (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
- mdf |= MDF_PRIMARY_IND;
- if (mdev->state.conn > C_WF_REPORT_PARAMS)
- mdf |= MDF_CONNECTED_IND;
- if (mdev->state.disk > D_INCONSISTENT)
- mdf |= MDF_CONSISTENT;
- if (mdev->state.disk > D_OUTDATED)
- mdf |= MDF_WAS_UP_TO_DATE;
- if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
- mdf |= MDF_PEER_OUT_DATED;
- if (mdf != mdev->ldev->md.flags) {
- mdev->ldev->md.flags = mdf;
- drbd_md_mark_dirty(mdev);
- }
- if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
- drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
- put_ldev(mdev);
- }
-
- /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
- if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
- os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
- drbd_set_flag(mdev, CONSIDER_RESYNC);
-
- /* Receiver should clean up itself */
- if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Now the receiver finished cleaning up itself, it should die */
- if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Upon network failure, we need to restart the receiver. */
- if (os.conn > C_WF_CONNECTION &&
- ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
- drbd_thread_restart_nowait(&mdev->receiver);
-
- /* Resume AL writing if we get a connection */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
- drbd_resume_al(mdev);
-
- /* remember last connect and attach times so request_timer_fn() won't
- * kill newly established sessions while we are still trying to thaw
- * previously frozen IO */
- if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
- mdev->last_reconnect_jif = jiffies;
- if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
- ns.disk > D_NEGOTIATING)
- mdev->last_reattach_jif = jiffies;
-
- ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
- if (ascw) {
- ascw->os = os;
- ascw->ns = ns;
- ascw->flags = flags;
- ascw->w.cb = w_after_state_ch;
- ascw->done = done;
- drbd_queue_work(&mdev->data.work, &ascw->w);
- } else {
- dev_warn(DEV, "Could not kmalloc an ascw\n");
- }
-
- return rv;
- }
-
- static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
- {
- struct after_state_chg_work *ascw =
- container_of(w, struct after_state_chg_work, w);
- after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
- if (ascw->flags & CS_WAIT_COMPLETE) {
- D_ASSERT(ascw->done != NULL);
- complete(ascw->done);
- }
- kfree(ascw);
-
- return 1;
- }
-
- static void abw_start_sync(struct drbd_conf *mdev, int rv)
- {
- if (rv) {
- dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
- _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
- return;
- }
-
- switch (mdev->state.conn) {
- case C_STARTING_SYNC_T:
- _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
- break;
- case C_STARTING_SYNC_S:
- drbd_start_resync(mdev, C_SYNC_SOURCE);
- break;
- }
- }
-
- int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
- int (*io_fn)(struct drbd_conf *),
- char *why, enum bm_flag flags)
- {
- int rv;
-
- D_ASSERT(current == mdev->worker.task);
-
- /* open coded non-blocking drbd_suspend_io(mdev); */
- drbd_set_flag(mdev, SUSPEND_IO);
+ .open = drbd_open,
+ .release = drbd_release,
+ };
- drbd_bm_lock(mdev, why, flags);
- rv = io_fn(mdev);
- drbd_bm_unlock(mdev);
-static void bio_destructor_drbd(struct bio *bio)
-{
- bio_free(bio, drbd_md_io_bio_set);
-}
-
+ struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+ {
+ struct bio *bio;
- drbd_resume_io(mdev);
+ if (!drbd_md_io_bio_set)
+ return bio_alloc(gfp_mask, 1);
- return rv;
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
- bio->bi_destructor = bio_destructor_drbd;
+ return bio;
}
- /**
- * after_state_ch() - Perform after state change actions that may sleep
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @flags: Flags
+ #ifdef __CHECKER__
+ /* When checking with sparse, and this is an inline function, sparse will
+ give tons of false positives. When this is a real functions sparse works.
*/
- static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum chg_state_flags flags)
+ int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
{
- enum drbd_fencing_p fp;
- enum drbd_req_event what = nothing;
- union drbd_state nsm = (union drbd_state){ .i = -1 };
-
- if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
- drbd_clear_flag(mdev, CRASHED_PRIMARY);
- if (mdev->p_uuid)
- mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
- }
+ int io_allowed;
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
+ atomic_inc(&mdev->local_cnt);
+ io_allowed = (mdev->state.disk >= mins);
+ if (!io_allowed) {
+ if (atomic_dec_and_test(&mdev->local_cnt))
+ wake_up(&mdev->misc_wait);
}
+ return io_allowed;
+ }
- /* Inform userspace about the change... */
- drbd_bcast_state(mdev, ns);
-
- if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
- drbd_khelper(mdev, "pri-on-incon-degr");
-
- /* Here we have the actions that are performed after a
- state change. This function might sleep */
-
- if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
- mod_timer(&mdev->request_timer, jiffies + HZ);
-
- nsm.i = -1;
- if (ns.susp_nod) {
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
- what = resend;
-
- if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
- ns.disk > D_NEGOTIATING)
- what = restart_frozen_disk_io;
-
- if (what != nothing)
- nsm.susp_nod = 0;
- }
+ #endif
- if (ns.susp_fen) {
- /* case1: The outdate peer handler is successful: */
- if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
- if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
- drbd_uuid_new_current(mdev);
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- }
- spin_lock_irq(&mdev->req_lock);
- _tl_clear(mdev);
- _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
- }
- /* case2: The connection was established again: */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- what = resend;
- nsm.susp_fen = 0;
+ /**
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @tconn: DRBD connection.
+ * @barrier_nr: Expected identifier of the DRBD write barrier packet.
+ * @set_size: Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
+ */
+ void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
+ unsigned int set_size)
+ {
+ struct drbd_request *r;
+ struct drbd_request *req = NULL;
+ int expect_epoch = 0;
+ int expect_size = 0;
+
+ spin_lock_irq(&tconn->req_lock);
+
- /* find latest not yet barrier-acked write request,
++ /* find oldest not yet barrier-acked write request,
+ * count writes in its epoch. */
+ list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+ const unsigned s = r->rq_state;
+ if (!req) {
+ if (!(s & RQ_WRITE))
+ continue;
+ if (!(s & RQ_NET_MASK))
+ continue;
+ if (s & RQ_NET_DONE)
+ continue;
+ req = r;
+ expect_epoch = req->epoch;
+ expect_size ++;
+ } else {
+ if (r->epoch != expect_epoch)
+ break;
+ if (!(s & RQ_WRITE))
+ continue;
+ /* if (s & RQ_DONE): not expected */
+ /* if (!(s & RQ_NET_MASK)): not expected */
+ expect_size++;
}
}
- if (what != nothing) {
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- nsm.i &= mdev->state.i;
- _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ /* first some paranoia code */
+ if (req == NULL) {
+ conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+ barrier_nr);
+ goto bail;
}
-
- /* Became sync source. With protocol >= 96, we still need to send out
- * the sync uuid now. Need to do that before any drbd_send_state, or
- * the other side may go "paused sync" before receiving the sync uuids,
- * which is unexpected. */
- if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
- (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
- mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
- drbd_gen_and_send_sync_uuid(mdev);
- put_ldev(mdev);
+ if (expect_epoch != barrier_nr) {
+ conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
+ barrier_nr, expect_epoch);
+ goto bail;
}
- /* Do not change the order of the if above and the two below... */
- if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
- /* we probably will start a resync soon.
- * make sure those things are properly reset. */
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
- drbd_rs_cancel_all(mdev);
-
- drbd_send_uuids(mdev);
- drbd_send_state(mdev, ns);
- }
- /* No point in queuing send_bitmap if we don't have a connection
- * anymore, so check also the _current_ state, not only the new state
- * at the time this work was queued. */
- if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
- mdev->state.conn == C_WF_BITMAP_S)
- drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
- "send_bitmap (WFBitMapS)",
- BM_LOCKED_TEST_ALLOWED);
-
- /* Lost contact to peer's copy of the data */
- if ((os.pdsk >= D_INCONSISTENT &&
- os.pdsk != D_UNKNOWN &&
- os.pdsk != D_OUTDATED)
- && (ns.pdsk < D_INCONSISTENT ||
- ns.pdsk == D_UNKNOWN ||
- ns.pdsk == D_OUTDATED)) {
- if (get_ldev(mdev)) {
- if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- if (is_susp(mdev->state)) {
- drbd_set_flag(mdev, NEW_CUR_UUID);
- } else {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
- }
- put_ldev(mdev);
- }
+ if (expect_size != set_size) {
+ conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, expect_size);
+ goto bail;
}
- if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
- /* D_DISKLESS Peer becomes secondary */
- if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
- /* We may still be Primary ourselves.
- * No harm done if the bitmap still changes,
- * redirtied pages will follow later. */
- drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
- "demote diskless peer", BM_LOCKED_SET_ALLOWED);
- put_ldev(mdev);
- /* Clean up list of requests processed during current epoch */
- list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
++ /* Clean up list of requests processed during current epoch. */
++ /* this extra list walk restart is paranoia,
++ * to catch requests being barrier-acked "unexpectedly".
++ * It usually should find the same req again, or some READ preceding it. */
++ list_for_each_entry(req, &tconn->transfer_log, tl_requests)
++ if (req->epoch == expect_epoch)
++ break;
++ list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) {
+ if (req->epoch != expect_epoch)
+ break;
+ _req_mod(req, BARRIER_ACKED);
}
+ spin_unlock_irq(&tconn->req_lock);
- /* Write out all changed bits on demote.
- * Though, no need to da that just yet
- * if there is a resync going on still */
- if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
- mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
- /* No changes to the bitmap expected this time, so assert that,
- * even though no harm was done if it did change. */
- drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
- "demote", BM_LOCKED_TEST_ALLOWED);
- put_ldev(mdev);
- }
+ return;
- /* Last part of the attaching process ... */
- if (ns.conn >= C_CONNECTED &&
- os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
- drbd_send_sizes(mdev, 0, 0); /* to start sync... */
- drbd_send_uuids(mdev);
- drbd_send_state(mdev, ns);
- }
+ bail:
+ spin_unlock_irq(&tconn->req_lock);
+ conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+ }
- /* We want to pause/continue resync, tell peer. */
- if (ns.conn >= C_CONNECTED &&
- ((os.aftr_isp != ns.aftr_isp) ||
- (os.user_isp != ns.user_isp)))
- drbd_send_state(mdev, ns);
-
- /* In case one of the isp bits got set, suspend other devices. */
- if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
- (ns.aftr_isp || ns.peer_isp || ns.user_isp))
- suspend_other_sg(mdev);
-
- /* Make sure the peer gets informed about eventual state
- changes (ISP bits) while we were in WFReportParams. */
- if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
- drbd_send_state(mdev, ns);
-
- /* We are in the progress to start a full sync... */
- if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
- /* no other bitmap changes expected during this phase */
- drbd_queue_bitmap_io(mdev,
- &drbd_bmio_set_n_write, &abw_start_sync,
- "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
-
- /* We are invalidating our self... */
- if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
- os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
- /* other bitmap operation expected during this phase */
- drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
- "set_n_write from invalidate", BM_LOCKED_MASK);
-
- /* first half of local IO error, failure to attach,
- * or administrative detach */
- if (os.disk != D_FAILED && ns.disk == D_FAILED) {
- /* corresponding get_ldev was in __drbd_set_state, to serialize
- * our cleanup here with the transition to D_DISKLESS.
- * But it is still not safe to dreference ldev here, we may end
- * up here from a failed attach, before ldev was even set. */
- if (mdev->ldev) {
- enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error;
-
- /* In some setups, this handler triggers a suicide,
- * basically mapping IO error to node failure, to
- * reduce the number of different failure scenarios.
- *
- * This handler intentionally runs before we abort IO,
- * notify the peer, or try to update our meta data. */
- if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR))
- drbd_khelper(mdev, "local-io-error");
-
- /* Immediately allow completion of all application IO,
- * that waits for completion from the local disk,
- * if this was a force-detach due to disk_timeout
- * or administrator request (drbdsetup detach --force).
- * Do NOT abort otherwise.
- * Aborting local requests may cause serious problems,
- * if requests are completed to upper layers already,
- * and then later the already submitted local bio completes.
- * This can cause DMA into former bio pages that meanwhile
- * have been re-used for other things.
- * So aborting local requests may cause crashes,
- * or even worse, silent data corruption.
- */
- if (drbd_test_flag(mdev, FORCE_DETACH))
- tl_abort_disk_io(mdev);
-
- /* current state still has to be D_FAILED,
- * there is only one way out: to D_DISKLESS,
- * and that may only happen after our put_ldev below. */
- if (mdev->state.disk != D_FAILED)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s during detach\n",
- drbd_disk_str(mdev->state.disk));
-
- if (ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- drbd_rs_cancel_all(mdev);
-
- /* In case we want to get something to stable storage still,
- * this may be the last chance.
- * Following put_ldev may transition to D_DISKLESS. */
- drbd_md_sync(mdev);
- }
- put_ldev(mdev);
- }
- /* second half of local IO error, failure to attach,
- * or administrative detach,
- * after local_cnt references have reached zero again */
- if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
- /* We must still be diskless,
- * re-attach has to be serialized with this! */
- if (mdev->state.disk != D_DISKLESS)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s while going diskless\n",
- drbd_disk_str(mdev->state.disk));
-
- if (ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* corresponding get_ldev in __drbd_set_state
- * this may finally trigger drbd_ldev_destroy. */
- put_ldev(mdev);
- }
+ /**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @mdev: DRBD device.
+ * @what: The action/event to perform with all request objects
+ *
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
+ */
+ /* must hold resource->req_lock */
+ void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+ {
+ struct drbd_request *req, *r;
- /* Notify peer that I had a local IO error, and did not detached.. */
- if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests)
+ _req_mod(req, what);
+ }
- /* Disks got bigger while they were detached */
- if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
- drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) {
- if (ns.conn == C_CONNECTED)
- resync_after_online_grow(mdev);
- }
+ void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+ {
+ spin_lock_irq(&tconn->req_lock);
+ _tl_restart(tconn, what);
+ spin_unlock_irq(&tconn->req_lock);
+ }
- /* A resync finished or aborted, wake paused devices... */
- if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
- (os.peer_isp && !ns.peer_isp) ||
- (os.user_isp && !ns.user_isp))
- resume_next_sg(mdev);
-
- /* sync target done with resync. Explicitly notify peer, even though
- * it should (at least for non-empty resyncs) already know itself. */
- if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* Verify finished, or reached stop sector. Peer did not know about
- * the stop sector, and we may even have changed the stop sector during
- * verify to interrupt/stop early. Send the new state. */
- if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
- && mdev->agreed_pro_version >= 97)
- drbd_send_state(mdev, ns);
-
- /* Wake up role changes, that were delayed because of connection establishing */
- if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
- drbd_clear_flag(mdev, STATE_SENT);
- wake_up(&mdev->state_wait);
- }
+ /**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @mdev: DRBD device.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+ void tl_clear(struct drbd_tconn *tconn)
+ {
+ tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
+ }
- /* This triggers bitmap writeout of potentially still unwritten pages
- * if the resync finished cleanly, or aborted because of peer disk
- * failure, or because of connection loss.
- * For resync aborted because of local disk failure, we cannot do
- * any bitmap writeout anymore.
- * No harm done if some bits change during this phase.
- */
- if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
- drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
- "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
- put_ldev(mdev);
- }
+ /**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev: DRBD device.
+ */
+ void tl_abort_disk_io(struct drbd_conf *mdev)
+ {
+ struct drbd_tconn *tconn = mdev->tconn;
+ struct drbd_request *req, *r;
- /* free tl_hash if we Got thawed and are C_STANDALONE */
- if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
- drbd_free_tl_hash(mdev);
-
- /* Upon network connection, we need to start the receiver */
- if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
- drbd_thread_start(&mdev->receiver);
-
- /* Terminate worker thread if we are unconfigured - it will be
- restarted as needed... */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY) {
- if (os.aftr_isp != ns.aftr_isp)
- resume_next_sg(mdev);
- /* set in __drbd_set_state, unless CONFIG_PENDING was set */
- if (drbd_test_flag(mdev, DEVICE_DYING))
- drbd_thread_stop_nowait(&mdev->worker);
+ spin_lock_irq(&tconn->req_lock);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
+ if (!(req->rq_state & RQ_LOCAL_PENDING))
+ continue;
+ if (req->w.mdev != mdev)
+ continue;
+ _req_mod(req, ABORT_DISK_IO);
}
-
- drbd_md_sync(mdev);
+ spin_unlock_irq(&tconn->req_lock);
}
-
static int drbd_thread_setup(void *arg)
{
struct drbd_thread *thi = (struct drbd_thread *) arg;
int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
{
- struct p_sizes p;
+ struct drbd_socket *sock;
+ struct p_sizes *p;
sector_t d_size, u_size;
- int q_order_type, max_bio_size;
+ int q_order_type;
+ unsigned int max_bio_size;
- int ok;
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
D_ASSERT(mdev->ldev->backing_bdev);
d_size = drbd_get_max_capacity(mdev->ldev);
- u_size = mdev->ldev->dc.disk_size;
+ rcu_read_lock();
+ u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
q_order_type = drbd_queue_order_type(mdev);
max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
- max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE);
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
put_ldev(mdev);
} else {
d_size = 0;
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
}
- /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
- if (mdev->agreed_pro_version <= 94)
- max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
- p.d_size = cpu_to_be64(d_size);
- p.u_size = cpu_to_be64(u_size);
- p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
- p.max_bio_size = cpu_to_be32(max_bio_size);
- p.queue_order_type = cpu_to_be16(q_order_type);
- p.dds_flags = cpu_to_be16(flags);
+ if (mdev->tconn->agreed_pro_version <= 94)
- max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
++ max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ else if (mdev->tconn->agreed_pro_version < 100)
- max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE_P95);
++ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ p->d_size = cpu_to_be64(d_size);
+ p->u_size = cpu_to_be64(u_size);
+ p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+ p->max_bio_size = cpu_to_be32(max_bio_size);
+ p->queue_order_type = cpu_to_be16(q_order_type);
+ p->dds_flags = cpu_to_be16(flags);
+ return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
}
/**
for (i = UI_CURRENT; i < UI_SIZE; i++)
bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
bdev->md.flags = be32_to_cpu(buffer->flags);
- mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED) {
- int peer;
+ unsigned int peer;
peer = be32_to_cpu(buffer->la_peer_max_bio_size);
- peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE);
+ peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
mdev->peer_max_bio_size = peer;
}
- spin_unlock_irq(&mdev->req_lock);
-
- if (mdev->sync_conf.al_extents < 7)
- mdev->sync_conf.al_extents = 127;
+ spin_unlock_irq(&mdev->tconn->req_lock);
err:
drbd_md_put_buffer(mdev);
#include "drbd_req.h"
#include "drbd_wrappers.h"
#include <asm/unaligned.h>
- #include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
- #include <linux/compiler.h>
#include <linux/kthread.h>
- static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
- static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
- static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
-
- /* see get_sb_bdev and bd_claim */
+ #include <net/genetlink.h>
+
+ /* .doit */
+ // int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+ // int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+ int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info);
+
+ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+ int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+ /* .dumpit */
+ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+ #include <linux/drbd_genl_api.h>
+ #include "drbd_nla.h"
+ #include <linux/genl_magic_func.h>
+
+ /* used blkdev_get_by_path, to claim our meta data device(s) */
static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
- /* Generate the tag_list to struct functions */
- #define NL_PACKET(name, number, fields) \
- static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
- static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) \
- { \
- int tag; \
- int dlen; \
- \
- while ((tag = get_unaligned(tags++)) != TT_END) { \
- dlen = get_unaligned(tags++); \
- switch (tag_number(tag)) { \
- fields \
- default: \
- if (tag & T_MANDATORY) { \
- dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
- return 0; \
- } \
- } \
- tags = (unsigned short *)((char *)tags + dlen); \
- } \
- return 1; \
- }
- #define NL_INTEGER(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
- arg->member = get_unaligned((int *)(tags)); \
- break;
- #define NL_INT64(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
- arg->member = get_unaligned((u64 *)(tags)); \
+ /* Configuration is strictly serialized, because generic netlink message
+ * processing is strictly serialized by the genl_lock().
+ * Which means we can use one static global drbd_config_context struct.
+ */
+ static struct drbd_config_context {
+ /* assigned from drbd_genlmsghdr */
+ unsigned int minor;
+ /* assigned from request attributes, if present */
+ unsigned int volume;
+ #define VOLUME_UNSPECIFIED (-1U)
+ /* pointer into the request skb,
+ * limited lifetime! */
+ char *resource_name;
+ struct nlattr *my_addr;
+ struct nlattr *peer_addr;
+
+ /* reply buffer */
+ struct sk_buff *reply_skb;
+ /* pointer into reply buffer */
+ struct drbd_genlmsghdr *reply_dh;
+ /* resolved from attributes, if possible */
+ struct drbd_conf *mdev;
+ struct drbd_tconn *tconn;
+ } adm_ctx;
+
+ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+ {
+ genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+ if (genlmsg_reply(skb, info))
+ printk(KERN_ERR "drbd: error sending genl reply\n");
+ }
+
+ /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+ int drbd_msg_put_info(const char *info)
+ {
+ struct sk_buff *skb = adm_ctx.reply_skb;
+ struct nlattr *nla;
+ int err = -EMSGSIZE;
+
+ if (!info || !info[0])
+ return 0;
+
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+ if (!nla)
+ return err;
+
+ err = nla_put_string(skb, T_info_text, info);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ } else
+ nla_nest_end(skb, nla);
+ return 0;
+ }
+
+ /* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ */
+ #define DRBD_ADM_NEED_MINOR 1
+ #define DRBD_ADM_NEED_RESOURCE 2
+ #define DRBD_ADM_NEED_CONNECTION 4
+ static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
+ unsigned flags)
+ {
+ struct drbd_genlmsghdr *d_in = info->userhdr;
+ const u8 cmd = info->genlhdr->cmd;
+ int err;
+
+ memset(&adm_ctx, 0, sizeof(adm_ctx));
+
+ /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
- if (cmd != DRBD_ADM_GET_STATUS
- && security_netlink_recv(skb, CAP_SYS_ADMIN))
++ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!adm_ctx.reply_skb) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
+ info, &drbd_genl_family, 0, cmd);
+ /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+ * but anyways */
+ if (!adm_ctx.reply_dh) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx.reply_dh->minor = d_in->minor;
+ adm_ctx.reply_dh->ret_code = NO_ERROR;
+
+ adm_ctx.volume = VOLUME_UNSPECIFIED;
+ if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+ struct nlattr *nla;
+ /* parse and validate only */
+ err = drbd_cfg_context_from_attrs(NULL, info);
+ if (err)
+ goto fail;
+
+ /* It was present, and valid,
+ * copy it over to the reply skb. */
+ err = nla_put_nohdr(adm_ctx.reply_skb,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]);
+ if (err)
+ goto fail;
+
+ /* and assign stuff to the global adm_ctx */
+ nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+ if (nla)
+ adm_ctx.volume = nla_get_u32(nla);
+ nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+ if (nla)
+ adm_ctx.resource_name = nla_data(nla);
+ adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+ adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+ if ((adm_ctx.my_addr &&
+ nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) ||
+ (adm_ctx.peer_addr &&
+ nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) {
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ adm_ctx.minor = d_in->minor;
+ adm_ctx.mdev = minor_to_mdev(d_in->minor);
+ adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name);
+
+ if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) {
+ drbd_msg_put_info("unknown minor");
+ return ERR_MINOR_INVALID;
+ }
+ if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info("unknown resource");
+ return ERR_INVALID_REQUEST;
+ }
+
+ if (flags & DRBD_ADM_NEED_CONNECTION) {
+ if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info("no resource name expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.mdev) {
+ drbd_msg_put_info("no minor number expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.my_addr && adm_ctx.peer_addr)
+ adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
+ nla_len(adm_ctx.my_addr),
+ nla_data(adm_ctx.peer_addr),
+ nla_len(adm_ctx.peer_addr));
+ if (!adm_ctx.tconn) {
+ drbd_msg_put_info("unknown connection");
+ return ERR_INVALID_REQUEST;
+ }
+ }
+
+ /* some more paranoia, if the request was over-determined */
+ if (adm_ctx.mdev && adm_ctx.tconn &&
+ adm_ctx.mdev->tconn != adm_ctx.tconn) {
+ pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
+ adm_ctx.minor, adm_ctx.resource_name,
+ adm_ctx.mdev->tconn->name);
+ drbd_msg_put_info("minor exists in different resource");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.mdev &&
+ adm_ctx.volume != VOLUME_UNSPECIFIED &&
+ adm_ctx.volume != adm_ctx.mdev->vnr) {
+ pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+ adm_ctx.minor, adm_ctx.volume,
+ adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name);
+ drbd_msg_put_info("minor exists as different volume");
+ return ERR_INVALID_REQUEST;
+ }
+
+ return NO_ERROR;
+
+ fail:
+ nlmsg_free(adm_ctx.reply_skb);
+ adm_ctx.reply_skb = NULL;
+ return err;
+ }
+
+ static int drbd_adm_finish(struct genl_info *info, int retcode)
+ {
+ if (adm_ctx.tconn) {
+ kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+ adm_ctx.tconn = NULL;
+ }
+
+ if (!adm_ctx.reply_skb)
+ return -ENOMEM;
+
+ adm_ctx.reply_dh->ret_code = retcode;
+ drbd_adm_send_reply(adm_ctx.reply_skb, info);
+ return 0;
+ }
+
+ static void setup_khelper_env(struct drbd_tconn *tconn, char **envp)
+ {
+ char *afs;
+
+ /* FIXME: A future version will not allow this case. */
+ if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0)
+ return;
+
+ switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) {
+ case AF_INET6:
+ afs = "ipv6";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+ &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr);
break;
- #define NL_BIT(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
- arg->member = *(char *)(tags) ? 1 : 0; \
+ case AF_INET:
+ afs = "ipv4";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
break;
- #define NL_STRING(pn, pr, member, len) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
- if (dlen > len) { \
- dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
- #member, dlen, (unsigned int)len); \
- return 0; \
- } \
- arg->member ## _len = dlen; \
- memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
- break;
- #include <linux/drbd_nl.h>
-
- /* Generate the struct to tag_list functions */
- #define NL_PACKET(name, number, fields) \
- static unsigned short* \
- name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
- static unsigned short* \
- name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) \
- { \
- fields \
- return tags; \
- }
-
- #define NL_INTEGER(pn, pr, member) \
- put_unaligned(pn | pr | TT_INTEGER, tags++); \
- put_unaligned(sizeof(int), tags++); \
- put_unaligned(arg->member, (int *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(int));
- #define NL_INT64(pn, pr, member) \
- put_unaligned(pn | pr | TT_INT64, tags++); \
- put_unaligned(sizeof(u64), tags++); \
- put_unaligned(arg->member, (u64 *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(u64));
- #define NL_BIT(pn, pr, member) \
- put_unaligned(pn | pr | TT_BIT, tags++); \
- put_unaligned(sizeof(char), tags++); \
- *(char *)tags = arg->member; \
- tags = (unsigned short *)((char *)tags+sizeof(char));
- #define NL_STRING(pn, pr, member, len) \
- put_unaligned(pn | pr | TT_STRING, tags++); \
- put_unaligned(arg->member ## _len, tags++); \
- memcpy(tags, arg->member, arg->member ## _len); \
- tags = (unsigned short *)((char *)tags + arg->member ## _len);
- #include <linux/drbd_nl.h>
-
- void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
- void drbd_nl_send_reply(struct cn_msg *, int);
+ default:
+ afs = "ssocks";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
+ }
+ snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+ }
int drbd_khelper(struct drbd_conf *mdev, char *cmd)
{
drbd_md_sync(mdev);
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
-
- drbd_bcast_ev_helper(mdev, cmd);
+ sib.sib_reason = SIB_HELPER_PRE;
+ sib.helper_name = cmd;
+ drbd_bcast_event(mdev, &sib);
- ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret)
dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
+ sib.sib_reason = SIB_HELPER_POST;
+ sib.helper_exit_code = ret;
+ drbd_bcast_event(mdev, &sib);
+
+ if (current == tconn->worker.task)
+ clear_bit(CALLBACK_PENDING, &tconn->flags);
- if (current == mdev->worker.task)
- drbd_clear_flag(mdev, CALLBACK_PENDING);
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+ }
+
+ int conn_khelper(struct drbd_tconn *tconn, char *cmd)
+ {
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char *argv[] = {usermode_helper, cmd, tconn->name, NULL };
+ int ret;
+
+ setup_khelper_env(tconn, envp);
+ conn_md_sync(tconn);
+
+ conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name);
+ /* TODO: conn_bcast_event() ?? */
+
- ret = call_usermodehelper(usermode_helper, argv, envp, 1);
++ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, tconn->name,
+ (ret >> 8) & 0xff, ret);
+ else
+ conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, tconn->name,
+ (ret >> 8) & 0xff, ret);
+ /* TODO: conn_bcast_event() ?? */
if (ret < 0) /* Ignore any ERRNOs we got. */
ret = 0;
Because new from 8.3.8 onwards the peer can use multiple
BIOs for a single peer_request */
if (mdev->state.conn >= C_CONNECTED) {
- if (mdev->agreed_pro_version < 94) {
- peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ if (mdev->tconn->agreed_pro_version < 94)
- peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
++ peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
- } else if (mdev->agreed_pro_version == 94)
+ else if (mdev->tconn->agreed_pro_version == 94)
peer = DRBD_MAX_SIZE_H80_PACKET;
- else /* drbd 8.3.8 onwards */
+ else if (mdev->tconn->agreed_pro_version < 100)
+ peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
+ else
peer = DRBD_MAX_BIO_SIZE;
}
retcode = SS_NOTHING_TO_DO;
if (ret)
retcode = ERR_INTR;
- reply->ret_code = retcode;
out:
- return 0;
+ return retcode;
}
- static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+ /* Detaching the disk is a process in multiple stages. First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+ int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
{
- int i, ns;
enum drbd_ret_code retcode;
- struct net_conf *new_conf = NULL;
- struct crypto_hash *tfm = NULL;
- struct crypto_hash *integrity_w_tfm = NULL;
- struct crypto_hash *integrity_r_tfm = NULL;
- struct hlist_head *new_tl_hash = NULL;
- struct hlist_head *new_ee_hash = NULL;
- struct drbd_conf *odev;
- char hmac_name[CRYPTO_MAX_ALG_NAME];
- void *int_dig_out = NULL;
- void *int_dig_in = NULL;
- void *int_dig_vv = NULL;
- struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+ struct detach_parms parms = { };
+ int err;
- drbd_reconfig_start(mdev);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- if (mdev->state.conn > C_STANDALONE) {
- retcode = ERR_NET_CONFIGURED;
- goto fail;
+ if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+ err = detach_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
}
- /* allocation not in the IO path, cqueue thread context */
- new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
- if (!new_conf) {
- retcode = ERR_NOMEM;
- goto fail;
- }
+ retcode = adm_detach(adm_ctx.mdev, parms.force_detach);
+ out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+ }
- new_conf->timeout = DRBD_TIMEOUT_DEF;
- new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
- new_conf->ping_int = DRBD_PING_INT_DEF;
- new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
- new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
- new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
- new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
- new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
- new_conf->ko_count = DRBD_KO_COUNT_DEF;
- new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
- new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
- new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
- new_conf->want_lose = 0;
- new_conf->two_primaries = 0;
- new_conf->wire_protocol = DRBD_PROT_C;
- new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
- new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
- new_conf->on_congestion = DRBD_ON_CONGESTION_DEF;
- new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF;
-
- if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
- retcode = ERR_MANDATORY_TAG;
- goto fail;
+ static bool conn_resync_running(struct drbd_tconn *tconn)
+ {
+ struct drbd_conf *mdev;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.conn == C_SYNC_SOURCE ||
+ mdev->state.conn == C_SYNC_TARGET ||
+ mdev->state.conn == C_PAUSED_SYNC_S ||
+ mdev->state.conn == C_PAUSED_SYNC_T) {
+ rv = true;
+ break;
+ }
}
+ rcu_read_unlock();
- if (new_conf->two_primaries
- && (new_conf->wire_protocol != DRBD_PROT_C)) {
- retcode = ERR_NOT_PROTO_C;
- goto fail;
- }
+ return rv;
+ }
- if (get_ldev(mdev)) {
- enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
- retcode = ERR_STONITH_AND_PROT_A;
- goto fail;
+ static bool conn_ov_running(struct drbd_tconn *tconn)
+ {
+ struct drbd_conf *mdev;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.conn == C_VERIFY_S ||
+ mdev->state.conn == C_VERIFY_T) {
+ rv = true;
+ break;
}
}
+ rcu_read_unlock();
- if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
- retcode = ERR_CONG_NOT_PROTO_A;
- goto fail;
- }
+ return rv;
+ }
- if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
- retcode = ERR_DISCARD;
- goto fail;
- }
+ static enum drbd_ret_code
+ _check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf)
+ {
+ struct drbd_conf *mdev;
+ int i;
- retcode = NO_ERROR;
+ if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) {
+ if (new_conf->wire_protocol != old_conf->wire_protocol)
+ return ERR_NEED_APV_100;
- new_my_addr = (struct sockaddr *)&new_conf->my_addr;
- new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev || odev == mdev)
- continue;
- if (get_net_conf(odev)) {
- taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
- if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
- !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
- retcode = ERR_LOCAL_ADDR;
-
- taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
- if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
- !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
- retcode = ERR_PEER_ADDR;
-
- put_net_conf(odev);
- if (retcode != NO_ERROR)
- goto fail;
- }
+ if (new_conf->two_primaries != old_conf->two_primaries)
+ return ERR_NEED_APV_100;
+
- if (!new_conf->integrity_alg != !old_conf->integrity_alg)
- return ERR_NEED_APV_100;
-
+ if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
+ return ERR_NEED_APV_100;
}
- if (new_conf->cram_hmac_alg[0] != 0) {
- snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
- new_conf->cram_hmac_alg);
- tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm)) {
- tfm = NULL;
- retcode = ERR_AUTH_ALG;
- goto fail;
- }
+ if (!new_conf->two_primaries &&
+ conn_highest_role(tconn) == R_PRIMARY &&
+ conn_highest_peer(tconn) == R_PRIMARY)
+ return ERR_NEED_ALLOW_TWO_PRI;
- if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
- retcode = ERR_AUTH_ALG_ND;
- goto fail;
+ if (new_conf->two_primaries &&
+ (new_conf->wire_protocol != DRBD_PROT_C))
+ return ERR_NOT_PROTO_C;
+
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ if (get_ldev(mdev)) {
+ enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+ put_ldev(mdev);
+ if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+ return ERR_STONITH_AND_PROT_A;
}
+ if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data)
+ return ERR_DISCARD_IMPOSSIBLE;
}
- if (new_conf->integrity_alg[0]) {
- integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_w_tfm)) {
- integrity_w_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
+ if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
+ return ERR_CONG_NOT_PROTO_A;
- if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
- retcode=ERR_INTEGRITY_ALG_ND;
- goto fail;
- }
+ return NO_ERROR;
+ }
- integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_r_tfm)) {
- integrity_r_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
- }
+ static enum drbd_ret_code
+ check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf)
+ {
+ static enum drbd_ret_code rv;
+ struct drbd_conf *mdev;
+ int i;
- ns = new_conf->max_epoch_size/8;
- if (mdev->tl_hash_s != ns) {
- new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_tl_hash) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
+ rcu_read_lock();
+ rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf);
+ rcu_read_unlock();
- ns = new_conf->max_buffers/8;
- if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
- new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_ee_hash) {
- retcode = ERR_NOMEM;
- goto fail;
+ /* tconn->volumes protected by genl_lock() here */
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ if (!mdev->bitmap) {
+ if(drbd_bm_init(mdev))
+ return ERR_NOMEM;
}
}
return 0;
}
- static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
{
- reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
- return 0;
+ return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
}
- static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+ int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr)
{
- unsigned short *tl;
+ struct nlattr *nla;
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+ if (!nla)
+ goto nla_put_failure;
+ if (vnr != VOLUME_UNSPECIFIED &&
+ nla_put_u32(skb, T_ctx_volume, vnr))
+ goto nla_put_failure;
+ if (nla_put_string(skb, T_ctx_resource_name, tconn->name))
+ goto nla_put_failure;
+ if (tconn->my_addr_len &&
+ nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr))
+ goto nla_put_failure;
+ if (tconn->peer_addr_len &&
+ nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr))
+ goto nla_put_failure;
+ nla_nest_end(skb, nla);
+ return 0;
- tl = reply->tag_list;
+ nla_put_failure:
+ if (nla)
+ nla_nest_cancel(skb, nla);
+ return -EMSGSIZE;
+ }
- if (get_ldev(mdev)) {
- tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
- put_ldev(mdev);
- }
+ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
+ const struct sib_info *sib)
+ {
+ struct state_info *si = NULL; /* for sizeof(si->member); */
+ struct net_conf *nc;
+ struct nlattr *nla;
+ int got_ldev;
+ int err = 0;
+ int exclude_sensitive;
+
+ /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+ * to. So we better exclude_sensitive information.
+ *
+ * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+ * in the context of the requesting user process. Exclude sensitive
+ * information, unless current has superuser.
+ *
+ * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+ * relies on the current implementation of netlink_dump(), which
+ * executes the dump callback successively from netlink_recvmsg(),
+ * always in the context of the receiving process */
+ exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+ got_ldev = get_ldev(mdev);
+
+ /* We need to add connection name and volume number information still.
+ * Minor number is in drbd_genlmsghdr. */
+ if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr))
+ goto nla_put_failure;
+
+ if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
+ goto nla_put_failure;
+
+ rcu_read_lock();
+ if (got_ldev)
+ if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
+ goto nla_put_failure;
+
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (nc)
+ err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
+
+ nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+ if (!nla)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+ nla_put_u32(skb, T_current_state, mdev->state.i) ||
+ nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) ||
+ nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) ||
+ nla_put_u64(skb, T_send_cnt, mdev->send_cnt) ||
+ nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) ||
+ nla_put_u64(skb, T_read_cnt, mdev->read_cnt) ||
+ nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) ||
+ nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) ||
+ nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) ||
+ nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) ||
+ nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) ||
+ nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt)))
+ goto nla_put_failure;
+
+ if (got_ldev) {
+ int err;
- if (get_net_conf(mdev)) {
- tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
- put_net_conf(mdev);
+ spin_lock_irq(&mdev->ldev->md.uuid_lock);
+ err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid);
+ spin_unlock_irq(&mdev->ldev->md.uuid_lock);
+
+ if (err)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) ||
+ nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) ||
+ nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev)))
+ goto nla_put_failure;
+ if (C_SYNC_SOURCE <= mdev->state.conn &&
+ C_PAUSED_SYNC_T >= mdev->state.conn) {
+ if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) ||
+ nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed))
+ goto nla_put_failure;
+ }
}
- tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ if (sib) {
+ switch(sib->sib_reason) {
+ case SIB_SYNC_PROGRESS:
+ case SIB_GET_STATUS_REPLY:
+ break;
+ case SIB_STATE_CHANGE:
+ if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+ nla_put_u32(skb, T_new_state, sib->ns.i))
+ goto nla_put_failure;
+ break;
+ case SIB_HELPER_POST:
+ if (nla_put_u32(skb, T_helper_exit_code,
+ sib->helper_exit_code))
+ goto nla_put_failure;
+ /* fall through */
+ case SIB_HELPER_PRE:
+ if (nla_put_string(skb, T_helper, sib->helper_name))
+ goto nla_put_failure;
+ break;
+ }
+ }
+ nla_nest_end(skb, nla);
- return (int)((char *)tl - (char *)reply->tag_list);
+ if (0)
+ nla_put_failure:
+ err = -EMSGSIZE;
+ if (got_ldev)
+ put_ldev(mdev);
+ return err;
}
- static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
{
- unsigned short *tl = reply->tag_list;
- union drbd_state s = mdev->state;
- unsigned long rs_left;
- unsigned int res;
+ enum drbd_ret_code retcode;
+ int err;
- tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- /* no local ref, no bitmap, no syncer progress. */
- if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
- if (get_ldev(mdev)) {
- drbd_get_syncer_progress(mdev, &rs_left, &res);
- tl = tl_add_int(tl, T_sync_progress, &res);
- put_ldev(mdev);
- }
+ err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- return (int)((char *)tl - (char *)reply->tag_list);
+ out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
- static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+ int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned short *tl;
-
- tl = reply->tag_list;
+ struct drbd_conf *mdev;
+ struct drbd_genlmsghdr *dh;
+ struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0];
+ struct drbd_tconn *tconn = NULL;
+ struct drbd_tconn *tmp;
+ unsigned volume = cb->args[1];
+
+ /* Open coded, deferred, iteration:
+ * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+ * idr_for_each_entry(&tconn->volumes, mdev, i) {
+ * ...
+ * }
+ * }
+ * where tconn is cb->args[0];
+ * and i is cb->args[1];
+ *
+ * cb->args[2] indicates if we shall loop over all resources,
+ * or just dump all volumes of a single resource.
+ *
+ * This may miss entries inserted after this dump started,
+ * or entries deleted before they are reached.
+ *
+ * We need to make sure the mdev won't disappear while
+ * we are looking at it, and revalidate our iterators
+ * on each iteration.
+ */
- if (get_ldev(mdev)) {
- unsigned long flags;
- spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
- tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
- tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
- spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
- put_ldev(mdev);
+ /* synchronize with conn_create()/conn_destroy() */
+ rcu_read_lock();
+ /* revalidate iterator position */
+ list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) {
+ if (pos == NULL) {
+ /* first iteration */
+ pos = tmp;
+ tconn = pos;
+ break;
+ }
+ if (tmp == pos) {
+ tconn = pos;
+ break;
+ }
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ if (tconn) {
+ next_tconn:
+ mdev = idr_get_next(&tconn->volumes, &volume);
+ if (!mdev) {
+ /* No more volumes to dump on this tconn.
+ * Advance tconn iterator. */
+ pos = list_entry_rcu(tconn->all_tconn.next,
+ struct drbd_tconn, all_tconn);
+ /* Did we dump any volume on this tconn yet? */
+ if (volume != 0) {
+ /* If we reached the end of the list,
+ * or only a single resource dump was requested,
+ * we are done. */
+ if (&pos->all_tconn == &drbd_tconns || cb->args[2])
+ goto out;
+ volume = 0;
+ tconn = pos;
+ goto next_tconn;
+ }
+ }
+
- dh = genlmsg_put(skb, NETLINK_CB(cb->skb).pid,
++ dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &drbd_genl_family,
+ NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+ if (!dh)
+ goto out;
+
+ if (!mdev) {
+ /* This is a tconn without a single volume.
+ * Suprisingly enough, it may have a network
+ * configuration. */
+ struct net_conf *nc;
+ dh->minor = -1U;
+ dh->ret_code = NO_ERROR;
+ if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED))
+ goto cancel;
+ nc = rcu_dereference(tconn->net_conf);
+ if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+ goto cancel;
+ goto done;
+ }
- return (int)((char *)tl - (char *)reply->tag_list);
+ D_ASSERT(mdev->vnr == volume);
+ D_ASSERT(mdev->tconn == tconn);
+
+ dh->minor = mdev_to_minor(mdev);
+ dh->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(skb, mdev, NULL)) {
+ cancel:
+ genlmsg_cancel(skb, dh);
+ goto out;
+ }
+ done:
+ genlmsg_end(skb, dh);
+ }
+
+ out:
+ rcu_read_unlock();
+ /* where to start the next iteration */
+ cb->args[0] = (long)pos;
+ cb->args[1] = (pos == tconn) ? volume + 1 : 0;
+
+ /* No more tconns/volumes/minors found results in an empty skb.
+ * Which will terminate the dump. */
+ return skb->len;
}
- /**
- * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
- * @mdev: DRBD device.
- * @nlp: Netlink/connector packet from drbdsetup
- * @reply: Reply packet for drbdsetup
+ /*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock(). During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
*/
- static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned short *tl;
- char rv;
+ const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+ struct nlattr *nla;
+ const char *resource_name;
+ struct drbd_tconn *tconn;
+ int maxtype;
+
+ /* Is this a followup call? */
+ if (cb->args[0]) {
+ /* ... of a single resource dump,
+ * and the resource iterator has been advanced already? */
+ if (cb->args[2] && cb->args[2] != cb->args[0])
+ return 0; /* DONE. */
+ goto dump;
+ }
+
+ /* First call (from netlink_dump_start). We need to figure out
+ * which resource(s) the user wants us to dump. */
+ nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+ nlmsg_attrlen(cb->nlh, hdrlen),
+ DRBD_NLA_CFG_CONTEXT);
+
+ /* No explicit context given. Dump all. */
+ if (!nla)
+ goto dump;
+ maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+ nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+ if (IS_ERR(nla))
+ return PTR_ERR(nla);
+ /* context given, but no name present? */
+ if (!nla)
+ return -EINVAL;
+ resource_name = nla_data(nla);
+ tconn = conn_get_by_name(resource_name);
+
+ if (!tconn)
+ return -ENODEV;
+
+ kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */
+
+ /* prime iterators, and set "filter" mode mark:
+ * only dump this tconn. */
+ cb->args[0] = (long)tconn;
+ /* cb->args[1] = 0; passed in this way. */
+ cb->args[2] = (long)tconn;
+
+ dump:
+ return get_one_status(skb, cb);
+ }
- tl = reply->tag_list;
+ int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+ {
+ enum drbd_ret_code retcode;
+ struct timeout_parms tp;
+ int err;
- rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
- drbd_test_flag(mdev, USE_DEGR_WFC_T) ? UT_DEGRADED : UT_DEFAULT;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ tp.timeout_type =
+ adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+ test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED :
+ UT_DEFAULT;
- return (int)((char *)tl - (char *)reply->tag_list);
+ err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+ out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
- static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+ int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
{
- /* default to resume from last known position, if possible */
- struct start_ov args = {
- .start_sector = mdev->ov_start_sector,
- .stop_sector = ULLONG_MAX,
- };
+ struct drbd_conf *mdev;
+ enum drbd_ret_code retcode;
+ struct start_ov_parms parms;
- if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
+
+ /* resume from last known position, if possible */
+ parms.ov_start_sector = mdev->ov_start_sector;
+ parms.ov_stop_sector = ULLONG_MAX;
+ if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+ int err = start_ov_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
}
+ /* w_make_ov_request expects position to be aligned */
+ mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+ mdev->ov_stop_sector = parms.ov_stop_sector;
/* If there is still bitmap IO pending, e.g. previous resync or verify
* just being finished, wait for it before requesting a new resync. */
goto out;
}
- timeo = mdev->net_conf->try_connect_int * HZ;
- timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
-
- s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- s_listen->sk->sk_rcvtimeo = timeo;
- s_listen->sk->sk_sndtimeo = timeo;
- drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
- mdev->net_conf->rcvbuf_size);
- s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */
++ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
what = "bind before listen";
- err = s_listen->ops->bind(s_listen,
- (struct sockaddr *) mdev->net_conf->my_addr,
- mdev->net_conf->my_addr_len);
+ err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
if (err < 0)
goto out;
goto out_release_sockets;
}
- if (sock && msock) {
- ok = drbd_socket_okay(mdev, &sock);
- ok = drbd_socket_okay(mdev, &msock) && ok;
- if (ok)
- break;
- }
- } while (1);
+ ok = drbd_socket_okay(&sock.socket);
+ ok = drbd_socket_okay(&msock.socket) && ok;
+ } while (!ok);
+
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
- msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
- msock.socket->sk->sk_reuse = 1; /* SO_REUSEADDR */
++ sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
++ msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock->sk->sk_allocation = GFP_NOIO;
- msock->sk->sk_allocation = GFP_NOIO;
+ sock.socket->sk->sk_allocation = GFP_NOIO;
+ msock.socket->sk->sk_allocation = GFP_NOIO;
- sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
- msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+ sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
/* NOT YET ...
- * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- * first set it to the P_HAND_SHAKE timeout,
+ * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
+ * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ * first set it to the P_CONNECTION_FEATURES timeout,
* which we set to 4x the configured ping_timeout. */
- sock->sk->sk_sndtimeo =
- sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+
+ sock.socket->sk->sk_sndtimeo =
+ sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
- msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+ msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+ timeout = nc->timeout * HZ / 10;
+ discard_my_data = nc->discard_my_data;
+ rcu_read_unlock();
+
+ msock.socket->sk->sk_sndtimeo = timeout;
/* we don't want delays.
* we use TCP_CORK where appropriate, though */
/* no point in adding empty flushes to the transfer log,
* they are mapped to drbd barriers already. */
- if (likely(size!=0))
- list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+ if (likely(req->i.size!=0)) {
+ if (rw == WRITE)
+ mdev->tconn->current_tle_writes++;
- /* NOTE remote first: to get the concurrent write detection right,
- * we must register the request before start of local IO. */
- if (remote) {
- /* either WRITE and C_CONNECTED,
- * or READ, and no local disk,
- * or READ, but not in sync.
- */
- _req_mod(req, (rw == WRITE)
- ? queue_for_net_write
- : queue_for_net_read);
+ list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
}
- if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
- _req_mod(req, queue_for_send_oos);
-
- if (remote &&
- mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
- maybe_pull_ahead(mdev);
- /* If this was a flush, queue a drbd barrier/start a new epoch.
- * Unless the current epoch was empty anyways, or we are not currently
- * replicating, in which case there is no point. */
- if (unlikely(bio->bi_rw & REQ_FLUSH)
- && mdev->newest_tle->n_writes
- && drbd_should_do_remote(mdev->state))
- queue_barrier(mdev);
-
- spin_unlock_irq(&mdev->req_lock);
- kfree(b); /* if someone else has beaten us to it... */
-
- if (local) {
- req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
-
- /* State may have changed since we grabbed our reference on the
- * mdev->ldev member. Double check, and short-circuit to endio.
- * In case the last activity log transaction failed to get on
- * stable storage, and this is a WRITE, we may not even submit
- * this bio. */
- if (get_ldev(mdev)) {
- if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
- : rw == READ ? DRBD_FAULT_DT_RD
- : DRBD_FAULT_DT_RA))
- bio_endio(req->private_bio, -EIO);
- else
- generic_make_request(req->private_bio);
- put_ldev(mdev);
+ if (rw == WRITE) {
+ if (!drbd_process_write_request(req))
+ no_remote = true;
+ } else {
+ /* We either have a private_bio, or we can read from remote.
+ * Otherwise we had done the goto nodata above. */
+ if (req->private_bio == NULL) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_READ);
} else
- bio_endio(req->private_bio, -EIO);
+ no_remote = true;
}
- return 0;
-
- fail_conflicting:
- /* this is a conflicting request.
- * even though it may have been only _partially_
- * overlapping with one of the currently pending requests,
- * without even submitting or sending it, we will
- * pretend that it was successfully served right now.
- */
- _drbd_end_io_acct(mdev, req);
- spin_unlock_irq(&mdev->req_lock);
- if (remote)
- dec_ap_pending(mdev);
- /* THINK: do we want to fail it (-EIO), or pretend success?
- * this pretends success. */
- err = 0;
-
- fail_free_complete:
- if (req->rq_state & RQ_IN_ACT_LOG)
- drbd_al_complete_io(mdev, sector);
- fail_and_free_req:
- if (local) {
- bio_put(req->private_bio);
- req->private_bio = NULL;
- put_ldev(mdev);
+ if (req->private_bio) {
+ /* needs to be marked within the same spinlock */
+ _req_mod(req, TO_BE_SUBMITTED);
+ /* but we need to give up the spinlock to submit */
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ drbd_submit_req_private_bio(req);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ } else if (no_remote) {
+ nodata:
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+ (unsigned long long)req->i.sector, req->i.size >> 9);
+ /* A write may have been queued for send_oos, however.
+ * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
}
- if (!ret)
- bio_endio(bio, err);
-
- drbd_req_free(req);
- dec_ap_bio(mdev);
- kfree(b);
-
- return ret;
- }
- /* helper function for drbd_make_request
- * if we can determine just by the mdev (state) that this request will fail,
- * return 1
- * otherwise return 0
- */
- static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
- {
- if (mdev->state.role != R_PRIMARY &&
- (!allow_oos || is_write)) {
- if (__ratelimit(&drbd_ratelimit_state)) {
- dev_err(DEV, "Process %s[%u] tried to %s; "
- "since we are not in Primary state, "
- "we cannot allow this\n",
- current->comm, current->pid,
- is_write ? "WRITE" : "READ");
- }
- return 1;
- }
+ out:
+ if (drbd_req_put_completion_ref(req, &m, 1))
+ kref_put(&req->kref, drbd_req_destroy);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- return 0;
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+ return;
}
-int drbd_make_request(struct request_queue *q, struct bio *bio)
+void drbd_make_request(struct request_queue *q, struct bio *bio)
{
- unsigned int s_enr, e_enr;
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
unsigned long start_time;
/*
* what we "blindly" assume:
*/
- D_ASSERT((bio->bi_size & 0x1ff) == 0);
-
- /* to make some things easier, force alignment of requests within the
- * granularity of our hash tables */
- s_enr = bio->bi_sector >> HT_SHIFT;
- e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
-
- if (likely(s_enr == e_enr)) {
- do {
- inc_ap_bio(mdev, 1);
- } while (drbd_make_request_common(mdev, bio, start_time));
- return;
- }
-
- /* can this bio be split generically?
- * Maybe add our own split-arbitrary-bios function. */
- if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
- /* rather error out here than BUG in bio_split */
- dev_err(DEV, "bio would need to, but cannot, be split: "
- "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
- bio->bi_vcnt, bio->bi_idx, bio->bi_size,
- (unsigned long long)bio->bi_sector);
- bio_endio(bio, -EINVAL);
- } else {
- /* This bio crosses some boundary, so we have to split it. */
- struct bio_pair *bp;
- /* works for the "do not cross hash slot boundaries" case
- * e.g. sector 262269, size 4096
- * s_enr = 262269 >> 6 = 4097
- * e_enr = (262269+8-1) >> 6 = 4098
- * HT_SHIFT = 6
- * sps = 64, mask = 63
- * first_sectors = 64 - (262269 & 63) = 3
- */
- const sector_t sect = bio->bi_sector;
- const int sps = 1 << HT_SHIFT; /* sectors per slot */
- const int mask = sps - 1;
- const sector_t first_sectors = sps - (sect & mask);
- bp = bio_split(bio, first_sectors);
+ D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
- /* we need to get a "reference count" (ap_bio_cnt)
- * to avoid races with the disconnect/reconnect/suspend code.
- * In case we need to split the bio here, we need to get three references
- * atomically, otherwise we might deadlock when trying to submit the
- * second one! */
- inc_ap_bio(mdev, 3);
-
- D_ASSERT(e_enr == s_enr + 1);
-
- while (drbd_make_request_common(mdev, &bp->bio1, start_time))
- inc_ap_bio(mdev, 1);
-
- while (drbd_make_request_common(mdev, &bp->bio2, start_time))
- inc_ap_bio(mdev, 1);
-
- dec_ap_bio(mdev);
-
- bio_pair_release(bp);
- }
+ inc_ap_bio(mdev);
+ __drbd_make_request(mdev, bio, start_time);
-
- return 0;
}
- /* This is called by bio_add_page(). With this function we reduce
- * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
- * units (was AL_EXTENTs).
+ /* This is called by bio_add_page().
+ *
+ * q->max_hw_sectors and other global limits are already enforced there.
*
- * we do the calculation within the lower 32bit of the byte offsets,
- * since we don't care for actual offset, but only check whether it
- * would cross "activity log extent" boundaries.
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
*
* As long as the BIO is empty we have to allow at least one bvec,
- * regardless of size and offset. so the resulting bio may still
- * cross extent boundaries. those are dealt with (bio_split) in
- * drbd_make_request.
+ * regardless of size and offset, so no need to ask lower levels.
*/
int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
{