4 This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
6 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
7 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
8 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
10 Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
11 from Logicworks, Inc. for making SDP replication support possible.
13 drbd is free software; you can redistribute it and/or modify
14 it under the terms of the GNU General Public License as published by
15 the Free Software Foundation; either version 2, or (at your option)
18 drbd is distributed in the hope that it will be useful,
19 but WITHOUT ANY WARRANTY; without even the implied warranty of
20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 GNU General Public License for more details.
23 You should have received a copy of the GNU General Public License
24 along with drbd; see the file COPYING. If not, write to
25 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
29 #include <linux/module.h>
30 #include <linux/drbd.h>
31 #include <asm/uaccess.h>
32 #include <asm/types.h>
34 #include <linux/ctype.h>
35 #include <linux/mutex.h>
37 #include <linux/file.h>
38 #include <linux/proc_fs.h>
39 #include <linux/init.h>
41 #include <linux/memcontrol.h>
42 #include <linux/mm_inline.h>
43 #include <linux/slab.h>
44 #include <linux/random.h>
45 #include <linux/reboot.h>
46 #include <linux/notifier.h>
47 #include <linux/kthread.h>
49 #define __KERNEL_SYSCALLS__
50 #include <linux/unistd.h>
51 #include <linux/vmalloc.h>
53 #include <linux/drbd_limits.h>
55 #include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
59 struct after_state_chg_work {
63 enum chg_state_flags flags;
64 struct completion *done;
67 static DEFINE_MUTEX(drbd_main_mutex);
68 int drbdd_init(struct drbd_thread *);
69 int drbd_worker(struct drbd_thread *);
70 int drbd_asender(struct drbd_thread *);
73 static int drbd_open(struct block_device *bdev, fmode_t mode);
74 static int drbd_release(struct gendisk *gd, fmode_t mode);
75 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
76 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
77 union drbd_state ns, enum chg_state_flags flags);
78 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
79 static void md_sync_timer_fn(unsigned long data);
80 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
81 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
82 static void _tl_clear(struct drbd_conf *mdev);
84 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
85 "Lars Ellenberg <lars@linbit.com>");
86 MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
87 MODULE_VERSION(REL_VERSION);
88 MODULE_LICENSE("GPL");
89 MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
90 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
91 MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
93 #include <linux/moduleparam.h>
94 /* allow_open_on_secondary */
95 MODULE_PARM_DESC(allow_oos, "DONT USE!");
96 /* thanks to these macros, if compiled into the kernel (not-module),
97 * this becomes the boot parameter drbd.minor_count */
98 module_param(minor_count, uint, 0444);
99 module_param(disable_sendpage, bool, 0644);
100 module_param(allow_oos, bool, 0);
101 module_param(cn_idx, uint, 0444);
102 module_param(proc_details, int, 0644);
104 #ifdef CONFIG_DRBD_FAULT_INJECTION
107 static int fault_count;
109 /* bitmap of enabled faults */
110 module_param(enable_faults, int, 0664);
111 /* fault rate % value - applies to all enabled faults */
112 module_param(fault_rate, int, 0664);
113 /* count of faults inserted */
114 module_param(fault_count, int, 0664);
115 /* bitmap of devices to insert faults on */
116 module_param(fault_devs, int, 0644);
119 /* module parameter, defined */
120 unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
121 bool disable_sendpage;
123 unsigned int cn_idx = CN_IDX_DRBD;
124 int proc_details; /* Detail level in proc drbd*/
126 /* Module parameter for setting the user mode helper program
127 * to run. Default is /sbin/drbdadm */
128 char usermode_helper[80] = "/sbin/drbdadm";
130 module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644);
132 /* in 2.6.x, our device mapping and config info contains our virtual gendisks
133 * as member "struct gendisk *vdisk;"
135 struct drbd_conf **minor_table;
137 struct kmem_cache *drbd_request_cache;
138 struct kmem_cache *drbd_ee_cache; /* epoch entries */
139 struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
140 struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
141 mempool_t *drbd_request_mempool;
142 mempool_t *drbd_ee_mempool;
143 mempool_t *drbd_md_io_page_pool;
144 struct bio_set *drbd_md_io_bio_set;
146 /* I do not use a standard mempool, because:
147 1) I want to hand out the pre-allocated objects first.
148 2) I want to be able to interrupt sleeping allocation with a signal.
149 Note: This is a single linked list, the next pointer is the private
150 member of struct page.
152 struct page *drbd_pp_pool;
153 spinlock_t drbd_pp_lock;
155 wait_queue_head_t drbd_pp_wait;
157 DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
159 static const struct block_device_operations drbd_ops = {
160 .owner = THIS_MODULE,
162 .release = drbd_release,
165 struct bio *bio_alloc_drbd(gfp_t gfp_mask)
167 if (!drbd_md_io_bio_set)
168 return bio_alloc(gfp_mask, 1);
170 return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
174 /* When checking with sparse, and this is an inline function, sparse will
175 give tons of false positives. When this is a real functions sparse works.
177 int _get_ldev_if_state(struct drbd_conf *mdev, enum drbd_disk_state mins)
181 atomic_inc(&mdev->local_cnt);
182 io_allowed = (mdev->state.disk >= mins);
184 if (atomic_dec_and_test(&mdev->local_cnt))
185 wake_up(&mdev->misc_wait);
193 * DOC: The transfer log
195 * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
196 * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
197 * of the list. There is always at least one &struct drbd_tl_epoch object.
199 * Each &struct drbd_tl_epoch has a circular double linked list of requests
202 static int tl_init(struct drbd_conf *mdev)
204 struct drbd_tl_epoch *b;
206 /* during device minor initialization, we may well use GFP_KERNEL */
207 b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
210 INIT_LIST_HEAD(&b->requests);
211 INIT_LIST_HEAD(&b->w.list);
215 b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
217 mdev->oldest_tle = b;
218 mdev->newest_tle = b;
219 INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
220 INIT_LIST_HEAD(&mdev->barrier_acked_requests);
222 mdev->tl_hash = NULL;
228 static void tl_cleanup(struct drbd_conf *mdev)
230 D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
231 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
232 kfree(mdev->oldest_tle);
233 mdev->oldest_tle = NULL;
234 kfree(mdev->unused_spare_tle);
235 mdev->unused_spare_tle = NULL;
236 kfree(mdev->tl_hash);
237 mdev->tl_hash = NULL;
242 * _tl_add_barrier() - Adds a barrier to the transfer log
243 * @mdev: DRBD device.
244 * @new: Barrier to be added before the current head of the TL.
246 * The caller must hold the req_lock.
248 void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
250 struct drbd_tl_epoch *newest_before;
252 INIT_LIST_HEAD(&new->requests);
253 INIT_LIST_HEAD(&new->w.list);
254 new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
258 newest_before = mdev->newest_tle;
259 new->br_number = newest_before->br_number+1;
260 if (mdev->newest_tle != new) {
261 mdev->newest_tle->next = new;
262 mdev->newest_tle = new;
267 * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
268 * @mdev: DRBD device.
269 * @barrier_nr: Expected identifier of the DRBD write barrier packet.
270 * @set_size: Expected number of requests before that barrier.
272 * In case the passed barrier_nr or set_size does not match the oldest
273 * &struct drbd_tl_epoch objects this function will cause a termination
276 void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
277 unsigned int set_size)
279 struct drbd_tl_epoch *b, *nob; /* next old barrier */
280 struct list_head *le, *tle;
281 struct drbd_request *r;
283 spin_lock_irq(&mdev->req_lock);
285 b = mdev->oldest_tle;
287 /* first some paranoia code */
289 dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
293 if (b->br_number != barrier_nr) {
294 dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
295 barrier_nr, b->br_number);
298 if (b->n_writes != set_size) {
299 dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
300 barrier_nr, set_size, b->n_writes);
304 /* Clean up list of requests processed during current epoch */
305 list_for_each_safe(le, tle, &b->requests) {
306 r = list_entry(le, struct drbd_request, tl_requests);
307 _req_mod(r, barrier_acked);
309 /* There could be requests on the list waiting for completion
310 of the write to the local disk. To avoid corruptions of
311 slab's data structures we have to remove the lists head.
313 Also there could have been a barrier ack out of sequence, overtaking
314 the write acks - which would be a bug and violating write ordering.
315 To not deadlock in case we lose connection while such requests are
316 still pending, we need some way to find them for the
317 _req_mode(connection_lost_while_pending).
319 These have been list_move'd to the out_of_sequence_requests list in
320 _req_mod(, barrier_acked) above.
322 list_splice_init(&b->requests, &mdev->barrier_acked_requests);
325 if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
326 _tl_add_barrier(mdev, b);
328 mdev->oldest_tle = nob;
329 /* if nob == NULL b was the only barrier, and becomes the new
330 barrier. Therefore mdev->oldest_tle points already to b */
332 D_ASSERT(nob != NULL);
333 mdev->oldest_tle = nob;
337 spin_unlock_irq(&mdev->req_lock);
338 dec_ap_pending(mdev);
343 spin_unlock_irq(&mdev->req_lock);
344 drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
349 * _tl_restart() - Walks the transfer log, and applies an action to all requests
350 * @mdev: DRBD device.
351 * @what: The action/event to perform with all request objects
353 * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
354 * restart_frozen_disk_io.
356 static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
358 struct drbd_tl_epoch *b, *tmp, **pn;
359 struct list_head *le, *tle, carry_reads;
360 struct drbd_request *req;
361 int rv, n_writes, n_reads;
363 b = mdev->oldest_tle;
364 pn = &mdev->oldest_tle;
368 INIT_LIST_HEAD(&carry_reads);
369 list_for_each_safe(le, tle, &b->requests) {
370 req = list_entry(le, struct drbd_request, tl_requests);
371 rv = _req_mod(req, what);
373 n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
374 n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
379 if (what == resend) {
380 b->n_writes = n_writes;
381 if (b->w.cb == NULL) {
382 b->w.cb = w_send_barrier;
383 inc_ap_pending(mdev);
384 drbd_set_flag(mdev, CREATE_BARRIER);
387 drbd_queue_work(&mdev->data.work, &b->w);
392 list_add(&carry_reads, &b->requests);
393 /* there could still be requests on that ring list,
394 * in case local io is still pending */
395 list_del(&b->requests);
397 /* dec_ap_pending corresponding to queue_barrier.
398 * the newest barrier may not have been queued yet,
399 * in which case w.cb is still NULL. */
401 dec_ap_pending(mdev);
403 if (b == mdev->newest_tle) {
404 /* recycle, but reinit! */
405 D_ASSERT(tmp == NULL);
406 INIT_LIST_HEAD(&b->requests);
407 list_splice(&carry_reads, &b->requests);
408 INIT_LIST_HEAD(&b->w.list);
410 b->br_number = net_random();
420 list_splice(&carry_reads, &b->requests);
423 /* Actions operating on the disk state, also want to work on
424 requests that got barrier acked. */
426 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
427 req = list_entry(le, struct drbd_request, tl_requests);
434 * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
435 * @mdev: DRBD device.
437 * This is called after the connection to the peer was lost. The storage covered
438 * by the requests on the transfer gets marked as our of sync. Called from the
439 * receiver thread and the worker thread.
441 void tl_clear(struct drbd_conf *mdev)
443 spin_lock_irq(&mdev->req_lock);
445 spin_unlock_irq(&mdev->req_lock);
448 static void _tl_clear(struct drbd_conf *mdev)
450 struct list_head *le, *tle;
451 struct drbd_request *r;
453 _tl_restart(mdev, connection_lost_while_pending);
455 /* we expect this list to be empty. */
456 D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
458 /* but just in case, clean it up anyways! */
459 list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
460 r = list_entry(le, struct drbd_request, tl_requests);
461 /* It would be nice to complete outside of spinlock.
462 * But this is easier for now. */
463 _req_mod(r, connection_lost_while_pending);
466 /* ensure bit indicating barrier is required is clear */
467 drbd_clear_flag(mdev, CREATE_BARRIER);
469 memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
473 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
475 spin_lock_irq(&mdev->req_lock);
476 _tl_restart(mdev, what);
477 spin_unlock_irq(&mdev->req_lock);
481 * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
482 * @mdev: DRBD device.
484 void tl_abort_disk_io(struct drbd_conf *mdev)
486 struct drbd_tl_epoch *b;
487 struct list_head *le, *tle;
488 struct drbd_request *req;
490 spin_lock_irq(&mdev->req_lock);
491 b = mdev->oldest_tle;
493 list_for_each_safe(le, tle, &b->requests) {
494 req = list_entry(le, struct drbd_request, tl_requests);
495 if (!(req->rq_state & RQ_LOCAL_PENDING))
497 _req_mod(req, abort_disk_io);
502 list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
503 req = list_entry(le, struct drbd_request, tl_requests);
504 if (!(req->rq_state & RQ_LOCAL_PENDING))
506 _req_mod(req, abort_disk_io);
509 spin_unlock_irq(&mdev->req_lock);
513 * cl_wide_st_chg() - true if the state change is a cluster wide one
514 * @mdev: DRBD device.
515 * @os: old (current) state.
516 * @ns: new (wanted) state.
518 static int cl_wide_st_chg(struct drbd_conf *mdev,
519 union drbd_state os, union drbd_state ns)
521 return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
522 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
523 (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
524 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
525 (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
526 (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
527 (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
531 drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
532 union drbd_state mask, union drbd_state val)
535 union drbd_state os, ns;
536 enum drbd_state_rv rv;
538 spin_lock_irqsave(&mdev->req_lock, flags);
540 ns.i = (os.i & ~mask.i) | val.i;
541 rv = _drbd_set_state(mdev, ns, f, NULL);
543 spin_unlock_irqrestore(&mdev->req_lock, flags);
549 * drbd_force_state() - Impose a change which happens outside our control on our state
550 * @mdev: DRBD device.
551 * @mask: mask of state bits to change.
552 * @val: value of new state bits.
554 void drbd_force_state(struct drbd_conf *mdev,
555 union drbd_state mask, union drbd_state val)
557 drbd_change_state(mdev, CS_HARD, mask, val);
560 static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
561 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
564 enum sanitize_state_warnings {
566 ABORTED_ONLINE_VERIFY,
568 CONNECTION_LOST_NEGOTIATING,
569 IMPLICITLY_UPGRADED_DISK,
570 IMPLICITLY_UPGRADED_PDSK,
572 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
573 union drbd_state ns, enum sanitize_state_warnings *warn);
574 int drbd_send_state_req(struct drbd_conf *,
575 union drbd_state, union drbd_state);
577 static enum drbd_state_rv
578 _req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
579 union drbd_state val)
581 union drbd_state os, ns;
583 enum drbd_state_rv rv;
585 if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS))
586 return SS_CW_SUCCESS;
588 if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL))
589 return SS_CW_FAILED_BY_PEER;
592 spin_lock_irqsave(&mdev->req_lock, flags);
594 ns.i = (os.i & ~mask.i) | val.i;
595 ns = sanitize_state(mdev, os, ns, NULL);
597 if (!cl_wide_st_chg(mdev, os, ns))
600 rv = is_valid_state(mdev, ns);
601 if (rv == SS_SUCCESS) {
602 rv = is_valid_state_transition(mdev, ns, os);
603 if (rv == SS_SUCCESS)
604 rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
607 spin_unlock_irqrestore(&mdev->req_lock, flags);
613 * drbd_req_state() - Perform an eventually cluster wide state change
614 * @mdev: DRBD device.
615 * @mask: mask of state bits to change.
616 * @val: value of new state bits.
619 * Should not be called directly, use drbd_request_state() or
620 * _drbd_request_state().
622 static enum drbd_state_rv
623 drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
624 union drbd_state val, enum chg_state_flags f)
626 struct completion done;
628 union drbd_state os, ns;
629 enum drbd_state_rv rv;
631 init_completion(&done);
633 if (f & CS_SERIALIZE)
634 mutex_lock(&mdev->state_mutex);
636 spin_lock_irqsave(&mdev->req_lock, flags);
638 ns.i = (os.i & ~mask.i) | val.i;
639 ns = sanitize_state(mdev, os, ns, NULL);
641 if (cl_wide_st_chg(mdev, os, ns)) {
642 rv = is_valid_state(mdev, ns);
643 if (rv == SS_SUCCESS)
644 rv = is_valid_state_transition(mdev, ns, os);
645 spin_unlock_irqrestore(&mdev->req_lock, flags);
647 if (rv < SS_SUCCESS) {
649 print_st_err(mdev, os, ns, rv);
653 drbd_state_lock(mdev);
654 if (!drbd_send_state_req(mdev, mask, val)) {
655 drbd_state_unlock(mdev);
656 rv = SS_CW_FAILED_BY_PEER;
658 print_st_err(mdev, os, ns, rv);
662 if (mask.conn == C_MASK && val.conn == C_DISCONNECTING)
663 drbd_set_flag(mdev, DISCONNECT_SENT);
665 wait_event(mdev->state_wait,
666 (rv = _req_st_cond(mdev, mask, val)));
668 if (rv < SS_SUCCESS) {
669 drbd_state_unlock(mdev);
671 print_st_err(mdev, os, ns, rv);
674 spin_lock_irqsave(&mdev->req_lock, flags);
676 ns.i = (os.i & ~mask.i) | val.i;
677 rv = _drbd_set_state(mdev, ns, f, &done);
678 drbd_state_unlock(mdev);
680 rv = _drbd_set_state(mdev, ns, f, &done);
683 spin_unlock_irqrestore(&mdev->req_lock, flags);
685 if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
686 D_ASSERT(current != mdev->worker.task);
687 wait_for_completion(&done);
691 if (f & CS_SERIALIZE)
692 mutex_unlock(&mdev->state_mutex);
698 * _drbd_request_state() - Request a state change (with flags)
699 * @mdev: DRBD device.
700 * @mask: mask of state bits to change.
701 * @val: value of new state bits.
704 * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
705 * flag, or when logging of failed state change requests is not desired.
708 _drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
709 union drbd_state val, enum chg_state_flags f)
711 enum drbd_state_rv rv;
713 wait_event(mdev->state_wait,
714 (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
719 static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
721 dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
723 drbd_conn_str(ns.conn),
724 drbd_role_str(ns.role),
725 drbd_role_str(ns.peer),
726 drbd_disk_str(ns.disk),
727 drbd_disk_str(ns.pdsk),
728 is_susp(ns) ? 's' : 'r',
729 ns.aftr_isp ? 'a' : '-',
730 ns.peer_isp ? 'p' : '-',
731 ns.user_isp ? 'u' : '-'
735 void print_st_err(struct drbd_conf *mdev, union drbd_state os,
736 union drbd_state ns, enum drbd_state_rv err)
738 if (err == SS_IN_TRANSIENT_STATE)
740 dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
741 print_st(mdev, " state", os);
742 print_st(mdev, "wanted", ns);
747 * is_valid_state() - Returns an SS_ error code if ns is not valid
748 * @mdev: DRBD device.
749 * @ns: State to consider.
751 static enum drbd_state_rv
752 is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
754 /* See drbd_state_sw_errors in drbd_strings.c */
756 enum drbd_fencing_p fp;
757 enum drbd_state_rv rv = SS_SUCCESS;
760 if (get_ldev(mdev)) {
761 fp = mdev->ldev->dc.fencing;
765 if (get_net_conf(mdev)) {
766 if (!mdev->net_conf->two_primaries &&
767 ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
768 rv = SS_TWO_PRIMARIES;
773 /* already found a reason to abort */;
774 else if (ns.role == R_SECONDARY && mdev->open_cnt)
775 rv = SS_DEVICE_IN_USE;
777 else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
778 rv = SS_NO_UP_TO_DATE_DISK;
780 else if (fp >= FP_RESOURCE &&
781 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
784 else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
785 rv = SS_NO_UP_TO_DATE_DISK;
787 else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
788 rv = SS_NO_LOCAL_DISK;
790 else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
791 rv = SS_NO_REMOTE_DISK;
793 else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
794 rv = SS_NO_UP_TO_DATE_DISK;
796 else if ((ns.conn == C_CONNECTED ||
797 ns.conn == C_WF_BITMAP_S ||
798 ns.conn == C_SYNC_SOURCE ||
799 ns.conn == C_PAUSED_SYNC_S) &&
800 ns.disk == D_OUTDATED)
801 rv = SS_CONNECTED_OUTDATES;
803 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
804 (mdev->sync_conf.verify_alg[0] == 0))
805 rv = SS_NO_VERIFY_ALG;
807 else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
808 mdev->agreed_pro_version < 88)
809 rv = SS_NOT_SUPPORTED;
811 else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
812 rv = SS_CONNECTED_OUTDATES;
818 * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
819 * @mdev: DRBD device.
823 static enum drbd_state_rv
824 is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
827 enum drbd_state_rv rv = SS_SUCCESS;
829 if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
830 os.conn > C_CONNECTED)
831 rv = SS_RESYNC_RUNNING;
833 if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
834 rv = SS_ALREADY_STANDALONE;
836 if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
839 if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
840 rv = SS_NO_NET_CONFIG;
842 if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
843 rv = SS_LOWER_THAN_OUTDATED;
845 if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
846 rv = SS_IN_TRANSIENT_STATE;
848 if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
849 rv = SS_IN_TRANSIENT_STATE;
851 /* While establishing a connection only allow cstate to change.
852 Delay/refuse role changes, detach attach etc... */
853 if (drbd_test_flag(mdev, STATE_SENT) &&
854 !(os.conn == C_WF_REPORT_PARAMS ||
855 (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
856 rv = SS_IN_TRANSIENT_STATE;
858 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
859 rv = SS_NEED_CONNECTION;
861 if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
862 ns.conn != os.conn && os.conn > C_CONNECTED)
863 rv = SS_RESYNC_RUNNING;
865 if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
866 os.conn < C_CONNECTED)
867 rv = SS_NEED_CONNECTION;
869 if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
870 && os.conn < C_WF_REPORT_PARAMS)
871 rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
876 static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
878 static const char *msg_table[] = {
880 [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
881 [ABORTED_RESYNC] = "Resync aborted.",
882 [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
883 [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
884 [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
887 if (warn != NO_WARNING)
888 dev_warn(DEV, "%s\n", msg_table[warn]);
892 * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
893 * @mdev: DRBD device.
898 * When we loose connection, we have to set the state of the peers disk (pdsk)
899 * to D_UNKNOWN. This rule and many more along those lines are in this function.
901 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
902 union drbd_state ns, enum sanitize_state_warnings *warn)
904 enum drbd_fencing_p fp;
905 enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
911 if (get_ldev(mdev)) {
912 fp = mdev->ldev->dc.fencing;
916 /* Disallow Network errors to configure a device's network part */
917 if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
918 os.conn <= C_DISCONNECTING)
921 /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
922 * If you try to go into some Sync* state, that shall fail (elsewhere). */
923 if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
924 ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
927 /* we cannot fail (again) if we already detached */
928 if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
929 ns.disk = D_DISKLESS;
931 /* After C_DISCONNECTING only C_STANDALONE may follow */
932 if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
935 if (ns.conn < C_CONNECTED) {
938 if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
942 /* Clear the aftr_isp when becoming unconfigured */
943 if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
946 /* Abort resync if a disk fails/detaches */
947 if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
948 (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
950 *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
951 ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
952 ns.conn = C_CONNECTED;
955 /* Connection breaks down before we finished "Negotiating" */
956 if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
957 get_ldev_if_state(mdev, D_NEGOTIATING)) {
958 if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
959 ns.disk = mdev->new_state_tmp.disk;
960 ns.pdsk = mdev->new_state_tmp.pdsk;
963 *warn = CONNECTION_LOST_NEGOTIATING;
964 ns.disk = D_DISKLESS;
970 /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
971 if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
972 if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
973 ns.disk = D_UP_TO_DATE;
974 if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
975 ns.pdsk = D_UP_TO_DATE;
978 /* Implications of the connection stat on the disk states */
979 disk_min = D_DISKLESS;
980 disk_max = D_UP_TO_DATE;
981 pdsk_min = D_INCONSISTENT;
982 pdsk_max = D_UNKNOWN;
983 switch ((enum drbd_conns)ns.conn) {
985 case C_PAUSED_SYNC_T:
986 case C_STARTING_SYNC_T:
989 disk_min = D_INCONSISTENT;
990 disk_max = D_OUTDATED;
991 pdsk_min = D_UP_TO_DATE;
992 pdsk_max = D_UP_TO_DATE;
996 disk_min = D_UP_TO_DATE;
997 disk_max = D_UP_TO_DATE;
998 pdsk_min = D_UP_TO_DATE;
999 pdsk_max = D_UP_TO_DATE;
1002 disk_min = D_DISKLESS;
1003 disk_max = D_UP_TO_DATE;
1004 pdsk_min = D_DISKLESS;
1005 pdsk_max = D_UP_TO_DATE;
1008 case C_PAUSED_SYNC_S:
1009 case C_STARTING_SYNC_S:
1011 disk_min = D_UP_TO_DATE;
1012 disk_max = D_UP_TO_DATE;
1013 pdsk_min = D_INCONSISTENT;
1014 pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
1017 disk_min = D_INCONSISTENT;
1018 disk_max = D_INCONSISTENT;
1019 pdsk_min = D_UP_TO_DATE;
1020 pdsk_max = D_UP_TO_DATE;
1023 disk_min = D_UP_TO_DATE;
1024 disk_max = D_UP_TO_DATE;
1025 pdsk_min = D_INCONSISTENT;
1026 pdsk_max = D_INCONSISTENT;
1029 case C_DISCONNECTING:
1033 case C_NETWORK_FAILURE:
1034 case C_PROTOCOL_ERROR:
1036 case C_WF_CONNECTION:
1037 case C_WF_REPORT_PARAMS:
1041 if (ns.disk > disk_max)
1044 if (ns.disk < disk_min) {
1046 *warn = IMPLICITLY_UPGRADED_DISK;
1049 if (ns.pdsk > pdsk_max)
1052 if (ns.pdsk < pdsk_min) {
1054 *warn = IMPLICITLY_UPGRADED_PDSK;
1058 if (fp == FP_STONITH &&
1059 (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
1060 !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
1061 ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
1063 if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
1064 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
1065 !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
1066 ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
1068 if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
1069 if (ns.conn == C_SYNC_SOURCE)
1070 ns.conn = C_PAUSED_SYNC_S;
1071 if (ns.conn == C_SYNC_TARGET)
1072 ns.conn = C_PAUSED_SYNC_T;
1074 if (ns.conn == C_PAUSED_SYNC_S)
1075 ns.conn = C_SYNC_SOURCE;
1076 if (ns.conn == C_PAUSED_SYNC_T)
1077 ns.conn = C_SYNC_TARGET;
1083 /* helper for __drbd_set_state */
1084 static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
1086 if (mdev->agreed_pro_version < 90)
1087 mdev->ov_start_sector = 0;
1088 mdev->rs_total = drbd_bm_bits(mdev);
1089 mdev->ov_position = 0;
1090 if (cs == C_VERIFY_T) {
1091 /* starting online verify from an arbitrary position
1092 * does not fit well into the existing protocol.
1093 * on C_VERIFY_T, we initialize ov_left and friends
1094 * implicitly in receive_DataRequest once the
1095 * first P_OV_REQUEST is received */
1096 mdev->ov_start_sector = ~(sector_t)0;
1098 unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
1099 if (bit >= mdev->rs_total) {
1100 mdev->ov_start_sector =
1101 BM_BIT_TO_SECT(mdev->rs_total - 1);
1104 mdev->rs_total -= bit;
1105 mdev->ov_position = mdev->ov_start_sector;
1107 mdev->ov_left = mdev->rs_total;
1110 static void drbd_resume_al(struct drbd_conf *mdev)
1112 if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED))
1113 dev_info(DEV, "Resumed AL updates\n");
1117 * __drbd_set_state() - Set a new DRBD state
1118 * @mdev: DRBD device.
1121 * @done: Optional completion, that will get completed after the after_state_ch() finished
1123 * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
1126 __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
1127 enum chg_state_flags flags, struct completion *done)
1129 union drbd_state os;
1130 enum drbd_state_rv rv = SS_SUCCESS;
1131 enum sanitize_state_warnings ssw;
1132 struct after_state_chg_work *ascw;
1136 ns = sanitize_state(mdev, os, ns, &ssw);
1139 return SS_NOTHING_TO_DO;
1141 if (!(flags & CS_HARD)) {
1142 /* pre-state-change checks ; only look at ns */
1143 /* See drbd_state_sw_errors in drbd_strings.c */
1145 rv = is_valid_state(mdev, ns);
1146 if (rv < SS_SUCCESS) {
1147 /* If the old state was illegal as well, then let
1150 if (is_valid_state(mdev, os) == rv)
1151 rv = is_valid_state_transition(mdev, ns, os);
1153 rv = is_valid_state_transition(mdev, ns, os);
1156 if (rv < SS_SUCCESS) {
1157 if (flags & CS_VERBOSE)
1158 print_st_err(mdev, os, ns, rv);
1162 print_sanitize_warnings(mdev, ssw);
1168 if (ns.role != os.role)
1169 pbp += sprintf(pbp, "role( %s -> %s ) ",
1170 drbd_role_str(os.role),
1171 drbd_role_str(ns.role));
1172 if (ns.peer != os.peer)
1173 pbp += sprintf(pbp, "peer( %s -> %s ) ",
1174 drbd_role_str(os.peer),
1175 drbd_role_str(ns.peer));
1176 if (ns.conn != os.conn)
1177 pbp += sprintf(pbp, "conn( %s -> %s ) ",
1178 drbd_conn_str(os.conn),
1179 drbd_conn_str(ns.conn));
1180 if (ns.disk != os.disk)
1181 pbp += sprintf(pbp, "disk( %s -> %s ) ",
1182 drbd_disk_str(os.disk),
1183 drbd_disk_str(ns.disk));
1184 if (ns.pdsk != os.pdsk)
1185 pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
1186 drbd_disk_str(os.pdsk),
1187 drbd_disk_str(ns.pdsk));
1188 if (is_susp(ns) != is_susp(os))
1189 pbp += sprintf(pbp, "susp( %d -> %d ) ",
1192 if (ns.aftr_isp != os.aftr_isp)
1193 pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
1196 if (ns.peer_isp != os.peer_isp)
1197 pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
1200 if (ns.user_isp != os.user_isp)
1201 pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
1204 dev_info(DEV, "%s\n", pb);
1207 /* solve the race between becoming unconfigured,
1208 * worker doing the cleanup, and
1209 * admin reconfiguring us:
1210 * on (re)configure, first set CONFIG_PENDING,
1211 * then wait for a potentially exiting worker,
1212 * start the worker, and schedule one no_op.
1213 * then proceed with configuration.
1215 if (ns.disk == D_DISKLESS &&
1216 ns.conn == C_STANDALONE &&
1217 ns.role == R_SECONDARY &&
1218 !drbd_test_and_set_flag(mdev, CONFIG_PENDING))
1219 drbd_set_flag(mdev, DEVICE_DYING);
1221 /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
1222 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
1223 * drbd_ldev_destroy() won't happen before our corresponding
1224 * after_state_ch works run, where we put_ldev again. */
1225 if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
1226 (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
1227 atomic_inc(&mdev->local_cnt);
1231 if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
1232 drbd_print_uuids(mdev, "attached to UUIDs");
1234 wake_up(&mdev->misc_wait);
1235 wake_up(&mdev->state_wait);
1237 /* Aborted verify run, or we reached the stop sector.
1238 * Log the last position, unless end-of-device. */
1239 if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
1240 ns.conn <= C_CONNECTED) {
1241 mdev->ov_start_sector =
1242 BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
1244 dev_info(DEV, "Online Verify reached sector %llu\n",
1245 (unsigned long long)mdev->ov_start_sector);
1248 if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
1249 (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
1250 dev_info(DEV, "Syncer continues.\n");
1251 mdev->rs_paused += (long)jiffies
1252 -(long)mdev->rs_mark_time[mdev->rs_last_mark];
1253 if (ns.conn == C_SYNC_TARGET)
1254 mod_timer(&mdev->resync_timer, jiffies);
1257 if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
1258 (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
1259 dev_info(DEV, "Resync suspended\n");
1260 mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
1263 if (os.conn == C_CONNECTED &&
1264 (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
1265 unsigned long now = jiffies;
1268 set_ov_position(mdev, ns.conn);
1269 mdev->rs_start = now;
1270 mdev->rs_last_events = 0;
1271 mdev->rs_last_sect_ev = 0;
1272 mdev->ov_last_oos_size = 0;
1273 mdev->ov_last_oos_start = 0;
1275 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
1276 mdev->rs_mark_left[i] = mdev->ov_left;
1277 mdev->rs_mark_time[i] = now;
1280 drbd_rs_controller_reset(mdev);
1282 if (ns.conn == C_VERIFY_S) {
1283 dev_info(DEV, "Starting Online Verify from sector %llu\n",
1284 (unsigned long long)mdev->ov_position);
1285 mod_timer(&mdev->resync_timer, jiffies);
1289 if (get_ldev(mdev)) {
1290 u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
1291 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
1292 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
1294 if (drbd_test_flag(mdev, CRASHED_PRIMARY))
1295 mdf |= MDF_CRASHED_PRIMARY;
1296 if (mdev->state.role == R_PRIMARY ||
1297 (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
1298 mdf |= MDF_PRIMARY_IND;
1299 if (mdev->state.conn > C_WF_REPORT_PARAMS)
1300 mdf |= MDF_CONNECTED_IND;
1301 if (mdev->state.disk > D_INCONSISTENT)
1302 mdf |= MDF_CONSISTENT;
1303 if (mdev->state.disk > D_OUTDATED)
1304 mdf |= MDF_WAS_UP_TO_DATE;
1305 if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
1306 mdf |= MDF_PEER_OUT_DATED;
1307 if (mdf != mdev->ldev->md.flags) {
1308 mdev->ldev->md.flags = mdf;
1309 drbd_md_mark_dirty(mdev);
1311 if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
1312 drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
1316 /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
1317 if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
1318 os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
1319 drbd_set_flag(mdev, CONSIDER_RESYNC);
1321 /* Receiver should clean up itself */
1322 if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
1323 drbd_thread_stop_nowait(&mdev->receiver);
1325 /* Now the receiver finished cleaning up itself, it should die */
1326 if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
1327 drbd_thread_stop_nowait(&mdev->receiver);
1329 /* Upon network failure, we need to restart the receiver. */
1330 if (os.conn > C_WF_CONNECTION &&
1331 ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
1332 drbd_thread_restart_nowait(&mdev->receiver);
1334 /* Resume AL writing if we get a connection */
1335 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1336 drbd_resume_al(mdev);
1338 /* remember last connect and attach times so request_timer_fn() won't
1339 * kill newly established sessions while we are still trying to thaw
1340 * previously frozen IO */
1341 if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
1342 mdev->last_reconnect_jif = jiffies;
1343 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1344 ns.disk > D_NEGOTIATING)
1345 mdev->last_reattach_jif = jiffies;
1347 ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
1351 ascw->flags = flags;
1352 ascw->w.cb = w_after_state_ch;
1354 drbd_queue_work(&mdev->data.work, &ascw->w);
1356 dev_warn(DEV, "Could not kmalloc an ascw\n");
1362 static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
1364 struct after_state_chg_work *ascw =
1365 container_of(w, struct after_state_chg_work, w);
1366 after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
1367 if (ascw->flags & CS_WAIT_COMPLETE) {
1368 D_ASSERT(ascw->done != NULL);
1369 complete(ascw->done);
1376 static void abw_start_sync(struct drbd_conf *mdev, int rv)
1379 dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
1380 _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
1384 switch (mdev->state.conn) {
1385 case C_STARTING_SYNC_T:
1386 _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
1388 case C_STARTING_SYNC_S:
1389 drbd_start_resync(mdev, C_SYNC_SOURCE);
1394 int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
1395 int (*io_fn)(struct drbd_conf *),
1396 char *why, enum bm_flag flags)
1400 D_ASSERT(current == mdev->worker.task);
1402 /* open coded non-blocking drbd_suspend_io(mdev); */
1403 drbd_set_flag(mdev, SUSPEND_IO);
1405 drbd_bm_lock(mdev, why, flags);
1407 drbd_bm_unlock(mdev);
1409 drbd_resume_io(mdev);
1415 * after_state_ch() - Perform after state change actions that may sleep
1416 * @mdev: DRBD device.
1421 static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
1422 union drbd_state ns, enum chg_state_flags flags)
1424 enum drbd_fencing_p fp;
1425 enum drbd_req_event what = nothing;
1426 union drbd_state nsm = (union drbd_state){ .i = -1 };
1428 if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
1429 drbd_clear_flag(mdev, CRASHED_PRIMARY);
1431 mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
1435 if (get_ldev(mdev)) {
1436 fp = mdev->ldev->dc.fencing;
1440 /* Inform userspace about the change... */
1441 drbd_bcast_state(mdev, ns);
1443 if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
1444 (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
1445 drbd_khelper(mdev, "pri-on-incon-degr");
1447 /* Here we have the actions that are performed after a
1448 state change. This function might sleep */
1450 if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
1451 mod_timer(&mdev->request_timer, jiffies + HZ);
1455 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
1458 if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
1459 ns.disk > D_NEGOTIATING)
1460 what = restart_frozen_disk_io;
1462 if (what != nothing)
1467 /* case1: The outdate peer handler is successful: */
1468 if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
1469 if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
1470 drbd_uuid_new_current(mdev);
1471 drbd_clear_flag(mdev, NEW_CUR_UUID);
1473 spin_lock_irq(&mdev->req_lock);
1475 _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
1476 spin_unlock_irq(&mdev->req_lock);
1478 /* case2: The connection was established again: */
1479 if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
1480 drbd_clear_flag(mdev, NEW_CUR_UUID);
1486 if (what != nothing) {
1487 spin_lock_irq(&mdev->req_lock);
1488 _tl_restart(mdev, what);
1489 nsm.i &= mdev->state.i;
1490 _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
1491 spin_unlock_irq(&mdev->req_lock);
1494 /* Became sync source. With protocol >= 96, we still need to send out
1495 * the sync uuid now. Need to do that before any drbd_send_state, or
1496 * the other side may go "paused sync" before receiving the sync uuids,
1497 * which is unexpected. */
1498 if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
1499 (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
1500 mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
1501 drbd_gen_and_send_sync_uuid(mdev);
1505 /* Do not change the order of the if above and the two below... */
1506 if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
1507 /* we probably will start a resync soon.
1508 * make sure those things are properly reset. */
1510 mdev->rs_failed = 0;
1511 atomic_set(&mdev->rs_pending_cnt, 0);
1512 drbd_rs_cancel_all(mdev);
1514 drbd_send_uuids(mdev);
1515 drbd_send_state(mdev, ns);
1517 /* No point in queuing send_bitmap if we don't have a connection
1518 * anymore, so check also the _current_ state, not only the new state
1519 * at the time this work was queued. */
1520 if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
1521 mdev->state.conn == C_WF_BITMAP_S)
1522 drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
1523 "send_bitmap (WFBitMapS)",
1524 BM_LOCKED_TEST_ALLOWED);
1526 /* Lost contact to peer's copy of the data */
1527 if ((os.pdsk >= D_INCONSISTENT &&
1528 os.pdsk != D_UNKNOWN &&
1529 os.pdsk != D_OUTDATED)
1530 && (ns.pdsk < D_INCONSISTENT ||
1531 ns.pdsk == D_UNKNOWN ||
1532 ns.pdsk == D_OUTDATED)) {
1533 if (get_ldev(mdev)) {
1534 if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
1535 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1536 if (is_susp(mdev->state)) {
1537 drbd_set_flag(mdev, NEW_CUR_UUID);
1539 drbd_uuid_new_current(mdev);
1540 drbd_send_uuids(mdev);
1547 if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
1548 if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
1549 mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
1550 drbd_uuid_new_current(mdev);
1551 drbd_send_uuids(mdev);
1553 /* D_DISKLESS Peer becomes secondary */
1554 if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
1555 /* We may still be Primary ourselves.
1556 * No harm done if the bitmap still changes,
1557 * redirtied pages will follow later. */
1558 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1559 "demote diskless peer", BM_LOCKED_SET_ALLOWED);
1563 /* Write out all changed bits on demote.
1564 * Though, no need to da that just yet
1565 * if there is a resync going on still */
1566 if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
1567 mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
1568 /* No changes to the bitmap expected this time, so assert that,
1569 * even though no harm was done if it did change. */
1570 drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
1571 "demote", BM_LOCKED_TEST_ALLOWED);
1575 /* Last part of the attaching process ... */
1576 if (ns.conn >= C_CONNECTED &&
1577 os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
1578 drbd_send_sizes(mdev, 0, 0); /* to start sync... */
1579 drbd_send_uuids(mdev);
1580 drbd_send_state(mdev, ns);
1583 /* We want to pause/continue resync, tell peer. */
1584 if (ns.conn >= C_CONNECTED &&
1585 ((os.aftr_isp != ns.aftr_isp) ||
1586 (os.user_isp != ns.user_isp)))
1587 drbd_send_state(mdev, ns);
1589 /* In case one of the isp bits got set, suspend other devices. */
1590 if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
1591 (ns.aftr_isp || ns.peer_isp || ns.user_isp))
1592 suspend_other_sg(mdev);
1594 /* Make sure the peer gets informed about eventual state
1595 changes (ISP bits) while we were in WFReportParams. */
1596 if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
1597 drbd_send_state(mdev, ns);
1599 if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
1600 drbd_send_state(mdev, ns);
1602 /* We are in the progress to start a full sync... */
1603 if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
1604 (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
1605 /* no other bitmap changes expected during this phase */
1606 drbd_queue_bitmap_io(mdev,
1607 &drbd_bmio_set_n_write, &abw_start_sync,
1608 "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
1610 /* We are invalidating our self... */
1611 if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
1612 os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
1613 /* other bitmap operation expected during this phase */
1614 drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
1615 "set_n_write from invalidate", BM_LOCKED_MASK);
1617 /* first half of local IO error, failure to attach,
1618 * or administrative detach */
1619 if (os.disk != D_FAILED && ns.disk == D_FAILED) {
1620 enum drbd_io_error_p eh = EP_PASS_ON;
1621 int was_io_error = 0;
1622 /* corresponding get_ldev was in __drbd_set_state, to serialize
1623 * our cleanup here with the transition to D_DISKLESS.
1624 * But is is still not save to dreference ldev here, since
1625 * we might come from an failed Attach before ldev was set. */
1627 eh = mdev->ldev->dc.on_io_error;
1628 was_io_error = drbd_test_and_clear_flag(mdev, WAS_IO_ERROR);
1630 if (was_io_error && eh == EP_CALL_HELPER)
1631 drbd_khelper(mdev, "local-io-error");
1633 /* Immediately allow completion of all application IO,
1634 * that waits for completion from the local disk,
1635 * if this was a force-detach due to disk_timeout
1636 * or administrator request (drbdsetup detach --force).
1637 * Do NOT abort otherwise.
1638 * Aborting local requests may cause serious problems,
1639 * if requests are completed to upper layers already,
1640 * and then later the already submitted local bio completes.
1641 * This can cause DMA into former bio pages that meanwhile
1642 * have been re-used for other things.
1643 * So aborting local requests may cause crashes,
1644 * or even worse, silent data corruption.
1646 if (drbd_test_and_clear_flag(mdev, FORCE_DETACH))
1647 tl_abort_disk_io(mdev);
1649 /* current state still has to be D_FAILED,
1650 * there is only one way out: to D_DISKLESS,
1651 * and that may only happen after our put_ldev below. */
1652 if (mdev->state.disk != D_FAILED)
1654 "ASSERT FAILED: disk is %s during detach\n",
1655 drbd_disk_str(mdev->state.disk));
1657 if (ns.conn >= C_CONNECTED)
1658 drbd_send_state(mdev, ns);
1660 drbd_rs_cancel_all(mdev);
1662 /* In case we want to get something to stable storage still,
1663 * this may be the last chance.
1664 * Following put_ldev may transition to D_DISKLESS. */
1670 /* second half of local IO error, failure to attach,
1671 * or administrative detach,
1672 * after local_cnt references have reached zero again */
1673 if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
1674 /* We must still be diskless,
1675 * re-attach has to be serialized with this! */
1676 if (mdev->state.disk != D_DISKLESS)
1678 "ASSERT FAILED: disk is %s while going diskless\n",
1679 drbd_disk_str(mdev->state.disk));
1681 if (ns.conn >= C_CONNECTED)
1682 drbd_send_state(mdev, ns);
1684 /* corresponding get_ldev in __drbd_set_state
1685 * this may finally trigger drbd_ldev_destroy. */
1689 /* Notify peer that I had a local IO error, and did not detached.. */
1690 if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
1691 drbd_send_state(mdev, ns);
1693 /* Disks got bigger while they were detached */
1694 if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
1695 drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) {
1696 if (ns.conn == C_CONNECTED)
1697 resync_after_online_grow(mdev);
1700 /* A resync finished or aborted, wake paused devices... */
1701 if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
1702 (os.peer_isp && !ns.peer_isp) ||
1703 (os.user_isp && !ns.user_isp))
1704 resume_next_sg(mdev);
1706 /* sync target done with resync. Explicitly notify peer, even though
1707 * it should (at least for non-empty resyncs) already know itself. */
1708 if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
1709 drbd_send_state(mdev, ns);
1711 /* Verify finished, or reached stop sector. Peer did not know about
1712 * the stop sector, and we may even have changed the stop sector during
1713 * verify to interrupt/stop early. Send the new state. */
1714 if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
1715 && mdev->agreed_pro_version >= 97)
1716 drbd_send_state(mdev, ns);
1718 /* Wake up role changes, that were delayed because of connection establishing */
1719 if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
1720 drbd_clear_flag(mdev, STATE_SENT);
1721 wake_up(&mdev->state_wait);
1724 /* This triggers bitmap writeout of potentially still unwritten pages
1725 * if the resync finished cleanly, or aborted because of peer disk
1726 * failure, or because of connection loss.
1727 * For resync aborted because of local disk failure, we cannot do
1728 * any bitmap writeout anymore.
1729 * No harm done if some bits change during this phase.
1731 if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
1732 drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
1733 "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
1737 /* free tl_hash if we Got thawed and are C_STANDALONE */
1738 if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
1739 drbd_free_tl_hash(mdev);
1741 /* Upon network connection, we need to start the receiver */
1742 if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
1743 drbd_thread_start(&mdev->receiver);
1745 /* Terminate worker thread if we are unconfigured - it will be
1746 restarted as needed... */
1747 if (ns.disk == D_DISKLESS &&
1748 ns.conn == C_STANDALONE &&
1749 ns.role == R_SECONDARY) {
1750 if (os.aftr_isp != ns.aftr_isp)
1751 resume_next_sg(mdev);
1752 /* set in __drbd_set_state, unless CONFIG_PENDING was set */
1753 if (drbd_test_flag(mdev, DEVICE_DYING))
1754 drbd_thread_stop_nowait(&mdev->worker);
1761 static int drbd_thread_setup(void *arg)
1763 struct drbd_thread *thi = (struct drbd_thread *) arg;
1764 struct drbd_conf *mdev = thi->mdev;
1765 unsigned long flags;
1769 retval = thi->function(thi);
1771 spin_lock_irqsave(&thi->t_lock, flags);
1773 /* if the receiver has been "Exiting", the last thing it did
1774 * was set the conn state to "StandAlone",
1775 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
1776 * and receiver thread will be "started".
1777 * drbd_thread_start needs to set "Restarting" in that case.
1778 * t_state check and assignment needs to be within the same spinlock,
1779 * so either thread_start sees Exiting, and can remap to Restarting,
1780 * or thread_start see None, and can proceed as normal.
1783 if (thi->t_state == Restarting) {
1784 dev_info(DEV, "Restarting %s\n", current->comm);
1785 thi->t_state = Running;
1786 spin_unlock_irqrestore(&thi->t_lock, flags);
1791 thi->t_state = None;
1793 complete(&thi->stop);
1794 spin_unlock_irqrestore(&thi->t_lock, flags);
1796 dev_info(DEV, "Terminating %s\n", current->comm);
1798 /* Release mod reference taken when thread was started */
1799 module_put(THIS_MODULE);
1803 static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
1804 int (*func) (struct drbd_thread *))
1806 spin_lock_init(&thi->t_lock);
1808 thi->t_state = None;
1809 thi->function = func;
1813 int drbd_thread_start(struct drbd_thread *thi)
1815 struct drbd_conf *mdev = thi->mdev;
1816 struct task_struct *nt;
1817 unsigned long flags;
1820 thi == &mdev->receiver ? "receiver" :
1821 thi == &mdev->asender ? "asender" :
1822 thi == &mdev->worker ? "worker" : "NONSENSE";
1824 /* is used from state engine doing drbd_thread_stop_nowait,
1825 * while holding the req lock irqsave */
1826 spin_lock_irqsave(&thi->t_lock, flags);
1828 switch (thi->t_state) {
1830 dev_info(DEV, "Starting %s thread (from %s [%d])\n",
1831 me, current->comm, current->pid);
1833 /* Get ref on module for thread - this is released when thread exits */
1834 if (!try_module_get(THIS_MODULE)) {
1835 dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
1836 spin_unlock_irqrestore(&thi->t_lock, flags);
1840 init_completion(&thi->stop);
1841 D_ASSERT(thi->task == NULL);
1842 thi->reset_cpu_mask = 1;
1843 thi->t_state = Running;
1844 spin_unlock_irqrestore(&thi->t_lock, flags);
1845 flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
1847 nt = kthread_create(drbd_thread_setup, (void *) thi,
1848 "drbd%d_%s", mdev_to_minor(mdev), me);
1851 dev_err(DEV, "Couldn't start thread\n");
1853 module_put(THIS_MODULE);
1856 spin_lock_irqsave(&thi->t_lock, flags);
1858 thi->t_state = Running;
1859 spin_unlock_irqrestore(&thi->t_lock, flags);
1860 wake_up_process(nt);
1863 thi->t_state = Restarting;
1864 dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
1865 me, current->comm, current->pid);
1870 spin_unlock_irqrestore(&thi->t_lock, flags);
1878 void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
1880 unsigned long flags;
1882 enum drbd_thread_state ns = restart ? Restarting : Exiting;
1884 /* may be called from state engine, holding the req lock irqsave */
1885 spin_lock_irqsave(&thi->t_lock, flags);
1887 if (thi->t_state == None) {
1888 spin_unlock_irqrestore(&thi->t_lock, flags);
1890 drbd_thread_start(thi);
1894 if (thi->t_state != ns) {
1895 if (thi->task == NULL) {
1896 spin_unlock_irqrestore(&thi->t_lock, flags);
1902 init_completion(&thi->stop);
1903 if (thi->task != current)
1904 force_sig(DRBD_SIGKILL, thi->task);
1908 spin_unlock_irqrestore(&thi->t_lock, flags);
1911 wait_for_completion(&thi->stop);
1916 * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
1917 * @mdev: DRBD device.
1919 * Forces all threads of a device onto the same CPU. This is beneficial for
1920 * DRBD's performance. May be overwritten by user's configuration.
1922 void drbd_calc_cpu_mask(struct drbd_conf *mdev)
1926 /* user override. */
1927 if (cpumask_weight(mdev->cpu_mask))
1930 ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
1931 for_each_online_cpu(cpu) {
1933 cpumask_set_cpu(cpu, mdev->cpu_mask);
1937 /* should not be reached */
1938 cpumask_setall(mdev->cpu_mask);
1942 * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
1943 * @mdev: DRBD device.
1945 * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
1948 void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
1950 struct task_struct *p = current;
1951 struct drbd_thread *thi =
1952 p == mdev->asender.task ? &mdev->asender :
1953 p == mdev->receiver.task ? &mdev->receiver :
1954 p == mdev->worker.task ? &mdev->worker :
1958 if (!thi->reset_cpu_mask)
1960 thi->reset_cpu_mask = 0;
1961 set_cpus_allowed_ptr(p, mdev->cpu_mask);
1965 /* the appropriate socket mutex must be held already */
1966 int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
1967 enum drbd_packets cmd, struct p_header80 *h,
1968 size_t size, unsigned msg_flags)
1972 ERR_IF(!h) return false;
1973 ERR_IF(!size) return false;
1975 h->magic = BE_DRBD_MAGIC;
1976 h->command = cpu_to_be16(cmd);
1977 h->length = cpu_to_be16(size-sizeof(struct p_header80));
1979 sent = drbd_send(mdev, sock, h, size, msg_flags);
1981 ok = (sent == size);
1982 if (!ok && !signal_pending(current))
1983 dev_warn(DEV, "short sent %s size=%d sent=%d\n",
1984 cmdname(cmd), (int)size, sent);
1988 /* don't pass the socket. we may only look at it
1989 * when we hold the appropriate socket mutex.
1991 int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
1992 enum drbd_packets cmd, struct p_header80 *h, size_t size)
1995 struct socket *sock;
1997 if (use_data_socket) {
1998 mutex_lock(&mdev->data.mutex);
1999 sock = mdev->data.socket;
2001 mutex_lock(&mdev->meta.mutex);
2002 sock = mdev->meta.socket;
2005 /* drbd_disconnect() could have called drbd_free_sock()
2006 * while we were waiting in down()... */
2007 if (likely(sock != NULL))
2008 ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
2010 if (use_data_socket)
2011 mutex_unlock(&mdev->data.mutex);
2013 mutex_unlock(&mdev->meta.mutex);
2017 int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
2020 struct p_header80 h;
2023 h.magic = BE_DRBD_MAGIC;
2024 h.command = cpu_to_be16(cmd);
2025 h.length = cpu_to_be16(size);
2027 if (!drbd_get_data_sock(mdev))
2031 drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
2033 drbd_send(mdev, mdev->data.socket, data, size, 0));
2035 drbd_put_data_sock(mdev);
2040 int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
2042 struct p_rs_param_95 *p;
2043 struct socket *sock;
2045 const int apv = mdev->agreed_pro_version;
2047 size = apv <= 87 ? sizeof(struct p_rs_param)
2048 : apv == 88 ? sizeof(struct p_rs_param)
2049 + strlen(mdev->sync_conf.verify_alg) + 1
2050 : apv <= 94 ? sizeof(struct p_rs_param_89)
2051 : /* apv >= 95 */ sizeof(struct p_rs_param_95);
2053 /* used from admin command context and receiver/worker context.
2054 * to avoid kmalloc, grab the socket right here,
2055 * then use the pre-allocated sbuf there */
2056 mutex_lock(&mdev->data.mutex);
2057 sock = mdev->data.socket;
2059 if (likely(sock != NULL)) {
2060 enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
2062 p = &mdev->data.sbuf.rs_param_95;
2064 /* initialize verify_alg and csums_alg */
2065 memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
2067 p->rate = cpu_to_be32(sc->rate);
2068 p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
2069 p->c_delay_target = cpu_to_be32(sc->c_delay_target);
2070 p->c_fill_target = cpu_to_be32(sc->c_fill_target);
2071 p->c_max_rate = cpu_to_be32(sc->c_max_rate);
2074 strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
2076 strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
2078 rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
2080 rv = 0; /* not ok */
2082 mutex_unlock(&mdev->data.mutex);
2087 int drbd_send_protocol(struct drbd_conf *mdev)
2089 struct p_protocol *p;
2092 size = sizeof(struct p_protocol);
2094 if (mdev->agreed_pro_version >= 87)
2095 size += strlen(mdev->net_conf->integrity_alg) + 1;
2097 /* we must not recurse into our own queue,
2098 * as that is blocked during handshake */
2099 p = kmalloc(size, GFP_NOIO);
2103 p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
2104 p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
2105 p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
2106 p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
2107 p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
2110 if (mdev->net_conf->want_lose)
2112 if (mdev->net_conf->dry_run) {
2113 if (mdev->agreed_pro_version >= 92)
2116 dev_err(DEV, "--dry-run is not supported by peer");
2121 p->conn_flags = cpu_to_be32(cf);
2123 if (mdev->agreed_pro_version >= 87)
2124 strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
2126 rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
2127 (struct p_header80 *)p, size);
2132 int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
2137 if (!get_ldev_if_state(mdev, D_NEGOTIATING))
2140 spin_lock_irq(&mdev->ldev->md.uuid_lock);
2141 for (i = UI_CURRENT; i < UI_SIZE; i++)
2142 p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
2143 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
2145 mdev->comm_bm_set = drbd_bm_total_weight(mdev);
2146 p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
2147 uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
2148 uuid_flags |= drbd_test_flag(mdev, CRASHED_PRIMARY) ? 2 : 0;
2149 uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
2150 p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
2154 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
2155 (struct p_header80 *)&p, sizeof(p));
2158 int drbd_send_uuids(struct drbd_conf *mdev)
2160 return _drbd_send_uuids(mdev, 0);
2163 int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev)
2165 return _drbd_send_uuids(mdev, 8);
2168 void drbd_print_uuids(struct drbd_conf *mdev, const char *text)
2170 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2171 u64 *uuid = mdev->ldev->md.uuid;
2172 dev_info(DEV, "%s %016llX:%016llX:%016llX:%016llX\n",
2174 (unsigned long long)uuid[UI_CURRENT],
2175 (unsigned long long)uuid[UI_BITMAP],
2176 (unsigned long long)uuid[UI_HISTORY_START],
2177 (unsigned long long)uuid[UI_HISTORY_END]);
2180 dev_info(DEV, "%s effective data uuid: %016llX\n",
2182 (unsigned long long)mdev->ed_uuid);
2186 int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
2191 D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
2193 uuid = mdev->ldev->md.uuid[UI_BITMAP];
2194 if (uuid && uuid != UUID_JUST_CREATED)
2195 uuid = uuid + UUID_NEW_BM_OFFSET;
2197 get_random_bytes(&uuid, sizeof(u64));
2198 drbd_uuid_set(mdev, UI_BITMAP, uuid);
2199 drbd_print_uuids(mdev, "updated sync UUID");
2201 p.uuid = cpu_to_be64(uuid);
2203 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
2204 (struct p_header80 *)&p, sizeof(p));
2207 int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
2210 sector_t d_size, u_size;
2212 unsigned int max_bio_size;
2215 if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
2216 D_ASSERT(mdev->ldev->backing_bdev);
2217 d_size = drbd_get_max_capacity(mdev->ldev);
2218 u_size = mdev->ldev->dc.disk_size;
2219 q_order_type = drbd_queue_order_type(mdev);
2220 max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
2221 max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
2226 q_order_type = QUEUE_ORDERED_NONE;
2227 max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
2230 /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
2231 if (mdev->agreed_pro_version <= 94)
2232 max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
2234 p.d_size = cpu_to_be64(d_size);
2235 p.u_size = cpu_to_be64(u_size);
2236 p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
2237 p.max_bio_size = cpu_to_be32(max_bio_size);
2238 p.queue_order_type = cpu_to_be16(q_order_type);
2239 p.dds_flags = cpu_to_be16(flags);
2241 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
2242 (struct p_header80 *)&p, sizeof(p));
2247 * drbd_send_current_state() - Sends the drbd state to the peer
2248 * @mdev: DRBD device.
2250 int drbd_send_current_state(struct drbd_conf *mdev)
2252 struct socket *sock;
2256 /* Grab state lock so we wont send state if we're in the middle
2257 * of a cluster wide state change on another thread */
2258 drbd_state_lock(mdev);
2260 mutex_lock(&mdev->data.mutex);
2262 p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
2263 sock = mdev->data.socket;
2265 if (likely(sock != NULL)) {
2266 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2267 (struct p_header80 *)&p, sizeof(p), 0);
2270 mutex_unlock(&mdev->data.mutex);
2272 drbd_state_unlock(mdev);
2277 * drbd_send_state() - After a state change, sends the new state to the peer
2278 * @mdev: DRBD device.
2279 * @state: the state to send, not necessarily the current state.
2281 * Each state change queues an "after_state_ch" work, which will eventually
2282 * send the resulting new state to the peer. If more state changes happen
2283 * between queuing and processing of the after_state_ch work, we still
2284 * want to send each intermediary state in the order it occurred.
2286 int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
2288 struct socket *sock;
2292 mutex_lock(&mdev->data.mutex);
2294 p.state = cpu_to_be32(state.i);
2295 sock = mdev->data.socket;
2297 if (likely(sock != NULL)) {
2298 ok = _drbd_send_cmd(mdev, sock, P_STATE,
2299 (struct p_header80 *)&p, sizeof(p), 0);
2302 mutex_unlock(&mdev->data.mutex);
2307 int drbd_send_state_req(struct drbd_conf *mdev,
2308 union drbd_state mask, union drbd_state val)
2310 struct p_req_state p;
2312 p.mask = cpu_to_be32(mask.i);
2313 p.val = cpu_to_be32(val.i);
2315 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
2316 (struct p_header80 *)&p, sizeof(p));
2319 int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
2321 struct p_req_state_reply p;
2323 p.retcode = cpu_to_be32(retcode);
2325 return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
2326 (struct p_header80 *)&p, sizeof(p));
2329 int fill_bitmap_rle_bits(struct drbd_conf *mdev,
2330 struct p_compressed_bm *p,
2331 struct bm_xfer_ctx *c)
2333 struct bitstream bs;
2334 unsigned long plain_bits;
2341 /* may we use this feature? */
2342 if ((mdev->sync_conf.use_rle == 0) ||
2343 (mdev->agreed_pro_version < 90))
2346 if (c->bit_offset >= c->bm_bits)
2347 return 0; /* nothing to do. */
2349 /* use at most thus many bytes */
2350 bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
2351 memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
2352 /* plain bits covered in this code string */
2355 /* p->encoding & 0x80 stores whether the first run length is set.
2356 * bit offset is implicit.
2357 * start with toggle == 2 to be able to tell the first iteration */
2360 /* see how much plain bits we can stuff into one packet
2361 * using RLE and VLI. */
2363 tmp = (toggle == 0) ? _drbd_bm_find_next_zero(mdev, c->bit_offset)
2364 : _drbd_bm_find_next(mdev, c->bit_offset);
2367 rl = tmp - c->bit_offset;
2369 if (toggle == 2) { /* first iteration */
2371 /* the first checked bit was set,
2372 * store start value, */
2373 DCBP_set_start(p, 1);
2374 /* but skip encoding of zero run length */
2378 DCBP_set_start(p, 0);
2381 /* paranoia: catch zero runlength.
2382 * can only happen if bitmap is modified while we scan it. */
2384 dev_err(DEV, "unexpected zero runlength while encoding bitmap "
2385 "t:%u bo:%lu\n", toggle, c->bit_offset);
2389 bits = vli_encode_bits(&bs, rl);
2390 if (bits == -ENOBUFS) /* buffer full */
2393 dev_err(DEV, "error while encoding bitmap: %d\n", bits);
2399 c->bit_offset = tmp;
2400 } while (c->bit_offset < c->bm_bits);
2402 len = bs.cur.b - p->code + !!bs.cur.bit;
2404 if (plain_bits < (len << 3)) {
2405 /* incompressible with this method.
2406 * we need to rewind both word and bit position. */
2407 c->bit_offset -= plain_bits;
2408 bm_xfer_ctx_bit_to_word_offset(c);
2409 c->bit_offset = c->word_offset * BITS_PER_LONG;
2413 /* RLE + VLI was able to compress it just fine.
2414 * update c->word_offset. */
2415 bm_xfer_ctx_bit_to_word_offset(c);
2417 /* store pad_bits */
2418 DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
2424 * send_bitmap_rle_or_plain
2426 * Return 0 when done, 1 when another iteration is needed, and a negative error
2427 * code upon failure.
2430 send_bitmap_rle_or_plain(struct drbd_conf *mdev,
2431 struct p_header80 *h, struct bm_xfer_ctx *c)
2433 struct p_compressed_bm *p = (void*)h;
2434 unsigned long num_words;
2438 len = fill_bitmap_rle_bits(mdev, p, c);
2444 DCBP_set_code(p, RLE_VLI_Bits);
2445 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
2446 sizeof(*p) + len, 0);
2449 c->bytes[0] += sizeof(*p) + len;
2451 if (c->bit_offset >= c->bm_bits)
2454 /* was not compressible.
2455 * send a buffer full of plain text bits instead. */
2456 num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
2457 len = num_words * sizeof(long);
2459 drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
2460 ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
2461 h, sizeof(struct p_header80) + len, 0);
2462 c->word_offset += num_words;
2463 c->bit_offset = c->word_offset * BITS_PER_LONG;
2466 c->bytes[1] += sizeof(struct p_header80) + len;
2468 if (c->bit_offset > c->bm_bits)
2469 c->bit_offset = c->bm_bits;
2473 INFO_bm_xfer_stats(mdev, "send", c);
2481 /* See the comment at receive_bitmap() */
2482 int _drbd_send_bitmap(struct drbd_conf *mdev)
2484 struct bm_xfer_ctx c;
2485 struct p_header80 *p;
2488 ERR_IF(!mdev->bitmap) return false;
2490 /* maybe we should use some per thread scratch page,
2491 * and allocate that during initial device creation? */
2492 p = (struct p_header80 *) __get_free_page(GFP_NOIO);
2494 dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
2498 if (get_ldev(mdev)) {
2499 if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
2500 dev_info(DEV, "Writing the whole bitmap, MDF_FullSync was set.\n");
2501 drbd_bm_set_all(mdev);
2502 if (drbd_bm_write(mdev)) {
2503 /* write_bm did fail! Leave full sync flag set in Meta P_DATA
2504 * but otherwise process as per normal - need to tell other
2505 * side that a full resync is required! */
2506 dev_err(DEV, "Failed to write bitmap to disk!\n");
2508 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
2515 c = (struct bm_xfer_ctx) {
2516 .bm_bits = drbd_bm_bits(mdev),
2517 .bm_words = drbd_bm_words(mdev),
2521 err = send_bitmap_rle_or_plain(mdev, p, &c);
2524 free_page((unsigned long) p);
2528 int drbd_send_bitmap(struct drbd_conf *mdev)
2532 if (!drbd_get_data_sock(mdev))
2534 err = !_drbd_send_bitmap(mdev);
2535 drbd_put_data_sock(mdev);
2539 int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
2542 struct p_barrier_ack p;
2544 p.barrier = barrier_nr;
2545 p.set_size = cpu_to_be32(set_size);
2547 if (mdev->state.conn < C_CONNECTED)
2549 ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
2550 (struct p_header80 *)&p, sizeof(p));
2555 * _drbd_send_ack() - Sends an ack packet
2556 * @mdev: DRBD device.
2557 * @cmd: Packet command code.
2558 * @sector: sector, needs to be in big endian byte order
2559 * @blksize: size in byte, needs to be in big endian byte order
2560 * @block_id: Id, big endian byte order
2562 static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
2568 struct p_block_ack p;
2571 p.block_id = block_id;
2572 p.blksize = blksize;
2573 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2575 if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
2577 ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
2578 (struct p_header80 *)&p, sizeof(p));
2582 /* dp->sector and dp->block_id already/still in network byte order,
2583 * data_size is payload size according to dp->head,
2584 * and may need to be corrected for digest size. */
2585 int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
2586 struct p_data *dp, int data_size)
2588 data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
2589 crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
2590 return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
2594 int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
2595 struct p_block_req *rp)
2597 return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
2601 * drbd_send_ack() - Sends an ack packet
2602 * @mdev: DRBD device.
2603 * @cmd: Packet command code.
2606 int drbd_send_ack(struct drbd_conf *mdev,
2607 enum drbd_packets cmd, struct drbd_epoch_entry *e)
2609 return _drbd_send_ack(mdev, cmd,
2610 cpu_to_be64(e->sector),
2611 cpu_to_be32(e->size),
2615 /* This function misuses the block_id field to signal if the blocks
2616 * are is sync or not. */
2617 int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
2618 sector_t sector, int blksize, u64 block_id)
2620 return _drbd_send_ack(mdev, cmd,
2621 cpu_to_be64(sector),
2622 cpu_to_be32(blksize),
2623 cpu_to_be64(block_id));
2626 int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
2627 sector_t sector, int size, u64 block_id)
2630 struct p_block_req p;
2632 p.sector = cpu_to_be64(sector);
2633 p.block_id = block_id;
2634 p.blksize = cpu_to_be32(size);
2636 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
2637 (struct p_header80 *)&p, sizeof(p));
2641 int drbd_send_drequest_csum(struct drbd_conf *mdev,
2642 sector_t sector, int size,
2643 void *digest, int digest_size,
2644 enum drbd_packets cmd)
2647 struct p_block_req p;
2649 p.sector = cpu_to_be64(sector);
2650 p.block_id = BE_DRBD_MAGIC + 0xbeef;
2651 p.blksize = cpu_to_be32(size);
2653 p.head.magic = BE_DRBD_MAGIC;
2654 p.head.command = cpu_to_be16(cmd);
2655 p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
2657 mutex_lock(&mdev->data.mutex);
2659 ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
2660 ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
2662 mutex_unlock(&mdev->data.mutex);
2667 int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
2670 struct p_block_req p;
2672 p.sector = cpu_to_be64(sector);
2673 p.block_id = BE_DRBD_MAGIC + 0xbabe;
2674 p.blksize = cpu_to_be32(size);
2676 ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
2677 (struct p_header80 *)&p, sizeof(p));
2681 /* called on sndtimeo
2682 * returns false if we should retry,
2683 * true if we think connection is dead
2685 static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
2688 /* long elapsed = (long)(jiffies - mdev->last_received); */
2690 drop_it = mdev->meta.socket == sock
2691 || !mdev->asender.task
2692 || get_t_state(&mdev->asender) != Running
2693 || mdev->state.conn < C_CONNECTED;
2698 drop_it = !--mdev->ko_count;
2700 dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
2701 current->comm, current->pid, mdev->ko_count);
2705 return drop_it; /* && (mdev->state == R_PRIMARY) */;
2708 /* The idea of sendpage seems to be to put some kind of reference
2709 * to the page into the skb, and to hand it over to the NIC. In
2710 * this process get_page() gets called.
2712 * As soon as the page was really sent over the network put_page()
2713 * gets called by some part of the network layer. [ NIC driver? ]
2715 * [ get_page() / put_page() increment/decrement the count. If count
2716 * reaches 0 the page will be freed. ]
2718 * This works nicely with pages from FSs.
2719 * But this means that in protocol A we might signal IO completion too early!
2721 * In order not to corrupt data during a resync we must make sure
2722 * that we do not reuse our own buffer pages (EEs) to early, therefore
2723 * we have the net_ee list.
2725 * XFS seems to have problems, still, it submits pages with page_count == 0!
2726 * As a workaround, we disable sendpage on pages
2727 * with page_count == 0 or PageSlab.
2729 static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
2730 int offset, size_t size, unsigned msg_flags)
2732 int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
2735 mdev->send_cnt += size>>9;
2736 return sent == size;
2739 static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
2740 int offset, size_t size, unsigned msg_flags)
2742 mm_segment_t oldfs = get_fs();
2746 /* e.g. XFS meta- & log-data is in slab pages, which have a
2747 * page_count of 0 and/or have PageSlab() set.
2748 * we cannot use send_page for those, as that does get_page();
2749 * put_page(); and would cause either a VM_BUG directly, or
2750 * __page_cache_release a page that would actually still be referenced
2751 * by someone, leading to some obscure delayed Oops somewhere else. */
2752 if (disable_sendpage || (page_count(page) < 1) || PageSlab(page))
2753 return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
2755 msg_flags |= MSG_NOSIGNAL;
2756 drbd_update_congested(mdev);
2759 sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
2762 if (sent == -EAGAIN) {
2763 if (we_should_drop_the_connection(mdev,
2770 dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
2771 __func__, (int)size, len, sent);
2776 } while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
2778 drbd_clear_flag(mdev, NET_CONGESTED);
2782 mdev->send_cnt += size>>9;
2786 static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
2788 struct bio_vec *bvec;
2790 /* hint all but last page with MSG_MORE */
2791 bio_for_each_segment(bvec, bio, i) {
2792 if (!_drbd_no_send_page(mdev, bvec->bv_page,
2793 bvec->bv_offset, bvec->bv_len,
2794 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2800 static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
2802 struct bio_vec *bvec;
2804 /* hint all but last page with MSG_MORE */
2805 bio_for_each_segment(bvec, bio, i) {
2806 if (!_drbd_send_page(mdev, bvec->bv_page,
2807 bvec->bv_offset, bvec->bv_len,
2808 i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
2814 static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
2816 struct page *page = e->pages;
2817 unsigned len = e->size;
2818 /* hint all but last page with MSG_MORE */
2819 page_chain_for_each(page) {
2820 unsigned l = min_t(unsigned, len, PAGE_SIZE);
2821 if (!_drbd_send_page(mdev, page, 0, l,
2822 page_chain_next(page) ? MSG_MORE : 0))
2829 static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
2831 if (mdev->agreed_pro_version >= 95)
2832 return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
2833 (bi_rw & REQ_FUA ? DP_FUA : 0) |
2834 (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
2835 (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
2837 return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0;
2840 /* Used to send write requests
2841 * R_PRIMARY -> Peer (P_DATA)
2843 int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
2847 unsigned int dp_flags = 0;
2851 if (!drbd_get_data_sock(mdev))
2854 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2855 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2857 if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
2858 p.head.h80.magic = BE_DRBD_MAGIC;
2859 p.head.h80.command = cpu_to_be16(P_DATA);
2861 cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2863 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2864 p.head.h95.command = cpu_to_be16(P_DATA);
2866 cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
2869 p.sector = cpu_to_be64(req->sector);
2870 p.block_id = (unsigned long)req;
2871 p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
2873 dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
2875 if (mdev->state.conn >= C_SYNC_SOURCE &&
2876 mdev->state.conn <= C_PAUSED_SYNC_T)
2877 dp_flags |= DP_MAY_SET_IN_SYNC;
2879 p.dp_flags = cpu_to_be32(dp_flags);
2880 drbd_set_flag(mdev, UNPLUG_REMOTE);
2882 drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
2884 dgb = mdev->int_dig_out;
2885 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
2886 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2889 /* For protocol A, we have to memcpy the payload into
2890 * socket buffers, as we may complete right away
2891 * as soon as we handed it over to tcp, at which point the data
2892 * pages may become invalid.
2894 * For data-integrity enabled, we copy it as well, so we can be
2895 * sure that even if the bio pages may still be modified, it
2896 * won't change the data on the wire, thus if the digest checks
2897 * out ok after sending on this side, but does not fit on the
2898 * receiving side, we sure have detected corruption elsewhere.
2900 if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
2901 ok = _drbd_send_bio(mdev, req->master_bio);
2903 ok = _drbd_send_zc_bio(mdev, req->master_bio);
2905 /* double check digest, sometimes buffers have been modified in flight. */
2906 if (dgs > 0 && dgs <= 64) {
2907 /* 64 byte, 512 bit, is the largest digest size
2908 * currently supported in kernel crypto. */
2909 unsigned char digest[64];
2910 drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
2911 if (memcmp(mdev->int_dig_out, digest, dgs)) {
2913 "Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
2914 (unsigned long long)req->sector, req->size);
2916 } /* else if (dgs > 64) {
2917 ... Be noisy about digest too large ...
2921 drbd_put_data_sock(mdev);
2926 /* answer packet, used to send data back for read requests:
2927 * Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
2928 * C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
2930 int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
2931 struct drbd_epoch_entry *e)
2938 dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
2939 crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
2941 if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
2942 p.head.h80.magic = BE_DRBD_MAGIC;
2943 p.head.h80.command = cpu_to_be16(cmd);
2945 cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2947 p.head.h95.magic = BE_DRBD_MAGIC_BIG;
2948 p.head.h95.command = cpu_to_be16(cmd);
2950 cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
2953 p.sector = cpu_to_be64(e->sector);
2954 p.block_id = e->block_id;
2955 /* p.seq_num = 0; No sequence numbers here.. */
2957 /* Only called by our kernel thread.
2958 * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
2959 * in response to admin command or module unload.
2961 if (!drbd_get_data_sock(mdev))
2964 ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
2966 dgb = mdev->int_dig_out;
2967 drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
2968 ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
2971 ok = _drbd_send_zc_ee(mdev, e);
2973 drbd_put_data_sock(mdev);
2978 int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
2980 struct p_block_desc p;
2982 p.sector = cpu_to_be64(req->sector);
2983 p.blksize = cpu_to_be32(req->size);
2985 return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
2989 drbd_send distinguishes two cases:
2991 Packets sent via the data socket "sock"
2992 and packets sent via the meta data socket "msock"
2995 -----------------+-------------------------+------------------------------
2996 timeout conf.timeout / 2 conf.timeout / 2
2997 timeout action send a ping via msock Abort communication
2998 and close all sockets
3002 * you must have down()ed the appropriate [m]sock_mutex elsewhere!
3004 int drbd_send(struct drbd_conf *mdev, struct socket *sock,
3005 void *buf, size_t size, unsigned msg_flags)
3014 /* THINK if (signal_pending) return ... ? */
3019 msg.msg_name = NULL;
3020 msg.msg_namelen = 0;
3021 msg.msg_control = NULL;
3022 msg.msg_controllen = 0;
3023 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
3025 if (sock == mdev->data.socket) {
3026 mdev->ko_count = mdev->net_conf->ko_count;
3027 drbd_update_congested(mdev);
3031 * tcp_sendmsg does _not_ use its size parameter at all ?
3033 * -EAGAIN on timeout, -EINTR on signal.
3036 * do we need to block DRBD_SIG if sock == &meta.socket ??
3037 * otherwise wake_asender() might interrupt some send_*Ack !
3039 rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
3040 if (rv == -EAGAIN) {
3041 if (we_should_drop_the_connection(mdev, sock))
3048 flush_signals(current);
3056 } while (sent < size);
3058 if (sock == mdev->data.socket)
3059 drbd_clear_flag(mdev, NET_CONGESTED);
3062 if (rv != -EAGAIN) {
3063 dev_err(DEV, "%s_sendmsg returned %d\n",
3064 sock == mdev->meta.socket ? "msock" : "sock",
3066 drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
3068 drbd_force_state(mdev, NS(conn, C_TIMEOUT));
3074 static int drbd_open(struct block_device *bdev, fmode_t mode)
3076 struct drbd_conf *mdev = bdev->bd_disk->private_data;
3077 unsigned long flags;
3080 mutex_lock(&drbd_main_mutex);
3081 spin_lock_irqsave(&mdev->req_lock, flags);
3082 /* to have a stable mdev->state.role
3083 * and no race with updating open_cnt */
3085 if (mdev->state.role != R_PRIMARY) {
3086 if (mode & FMODE_WRITE)
3088 else if (!allow_oos)
3094 spin_unlock_irqrestore(&mdev->req_lock, flags);
3095 mutex_unlock(&drbd_main_mutex);
3100 static int drbd_release(struct gendisk *gd, fmode_t mode)
3102 struct drbd_conf *mdev = gd->private_data;
3103 mutex_lock(&drbd_main_mutex);
3105 mutex_unlock(&drbd_main_mutex);
3109 static void drbd_set_defaults(struct drbd_conf *mdev)
3111 /* This way we get a compile error when sync_conf grows,
3112 and we forgot to initialize it here */
3113 mdev->sync_conf = (struct syncer_conf) {
3114 /* .rate = */ DRBD_RATE_DEF,
3115 /* .after = */ DRBD_AFTER_DEF,
3116 /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
3117 /* .verify_alg = */ {}, 0,
3118 /* .cpu_mask = */ {}, 0,
3119 /* .csums_alg = */ {}, 0,
3121 /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
3122 /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
3123 /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
3124 /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
3125 /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
3126 /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
3129 /* Have to use that way, because the layout differs between
3130 big endian and little endian */
3131 mdev->state = (union drbd_state) {
3132 { .role = R_SECONDARY,
3134 .conn = C_STANDALONE,
3143 void drbd_init_set_defaults(struct drbd_conf *mdev)
3145 /* the memset(,0,) did most of this.
3146 * note: only assignments, no allocation in here */
3148 drbd_set_defaults(mdev);
3150 atomic_set(&mdev->ap_bio_cnt, 0);
3151 atomic_set(&mdev->ap_pending_cnt, 0);
3152 atomic_set(&mdev->rs_pending_cnt, 0);
3153 atomic_set(&mdev->unacked_cnt, 0);
3154 atomic_set(&mdev->local_cnt, 0);
3155 atomic_set(&mdev->net_cnt, 0);
3156 atomic_set(&mdev->packet_seq, 0);
3157 atomic_set(&mdev->pp_in_use, 0);
3158 atomic_set(&mdev->pp_in_use_by_net, 0);
3159 atomic_set(&mdev->rs_sect_in, 0);
3160 atomic_set(&mdev->rs_sect_ev, 0);
3161 atomic_set(&mdev->ap_in_flight, 0);
3162 atomic_set(&mdev->md_io_in_use, 0);
3164 mutex_init(&mdev->data.mutex);
3165 mutex_init(&mdev->meta.mutex);
3166 sema_init(&mdev->data.work.s, 0);
3167 sema_init(&mdev->meta.work.s, 0);
3168 mutex_init(&mdev->state_mutex);
3170 spin_lock_init(&mdev->data.work.q_lock);
3171 spin_lock_init(&mdev->meta.work.q_lock);
3173 spin_lock_init(&mdev->al_lock);
3174 spin_lock_init(&mdev->req_lock);
3175 spin_lock_init(&mdev->peer_seq_lock);
3176 spin_lock_init(&mdev->epoch_lock);
3178 INIT_LIST_HEAD(&mdev->active_ee);
3179 INIT_LIST_HEAD(&mdev->sync_ee);
3180 INIT_LIST_HEAD(&mdev->done_ee);
3181 INIT_LIST_HEAD(&mdev->read_ee);
3182 INIT_LIST_HEAD(&mdev->net_ee);
3183 INIT_LIST_HEAD(&mdev->resync_reads);
3184 INIT_LIST_HEAD(&mdev->data.work.q);
3185 INIT_LIST_HEAD(&mdev->meta.work.q);
3186 INIT_LIST_HEAD(&mdev->resync_work.list);
3187 INIT_LIST_HEAD(&mdev->unplug_work.list);
3188 INIT_LIST_HEAD(&mdev->go_diskless.list);
3189 INIT_LIST_HEAD(&mdev->md_sync_work.list);
3190 INIT_LIST_HEAD(&mdev->start_resync_work.list);
3191 INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
3193 mdev->resync_work.cb = w_resync_timer;
3194 mdev->unplug_work.cb = w_send_write_hint;
3195 mdev->go_diskless.cb = w_go_diskless;
3196 mdev->md_sync_work.cb = w_md_sync;
3197 mdev->bm_io_work.w.cb = w_bitmap_io;
3198 mdev->start_resync_work.cb = w_start_resync;
3199 init_timer(&mdev->resync_timer);
3200 init_timer(&mdev->md_sync_timer);
3201 init_timer(&mdev->start_resync_timer);
3202 init_timer(&mdev->request_timer);
3203 mdev->resync_timer.function = resync_timer_fn;
3204 mdev->resync_timer.data = (unsigned long) mdev;
3205 mdev->md_sync_timer.function = md_sync_timer_fn;
3206 mdev->md_sync_timer.data = (unsigned long) mdev;
3207 mdev->start_resync_timer.function = start_resync_timer_fn;
3208 mdev->start_resync_timer.data = (unsigned long) mdev;
3209 mdev->request_timer.function = request_timer_fn;
3210 mdev->request_timer.data = (unsigned long) mdev;
3212 init_waitqueue_head(&mdev->misc_wait);
3213 init_waitqueue_head(&mdev->state_wait);
3214 init_waitqueue_head(&mdev->net_cnt_wait);
3215 init_waitqueue_head(&mdev->ee_wait);
3216 init_waitqueue_head(&mdev->al_wait);
3217 init_waitqueue_head(&mdev->seq_wait);
3219 drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
3220 drbd_thread_init(mdev, &mdev->worker, drbd_worker);
3221 drbd_thread_init(mdev, &mdev->asender, drbd_asender);
3223 mdev->agreed_pro_version = PRO_VERSION_MAX;
3224 mdev->write_ordering = WO_bdev_flush;
3225 mdev->resync_wenr = LC_FREE;
3226 mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3227 mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
3230 void drbd_mdev_cleanup(struct drbd_conf *mdev)
3233 if (mdev->receiver.t_state != None)
3234 dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
3235 mdev->receiver.t_state);
3237 /* no need to lock it, I'm the only thread alive */
3238 if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
3239 dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
3249 mdev->rs_failed = 0;
3250 mdev->rs_last_events = 0;
3251 mdev->rs_last_sect_ev = 0;
3252 for (i = 0; i < DRBD_SYNC_MARKS; i++) {
3253 mdev->rs_mark_left[i] = 0;
3254 mdev->rs_mark_time[i] = 0;
3256 D_ASSERT(mdev->net_conf == NULL);
3258 drbd_set_my_capacity(mdev, 0);
3260 /* maybe never allocated. */
3261 drbd_bm_resize(mdev, 0, 1);
3262 drbd_bm_cleanup(mdev);
3265 drbd_free_resources(mdev);
3266 drbd_clear_flag(mdev, AL_SUSPENDED);
3269 * currently we drbd_init_ee only on module load, so
3270 * we may do drbd_release_ee only on module unload!
3272 D_ASSERT(list_empty(&mdev->active_ee));
3273 D_ASSERT(list_empty(&mdev->sync_ee));
3274 D_ASSERT(list_empty(&mdev->done_ee));
3275 D_ASSERT(list_empty(&mdev->read_ee));
3276 D_ASSERT(list_empty(&mdev->net_ee));
3277 D_ASSERT(list_empty(&mdev->resync_reads));
3278 D_ASSERT(list_empty(&mdev->data.work.q));
3279 D_ASSERT(list_empty(&mdev->meta.work.q));
3280 D_ASSERT(list_empty(&mdev->resync_work.list));
3281 D_ASSERT(list_empty(&mdev->unplug_work.list));
3282 D_ASSERT(list_empty(&mdev->go_diskless.list));
3284 drbd_set_defaults(mdev);
3288 static void drbd_destroy_mempools(void)
3292 while (drbd_pp_pool) {
3293 page = drbd_pp_pool;
3294 drbd_pp_pool = (struct page *)page_private(page);
3299 /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
3301 if (drbd_md_io_bio_set)
3302 bioset_free(drbd_md_io_bio_set);
3303 if (drbd_md_io_page_pool)
3304 mempool_destroy(drbd_md_io_page_pool);
3305 if (drbd_ee_mempool)
3306 mempool_destroy(drbd_ee_mempool);
3307 if (drbd_request_mempool)
3308 mempool_destroy(drbd_request_mempool);
3310 kmem_cache_destroy(drbd_ee_cache);
3311 if (drbd_request_cache)
3312 kmem_cache_destroy(drbd_request_cache);
3313 if (drbd_bm_ext_cache)
3314 kmem_cache_destroy(drbd_bm_ext_cache);
3315 if (drbd_al_ext_cache)
3316 kmem_cache_destroy(drbd_al_ext_cache);
3318 drbd_md_io_bio_set = NULL;
3319 drbd_md_io_page_pool = NULL;
3320 drbd_ee_mempool = NULL;
3321 drbd_request_mempool = NULL;
3322 drbd_ee_cache = NULL;
3323 drbd_request_cache = NULL;
3324 drbd_bm_ext_cache = NULL;
3325 drbd_al_ext_cache = NULL;
3330 static int drbd_create_mempools(void)
3333 const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count;
3336 /* prepare our caches and mempools */
3337 drbd_request_mempool = NULL;
3338 drbd_ee_cache = NULL;
3339 drbd_request_cache = NULL;
3340 drbd_bm_ext_cache = NULL;
3341 drbd_al_ext_cache = NULL;
3342 drbd_pp_pool = NULL;
3343 drbd_md_io_page_pool = NULL;
3344 drbd_md_io_bio_set = NULL;
3347 drbd_request_cache = kmem_cache_create(
3348 "drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
3349 if (drbd_request_cache == NULL)
3352 drbd_ee_cache = kmem_cache_create(
3353 "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
3354 if (drbd_ee_cache == NULL)
3357 drbd_bm_ext_cache = kmem_cache_create(
3358 "drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
3359 if (drbd_bm_ext_cache == NULL)
3362 drbd_al_ext_cache = kmem_cache_create(
3363 "drbd_al", sizeof(struct lc_element), 0, 0, NULL);
3364 if (drbd_al_ext_cache == NULL)
3368 #ifdef COMPAT_HAVE_BIOSET_CREATE
3369 drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
3370 if (drbd_md_io_bio_set == NULL)
3374 drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
3375 if (drbd_md_io_page_pool == NULL)
3378 drbd_request_mempool = mempool_create(number,
3379 mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
3380 if (drbd_request_mempool == NULL)
3383 drbd_ee_mempool = mempool_create(number,
3384 mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
3385 if (drbd_ee_mempool == NULL)
3388 /* drbd's page pool */
3389 spin_lock_init(&drbd_pp_lock);
3391 for (i = 0; i < number; i++) {
3392 page = alloc_page(GFP_HIGHUSER);
3395 set_page_private(page, (unsigned long)drbd_pp_pool);
3396 drbd_pp_pool = page;
3398 drbd_pp_vacant = number;
3403 drbd_destroy_mempools(); /* in case we allocated some */
3407 static int drbd_notify_sys(struct notifier_block *this, unsigned long code,
3410 /* just so we have it. you never know what interesting things we
3411 * might want to do here some day...
3417 static struct notifier_block drbd_notifier = {
3418 .notifier_call = drbd_notify_sys,
3421 static void drbd_release_ee_lists(struct drbd_conf *mdev)
3425 rr = drbd_release_ee(mdev, &mdev->active_ee);
3427 dev_err(DEV, "%d EEs in active list found!\n", rr);
3429 rr = drbd_release_ee(mdev, &mdev->sync_ee);
3431 dev_err(DEV, "%d EEs in sync list found!\n", rr);
3433 rr = drbd_release_ee(mdev, &mdev->read_ee);
3435 dev_err(DEV, "%d EEs in read list found!\n", rr);
3437 rr = drbd_release_ee(mdev, &mdev->done_ee);
3439 dev_err(DEV, "%d EEs in done list found!\n", rr);
3441 rr = drbd_release_ee(mdev, &mdev->net_ee);
3443 dev_err(DEV, "%d EEs in net list found!\n", rr);
3446 /* caution. no locking.
3447 * currently only used from module cleanup code. */
3448 static void drbd_delete_device(unsigned int minor)
3450 struct drbd_conf *mdev = minor_to_mdev(minor);
3455 del_timer_sync(&mdev->request_timer);
3457 /* paranoia asserts */
3458 if (mdev->open_cnt != 0)
3459 dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
3460 __FILE__ , __LINE__);
3462 ERR_IF (!list_empty(&mdev->data.work.q)) {
3463 struct list_head *lp;
3464 list_for_each(lp, &mdev->data.work.q) {
3465 dev_err(DEV, "lp = %p\n", lp);
3468 /* end paranoia asserts */
3470 del_gendisk(mdev->vdisk);
3472 /* cleanup stuff that may have been allocated during
3473 * device (re-)configuration or state changes */
3475 if (mdev->this_bdev)
3476 bdput(mdev->this_bdev);
3478 drbd_free_resources(mdev);
3480 drbd_release_ee_lists(mdev);
3482 /* should be freed on disconnect? */
3483 kfree(mdev->ee_hash);
3485 mdev->ee_hash_s = 0;
3486 mdev->ee_hash = NULL;
3489 lc_destroy(mdev->act_log);
3490 lc_destroy(mdev->resync);
3492 kfree(mdev->p_uuid);
3493 /* mdev->p_uuid = NULL; */
3495 kfree(mdev->int_dig_out);
3496 kfree(mdev->int_dig_in);
3497 kfree(mdev->int_dig_vv);
3499 /* cleanup the rest that has been
3500 * allocated from drbd_new_device
3501 * and actually free the mdev itself */
3502 drbd_free_mdev(mdev);
3505 static void drbd_cleanup(void)
3509 unregister_reboot_notifier(&drbd_notifier);
3511 /* first remove proc,
3512 * drbdsetup uses it's presence to detect
3513 * whether DRBD is loaded.
3514 * If we would get stuck in proc removal,
3515 * but have netlink already deregistered,
3516 * some drbdsetup commands may wait forever
3520 remove_proc_entry("drbd", NULL);
3527 drbd_delete_device(i);
3528 drbd_destroy_mempools();
3533 unregister_blkdev(DRBD_MAJOR, "drbd");
3535 printk(KERN_INFO "drbd: module cleanup done.\n");
3539 * drbd_congested() - Callback for the flusher thread
3540 * @congested_data: User data
3541 * @bdi_bits: Bits the BDI flusher thread is currently interested in
3543 * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
3545 static int drbd_congested(void *congested_data, int bdi_bits)
3547 struct drbd_conf *mdev = congested_data;
3548 struct request_queue *q;
3552 if (!may_inc_ap_bio(mdev)) {
3553 /* DRBD has frozen IO */
3559 if (drbd_test_flag(mdev, CALLBACK_PENDING)) {
3560 r |= (1 << BDI_async_congested);
3561 /* Without good local data, we would need to read from remote,
3562 * and that would need the worker thread as well, which is
3563 * currently blocked waiting for that usermode helper to
3566 if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
3567 r |= (1 << BDI_sync_congested);
3575 if (get_ldev(mdev)) {
3576 q = bdev_get_queue(mdev->ldev->backing_bdev);
3577 r = bdi_congested(&q->backing_dev_info, bdi_bits);
3583 if (bdi_bits & (1 << BDI_async_congested) && drbd_test_flag(mdev, NET_CONGESTED)) {
3584 r |= (1 << BDI_async_congested);
3585 reason = reason == 'b' ? 'a' : 'n';
3589 mdev->congestion_reason = reason;
3593 struct drbd_conf *drbd_new_device(unsigned int minor)
3595 struct drbd_conf *mdev;
3596 struct gendisk *disk;
3597 struct request_queue *q;
3599 /* GFP_KERNEL, we are outside of all write-out paths */
3600 mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
3603 if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
3604 goto out_no_cpumask;
3606 mdev->minor = minor;
3608 drbd_init_set_defaults(mdev);
3610 q = blk_alloc_queue(GFP_KERNEL);
3614 q->queuedata = mdev;
3616 disk = alloc_disk(1);
3621 set_disk_ro(disk, true);
3624 disk->major = DRBD_MAJOR;
3625 disk->first_minor = minor;
3626 disk->fops = &drbd_ops;
3627 sprintf(disk->disk_name, "drbd%d", minor);
3628 disk->private_data = mdev;
3630 mdev->this_bdev = bdget(MKDEV(DRBD_MAJOR, minor));
3631 /* we have no partitions. we contain only ourselves. */
3632 mdev->this_bdev->bd_contains = mdev->this_bdev;
3634 q->backing_dev_info.congested_fn = drbd_congested;
3635 q->backing_dev_info.congested_data = mdev;
3637 blk_queue_make_request(q, drbd_make_request);
3638 blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
3639 /* Setting the max_hw_sectors to an odd value of 8kibyte here
3640 This triggers a max_bio_size message upon first attach or connect */
3641 blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
3642 blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
3643 blk_queue_merge_bvec(q, drbd_merge_bvec);
3644 q->queue_lock = &mdev->req_lock;
3646 mdev->md_io_page = alloc_page(GFP_KERNEL);
3647 if (!mdev->md_io_page)
3648 goto out_no_io_page;
3650 if (drbd_bm_init(mdev))
3652 /* no need to lock access, we are still initializing this minor device. */
3656 mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
3657 if (!mdev->app_reads_hash)
3658 goto out_no_app_reads;
3660 mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
3661 if (!mdev->current_epoch)
3664 INIT_LIST_HEAD(&mdev->current_epoch->list);
3669 /* out_whatever_else:
3670 kfree(mdev->current_epoch); */
3672 kfree(mdev->app_reads_hash);
3676 drbd_bm_cleanup(mdev);
3678 __free_page(mdev->md_io_page);
3682 blk_cleanup_queue(q);
3684 free_cpumask_var(mdev->cpu_mask);
3690 /* counterpart of drbd_new_device.
3691 * last part of drbd_delete_device. */
3692 void drbd_free_mdev(struct drbd_conf *mdev)
3694 kfree(mdev->current_epoch);
3695 kfree(mdev->app_reads_hash);
3697 if (mdev->bitmap) /* should no longer be there. */
3698 drbd_bm_cleanup(mdev);
3699 __free_page(mdev->md_io_page);
3700 put_disk(mdev->vdisk);
3701 blk_cleanup_queue(mdev->rq_queue);
3702 free_cpumask_var(mdev->cpu_mask);
3703 drbd_free_tl_hash(mdev);
3708 int __init drbd_init(void)
3712 if (sizeof(struct p_handshake) != 80) {
3714 "drbd: never change the size or layout "
3715 "of the HandShake packet.\n");
3719 if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
3721 "drbd: invalid minor_count (%d)\n", minor_count);
3729 err = drbd_nl_init();
3733 err = register_blkdev(DRBD_MAJOR, "drbd");
3736 "drbd: unable to register block device major %d\n",
3741 register_reboot_notifier(&drbd_notifier);
3744 * allocate all necessary structs
3748 init_waitqueue_head(&drbd_pp_wait);
3750 drbd_proc = NULL; /* play safe for drbd_cleanup */
3751 minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
3756 err = drbd_create_mempools();
3760 drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
3762 printk(KERN_ERR "drbd: unable to register proc file\n");
3766 rwlock_init(&global_state_lock);
3768 printk(KERN_INFO "drbd: initialized. "
3769 "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
3770 API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
3771 printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
3772 printk(KERN_INFO "drbd: registered as block device major %d\n",
3774 printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
3776 return 0; /* Success! */
3781 /* currently always the case */
3782 printk(KERN_ERR "drbd: ran out of memory\n");
3784 printk(KERN_ERR "drbd: initialization failure\n");
3788 void drbd_free_bc(struct drbd_backing_dev *ldev)
3793 blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3794 blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
3799 void drbd_free_sock(struct drbd_conf *mdev)
3801 if (mdev->data.socket) {
3802 mutex_lock(&mdev->data.mutex);
3803 kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
3804 sock_release(mdev->data.socket);
3805 mdev->data.socket = NULL;
3806 mutex_unlock(&mdev->data.mutex);
3808 if (mdev->meta.socket) {
3809 mutex_lock(&mdev->meta.mutex);
3810 kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
3811 sock_release(mdev->meta.socket);
3812 mdev->meta.socket = NULL;
3813 mutex_unlock(&mdev->meta.mutex);
3818 void drbd_free_resources(struct drbd_conf *mdev)
3820 crypto_free_hash(mdev->csums_tfm);
3821 mdev->csums_tfm = NULL;
3822 crypto_free_hash(mdev->verify_tfm);
3823 mdev->verify_tfm = NULL;
3824 crypto_free_hash(mdev->cram_hmac_tfm);
3825 mdev->cram_hmac_tfm = NULL;
3826 crypto_free_hash(mdev->integrity_w_tfm);
3827 mdev->integrity_w_tfm = NULL;
3828 crypto_free_hash(mdev->integrity_r_tfm);
3829 mdev->integrity_r_tfm = NULL;
3831 drbd_free_sock(mdev);
3834 drbd_free_bc(mdev->ldev);
3835 mdev->ldev = NULL;);
3838 /* meta data management */
3840 struct meta_data_on_disk {
3841 u64 la_size; /* last agreed size. */
3842 u64 uuid[UI_SIZE]; /* UUIDs. */
3845 u32 flags; /* MDF */
3848 u32 al_offset; /* offset to this block */
3849 u32 al_nr_extents; /* important for restoring the AL */
3850 /* `-- act_log->nr_elements <-- sync_conf.al_extents */
3851 u32 bm_offset; /* offset to the bitmap, from here */
3852 u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
3853 u32 la_peer_max_bio_size; /* last peer max_bio_size */
3854 u32 reserved_u32[3];
3859 * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
3860 * @mdev: DRBD device.
3862 void drbd_md_sync(struct drbd_conf *mdev)
3864 struct meta_data_on_disk *buffer;
3868 del_timer(&mdev->md_sync_timer);
3869 /* timer may be rearmed by drbd_md_mark_dirty() now. */
3870 if (!drbd_test_and_clear_flag(mdev, MD_DIRTY))
3873 /* We use here D_FAILED and not D_ATTACHING because we try to write
3874 * metadata even if we detach due to a disk failure! */
3875 if (!get_ldev_if_state(mdev, D_FAILED))
3878 buffer = drbd_md_get_buffer(mdev);
3882 memset(buffer, 0, 512);
3884 buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
3885 for (i = UI_CURRENT; i < UI_SIZE; i++)
3886 buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
3887 buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
3888 buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
3890 buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
3891 buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
3892 buffer->al_nr_extents = cpu_to_be32(mdev->act_log->nr_elements);
3893 buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
3894 buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid);
3896 buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset);
3897 buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size);
3899 D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
3900 sector = mdev->ldev->md.md_offset;
3902 if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
3903 /* this was a try anyways ... */
3904 dev_err(DEV, "meta data update failed!\n");
3905 drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
3908 /* Update mdev->ldev->md.la_size_sect,
3909 * since we updated it on metadata. */
3910 mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
3912 drbd_md_put_buffer(mdev);
3918 * drbd_md_read() - Reads in the meta data super block
3919 * @mdev: DRBD device.
3920 * @bdev: Device from which the meta data should be read in.
3922 * Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
3923 * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
3925 int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
3927 struct meta_data_on_disk *buffer;
3928 int i, rv = NO_ERROR;
3930 if (!get_ldev_if_state(mdev, D_ATTACHING))
3931 return ERR_IO_MD_DISK;
3933 buffer = drbd_md_get_buffer(mdev);
3937 if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
3938 /* NOTE: can't do normal error processing here as this is
3939 called BEFORE disk is attached */
3940 dev_err(DEV, "Error while reading metadata.\n");
3941 rv = ERR_IO_MD_DISK;
3945 if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
3946 dev_err(DEV, "Error while reading metadata, magic not found.\n");
3947 rv = ERR_MD_INVALID;
3950 if (be32_to_cpu(buffer->al_offset) != bdev->md.al_offset) {
3951 dev_err(DEV, "unexpected al_offset: %d (expected %d)\n",
3952 be32_to_cpu(buffer->al_offset), bdev->md.al_offset);
3953 rv = ERR_MD_INVALID;
3956 if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
3957 dev_err(DEV, "unexpected bm_offset: %d (expected %d)\n",
3958 be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
3959 rv = ERR_MD_INVALID;
3962 if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
3963 dev_err(DEV, "unexpected md_size: %u (expected %u)\n",
3964 be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
3965 rv = ERR_MD_INVALID;
3969 if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
3970 dev_err(DEV, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
3971 be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
3972 rv = ERR_MD_INVALID;
3976 bdev->md.la_size_sect = be64_to_cpu(buffer->la_size);
3977 for (i = UI_CURRENT; i < UI_SIZE; i++)
3978 bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
3979 bdev->md.flags = be32_to_cpu(buffer->flags);
3980 mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
3981 bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
3983 spin_lock_irq(&mdev->req_lock);
3984 if (mdev->state.conn < C_CONNECTED) {
3986 peer = be32_to_cpu(buffer->la_peer_max_bio_size);
3987 peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
3988 mdev->peer_max_bio_size = peer;
3990 spin_unlock_irq(&mdev->req_lock);
3992 if (mdev->sync_conf.al_extents < 7)
3993 mdev->sync_conf.al_extents = 127;
3996 drbd_md_put_buffer(mdev);
4004 * drbd_md_mark_dirty() - Mark meta data super block as dirty
4005 * @mdev: DRBD device.
4007 * Call this function if you change anything that should be written to
4008 * the meta-data super block. This function sets MD_DIRTY, and starts a
4009 * timer that ensures that within five seconds you have to call drbd_md_sync().
4012 void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
4014 if (!drbd_test_and_set_flag(mdev, MD_DIRTY)) {
4015 mod_timer(&mdev->md_sync_timer, jiffies + HZ);
4016 mdev->last_md_mark_dirty.line = line;
4017 mdev->last_md_mark_dirty.func = func;
4021 void drbd_md_mark_dirty(struct drbd_conf *mdev)
4023 if (!drbd_test_and_set_flag(mdev, MD_DIRTY))
4024 mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
4028 void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
4032 for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
4033 mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
4036 void __drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4038 if (idx == UI_CURRENT) {
4039 if (mdev->state.role == R_PRIMARY)
4044 drbd_set_ed_uuid(mdev, val);
4047 mdev->ldev->md.uuid[idx] = val;
4048 drbd_md_mark_dirty(mdev);
4051 void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4053 unsigned long flags;
4054 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4055 __drbd_uuid_set(mdev, idx, val);
4056 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4059 void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
4061 unsigned long flags;
4062 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4063 if (mdev->ldev->md.uuid[idx]) {
4064 drbd_uuid_move_history(mdev);
4065 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
4067 __drbd_uuid_set(mdev, idx, val);
4068 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4072 * drbd_uuid_new_current() - Creates a new current UUID
4073 * @mdev: DRBD device.
4075 * Creates a new current UUID, and rotates the old current UUID into
4076 * the bitmap slot. Causes an incremental resync upon next connect.
4078 void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
4081 unsigned long long bm_uuid;
4083 get_random_bytes(&val, sizeof(u64));
4085 spin_lock_irq(&mdev->ldev->md.uuid_lock);
4086 bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4089 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4091 mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
4092 __drbd_uuid_set(mdev, UI_CURRENT, val);
4093 spin_unlock_irq(&mdev->ldev->md.uuid_lock);
4095 drbd_print_uuids(mdev, "new current UUID");
4096 /* get it to stable storage _now_ */
4100 void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
4102 unsigned long flags;
4103 if (mdev->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
4106 spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
4108 drbd_uuid_move_history(mdev);
4109 mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
4110 mdev->ldev->md.uuid[UI_BITMAP] = 0;
4112 unsigned long long bm_uuid = mdev->ldev->md.uuid[UI_BITMAP];
4114 dev_warn(DEV, "bm UUID was already set: %llX\n", bm_uuid);
4116 mdev->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
4118 spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
4120 drbd_md_mark_dirty(mdev);
4124 * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4125 * @mdev: DRBD device.
4127 * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
4129 int drbd_bmio_set_n_write(struct drbd_conf *mdev)
4133 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4134 drbd_md_set_flag(mdev, MDF_FULL_SYNC);
4136 drbd_bm_set_all(mdev);
4138 rv = drbd_bm_write(mdev);
4141 drbd_md_clear_flag(mdev, MDF_FULL_SYNC);
4152 * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
4153 * @mdev: DRBD device.
4155 * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
4157 int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
4161 drbd_resume_al(mdev);
4162 if (get_ldev_if_state(mdev, D_ATTACHING)) {
4163 drbd_bm_clear_all(mdev);
4164 rv = drbd_bm_write(mdev);
4171 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4173 struct bm_io_work *work = container_of(w, struct bm_io_work, w);
4176 D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
4178 if (get_ldev(mdev)) {
4179 drbd_bm_lock(mdev, work->why, work->flags);
4180 rv = work->io_fn(mdev);
4181 drbd_bm_unlock(mdev);
4185 drbd_clear_flag(mdev, BITMAP_IO);
4186 smp_mb__after_clear_bit();
4187 wake_up(&mdev->misc_wait);
4190 work->done(mdev, rv);
4192 drbd_clear_flag(mdev, BITMAP_IO_QUEUED);
4199 void drbd_ldev_destroy(struct drbd_conf *mdev)
4201 lc_destroy(mdev->resync);
4202 mdev->resync = NULL;
4203 lc_destroy(mdev->act_log);
4204 mdev->act_log = NULL;
4206 drbd_free_bc(mdev->ldev);
4207 mdev->ldev = NULL;);
4209 if (mdev->md_io_tmpp) {
4210 __free_page(mdev->md_io_tmpp);
4211 mdev->md_io_tmpp = NULL;
4213 drbd_clear_flag(mdev, GO_DISKLESS);
4216 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4218 D_ASSERT(mdev->state.disk == D_FAILED);
4219 /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
4220 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
4221 * the protected members anymore, though, so once put_ldev reaches zero
4222 * again, it will be safe to free them. */
4223 drbd_force_state(mdev, NS(disk, D_DISKLESS));
4227 void drbd_go_diskless(struct drbd_conf *mdev)
4229 D_ASSERT(mdev->state.disk == D_FAILED);
4230 if (!drbd_test_and_set_flag(mdev, GO_DISKLESS))
4231 drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
4235 * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
4236 * @mdev: DRBD device.
4237 * @io_fn: IO callback to be called when bitmap IO is possible
4238 * @done: callback to be called after the bitmap IO was performed
4239 * @why: Descriptive text of the reason for doing the IO
4241 * While IO on the bitmap happens we freeze application IO thus we ensure
4242 * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
4243 * called from worker context. It MUST NOT be used while a previous such
4244 * work is still pending!
4246 void drbd_queue_bitmap_io(struct drbd_conf *mdev,
4247 int (*io_fn)(struct drbd_conf *),
4248 void (*done)(struct drbd_conf *, int),
4249 char *why, enum bm_flag flags)
4251 D_ASSERT(current == mdev->worker.task);
4253 D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO_QUEUED));
4254 D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO));
4255 D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
4256 if (mdev->bm_io_work.why)
4257 dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
4258 why, mdev->bm_io_work.why);
4260 mdev->bm_io_work.io_fn = io_fn;
4261 mdev->bm_io_work.done = done;
4262 mdev->bm_io_work.why = why;
4263 mdev->bm_io_work.flags = flags;
4265 spin_lock_irq(&mdev->req_lock);
4266 drbd_set_flag(mdev, BITMAP_IO);
4267 if (atomic_read(&mdev->ap_bio_cnt) == 0) {
4268 if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
4269 drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
4271 spin_unlock_irq(&mdev->req_lock);
4275 * drbd_bitmap_io() - Does an IO operation on the whole bitmap
4276 * @mdev: DRBD device.
4277 * @io_fn: IO callback to be called when bitmap IO is possible
4278 * @why: Descriptive text of the reason for doing the IO
4280 * freezes application IO while that the actual IO operations runs. This
4281 * functions MAY NOT be called from worker context.
4283 int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *),
4284 char *why, enum bm_flag flags)
4288 D_ASSERT(current != mdev->worker.task);
4290 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4291 drbd_suspend_io(mdev);
4293 drbd_bm_lock(mdev, why, flags);
4295 drbd_bm_unlock(mdev);
4297 if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
4298 drbd_resume_io(mdev);
4303 void drbd_md_set_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4305 if ((mdev->ldev->md.flags & flag) != flag) {
4306 drbd_md_mark_dirty(mdev);
4307 mdev->ldev->md.flags |= flag;
4311 void drbd_md_clear_flag(struct drbd_conf *mdev, int flag) __must_hold(local)
4313 if ((mdev->ldev->md.flags & flag) != 0) {
4314 drbd_md_mark_dirty(mdev);
4315 mdev->ldev->md.flags &= ~flag;
4318 int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
4320 return (bdev->md.flags & flag) != 0;
4323 static void md_sync_timer_fn(unsigned long data)
4325 struct drbd_conf *mdev = (struct drbd_conf *) data;
4327 drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
4330 static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
4332 dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
4334 dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
4335 mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
4341 #ifdef CONFIG_DRBD_FAULT_INJECTION
4342 /* Fault insertion support including random number generator shamelessly
4343 * stolen from kernel/rcutorture.c */
4344 struct fault_random_state {
4345 unsigned long state;
4346 unsigned long count;
4349 #define FAULT_RANDOM_MULT 39916801 /* prime */
4350 #define FAULT_RANDOM_ADD 479001701 /* prime */
4351 #define FAULT_RANDOM_REFRESH 10000
4354 * Crude but fast random-number generator. Uses a linear congruential
4355 * generator, with occasional help from get_random_bytes().
4357 static unsigned long
4358 _drbd_fault_random(struct fault_random_state *rsp)
4362 if (!rsp->count--) {
4363 get_random_bytes(&refresh, sizeof(refresh));
4364 rsp->state += refresh;
4365 rsp->count = FAULT_RANDOM_REFRESH;
4367 rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
4368 return swahw32(rsp->state);
4372 _drbd_fault_str(unsigned int type) {
4373 static char *_faults[] = {
4374 [DRBD_FAULT_MD_WR] = "Meta-data write",
4375 [DRBD_FAULT_MD_RD] = "Meta-data read",
4376 [DRBD_FAULT_RS_WR] = "Resync write",
4377 [DRBD_FAULT_RS_RD] = "Resync read",
4378 [DRBD_FAULT_DT_WR] = "Data write",
4379 [DRBD_FAULT_DT_RD] = "Data read",
4380 [DRBD_FAULT_DT_RA] = "Data read ahead",
4381 [DRBD_FAULT_BM_ALLOC] = "BM allocation",
4382 [DRBD_FAULT_AL_EE] = "EE allocation",
4383 [DRBD_FAULT_RECEIVE] = "receive data corruption",
4386 return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
4390 _drbd_insert_fault(struct drbd_conf *mdev, unsigned int type)
4392 static struct fault_random_state rrs = {0, 0};
4394 unsigned int ret = (
4396 ((1 << mdev_to_minor(mdev)) & fault_devs) != 0) &&
4397 (((_drbd_fault_random(&rrs) % 100) + 1) <= fault_rate));
4402 if (__ratelimit(&drbd_ratelimit_state))
4403 dev_warn(DEV, "***Simulating %s failure\n",
4404 _drbd_fault_str(type));
4411 const char *drbd_buildtag(void)
4413 /* DRBD built from external sources has here a reference to the
4414 git hash of the source code. */
4416 static char buildtag[38] = "\0uilt-in";
4418 if (buildtag[0] == 0) {
4420 sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
4429 module_init(drbd_init)
4430 module_exit(drbd_cleanup)
4432 EXPORT_SYMBOL(drbd_conn_str);
4433 EXPORT_SYMBOL(drbd_role_str);
4434 EXPORT_SYMBOL(drbd_disk_str);
4435 EXPORT_SYMBOL(drbd_set_st_err_str);