int err;
};
-struct drbd_atodb_wait {
- atomic_t count;
- struct completion io_done;
- struct drbd_conf *mdev;
- int error;
-};
+static int al_write_transaction(struct drbd_conf *mdev);
+
+void *drbd_md_get_buffer(struct drbd_conf *mdev)
+{
+ int r;
+
+ wait_event(mdev->misc_wait,
+ (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
+ mdev->state.disk <= D_FAILED);
+
+ return r ? NULL : page_address(mdev->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_conf *mdev)
+{
+ if (atomic_dec_and_test(&mdev->md_io_in_use))
+ wake_up(&mdev->misc_wait);
+}
+static bool md_io_allowed(struct drbd_conf *mdev)
+{
+ enum drbd_disk_state ds = mdev->state.disk;
+ return ds >= D_NEGOTIATING || ds == D_ATTACHING;
+}
-static int w_al_write_transaction(struct drbd_work *, int);
+void wait_until_done_or_disk_failure(struct drbd_conf *mdev, unsigned int *done)
+{
+ wait_event(mdev->misc_wait, *done || !md_io_allowed(mdev));
+}
static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev,
int rw, int size)
{
struct bio *bio;
- struct drbd_md_io md_io;
int err;
- md_io.mdev = mdev;
- init_completion(&md_io.event);
- md_io.error = 0;
+ mdev->md_io.done = 0;
+ mdev->md_io.error = -ENODEV;
if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
rw |= REQ_FUA | REQ_FLUSH;
err = -EIO;
if (bio_add_page(bio, page, size, 0) != size)
goto out;
- bio->bi_private = &md_io;
+ bio->bi_private = &mdev->md_io;
bio->bi_end_io = drbd_md_io_complete;
bio->bi_rw = rw;
+ if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
+ dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+ err = -ENODEV;
+ goto out;
+ }
+
+ bio_get(bio); /* one bio_put() is in the completion handler */
+ atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
bio_endio(bio, -EIO);
else
submit_bio(rw, bio);
- wait_for_completion(&md_io.event);
+ wait_until_done_or_disk_failure(mdev, &mdev->md_io.done);
if (bio_flagged(bio, BIO_UPTODATE))
- err = md_io.error;
+ err = mdev->md_io.error;
out:
bio_put(bio);
int err;
struct page *iop = mdev->md_io_page;
- D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+ D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
BUG_ON(!bdev->md_bdev);
unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
unsigned last = (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
unsigned enr;
+ bool locked = false;
+
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
for (enr = first; enr <= last; enr++)
wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
- if (mdev->act_log->pending_changes) {
+ /* Serialize multiple transactions.
+ * This uses test_and_set_bit, memory barrier is implicit.
+ */
+ wait_event(mdev->al_wait,
+ mdev->act_log->pending_changes == 0 ||
+ (locked = lc_try_lock_for_transaction(mdev->act_log)));
+
+ if (locked) {
/* drbd_al_write_transaction(mdev,al_ext,enr);
* recurses into generic_make_request(), which
* disallows recursion, bios being serialized on the
* we have to delegate updates to the activity log
* to the worker thread. */
- /* Serialize multiple transactions.
- * This uses test_and_set_bit, memory barrier is implicit.
- * Optimization potential:
- * first check for transaction number > old transaction number,
- * so not all waiters have to lock/unlock. */
- wait_event(mdev->al_wait, lc_try_lock_for_transaction(mdev->act_log));
-
/* Double check: it may have been committed by someone else,
* while we have been waiting for the lock. */
if (mdev->act_log->pending_changes) {
- struct update_al_work al_work;
- init_completion(&al_work.event);
- al_work.w.cb = w_al_write_transaction;
- al_work.w.mdev = mdev;
- drbd_queue_work_front(&mdev->tconn->data.work, &al_work.w);
- wait_for_completion(&al_work.event);
-
+ int err;
+ err = al_write_transaction(mdev);
mdev->al_writ_cnt++;
spin_lock_irq(&mdev->al_lock);
/* FIXME
- if (al_work.err)
+ if (err)
we need an "lc_cancel" here;
*/
lc_committed(mdev->act_log);
return rs_enr >>
/* bit to page */
((PAGE_SHIFT + 3) -
- /* al extent number to bit */
+ /* resync extent number to bit */
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
}
static int
-w_al_write_transaction(struct drbd_work *w, int unused)
+_al_write_transaction(struct drbd_conf *mdev)
{
- struct update_al_work *aw = container_of(w, struct update_al_work, w);
- struct drbd_conf *mdev = w->mdev;
struct al_transaction_on_disk *buffer;
struct lc_element *e;
sector_t sector;
int i, mx;
unsigned extent_nr;
unsigned crc = 0;
+ int err = 0;
if (!get_ldev(mdev)) {
dev_err(DEV, "disk is %s, cannot start al transaction\n",
drbd_disk_str(mdev->state.disk));
- aw->err = -EIO;
- complete(&((struct update_al_work *)w)->event);
- return 0;
+ return -EIO;
}
/* The bitmap write may have failed, causing a state change. */
dev_err(DEV,
"disk is %s, cannot write al transaction\n",
drbd_disk_str(mdev->state.disk));
- aw->err = -EIO;
- complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
- return 0;
+ return -EIO;
}
- mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
- buffer = page_address(mdev->md_io_page);
+ buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
+ if (!buffer) {
+ dev_err(DEV, "disk failed while waiting for md_io buffer\n");
+ put_ldev(mdev);
+ return -ENODEV;
+ }
memset(buffer, 0, sizeof(*buffer));
buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
buffer->crc32c = cpu_to_be32(crc);
if (drbd_bm_write_hinted(mdev))
- aw->err = -EIO;
+ err = -EIO;
/* drbd_chk_io_error done already */
else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
- aw->err = -EIO;
+ err = -EIO;
drbd_chk_io_error(mdev, 1, true);
} else {
/* advance ringbuffer position and transaction counter */
mdev->al_tr_number++;
}
- mutex_unlock(&mdev->md_io_mutex);
- complete(&((struct update_al_work *)w)->event);
+ drbd_md_put_buffer(mdev);
put_ldev(mdev);
- return 0;
+ return err;
}
-/* FIXME
- * reading of the activity log,
- * and potentially dirtying of the affected bitmap regions,
- * should be done from userland only.
- * DRBD would simply always attach with an empty activity log,
- * and refuse to attach to something that looks like a crashed primary.
- */
-/**
- * drbd_al_read_tr() - Read a single transaction from the on disk activity log
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- * @b: pointer to an al_transaction.
- * @index: On disk slot of the transaction to read.
- *
- * Returns -1 on IO error, 0 on checksum error and 1 upon success.
- */
-static int drbd_al_read_tr(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev,
- int index)
+static int w_al_write_transaction(struct drbd_work *w, int unused)
{
- struct al_transaction_on_disk *b = page_address(mdev->md_io_page);
- sector_t sector;
- u32 crc;
-
- sector = bdev->md.md_offset
- + bdev->md.al_offset
- + index * (MD_BLOCK_SIZE>>9);
-
- /* Dont process error normally,
- * as this is done before disk is attached! */
- if (drbd_md_sync_page_io(mdev, bdev, sector, READ))
- return -1;
-
- if (!expect(b->magic == cpu_to_be32(DRBD_AL_MAGIC)))
- return 0;
-
- if (!expect(be16_to_cpu(b->n_updates) <= AL_UPDATES_PER_TRANSACTION))
- return 0;
-
- if (!expect(be16_to_cpu(b->context_size) <= DRBD_AL_EXTENTS_MAX))
- return 0;
-
- if (!expect(be16_to_cpu(b->context_start_slot_nr) < DRBD_AL_EXTENTS_MAX))
- return 0;
-
- crc = be32_to_cpu(b->crc32c);
- b->crc32c = 0;
- if (!expect(crc == crc32c(0, b, 4096)))
- return 0;
-
- return 1;
-}
-
-/**
- * drbd_al_read_log() - Restores the activity log from its on disk representation.
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- *
- * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
- */
-int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
-{
- struct al_transaction_on_disk *b;
- int i;
- int rv;
- int mx;
- int active_extents = 0;
- int transactions = 0;
- int found_valid = 0;
- int found_initialized = 0;
- int from = 0;
- int to = 0;
- u32 from_tnr = 0;
- u32 to_tnr = 0;
- u32 cnr;
-
- /* Note that this is expected to be called with a newly created,
- * clean and all unused activity log of the "expected size".
- */
-
- /* lock out all other meta data io for now,
- * and make sure the page is mapped.
- */
- mutex_lock(&mdev->md_io_mutex);
- b = page_address(mdev->md_io_page);
-
- /* Always use the full ringbuffer space for now.
- * possible optimization: read in all of it,
- * then scan the in-memory pages. */
-
- mx = (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
-
- /* Find the valid transaction in the log */
- for (i = 0; i < mx; i++) {
- rv = drbd_al_read_tr(mdev, bdev, i);
- /* invalid data in that block */
- if (rv == 0)
- continue;
- if (be16_to_cpu(b->transaction_type) == AL_TR_INITIALIZED) {
- ++found_initialized;
- continue;
- }
-
- /* IO error */
- if (rv == -1) {
- mutex_unlock(&mdev->md_io_mutex);
- return 0;
- }
-
- cnr = be32_to_cpu(b->tr_number);
- if (++found_valid == 1) {
- from = i;
- to = i;
- from_tnr = cnr;
- to_tnr = cnr;
- continue;
- }
-
- D_ASSERT(cnr != to_tnr);
- D_ASSERT(cnr != from_tnr);
- if ((int)cnr - (int)from_tnr < 0) {
- D_ASSERT(from_tnr - cnr + i - from == mx);
- from = i;
- from_tnr = cnr;
- }
- if ((int)cnr - (int)to_tnr > 0) {
- D_ASSERT(cnr - to_tnr == i - to);
- to = i;
- to_tnr = cnr;
- }
- }
-
- if (!found_valid) {
- if (found_initialized != mx)
- dev_warn(DEV, "No usable activity log found.\n");
- mutex_unlock(&mdev->md_io_mutex);
- return 1;
- }
-
- /* Read the valid transactions.
- * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
- i = from;
- while (1) {
- struct lc_element *e;
- unsigned j, n, slot, extent_nr;
-
- rv = drbd_al_read_tr(mdev, bdev, i);
- if (!expect(rv != 0))
- goto cancel;
- if (rv == -1) {
- mutex_unlock(&mdev->md_io_mutex);
- return 0;
- }
-
- /* deal with different transaction types.
- * not yet implemented */
- if (!expect(b->transaction_type == 0))
- goto cancel;
-
- /* on the fly re-create/resize activity log?
- * will be a special transaction type flag. */
- if (!expect(be16_to_cpu(b->context_size) == mdev->act_log->nr_elements))
- goto cancel;
- if (!expect(be16_to_cpu(b->context_start_slot_nr) < mdev->act_log->nr_elements))
- goto cancel;
-
- /* We are the only user of the activity log right now,
- * don't actually need to take that lock. */
- spin_lock_irq(&mdev->al_lock);
-
- /* first, apply the context, ... */
- for (j = 0, slot = be16_to_cpu(b->context_start_slot_nr);
- j < AL_CONTEXT_PER_TRANSACTION &&
- slot < mdev->act_log->nr_elements; j++, slot++) {
- extent_nr = be32_to_cpu(b->context[j]);
- e = lc_element_by_index(mdev->act_log, slot);
- if (e->lc_number != extent_nr) {
- if (extent_nr != LC_FREE)
- active_extents++;
- else
- active_extents--;
- }
- lc_set(mdev->act_log, extent_nr, slot);
- }
-
- /* ... then apply the updates,
- * which override the context information.
- * drbd_al_read_tr already did the rangecheck
- * on n <= AL_UPDATES_PER_TRANSACTION */
- n = be16_to_cpu(b->n_updates);
- for (j = 0; j < n; j++) {
- slot = be16_to_cpu(b->update_slot_nr[j]);
- extent_nr = be32_to_cpu(b->update_extent_nr[j]);
- if (!expect(slot < mdev->act_log->nr_elements))
- break;
- e = lc_element_by_index(mdev->act_log, slot);
- if (e->lc_number != extent_nr) {
- if (extent_nr != LC_FREE)
- active_extents++;
- else
- active_extents--;
- }
- lc_set(mdev->act_log, extent_nr, slot);
- }
- spin_unlock_irq(&mdev->al_lock);
-
- transactions++;
-
-cancel:
- if (i == to)
- break;
- i++;
- if (i >= mx)
- i = 0;
- }
-
- mdev->al_tr_number = to_tnr+1;
- mdev->al_tr_pos = (to + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
-
- /* ok, we are done with it */
- mutex_unlock(&mdev->md_io_mutex);
+ struct update_al_work *aw = container_of(w, struct update_al_work, w);
+ struct drbd_conf *mdev = w->mdev;
+ int err;
- dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
- transactions, active_extents);
+ err = _al_write_transaction(mdev);
+ aw->err = err;
+ complete(&aw->event);
- return 1;
+ return err != -EIO ? err : 0;
}
-/**
- * drbd_al_apply_to_bm() - Sets the bitmap to dirty(1) where covered by active AL extents
- * @mdev: DRBD device.
- */
-void drbd_al_apply_to_bm(struct drbd_conf *mdev)
+/* Calls from worker context (see w_restart_disk_io()) need to write the
+ transaction directly. Others came through generic_make_request(),
+ those need to delegate it to the worker. */
+static int al_write_transaction(struct drbd_conf *mdev)
{
- unsigned int enr;
- unsigned long add = 0;
- char ppb[10];
- int i, tmp;
-
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+ struct update_al_work al_work;
- for (i = 0; i < mdev->act_log->nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- tmp = drbd_bm_ALe_set_all(mdev, enr);
- dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
- add += tmp;
- }
+ if (current == mdev->tconn->worker.task)
+ return _al_write_transaction(mdev);
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
+ init_completion(&al_work.event);
+ al_work.w.cb = w_al_write_transaction;
+ al_work.w.mdev = mdev;
+ drbd_queue_work_front(&mdev->tconn->data.work, &al_work.w);
+ wait_for_completion(&al_work.event);
- dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
- ppsize(ppb, Bit2KB(add)));
+ return al_work.err;
}
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)