2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
5 completely rewritten, based on the MD driver code from Marc Zyngier
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
20 Neil Brown <neilb@cse.unsw.edu.au>.
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
35 #include <linux/kthread.h>
36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h>
40 #include <linux/poll.h>
41 #include <linux/ctype.h>
42 #include <linux/string.h>
43 #include <linux/hdreg.h>
44 #include <linux/proc_fs.h>
45 #include <linux/random.h>
46 #include <linux/module.h>
47 #include <linux/reboot.h>
48 #include <linux/file.h>
49 #include <linux/compat.h>
50 #include <linux/delay.h>
51 #include <linux/raid/md_p.h>
52 #include <linux/raid/md_u.h>
53 #include <linux/slab.h>
58 static void autostart_arrays(int part);
61 /* pers_list is a list of registered personalities protected
63 * pers_lock does extra service to protect accesses to
64 * mddev->thread when the mutex cannot be held.
66 static LIST_HEAD(pers_list);
67 static DEFINE_SPINLOCK(pers_lock);
69 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
70 static struct workqueue_struct *md_wq;
71 static struct workqueue_struct *md_misc_wq;
73 static int remove_and_add_spares(struct mddev *mddev,
74 struct md_rdev *this);
77 * Default number of read corrections we'll attempt on an rdev
78 * before ejecting it from the array. We divide the read error
79 * count by 2 for every hour elapsed between read errors.
81 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
83 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
84 * is 1000 KB/sec, so the extra system load does not show up that much.
85 * Increase it if you want to have more _guaranteed_ speed. Note that
86 * the RAID driver will use the maximum available bandwidth if the IO
87 * subsystem is idle. There is also an 'absolute maximum' reconstruction
88 * speed limit - in case reconstruction slows down your system despite
91 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
92 * or /sys/block/mdX/md/sync_speed_{min,max}
95 static int sysctl_speed_limit_min = 1000;
96 static int sysctl_speed_limit_max = 200000;
97 static inline int speed_min(struct mddev *mddev)
99 return mddev->sync_speed_min ?
100 mddev->sync_speed_min : sysctl_speed_limit_min;
103 static inline int speed_max(struct mddev *mddev)
105 return mddev->sync_speed_max ?
106 mddev->sync_speed_max : sysctl_speed_limit_max;
109 static struct ctl_table_header *raid_table_header;
111 static struct ctl_table raid_table[] = {
113 .procname = "speed_limit_min",
114 .data = &sysctl_speed_limit_min,
115 .maxlen = sizeof(int),
116 .mode = S_IRUGO|S_IWUSR,
117 .proc_handler = proc_dointvec,
120 .procname = "speed_limit_max",
121 .data = &sysctl_speed_limit_max,
122 .maxlen = sizeof(int),
123 .mode = S_IRUGO|S_IWUSR,
124 .proc_handler = proc_dointvec,
129 static struct ctl_table raid_dir_table[] = {
133 .mode = S_IRUGO|S_IXUGO,
139 static struct ctl_table raid_root_table[] = {
144 .child = raid_dir_table,
149 static const struct block_device_operations md_fops;
151 static int start_readonly;
154 * like bio_clone, but with a local bio set
157 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
162 if (!mddev || !mddev->bio_set)
163 return bio_alloc(gfp_mask, nr_iovecs);
165 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
170 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
172 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
175 if (!mddev || !mddev->bio_set)
176 return bio_clone(bio, gfp_mask);
178 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
180 EXPORT_SYMBOL_GPL(bio_clone_mddev);
183 * We have a system wide 'event count' that is incremented
184 * on any 'interesting' event, and readers of /proc/mdstat
185 * can use 'poll' or 'select' to find out when the event
189 * start array, stop array, error, add device, remove device,
190 * start build, activate spare
192 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
193 static atomic_t md_event_count;
194 void md_new_event(struct mddev *mddev)
196 atomic_inc(&md_event_count);
197 wake_up(&md_event_waiters);
199 EXPORT_SYMBOL_GPL(md_new_event);
201 /* Alternate version that can be called from interrupts
202 * when calling sysfs_notify isn't needed.
204 static void md_new_event_inintr(struct mddev *mddev)
206 atomic_inc(&md_event_count);
207 wake_up(&md_event_waiters);
211 * Enables to iterate over all existing md arrays
212 * all_mddevs_lock protects this list.
214 static LIST_HEAD(all_mddevs);
215 static DEFINE_SPINLOCK(all_mddevs_lock);
218 * iterates through all used mddevs in the system.
219 * We take care to grab the all_mddevs_lock whenever navigating
220 * the list, and to always hold a refcount when unlocked.
221 * Any code which breaks out of this loop while own
222 * a reference to the current mddev and must mddev_put it.
224 #define for_each_mddev(_mddev,_tmp) \
226 for (({ spin_lock(&all_mddevs_lock); \
227 _tmp = all_mddevs.next; \
229 ({ if (_tmp != &all_mddevs) \
230 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
231 spin_unlock(&all_mddevs_lock); \
232 if (_mddev) mddev_put(_mddev); \
233 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
234 _tmp != &all_mddevs;}); \
235 ({ spin_lock(&all_mddevs_lock); \
236 _tmp = _tmp->next;}) \
239 /* Rather than calling directly into the personality make_request function,
240 * IO requests come here first so that we can check if the device is
241 * being suspended pending a reconfiguration.
242 * We hold a refcount over the call to ->make_request. By the time that
243 * call has finished, the bio has been linked into some internal structure
244 * and so is visible to ->quiesce(), so we don't need the refcount any more.
246 static void md_make_request(struct request_queue *q, struct bio *bio)
248 const int rw = bio_data_dir(bio);
249 struct mddev *mddev = q->queuedata;
250 unsigned int sectors;
252 if (mddev == NULL || mddev->pers == NULL
257 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
258 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
261 smp_rmb(); /* Ensure implications of 'active' are visible */
263 if (mddev->suspended) {
266 prepare_to_wait(&mddev->sb_wait, &__wait,
267 TASK_UNINTERRUPTIBLE);
268 if (!mddev->suspended)
274 finish_wait(&mddev->sb_wait, &__wait);
276 atomic_inc(&mddev->active_io);
280 * save the sectors now since our bio can
281 * go away inside make_request
283 sectors = bio_sectors(bio);
284 mddev->pers->make_request(mddev, bio);
286 generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
288 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
289 wake_up(&mddev->sb_wait);
292 /* mddev_suspend makes sure no new requests are submitted
293 * to the device, and that any requests that have been submitted
294 * are completely handled.
295 * Once ->stop is called and completes, the module will be completely
298 void mddev_suspend(struct mddev *mddev)
300 BUG_ON(mddev->suspended);
301 mddev->suspended = 1;
303 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
304 mddev->pers->quiesce(mddev, 1);
306 del_timer_sync(&mddev->safemode_timer);
308 EXPORT_SYMBOL_GPL(mddev_suspend);
310 void mddev_resume(struct mddev *mddev)
312 mddev->suspended = 0;
313 wake_up(&mddev->sb_wait);
314 mddev->pers->quiesce(mddev, 0);
316 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
317 md_wakeup_thread(mddev->thread);
318 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
320 EXPORT_SYMBOL_GPL(mddev_resume);
322 int mddev_congested(struct mddev *mddev, int bits)
324 return mddev->suspended;
326 EXPORT_SYMBOL(mddev_congested);
329 * Generic flush handling for md
332 static void md_end_flush(struct bio *bio, int err)
334 struct md_rdev *rdev = bio->bi_private;
335 struct mddev *mddev = rdev->mddev;
337 rdev_dec_pending(rdev, mddev);
339 if (atomic_dec_and_test(&mddev->flush_pending)) {
340 /* The pre-request flush has finished */
341 queue_work(md_wq, &mddev->flush_work);
346 static void md_submit_flush_data(struct work_struct *ws);
348 static void submit_flushes(struct work_struct *ws)
350 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
351 struct md_rdev *rdev;
353 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
354 atomic_set(&mddev->flush_pending, 1);
356 rdev_for_each_rcu(rdev, mddev)
357 if (rdev->raid_disk >= 0 &&
358 !test_bit(Faulty, &rdev->flags)) {
359 /* Take two references, one is dropped
360 * when request finishes, one after
361 * we reclaim rcu_read_lock
364 atomic_inc(&rdev->nr_pending);
365 atomic_inc(&rdev->nr_pending);
367 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
368 bi->bi_end_io = md_end_flush;
369 bi->bi_private = rdev;
370 bi->bi_bdev = rdev->bdev;
371 atomic_inc(&mddev->flush_pending);
372 submit_bio(WRITE_FLUSH, bi);
374 rdev_dec_pending(rdev, mddev);
377 if (atomic_dec_and_test(&mddev->flush_pending))
378 queue_work(md_wq, &mddev->flush_work);
381 static void md_submit_flush_data(struct work_struct *ws)
383 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
384 struct bio *bio = mddev->flush_bio;
386 if (bio->bi_iter.bi_size == 0)
387 /* an empty barrier - all done */
390 bio->bi_rw &= ~REQ_FLUSH;
391 mddev->pers->make_request(mddev, bio);
394 mddev->flush_bio = NULL;
395 wake_up(&mddev->sb_wait);
398 void md_flush_request(struct mddev *mddev, struct bio *bio)
400 spin_lock_irq(&mddev->write_lock);
401 wait_event_lock_irq(mddev->sb_wait,
404 mddev->flush_bio = bio;
405 spin_unlock_irq(&mddev->write_lock);
407 INIT_WORK(&mddev->flush_work, submit_flushes);
408 queue_work(md_wq, &mddev->flush_work);
410 EXPORT_SYMBOL(md_flush_request);
412 void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
414 struct mddev *mddev = cb->data;
415 md_wakeup_thread(mddev->thread);
418 EXPORT_SYMBOL(md_unplug);
420 static inline struct mddev *mddev_get(struct mddev *mddev)
422 atomic_inc(&mddev->active);
426 static void mddev_delayed_delete(struct work_struct *ws);
428 static void mddev_put(struct mddev *mddev)
430 struct bio_set *bs = NULL;
432 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
434 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
435 mddev->ctime == 0 && !mddev->hold_active) {
436 /* Array is not configured at all, and not held active,
438 list_del_init(&mddev->all_mddevs);
440 mddev->bio_set = NULL;
441 if (mddev->gendisk) {
442 /* We did a probe so need to clean up. Call
443 * queue_work inside the spinlock so that
444 * flush_workqueue() after mddev_find will
445 * succeed in waiting for the work to be done.
447 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
448 queue_work(md_misc_wq, &mddev->del_work);
452 spin_unlock(&all_mddevs_lock);
457 void mddev_init(struct mddev *mddev)
459 mutex_init(&mddev->open_mutex);
460 mutex_init(&mddev->reconfig_mutex);
461 mutex_init(&mddev->bitmap_info.mutex);
462 INIT_LIST_HEAD(&mddev->disks);
463 INIT_LIST_HEAD(&mddev->all_mddevs);
464 init_timer(&mddev->safemode_timer);
465 atomic_set(&mddev->active, 1);
466 atomic_set(&mddev->openers, 0);
467 atomic_set(&mddev->active_io, 0);
468 spin_lock_init(&mddev->write_lock);
469 atomic_set(&mddev->flush_pending, 0);
470 init_waitqueue_head(&mddev->sb_wait);
471 init_waitqueue_head(&mddev->recovery_wait);
472 mddev->reshape_position = MaxSector;
473 mddev->reshape_backwards = 0;
474 mddev->last_sync_action = "none";
475 mddev->resync_min = 0;
476 mddev->resync_max = MaxSector;
477 mddev->level = LEVEL_NONE;
479 EXPORT_SYMBOL_GPL(mddev_init);
481 static struct mddev *mddev_find(dev_t unit)
483 struct mddev *mddev, *new = NULL;
485 if (unit && MAJOR(unit) != MD_MAJOR)
486 unit &= ~((1<<MdpMinorShift)-1);
489 spin_lock(&all_mddevs_lock);
492 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
493 if (mddev->unit == unit) {
495 spin_unlock(&all_mddevs_lock);
501 list_add(&new->all_mddevs, &all_mddevs);
502 spin_unlock(&all_mddevs_lock);
503 new->hold_active = UNTIL_IOCTL;
507 /* find an unused unit number */
508 static int next_minor = 512;
509 int start = next_minor;
513 dev = MKDEV(MD_MAJOR, next_minor);
515 if (next_minor > MINORMASK)
517 if (next_minor == start) {
518 /* Oh dear, all in use. */
519 spin_unlock(&all_mddevs_lock);
525 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
526 if (mddev->unit == dev) {
532 new->md_minor = MINOR(dev);
533 new->hold_active = UNTIL_STOP;
534 list_add(&new->all_mddevs, &all_mddevs);
535 spin_unlock(&all_mddevs_lock);
538 spin_unlock(&all_mddevs_lock);
540 new = kzalloc(sizeof(*new), GFP_KERNEL);
545 if (MAJOR(unit) == MD_MAJOR)
546 new->md_minor = MINOR(unit);
548 new->md_minor = MINOR(unit) >> MdpMinorShift;
555 static inline int __must_check mddev_lock(struct mddev *mddev)
557 return mutex_lock_interruptible(&mddev->reconfig_mutex);
560 /* Sometimes we need to take the lock in a situation where
561 * failure due to interrupts is not acceptable.
563 static inline void mddev_lock_nointr(struct mddev *mddev)
565 mutex_lock(&mddev->reconfig_mutex);
568 static inline int mddev_is_locked(struct mddev *mddev)
570 return mutex_is_locked(&mddev->reconfig_mutex);
573 static inline int mddev_trylock(struct mddev *mddev)
575 return mutex_trylock(&mddev->reconfig_mutex);
578 static struct attribute_group md_redundancy_group;
580 static void mddev_unlock(struct mddev *mddev)
582 if (mddev->to_remove) {
583 /* These cannot be removed under reconfig_mutex as
584 * an access to the files will try to take reconfig_mutex
585 * while holding the file unremovable, which leads to
587 * So hold set sysfs_active while the remove in happeing,
588 * and anything else which might set ->to_remove or my
589 * otherwise change the sysfs namespace will fail with
590 * -EBUSY if sysfs_active is still set.
591 * We set sysfs_active under reconfig_mutex and elsewhere
592 * test it under the same mutex to ensure its correct value
595 struct attribute_group *to_remove = mddev->to_remove;
596 mddev->to_remove = NULL;
597 mddev->sysfs_active = 1;
598 mutex_unlock(&mddev->reconfig_mutex);
600 if (mddev->kobj.sd) {
601 if (to_remove != &md_redundancy_group)
602 sysfs_remove_group(&mddev->kobj, to_remove);
603 if (mddev->pers == NULL ||
604 mddev->pers->sync_request == NULL) {
605 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
606 if (mddev->sysfs_action)
607 sysfs_put(mddev->sysfs_action);
608 mddev->sysfs_action = NULL;
611 mddev->sysfs_active = 0;
613 mutex_unlock(&mddev->reconfig_mutex);
615 /* As we've dropped the mutex we need a spinlock to
616 * make sure the thread doesn't disappear
618 spin_lock(&pers_lock);
619 md_wakeup_thread(mddev->thread);
620 spin_unlock(&pers_lock);
623 static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
625 struct md_rdev *rdev;
627 rdev_for_each_rcu(rdev, mddev)
628 if (rdev->desc_nr == nr)
634 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
636 struct md_rdev *rdev;
638 rdev_for_each(rdev, mddev)
639 if (rdev->bdev->bd_dev == dev)
645 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
647 struct md_rdev *rdev;
649 rdev_for_each_rcu(rdev, mddev)
650 if (rdev->bdev->bd_dev == dev)
656 static struct md_personality *find_pers(int level, char *clevel)
658 struct md_personality *pers;
659 list_for_each_entry(pers, &pers_list, list) {
660 if (level != LEVEL_NONE && pers->level == level)
662 if (strcmp(pers->name, clevel)==0)
668 /* return the offset of the super block in 512byte sectors */
669 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
671 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
672 return MD_NEW_SIZE_SECTORS(num_sectors);
675 static int alloc_disk_sb(struct md_rdev *rdev)
677 rdev->sb_page = alloc_page(GFP_KERNEL);
678 if (!rdev->sb_page) {
679 printk(KERN_ALERT "md: out of memory.\n");
686 void md_rdev_clear(struct md_rdev *rdev)
689 put_page(rdev->sb_page);
691 rdev->sb_page = NULL;
696 put_page(rdev->bb_page);
697 rdev->bb_page = NULL;
699 kfree(rdev->badblocks.page);
700 rdev->badblocks.page = NULL;
702 EXPORT_SYMBOL_GPL(md_rdev_clear);
704 static void super_written(struct bio *bio, int error)
706 struct md_rdev *rdev = bio->bi_private;
707 struct mddev *mddev = rdev->mddev;
709 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
710 printk("md: super_written gets error=%d, uptodate=%d\n",
711 error, test_bit(BIO_UPTODATE, &bio->bi_flags));
712 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
713 md_error(mddev, rdev);
716 if (atomic_dec_and_test(&mddev->pending_writes))
717 wake_up(&mddev->sb_wait);
721 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
722 sector_t sector, int size, struct page *page)
724 /* write first size bytes of page to sector of rdev
725 * Increment mddev->pending_writes before returning
726 * and decrement it on completion, waking up sb_wait
727 * if zero is reached.
728 * If an error occurred, call md_error
730 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
732 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
733 bio->bi_iter.bi_sector = sector;
734 bio_add_page(bio, page, size, 0);
735 bio->bi_private = rdev;
736 bio->bi_end_io = super_written;
738 atomic_inc(&mddev->pending_writes);
739 submit_bio(WRITE_FLUSH_FUA, bio);
742 void md_super_wait(struct mddev *mddev)
744 /* wait for all superblock writes that were scheduled to complete */
745 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
748 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
749 struct page *page, int rw, bool metadata_op)
751 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
754 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
755 rdev->meta_bdev : rdev->bdev;
757 bio->bi_iter.bi_sector = sector + rdev->sb_start;
758 else if (rdev->mddev->reshape_position != MaxSector &&
759 (rdev->mddev->reshape_backwards ==
760 (sector >= rdev->mddev->reshape_position)))
761 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
763 bio->bi_iter.bi_sector = sector + rdev->data_offset;
764 bio_add_page(bio, page, size, 0);
765 submit_bio_wait(rw, bio);
767 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
771 EXPORT_SYMBOL_GPL(sync_page_io);
773 static int read_disk_sb(struct md_rdev *rdev, int size)
775 char b[BDEVNAME_SIZE];
780 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
786 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
787 bdevname(rdev->bdev,b));
791 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
793 return sb1->set_uuid0 == sb2->set_uuid0 &&
794 sb1->set_uuid1 == sb2->set_uuid1 &&
795 sb1->set_uuid2 == sb2->set_uuid2 &&
796 sb1->set_uuid3 == sb2->set_uuid3;
799 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
802 mdp_super_t *tmp1, *tmp2;
804 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
805 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
807 if (!tmp1 || !tmp2) {
809 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
817 * nr_disks is not constant
822 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
829 static u32 md_csum_fold(u32 csum)
831 csum = (csum & 0xffff) + (csum >> 16);
832 return (csum & 0xffff) + (csum >> 16);
835 static unsigned int calc_sb_csum(mdp_super_t *sb)
838 u32 *sb32 = (u32*)sb;
840 unsigned int disk_csum, csum;
842 disk_csum = sb->sb_csum;
845 for (i = 0; i < MD_SB_BYTES/4 ; i++)
847 csum = (newcsum & 0xffffffff) + (newcsum>>32);
850 /* This used to use csum_partial, which was wrong for several
851 * reasons including that different results are returned on
852 * different architectures. It isn't critical that we get exactly
853 * the same return value as before (we always csum_fold before
854 * testing, and that removes any differences). However as we
855 * know that csum_partial always returned a 16bit value on
856 * alphas, do a fold to maximise conformity to previous behaviour.
858 sb->sb_csum = md_csum_fold(disk_csum);
860 sb->sb_csum = disk_csum;
866 * Handle superblock details.
867 * We want to be able to handle multiple superblock formats
868 * so we have a common interface to them all, and an array of
869 * different handlers.
870 * We rely on user-space to write the initial superblock, and support
871 * reading and updating of superblocks.
872 * Interface methods are:
873 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
874 * loads and validates a superblock on dev.
875 * if refdev != NULL, compare superblocks on both devices
877 * 0 - dev has a superblock that is compatible with refdev
878 * 1 - dev has a superblock that is compatible and newer than refdev
879 * so dev should be used as the refdev in future
880 * -EINVAL superblock incompatible or invalid
881 * -othererror e.g. -EIO
883 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
884 * Verify that dev is acceptable into mddev.
885 * The first time, mddev->raid_disks will be 0, and data from
886 * dev should be merged in. Subsequent calls check that dev
887 * is new enough. Return 0 or -EINVAL
889 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
890 * Update the superblock for rdev with data in mddev
891 * This does not write to disc.
897 struct module *owner;
898 int (*load_super)(struct md_rdev *rdev,
899 struct md_rdev *refdev,
901 int (*validate_super)(struct mddev *mddev,
902 struct md_rdev *rdev);
903 void (*sync_super)(struct mddev *mddev,
904 struct md_rdev *rdev);
905 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
906 sector_t num_sectors);
907 int (*allow_new_offset)(struct md_rdev *rdev,
908 unsigned long long new_offset);
912 * Check that the given mddev has no bitmap.
914 * This function is called from the run method of all personalities that do not
915 * support bitmaps. It prints an error message and returns non-zero if mddev
916 * has a bitmap. Otherwise, it returns 0.
919 int md_check_no_bitmap(struct mddev *mddev)
921 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
923 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
924 mdname(mddev), mddev->pers->name);
927 EXPORT_SYMBOL(md_check_no_bitmap);
930 * load_super for 0.90.0
932 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
934 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
939 * Calculate the position of the superblock (512byte sectors),
940 * it's at the end of the disk.
942 * It also happens to be a multiple of 4Kb.
944 rdev->sb_start = calc_dev_sboffset(rdev);
946 ret = read_disk_sb(rdev, MD_SB_BYTES);
951 bdevname(rdev->bdev, b);
952 sb = page_address(rdev->sb_page);
954 if (sb->md_magic != MD_SB_MAGIC) {
955 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
960 if (sb->major_version != 0 ||
961 sb->minor_version < 90 ||
962 sb->minor_version > 91) {
963 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
964 sb->major_version, sb->minor_version,
969 if (sb->raid_disks <= 0)
972 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
973 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
978 rdev->preferred_minor = sb->md_minor;
979 rdev->data_offset = 0;
980 rdev->new_data_offset = 0;
981 rdev->sb_size = MD_SB_BYTES;
982 rdev->badblocks.shift = -1;
984 if (sb->level == LEVEL_MULTIPATH)
987 rdev->desc_nr = sb->this_disk.number;
993 mdp_super_t *refsb = page_address(refdev->sb_page);
994 if (!uuid_equal(refsb, sb)) {
995 printk(KERN_WARNING "md: %s has different UUID to %s\n",
996 b, bdevname(refdev->bdev,b2));
999 if (!sb_equal(refsb, sb)) {
1000 printk(KERN_WARNING "md: %s has same UUID"
1001 " but different superblock to %s\n",
1002 b, bdevname(refdev->bdev, b2));
1006 ev2 = md_event(refsb);
1012 rdev->sectors = rdev->sb_start;
1013 /* Limit to 4TB as metadata cannot record more than that.
1014 * (not needed for Linear and RAID0 as metadata doesn't
1017 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1018 rdev->sectors = (2ULL << 32) - 2;
1020 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1021 /* "this cannot possibly happen" ... */
1029 * validate_super for 0.90.0
1031 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1034 mdp_super_t *sb = page_address(rdev->sb_page);
1035 __u64 ev1 = md_event(sb);
1037 rdev->raid_disk = -1;
1038 clear_bit(Faulty, &rdev->flags);
1039 clear_bit(In_sync, &rdev->flags);
1040 clear_bit(Bitmap_sync, &rdev->flags);
1041 clear_bit(WriteMostly, &rdev->flags);
1043 if (mddev->raid_disks == 0) {
1044 mddev->major_version = 0;
1045 mddev->minor_version = sb->minor_version;
1046 mddev->patch_version = sb->patch_version;
1047 mddev->external = 0;
1048 mddev->chunk_sectors = sb->chunk_size >> 9;
1049 mddev->ctime = sb->ctime;
1050 mddev->utime = sb->utime;
1051 mddev->level = sb->level;
1052 mddev->clevel[0] = 0;
1053 mddev->layout = sb->layout;
1054 mddev->raid_disks = sb->raid_disks;
1055 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1056 mddev->events = ev1;
1057 mddev->bitmap_info.offset = 0;
1058 mddev->bitmap_info.space = 0;
1059 /* bitmap can use 60 K after the 4K superblocks */
1060 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1061 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1062 mddev->reshape_backwards = 0;
1064 if (mddev->minor_version >= 91) {
1065 mddev->reshape_position = sb->reshape_position;
1066 mddev->delta_disks = sb->delta_disks;
1067 mddev->new_level = sb->new_level;
1068 mddev->new_layout = sb->new_layout;
1069 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1070 if (mddev->delta_disks < 0)
1071 mddev->reshape_backwards = 1;
1073 mddev->reshape_position = MaxSector;
1074 mddev->delta_disks = 0;
1075 mddev->new_level = mddev->level;
1076 mddev->new_layout = mddev->layout;
1077 mddev->new_chunk_sectors = mddev->chunk_sectors;
1080 if (sb->state & (1<<MD_SB_CLEAN))
1081 mddev->recovery_cp = MaxSector;
1083 if (sb->events_hi == sb->cp_events_hi &&
1084 sb->events_lo == sb->cp_events_lo) {
1085 mddev->recovery_cp = sb->recovery_cp;
1087 mddev->recovery_cp = 0;
1090 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1091 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1092 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1093 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1095 mddev->max_disks = MD_SB_DISKS;
1097 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1098 mddev->bitmap_info.file == NULL) {
1099 mddev->bitmap_info.offset =
1100 mddev->bitmap_info.default_offset;
1101 mddev->bitmap_info.space =
1102 mddev->bitmap_info.default_space;
1105 } else if (mddev->pers == NULL) {
1106 /* Insist on good event counter while assembling, except
1107 * for spares (which don't need an event count) */
1109 if (sb->disks[rdev->desc_nr].state & (
1110 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1111 if (ev1 < mddev->events)
1113 } else if (mddev->bitmap) {
1114 /* if adding to array with a bitmap, then we can accept an
1115 * older device ... but not too old.
1117 if (ev1 < mddev->bitmap->events_cleared)
1119 if (ev1 < mddev->events)
1120 set_bit(Bitmap_sync, &rdev->flags);
1122 if (ev1 < mddev->events)
1123 /* just a hot-add of a new device, leave raid_disk at -1 */
1127 if (mddev->level != LEVEL_MULTIPATH) {
1128 desc = sb->disks + rdev->desc_nr;
1130 if (desc->state & (1<<MD_DISK_FAULTY))
1131 set_bit(Faulty, &rdev->flags);
1132 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1133 desc->raid_disk < mddev->raid_disks */) {
1134 set_bit(In_sync, &rdev->flags);
1135 rdev->raid_disk = desc->raid_disk;
1136 rdev->saved_raid_disk = desc->raid_disk;
1137 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1138 /* active but not in sync implies recovery up to
1139 * reshape position. We don't know exactly where
1140 * that is, so set to zero for now */
1141 if (mddev->minor_version >= 91) {
1142 rdev->recovery_offset = 0;
1143 rdev->raid_disk = desc->raid_disk;
1146 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1147 set_bit(WriteMostly, &rdev->flags);
1148 } else /* MULTIPATH are always insync */
1149 set_bit(In_sync, &rdev->flags);
1154 * sync_super for 0.90.0
1156 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1159 struct md_rdev *rdev2;
1160 int next_spare = mddev->raid_disks;
1162 /* make rdev->sb match mddev data..
1165 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1166 * 3/ any empty disks < next_spare become removed
1168 * disks[0] gets initialised to REMOVED because
1169 * we cannot be sure from other fields if it has
1170 * been initialised or not.
1173 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1175 rdev->sb_size = MD_SB_BYTES;
1177 sb = page_address(rdev->sb_page);
1179 memset(sb, 0, sizeof(*sb));
1181 sb->md_magic = MD_SB_MAGIC;
1182 sb->major_version = mddev->major_version;
1183 sb->patch_version = mddev->patch_version;
1184 sb->gvalid_words = 0; /* ignored */
1185 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1186 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1187 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1188 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1190 sb->ctime = mddev->ctime;
1191 sb->level = mddev->level;
1192 sb->size = mddev->dev_sectors / 2;
1193 sb->raid_disks = mddev->raid_disks;
1194 sb->md_minor = mddev->md_minor;
1195 sb->not_persistent = 0;
1196 sb->utime = mddev->utime;
1198 sb->events_hi = (mddev->events>>32);
1199 sb->events_lo = (u32)mddev->events;
1201 if (mddev->reshape_position == MaxSector)
1202 sb->minor_version = 90;
1204 sb->minor_version = 91;
1205 sb->reshape_position = mddev->reshape_position;
1206 sb->new_level = mddev->new_level;
1207 sb->delta_disks = mddev->delta_disks;
1208 sb->new_layout = mddev->new_layout;
1209 sb->new_chunk = mddev->new_chunk_sectors << 9;
1211 mddev->minor_version = sb->minor_version;
1214 sb->recovery_cp = mddev->recovery_cp;
1215 sb->cp_events_hi = (mddev->events>>32);
1216 sb->cp_events_lo = (u32)mddev->events;
1217 if (mddev->recovery_cp == MaxSector)
1218 sb->state = (1<< MD_SB_CLEAN);
1220 sb->recovery_cp = 0;
1222 sb->layout = mddev->layout;
1223 sb->chunk_size = mddev->chunk_sectors << 9;
1225 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1226 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1228 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1229 rdev_for_each(rdev2, mddev) {
1232 int is_active = test_bit(In_sync, &rdev2->flags);
1234 if (rdev2->raid_disk >= 0 &&
1235 sb->minor_version >= 91)
1236 /* we have nowhere to store the recovery_offset,
1237 * but if it is not below the reshape_position,
1238 * we can piggy-back on that.
1241 if (rdev2->raid_disk < 0 ||
1242 test_bit(Faulty, &rdev2->flags))
1245 desc_nr = rdev2->raid_disk;
1247 desc_nr = next_spare++;
1248 rdev2->desc_nr = desc_nr;
1249 d = &sb->disks[rdev2->desc_nr];
1251 d->number = rdev2->desc_nr;
1252 d->major = MAJOR(rdev2->bdev->bd_dev);
1253 d->minor = MINOR(rdev2->bdev->bd_dev);
1255 d->raid_disk = rdev2->raid_disk;
1257 d->raid_disk = rdev2->desc_nr; /* compatibility */
1258 if (test_bit(Faulty, &rdev2->flags))
1259 d->state = (1<<MD_DISK_FAULTY);
1260 else if (is_active) {
1261 d->state = (1<<MD_DISK_ACTIVE);
1262 if (test_bit(In_sync, &rdev2->flags))
1263 d->state |= (1<<MD_DISK_SYNC);
1271 if (test_bit(WriteMostly, &rdev2->flags))
1272 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1274 /* now set the "removed" and "faulty" bits on any missing devices */
1275 for (i=0 ; i < mddev->raid_disks ; i++) {
1276 mdp_disk_t *d = &sb->disks[i];
1277 if (d->state == 0 && d->number == 0) {
1280 d->state = (1<<MD_DISK_REMOVED);
1281 d->state |= (1<<MD_DISK_FAULTY);
1285 sb->nr_disks = nr_disks;
1286 sb->active_disks = active;
1287 sb->working_disks = working;
1288 sb->failed_disks = failed;
1289 sb->spare_disks = spare;
1291 sb->this_disk = sb->disks[rdev->desc_nr];
1292 sb->sb_csum = calc_sb_csum(sb);
1296 * rdev_size_change for 0.90.0
1298 static unsigned long long
1299 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1301 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1302 return 0; /* component must fit device */
1303 if (rdev->mddev->bitmap_info.offset)
1304 return 0; /* can't move bitmap */
1305 rdev->sb_start = calc_dev_sboffset(rdev);
1306 if (!num_sectors || num_sectors > rdev->sb_start)
1307 num_sectors = rdev->sb_start;
1308 /* Limit to 4TB as metadata cannot record more than that.
1309 * 4TB == 2^32 KB, or 2*2^32 sectors.
1311 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1312 num_sectors = (2ULL << 32) - 2;
1313 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1315 md_super_wait(rdev->mddev);
1320 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1322 /* non-zero offset changes not possible with v0.90 */
1323 return new_offset == 0;
1327 * version 1 superblock
1330 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1334 unsigned long long newcsum;
1335 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1336 __le32 *isuper = (__le32*)sb;
1338 disk_csum = sb->sb_csum;
1341 for (; size >= 4; size -= 4)
1342 newcsum += le32_to_cpu(*isuper++);
1345 newcsum += le16_to_cpu(*(__le16*) isuper);
1347 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1348 sb->sb_csum = disk_csum;
1349 return cpu_to_le32(csum);
1352 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1354 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1356 struct mdp_superblock_1 *sb;
1360 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1364 * Calculate the position of the superblock in 512byte sectors.
1365 * It is always aligned to a 4K boundary and
1366 * depeding on minor_version, it can be:
1367 * 0: At least 8K, but less than 12K, from end of device
1368 * 1: At start of device
1369 * 2: 4K from start of device.
1371 switch(minor_version) {
1373 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1375 sb_start &= ~(sector_t)(4*2-1);
1386 rdev->sb_start = sb_start;
1388 /* superblock is rarely larger than 1K, but it can be larger,
1389 * and it is safe to read 4k, so we do that
1391 ret = read_disk_sb(rdev, 4096);
1392 if (ret) return ret;
1394 sb = page_address(rdev->sb_page);
1396 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1397 sb->major_version != cpu_to_le32(1) ||
1398 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1399 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1400 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1403 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1404 printk("md: invalid superblock checksum on %s\n",
1405 bdevname(rdev->bdev,b));
1408 if (le64_to_cpu(sb->data_size) < 10) {
1409 printk("md: data_size too small on %s\n",
1410 bdevname(rdev->bdev,b));
1415 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1416 /* Some padding is non-zero, might be a new feature */
1419 rdev->preferred_minor = 0xffff;
1420 rdev->data_offset = le64_to_cpu(sb->data_offset);
1421 rdev->new_data_offset = rdev->data_offset;
1422 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1423 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1424 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1425 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1427 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1428 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1429 if (rdev->sb_size & bmask)
1430 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1433 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1436 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1439 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1442 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1444 if (!rdev->bb_page) {
1445 rdev->bb_page = alloc_page(GFP_KERNEL);
1449 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1450 rdev->badblocks.count == 0) {
1451 /* need to load the bad block list.
1452 * Currently we limit it to one page.
1458 int sectors = le16_to_cpu(sb->bblog_size);
1459 if (sectors > (PAGE_SIZE / 512))
1461 offset = le32_to_cpu(sb->bblog_offset);
1464 bb_sector = (long long)offset;
1465 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1466 rdev->bb_page, READ, true))
1468 bbp = (u64 *)page_address(rdev->bb_page);
1469 rdev->badblocks.shift = sb->bblog_shift;
1470 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1471 u64 bb = le64_to_cpu(*bbp);
1472 int count = bb & (0x3ff);
1473 u64 sector = bb >> 10;
1474 sector <<= sb->bblog_shift;
1475 count <<= sb->bblog_shift;
1478 if (md_set_badblocks(&rdev->badblocks,
1479 sector, count, 1) == 0)
1482 } else if (sb->bblog_offset != 0)
1483 rdev->badblocks.shift = 0;
1489 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1491 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1492 sb->level != refsb->level ||
1493 sb->layout != refsb->layout ||
1494 sb->chunksize != refsb->chunksize) {
1495 printk(KERN_WARNING "md: %s has strangely different"
1496 " superblock to %s\n",
1497 bdevname(rdev->bdev,b),
1498 bdevname(refdev->bdev,b2));
1501 ev1 = le64_to_cpu(sb->events);
1502 ev2 = le64_to_cpu(refsb->events);
1509 if (minor_version) {
1510 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1511 sectors -= rdev->data_offset;
1513 sectors = rdev->sb_start;
1514 if (sectors < le64_to_cpu(sb->data_size))
1516 rdev->sectors = le64_to_cpu(sb->data_size);
1520 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1522 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1523 __u64 ev1 = le64_to_cpu(sb->events);
1525 rdev->raid_disk = -1;
1526 clear_bit(Faulty, &rdev->flags);
1527 clear_bit(In_sync, &rdev->flags);
1528 clear_bit(Bitmap_sync, &rdev->flags);
1529 clear_bit(WriteMostly, &rdev->flags);
1531 if (mddev->raid_disks == 0) {
1532 mddev->major_version = 1;
1533 mddev->patch_version = 0;
1534 mddev->external = 0;
1535 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1536 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1537 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1538 mddev->level = le32_to_cpu(sb->level);
1539 mddev->clevel[0] = 0;
1540 mddev->layout = le32_to_cpu(sb->layout);
1541 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1542 mddev->dev_sectors = le64_to_cpu(sb->size);
1543 mddev->events = ev1;
1544 mddev->bitmap_info.offset = 0;
1545 mddev->bitmap_info.space = 0;
1546 /* Default location for bitmap is 1K after superblock
1547 * using 3K - total of 4K
1549 mddev->bitmap_info.default_offset = 1024 >> 9;
1550 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1551 mddev->reshape_backwards = 0;
1553 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1554 memcpy(mddev->uuid, sb->set_uuid, 16);
1556 mddev->max_disks = (4096-256)/2;
1558 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1559 mddev->bitmap_info.file == NULL) {
1560 mddev->bitmap_info.offset =
1561 (__s32)le32_to_cpu(sb->bitmap_offset);
1562 /* Metadata doesn't record how much space is available.
1563 * For 1.0, we assume we can use up to the superblock
1564 * if before, else to 4K beyond superblock.
1565 * For others, assume no change is possible.
1567 if (mddev->minor_version > 0)
1568 mddev->bitmap_info.space = 0;
1569 else if (mddev->bitmap_info.offset > 0)
1570 mddev->bitmap_info.space =
1571 8 - mddev->bitmap_info.offset;
1573 mddev->bitmap_info.space =
1574 -mddev->bitmap_info.offset;
1577 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1578 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1579 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1580 mddev->new_level = le32_to_cpu(sb->new_level);
1581 mddev->new_layout = le32_to_cpu(sb->new_layout);
1582 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1583 if (mddev->delta_disks < 0 ||
1584 (mddev->delta_disks == 0 &&
1585 (le32_to_cpu(sb->feature_map)
1586 & MD_FEATURE_RESHAPE_BACKWARDS)))
1587 mddev->reshape_backwards = 1;
1589 mddev->reshape_position = MaxSector;
1590 mddev->delta_disks = 0;
1591 mddev->new_level = mddev->level;
1592 mddev->new_layout = mddev->layout;
1593 mddev->new_chunk_sectors = mddev->chunk_sectors;
1596 } else if (mddev->pers == NULL) {
1597 /* Insist of good event counter while assembling, except for
1598 * spares (which don't need an event count) */
1600 if (rdev->desc_nr >= 0 &&
1601 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1602 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1603 if (ev1 < mddev->events)
1605 } else if (mddev->bitmap) {
1606 /* If adding to array with a bitmap, then we can accept an
1607 * older device, but not too old.
1609 if (ev1 < mddev->bitmap->events_cleared)
1611 if (ev1 < mddev->events)
1612 set_bit(Bitmap_sync, &rdev->flags);
1614 if (ev1 < mddev->events)
1615 /* just a hot-add of a new device, leave raid_disk at -1 */
1618 if (mddev->level != LEVEL_MULTIPATH) {
1620 if (rdev->desc_nr < 0 ||
1621 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1625 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1627 case 0xffff: /* spare */
1629 case 0xfffe: /* faulty */
1630 set_bit(Faulty, &rdev->flags);
1633 rdev->saved_raid_disk = role;
1634 if ((le32_to_cpu(sb->feature_map) &
1635 MD_FEATURE_RECOVERY_OFFSET)) {
1636 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1637 if (!(le32_to_cpu(sb->feature_map) &
1638 MD_FEATURE_RECOVERY_BITMAP))
1639 rdev->saved_raid_disk = -1;
1641 set_bit(In_sync, &rdev->flags);
1642 rdev->raid_disk = role;
1645 if (sb->devflags & WriteMostly1)
1646 set_bit(WriteMostly, &rdev->flags);
1647 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1648 set_bit(Replacement, &rdev->flags);
1649 } else /* MULTIPATH are always insync */
1650 set_bit(In_sync, &rdev->flags);
1655 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1657 struct mdp_superblock_1 *sb;
1658 struct md_rdev *rdev2;
1660 /* make rdev->sb match mddev and rdev data. */
1662 sb = page_address(rdev->sb_page);
1664 sb->feature_map = 0;
1666 sb->recovery_offset = cpu_to_le64(0);
1667 memset(sb->pad3, 0, sizeof(sb->pad3));
1669 sb->utime = cpu_to_le64((__u64)mddev->utime);
1670 sb->events = cpu_to_le64(mddev->events);
1672 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1674 sb->resync_offset = cpu_to_le64(0);
1676 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1678 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1679 sb->size = cpu_to_le64(mddev->dev_sectors);
1680 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1681 sb->level = cpu_to_le32(mddev->level);
1682 sb->layout = cpu_to_le32(mddev->layout);
1684 if (test_bit(WriteMostly, &rdev->flags))
1685 sb->devflags |= WriteMostly1;
1687 sb->devflags &= ~WriteMostly1;
1688 sb->data_offset = cpu_to_le64(rdev->data_offset);
1689 sb->data_size = cpu_to_le64(rdev->sectors);
1691 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1692 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1693 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1696 if (rdev->raid_disk >= 0 &&
1697 !test_bit(In_sync, &rdev->flags)) {
1699 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1700 sb->recovery_offset =
1701 cpu_to_le64(rdev->recovery_offset);
1702 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1704 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1706 if (test_bit(Replacement, &rdev->flags))
1708 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1710 if (mddev->reshape_position != MaxSector) {
1711 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1712 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1713 sb->new_layout = cpu_to_le32(mddev->new_layout);
1714 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1715 sb->new_level = cpu_to_le32(mddev->new_level);
1716 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1717 if (mddev->delta_disks == 0 &&
1718 mddev->reshape_backwards)
1720 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1721 if (rdev->new_data_offset != rdev->data_offset) {
1723 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1724 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1725 - rdev->data_offset));
1729 if (rdev->badblocks.count == 0)
1730 /* Nothing to do for bad blocks*/ ;
1731 else if (sb->bblog_offset == 0)
1732 /* Cannot record bad blocks on this device */
1733 md_error(mddev, rdev);
1735 struct badblocks *bb = &rdev->badblocks;
1736 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1738 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1743 seq = read_seqbegin(&bb->lock);
1745 memset(bbp, 0xff, PAGE_SIZE);
1747 for (i = 0 ; i < bb->count ; i++) {
1748 u64 internal_bb = p[i];
1749 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1750 | BB_LEN(internal_bb));
1751 bbp[i] = cpu_to_le64(store_bb);
1754 if (read_seqretry(&bb->lock, seq))
1757 bb->sector = (rdev->sb_start +
1758 (int)le32_to_cpu(sb->bblog_offset));
1759 bb->size = le16_to_cpu(sb->bblog_size);
1764 rdev_for_each(rdev2, mddev)
1765 if (rdev2->desc_nr+1 > max_dev)
1766 max_dev = rdev2->desc_nr+1;
1768 if (max_dev > le32_to_cpu(sb->max_dev)) {
1770 sb->max_dev = cpu_to_le32(max_dev);
1771 rdev->sb_size = max_dev * 2 + 256;
1772 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1773 if (rdev->sb_size & bmask)
1774 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1776 max_dev = le32_to_cpu(sb->max_dev);
1778 for (i=0; i<max_dev;i++)
1779 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1781 rdev_for_each(rdev2, mddev) {
1783 if (test_bit(Faulty, &rdev2->flags))
1784 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1785 else if (test_bit(In_sync, &rdev2->flags))
1786 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1787 else if (rdev2->raid_disk >= 0)
1788 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1790 sb->dev_roles[i] = cpu_to_le16(0xffff);
1793 sb->sb_csum = calc_sb_1_csum(sb);
1796 static unsigned long long
1797 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1799 struct mdp_superblock_1 *sb;
1800 sector_t max_sectors;
1801 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1802 return 0; /* component must fit device */
1803 if (rdev->data_offset != rdev->new_data_offset)
1804 return 0; /* too confusing */
1805 if (rdev->sb_start < rdev->data_offset) {
1806 /* minor versions 1 and 2; superblock before data */
1807 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1808 max_sectors -= rdev->data_offset;
1809 if (!num_sectors || num_sectors > max_sectors)
1810 num_sectors = max_sectors;
1811 } else if (rdev->mddev->bitmap_info.offset) {
1812 /* minor version 0 with bitmap we can't move */
1815 /* minor version 0; superblock after data */
1817 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1818 sb_start &= ~(sector_t)(4*2 - 1);
1819 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1820 if (!num_sectors || num_sectors > max_sectors)
1821 num_sectors = max_sectors;
1822 rdev->sb_start = sb_start;
1824 sb = page_address(rdev->sb_page);
1825 sb->data_size = cpu_to_le64(num_sectors);
1826 sb->super_offset = rdev->sb_start;
1827 sb->sb_csum = calc_sb_1_csum(sb);
1828 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1830 md_super_wait(rdev->mddev);
1836 super_1_allow_new_offset(struct md_rdev *rdev,
1837 unsigned long long new_offset)
1839 /* All necessary checks on new >= old have been done */
1840 struct bitmap *bitmap;
1841 if (new_offset >= rdev->data_offset)
1844 /* with 1.0 metadata, there is no metadata to tread on
1845 * so we can always move back */
1846 if (rdev->mddev->minor_version == 0)
1849 /* otherwise we must be sure not to step on
1850 * any metadata, so stay:
1851 * 36K beyond start of superblock
1852 * beyond end of badblocks
1853 * beyond write-intent bitmap
1855 if (rdev->sb_start + (32+4)*2 > new_offset)
1857 bitmap = rdev->mddev->bitmap;
1858 if (bitmap && !rdev->mddev->bitmap_info.file &&
1859 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1860 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1862 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1868 static struct super_type super_types[] = {
1871 .owner = THIS_MODULE,
1872 .load_super = super_90_load,
1873 .validate_super = super_90_validate,
1874 .sync_super = super_90_sync,
1875 .rdev_size_change = super_90_rdev_size_change,
1876 .allow_new_offset = super_90_allow_new_offset,
1880 .owner = THIS_MODULE,
1881 .load_super = super_1_load,
1882 .validate_super = super_1_validate,
1883 .sync_super = super_1_sync,
1884 .rdev_size_change = super_1_rdev_size_change,
1885 .allow_new_offset = super_1_allow_new_offset,
1889 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1891 if (mddev->sync_super) {
1892 mddev->sync_super(mddev, rdev);
1896 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1898 super_types[mddev->major_version].sync_super(mddev, rdev);
1901 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1903 struct md_rdev *rdev, *rdev2;
1906 rdev_for_each_rcu(rdev, mddev1)
1907 rdev_for_each_rcu(rdev2, mddev2)
1908 if (rdev->bdev->bd_contains ==
1909 rdev2->bdev->bd_contains) {
1917 static LIST_HEAD(pending_raid_disks);
1920 * Try to register data integrity profile for an mddev
1922 * This is called when an array is started and after a disk has been kicked
1923 * from the array. It only succeeds if all working and active component devices
1924 * are integrity capable with matching profiles.
1926 int md_integrity_register(struct mddev *mddev)
1928 struct md_rdev *rdev, *reference = NULL;
1930 if (list_empty(&mddev->disks))
1931 return 0; /* nothing to do */
1932 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1933 return 0; /* shouldn't register, or already is */
1934 rdev_for_each(rdev, mddev) {
1935 /* skip spares and non-functional disks */
1936 if (test_bit(Faulty, &rdev->flags))
1938 if (rdev->raid_disk < 0)
1941 /* Use the first rdev as the reference */
1945 /* does this rdev's profile match the reference profile? */
1946 if (blk_integrity_compare(reference->bdev->bd_disk,
1947 rdev->bdev->bd_disk) < 0)
1950 if (!reference || !bdev_get_integrity(reference->bdev))
1953 * All component devices are integrity capable and have matching
1954 * profiles, register the common profile for the md device.
1956 if (blk_integrity_register(mddev->gendisk,
1957 bdev_get_integrity(reference->bdev)) != 0) {
1958 printk(KERN_ERR "md: failed to register integrity for %s\n",
1962 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
1963 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
1964 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
1970 EXPORT_SYMBOL(md_integrity_register);
1972 /* Disable data integrity if non-capable/non-matching disk is being added */
1973 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
1975 struct blk_integrity *bi_rdev;
1976 struct blk_integrity *bi_mddev;
1978 if (!mddev->gendisk)
1981 bi_rdev = bdev_get_integrity(rdev->bdev);
1982 bi_mddev = blk_get_integrity(mddev->gendisk);
1984 if (!bi_mddev) /* nothing to do */
1986 if (rdev->raid_disk < 0) /* skip spares */
1988 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1989 rdev->bdev->bd_disk) >= 0)
1991 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1992 blk_integrity_unregister(mddev->gendisk);
1994 EXPORT_SYMBOL(md_integrity_add_rdev);
1996 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
1998 char b[BDEVNAME_SIZE];
2003 /* prevent duplicates */
2004 if (find_rdev(mddev, rdev->bdev->bd_dev))
2007 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2008 if (rdev->sectors && (mddev->dev_sectors == 0 ||
2009 rdev->sectors < mddev->dev_sectors)) {
2011 /* Cannot change size, so fail
2012 * If mddev->level <= 0, then we don't care
2013 * about aligning sizes (e.g. linear)
2015 if (mddev->level > 0)
2018 mddev->dev_sectors = rdev->sectors;
2021 /* Verify rdev->desc_nr is unique.
2022 * If it is -1, assign a free number, else
2023 * check number is not in use
2026 if (rdev->desc_nr < 0) {
2029 choice = mddev->raid_disks;
2030 while (find_rdev_nr_rcu(mddev, choice))
2032 rdev->desc_nr = choice;
2034 if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2040 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2041 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2042 mdname(mddev), mddev->max_disks);
2045 bdevname(rdev->bdev,b);
2046 while ( (s=strchr(b, '/')) != NULL)
2049 rdev->mddev = mddev;
2050 printk(KERN_INFO "md: bind<%s>\n", b);
2052 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2055 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2056 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2057 /* failure here is OK */;
2058 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2060 list_add_rcu(&rdev->same_set, &mddev->disks);
2061 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2063 /* May as well allow recovery to be retried once */
2064 mddev->recovery_disabled++;
2069 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2074 static void md_delayed_delete(struct work_struct *ws)
2076 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2077 kobject_del(&rdev->kobj);
2078 kobject_put(&rdev->kobj);
2081 static void unbind_rdev_from_array(struct md_rdev *rdev)
2083 char b[BDEVNAME_SIZE];
2085 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2086 list_del_rcu(&rdev->same_set);
2087 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2089 sysfs_remove_link(&rdev->kobj, "block");
2090 sysfs_put(rdev->sysfs_state);
2091 rdev->sysfs_state = NULL;
2092 rdev->badblocks.count = 0;
2093 /* We need to delay this, otherwise we can deadlock when
2094 * writing to 'remove' to "dev/state". We also need
2095 * to delay it due to rcu usage.
2098 INIT_WORK(&rdev->del_work, md_delayed_delete);
2099 kobject_get(&rdev->kobj);
2100 queue_work(md_misc_wq, &rdev->del_work);
2104 * prevent the device from being mounted, repartitioned or
2105 * otherwise reused by a RAID array (or any other kernel
2106 * subsystem), by bd_claiming the device.
2108 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2111 struct block_device *bdev;
2112 char b[BDEVNAME_SIZE];
2114 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2115 shared ? (struct md_rdev *)lock_rdev : rdev);
2117 printk(KERN_ERR "md: could not open %s.\n",
2118 __bdevname(dev, b));
2119 return PTR_ERR(bdev);
2125 static void unlock_rdev(struct md_rdev *rdev)
2127 struct block_device *bdev = rdev->bdev;
2129 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2132 void md_autodetect_dev(dev_t dev);
2134 static void export_rdev(struct md_rdev *rdev)
2136 char b[BDEVNAME_SIZE];
2138 printk(KERN_INFO "md: export_rdev(%s)\n",
2139 bdevname(rdev->bdev,b));
2140 md_rdev_clear(rdev);
2142 if (test_bit(AutoDetected, &rdev->flags))
2143 md_autodetect_dev(rdev->bdev->bd_dev);
2146 kobject_put(&rdev->kobj);
2149 static void kick_rdev_from_array(struct md_rdev *rdev)
2151 unbind_rdev_from_array(rdev);
2155 static void export_array(struct mddev *mddev)
2157 struct md_rdev *rdev;
2159 while (!list_empty(&mddev->disks)) {
2160 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2162 kick_rdev_from_array(rdev);
2164 mddev->raid_disks = 0;
2165 mddev->major_version = 0;
2168 static void sync_sbs(struct mddev *mddev, int nospares)
2170 /* Update each superblock (in-memory image), but
2171 * if we are allowed to, skip spares which already
2172 * have the right event counter, or have one earlier
2173 * (which would mean they aren't being marked as dirty
2174 * with the rest of the array)
2176 struct md_rdev *rdev;
2177 rdev_for_each(rdev, mddev) {
2178 if (rdev->sb_events == mddev->events ||
2180 rdev->raid_disk < 0 &&
2181 rdev->sb_events+1 == mddev->events)) {
2182 /* Don't update this superblock */
2183 rdev->sb_loaded = 2;
2185 sync_super(mddev, rdev);
2186 rdev->sb_loaded = 1;
2191 static void md_update_sb(struct mddev *mddev, int force_change)
2193 struct md_rdev *rdev;
2196 int any_badblocks_changed = 0;
2200 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2204 /* First make sure individual recovery_offsets are correct */
2205 rdev_for_each(rdev, mddev) {
2206 if (rdev->raid_disk >= 0 &&
2207 mddev->delta_disks >= 0 &&
2208 !test_bit(In_sync, &rdev->flags) &&
2209 mddev->curr_resync_completed > rdev->recovery_offset)
2210 rdev->recovery_offset = mddev->curr_resync_completed;
2213 if (!mddev->persistent) {
2214 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2215 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2216 if (!mddev->external) {
2217 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2218 rdev_for_each(rdev, mddev) {
2219 if (rdev->badblocks.changed) {
2220 rdev->badblocks.changed = 0;
2221 md_ack_all_badblocks(&rdev->badblocks);
2222 md_error(mddev, rdev);
2224 clear_bit(Blocked, &rdev->flags);
2225 clear_bit(BlockedBadBlocks, &rdev->flags);
2226 wake_up(&rdev->blocked_wait);
2229 wake_up(&mddev->sb_wait);
2233 spin_lock_irq(&mddev->write_lock);
2235 mddev->utime = get_seconds();
2237 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2239 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2240 /* just a clean<-> dirty transition, possibly leave spares alone,
2241 * though if events isn't the right even/odd, we will have to do
2247 if (mddev->degraded)
2248 /* If the array is degraded, then skipping spares is both
2249 * dangerous and fairly pointless.
2250 * Dangerous because a device that was removed from the array
2251 * might have a event_count that still looks up-to-date,
2252 * so it can be re-added without a resync.
2253 * Pointless because if there are any spares to skip,
2254 * then a recovery will happen and soon that array won't
2255 * be degraded any more and the spare can go back to sleep then.
2259 sync_req = mddev->in_sync;
2261 /* If this is just a dirty<->clean transition, and the array is clean
2262 * and 'events' is odd, we can roll back to the previous clean state */
2264 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2265 && mddev->can_decrease_events
2266 && mddev->events != 1) {
2268 mddev->can_decrease_events = 0;
2270 /* otherwise we have to go forward and ... */
2272 mddev->can_decrease_events = nospares;
2276 * This 64-bit counter should never wrap.
2277 * Either we are in around ~1 trillion A.C., assuming
2278 * 1 reboot per second, or we have a bug...
2280 WARN_ON(mddev->events == 0);
2282 rdev_for_each(rdev, mddev) {
2283 if (rdev->badblocks.changed)
2284 any_badblocks_changed++;
2285 if (test_bit(Faulty, &rdev->flags))
2286 set_bit(FaultRecorded, &rdev->flags);
2289 sync_sbs(mddev, nospares);
2290 spin_unlock_irq(&mddev->write_lock);
2292 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2293 mdname(mddev), mddev->in_sync);
2295 bitmap_update_sb(mddev->bitmap);
2296 rdev_for_each(rdev, mddev) {
2297 char b[BDEVNAME_SIZE];
2299 if (rdev->sb_loaded != 1)
2300 continue; /* no noise on spare devices */
2302 if (!test_bit(Faulty, &rdev->flags)) {
2303 md_super_write(mddev,rdev,
2304 rdev->sb_start, rdev->sb_size,
2306 pr_debug("md: (write) %s's sb offset: %llu\n",
2307 bdevname(rdev->bdev, b),
2308 (unsigned long long)rdev->sb_start);
2309 rdev->sb_events = mddev->events;
2310 if (rdev->badblocks.size) {
2311 md_super_write(mddev, rdev,
2312 rdev->badblocks.sector,
2313 rdev->badblocks.size << 9,
2315 rdev->badblocks.size = 0;
2319 pr_debug("md: %s (skipping faulty)\n",
2320 bdevname(rdev->bdev, b));
2322 if (mddev->level == LEVEL_MULTIPATH)
2323 /* only need to write one superblock... */
2326 md_super_wait(mddev);
2327 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2329 spin_lock_irq(&mddev->write_lock);
2330 if (mddev->in_sync != sync_req ||
2331 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2332 /* have to write it out again */
2333 spin_unlock_irq(&mddev->write_lock);
2336 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2337 spin_unlock_irq(&mddev->write_lock);
2338 wake_up(&mddev->sb_wait);
2339 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2340 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2342 rdev_for_each(rdev, mddev) {
2343 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2344 clear_bit(Blocked, &rdev->flags);
2346 if (any_badblocks_changed)
2347 md_ack_all_badblocks(&rdev->badblocks);
2348 clear_bit(BlockedBadBlocks, &rdev->flags);
2349 wake_up(&rdev->blocked_wait);
2353 /* words written to sysfs files may, or may not, be \n terminated.
2354 * We want to accept with case. For this we use cmd_match.
2356 static int cmd_match(const char *cmd, const char *str)
2358 /* See if cmd, written into a sysfs file, matches
2359 * str. They must either be the same, or cmd can
2360 * have a trailing newline
2362 while (*cmd && *str && *cmd == *str) {
2373 struct rdev_sysfs_entry {
2374 struct attribute attr;
2375 ssize_t (*show)(struct md_rdev *, char *);
2376 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2380 state_show(struct md_rdev *rdev, char *page)
2385 if (test_bit(Faulty, &rdev->flags) ||
2386 rdev->badblocks.unacked_exist) {
2387 len+= sprintf(page+len, "%sfaulty",sep);
2390 if (test_bit(In_sync, &rdev->flags)) {
2391 len += sprintf(page+len, "%sin_sync",sep);
2394 if (test_bit(WriteMostly, &rdev->flags)) {
2395 len += sprintf(page+len, "%swrite_mostly",sep);
2398 if (test_bit(Blocked, &rdev->flags) ||
2399 (rdev->badblocks.unacked_exist
2400 && !test_bit(Faulty, &rdev->flags))) {
2401 len += sprintf(page+len, "%sblocked", sep);
2404 if (!test_bit(Faulty, &rdev->flags) &&
2405 !test_bit(In_sync, &rdev->flags)) {
2406 len += sprintf(page+len, "%sspare", sep);
2409 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2410 len += sprintf(page+len, "%swrite_error", sep);
2413 if (test_bit(WantReplacement, &rdev->flags)) {
2414 len += sprintf(page+len, "%swant_replacement", sep);
2417 if (test_bit(Replacement, &rdev->flags)) {
2418 len += sprintf(page+len, "%sreplacement", sep);
2422 return len+sprintf(page+len, "\n");
2426 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2429 * faulty - simulates an error
2430 * remove - disconnects the device
2431 * writemostly - sets write_mostly
2432 * -writemostly - clears write_mostly
2433 * blocked - sets the Blocked flags
2434 * -blocked - clears the Blocked and possibly simulates an error
2435 * insync - sets Insync providing device isn't active
2436 * -insync - clear Insync for a device with a slot assigned,
2437 * so that it gets rebuilt based on bitmap
2438 * write_error - sets WriteErrorSeen
2439 * -write_error - clears WriteErrorSeen
2442 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2443 md_error(rdev->mddev, rdev);
2444 if (test_bit(Faulty, &rdev->flags))
2448 } else if (cmd_match(buf, "remove")) {
2449 if (rdev->raid_disk >= 0)
2452 struct mddev *mddev = rdev->mddev;
2453 kick_rdev_from_array(rdev);
2455 md_update_sb(mddev, 1);
2456 md_new_event(mddev);
2459 } else if (cmd_match(buf, "writemostly")) {
2460 set_bit(WriteMostly, &rdev->flags);
2462 } else if (cmd_match(buf, "-writemostly")) {
2463 clear_bit(WriteMostly, &rdev->flags);
2465 } else if (cmd_match(buf, "blocked")) {
2466 set_bit(Blocked, &rdev->flags);
2468 } else if (cmd_match(buf, "-blocked")) {
2469 if (!test_bit(Faulty, &rdev->flags) &&
2470 rdev->badblocks.unacked_exist) {
2471 /* metadata handler doesn't understand badblocks,
2472 * so we need to fail the device
2474 md_error(rdev->mddev, rdev);
2476 clear_bit(Blocked, &rdev->flags);
2477 clear_bit(BlockedBadBlocks, &rdev->flags);
2478 wake_up(&rdev->blocked_wait);
2479 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2480 md_wakeup_thread(rdev->mddev->thread);
2483 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2484 set_bit(In_sync, &rdev->flags);
2486 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
2487 if (rdev->mddev->pers == NULL) {
2488 clear_bit(In_sync, &rdev->flags);
2489 rdev->saved_raid_disk = rdev->raid_disk;
2490 rdev->raid_disk = -1;
2493 } else if (cmd_match(buf, "write_error")) {
2494 set_bit(WriteErrorSeen, &rdev->flags);
2496 } else if (cmd_match(buf, "-write_error")) {
2497 clear_bit(WriteErrorSeen, &rdev->flags);
2499 } else if (cmd_match(buf, "want_replacement")) {
2500 /* Any non-spare device that is not a replacement can
2501 * become want_replacement at any time, but we then need to
2502 * check if recovery is needed.
2504 if (rdev->raid_disk >= 0 &&
2505 !test_bit(Replacement, &rdev->flags))
2506 set_bit(WantReplacement, &rdev->flags);
2507 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2508 md_wakeup_thread(rdev->mddev->thread);
2510 } else if (cmd_match(buf, "-want_replacement")) {
2511 /* Clearing 'want_replacement' is always allowed.
2512 * Once replacements starts it is too late though.
2515 clear_bit(WantReplacement, &rdev->flags);
2516 } else if (cmd_match(buf, "replacement")) {
2517 /* Can only set a device as a replacement when array has not
2518 * yet been started. Once running, replacement is automatic
2519 * from spares, or by assigning 'slot'.
2521 if (rdev->mddev->pers)
2524 set_bit(Replacement, &rdev->flags);
2527 } else if (cmd_match(buf, "-replacement")) {
2528 /* Similarly, can only clear Replacement before start */
2529 if (rdev->mddev->pers)
2532 clear_bit(Replacement, &rdev->flags);
2537 sysfs_notify_dirent_safe(rdev->sysfs_state);
2538 return err ? err : len;
2540 static struct rdev_sysfs_entry rdev_state =
2541 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2544 errors_show(struct md_rdev *rdev, char *page)
2546 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2550 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2553 unsigned long n = simple_strtoul(buf, &e, 10);
2554 if (*buf && (*e == 0 || *e == '\n')) {
2555 atomic_set(&rdev->corrected_errors, n);
2560 static struct rdev_sysfs_entry rdev_errors =
2561 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2564 slot_show(struct md_rdev *rdev, char *page)
2566 if (rdev->raid_disk < 0)
2567 return sprintf(page, "none\n");
2569 return sprintf(page, "%d\n", rdev->raid_disk);
2573 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2577 int slot = simple_strtoul(buf, &e, 10);
2578 if (strncmp(buf, "none", 4)==0)
2580 else if (e==buf || (*e && *e!= '\n'))
2582 if (rdev->mddev->pers && slot == -1) {
2583 /* Setting 'slot' on an active array requires also
2584 * updating the 'rd%d' link, and communicating
2585 * with the personality with ->hot_*_disk.
2586 * For now we only support removing
2587 * failed/spare devices. This normally happens automatically,
2588 * but not when the metadata is externally managed.
2590 if (rdev->raid_disk == -1)
2592 /* personality does all needed checks */
2593 if (rdev->mddev->pers->hot_remove_disk == NULL)
2595 clear_bit(Blocked, &rdev->flags);
2596 remove_and_add_spares(rdev->mddev, rdev);
2597 if (rdev->raid_disk >= 0)
2599 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2600 md_wakeup_thread(rdev->mddev->thread);
2601 } else if (rdev->mddev->pers) {
2602 /* Activating a spare .. or possibly reactivating
2603 * if we ever get bitmaps working here.
2606 if (rdev->raid_disk != -1)
2609 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2612 if (rdev->mddev->pers->hot_add_disk == NULL)
2615 if (slot >= rdev->mddev->raid_disks &&
2616 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2619 rdev->raid_disk = slot;
2620 if (test_bit(In_sync, &rdev->flags))
2621 rdev->saved_raid_disk = slot;
2623 rdev->saved_raid_disk = -1;
2624 clear_bit(In_sync, &rdev->flags);
2625 clear_bit(Bitmap_sync, &rdev->flags);
2626 err = rdev->mddev->pers->
2627 hot_add_disk(rdev->mddev, rdev);
2629 rdev->raid_disk = -1;
2632 sysfs_notify_dirent_safe(rdev->sysfs_state);
2633 if (sysfs_link_rdev(rdev->mddev, rdev))
2634 /* failure here is OK */;
2635 /* don't wakeup anyone, leave that to userspace. */
2637 if (slot >= rdev->mddev->raid_disks &&
2638 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2640 rdev->raid_disk = slot;
2641 /* assume it is working */
2642 clear_bit(Faulty, &rdev->flags);
2643 clear_bit(WriteMostly, &rdev->flags);
2644 set_bit(In_sync, &rdev->flags);
2645 sysfs_notify_dirent_safe(rdev->sysfs_state);
2650 static struct rdev_sysfs_entry rdev_slot =
2651 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2654 offset_show(struct md_rdev *rdev, char *page)
2656 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2660 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2662 unsigned long long offset;
2663 if (kstrtoull(buf, 10, &offset) < 0)
2665 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2667 if (rdev->sectors && rdev->mddev->external)
2668 /* Must set offset before size, so overlap checks
2671 rdev->data_offset = offset;
2672 rdev->new_data_offset = offset;
2676 static struct rdev_sysfs_entry rdev_offset =
2677 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2679 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2681 return sprintf(page, "%llu\n",
2682 (unsigned long long)rdev->new_data_offset);
2685 static ssize_t new_offset_store(struct md_rdev *rdev,
2686 const char *buf, size_t len)
2688 unsigned long long new_offset;
2689 struct mddev *mddev = rdev->mddev;
2691 if (kstrtoull(buf, 10, &new_offset) < 0)
2694 if (mddev->sync_thread)
2696 if (new_offset == rdev->data_offset)
2697 /* reset is always permitted */
2699 else if (new_offset > rdev->data_offset) {
2700 /* must not push array size beyond rdev_sectors */
2701 if (new_offset - rdev->data_offset
2702 + mddev->dev_sectors > rdev->sectors)
2705 /* Metadata worries about other space details. */
2707 /* decreasing the offset is inconsistent with a backwards
2710 if (new_offset < rdev->data_offset &&
2711 mddev->reshape_backwards)
2713 /* Increasing offset is inconsistent with forwards
2714 * reshape. reshape_direction should be set to
2715 * 'backwards' first.
2717 if (new_offset > rdev->data_offset &&
2718 !mddev->reshape_backwards)
2721 if (mddev->pers && mddev->persistent &&
2722 !super_types[mddev->major_version]
2723 .allow_new_offset(rdev, new_offset))
2725 rdev->new_data_offset = new_offset;
2726 if (new_offset > rdev->data_offset)
2727 mddev->reshape_backwards = 1;
2728 else if (new_offset < rdev->data_offset)
2729 mddev->reshape_backwards = 0;
2733 static struct rdev_sysfs_entry rdev_new_offset =
2734 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2737 rdev_size_show(struct md_rdev *rdev, char *page)
2739 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2742 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2744 /* check if two start/length pairs overlap */
2752 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2754 unsigned long long blocks;
2757 if (kstrtoull(buf, 10, &blocks) < 0)
2760 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2761 return -EINVAL; /* sector conversion overflow */
2764 if (new != blocks * 2)
2765 return -EINVAL; /* unsigned long long to sector_t overflow */
2772 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2774 struct mddev *my_mddev = rdev->mddev;
2775 sector_t oldsectors = rdev->sectors;
2778 if (strict_blocks_to_sectors(buf, §ors) < 0)
2780 if (rdev->data_offset != rdev->new_data_offset)
2781 return -EINVAL; /* too confusing */
2782 if (my_mddev->pers && rdev->raid_disk >= 0) {
2783 if (my_mddev->persistent) {
2784 sectors = super_types[my_mddev->major_version].
2785 rdev_size_change(rdev, sectors);
2788 } else if (!sectors)
2789 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2791 if (!my_mddev->pers->resize)
2792 /* Cannot change size for RAID0 or Linear etc */
2795 if (sectors < my_mddev->dev_sectors)
2796 return -EINVAL; /* component must fit device */
2798 rdev->sectors = sectors;
2799 if (sectors > oldsectors && my_mddev->external) {
2800 /* Need to check that all other rdevs with the same
2801 * ->bdev do not overlap. 'rcu' is sufficient to walk
2802 * the rdev lists safely.
2803 * This check does not provide a hard guarantee, it
2804 * just helps avoid dangerous mistakes.
2806 struct mddev *mddev;
2808 struct list_head *tmp;
2811 for_each_mddev(mddev, tmp) {
2812 struct md_rdev *rdev2;
2814 rdev_for_each(rdev2, mddev)
2815 if (rdev->bdev == rdev2->bdev &&
2817 overlaps(rdev->data_offset, rdev->sectors,
2830 /* Someone else could have slipped in a size
2831 * change here, but doing so is just silly.
2832 * We put oldsectors back because we *know* it is
2833 * safe, and trust userspace not to race with
2836 rdev->sectors = oldsectors;
2843 static struct rdev_sysfs_entry rdev_size =
2844 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2846 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
2848 unsigned long long recovery_start = rdev->recovery_offset;
2850 if (test_bit(In_sync, &rdev->flags) ||
2851 recovery_start == MaxSector)
2852 return sprintf(page, "none\n");
2854 return sprintf(page, "%llu\n", recovery_start);
2857 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
2859 unsigned long long recovery_start;
2861 if (cmd_match(buf, "none"))
2862 recovery_start = MaxSector;
2863 else if (kstrtoull(buf, 10, &recovery_start))
2866 if (rdev->mddev->pers &&
2867 rdev->raid_disk >= 0)
2870 rdev->recovery_offset = recovery_start;
2871 if (recovery_start == MaxSector)
2872 set_bit(In_sync, &rdev->flags);
2874 clear_bit(In_sync, &rdev->flags);
2878 static struct rdev_sysfs_entry rdev_recovery_start =
2879 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2882 badblocks_show(struct badblocks *bb, char *page, int unack);
2884 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2886 static ssize_t bb_show(struct md_rdev *rdev, char *page)
2888 return badblocks_show(&rdev->badblocks, page, 0);
2890 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
2892 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2893 /* Maybe that ack was all we needed */
2894 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2895 wake_up(&rdev->blocked_wait);
2898 static struct rdev_sysfs_entry rdev_bad_blocks =
2899 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2901 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
2903 return badblocks_show(&rdev->badblocks, page, 1);
2905 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
2907 return badblocks_store(&rdev->badblocks, page, len, 1);
2909 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2910 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2912 static struct attribute *rdev_default_attrs[] = {
2917 &rdev_new_offset.attr,
2919 &rdev_recovery_start.attr,
2920 &rdev_bad_blocks.attr,
2921 &rdev_unack_bad_blocks.attr,
2925 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2927 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2928 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
2929 struct mddev *mddev = rdev->mddev;
2935 rv = mddev ? mddev_lock(mddev) : -EBUSY;
2937 if (rdev->mddev == NULL)
2940 rv = entry->show(rdev, page);
2941 mddev_unlock(mddev);
2947 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2948 const char *page, size_t length)
2950 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2951 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
2953 struct mddev *mddev = rdev->mddev;
2957 if (!capable(CAP_SYS_ADMIN))
2959 rv = mddev ? mddev_lock(mddev): -EBUSY;
2961 if (rdev->mddev == NULL)
2964 rv = entry->store(rdev, page, length);
2965 mddev_unlock(mddev);
2970 static void rdev_free(struct kobject *ko)
2972 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
2975 static const struct sysfs_ops rdev_sysfs_ops = {
2976 .show = rdev_attr_show,
2977 .store = rdev_attr_store,
2979 static struct kobj_type rdev_ktype = {
2980 .release = rdev_free,
2981 .sysfs_ops = &rdev_sysfs_ops,
2982 .default_attrs = rdev_default_attrs,
2985 int md_rdev_init(struct md_rdev *rdev)
2988 rdev->saved_raid_disk = -1;
2989 rdev->raid_disk = -1;
2991 rdev->data_offset = 0;
2992 rdev->new_data_offset = 0;
2993 rdev->sb_events = 0;
2994 rdev->last_read_error.tv_sec = 0;
2995 rdev->last_read_error.tv_nsec = 0;
2996 rdev->sb_loaded = 0;
2997 rdev->bb_page = NULL;
2998 atomic_set(&rdev->nr_pending, 0);
2999 atomic_set(&rdev->read_errors, 0);
3000 atomic_set(&rdev->corrected_errors, 0);
3002 INIT_LIST_HEAD(&rdev->same_set);
3003 init_waitqueue_head(&rdev->blocked_wait);
3005 /* Add space to store bad block list.
3006 * This reserves the space even on arrays where it cannot
3007 * be used - I wonder if that matters
3009 rdev->badblocks.count = 0;
3010 rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
3011 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3012 seqlock_init(&rdev->badblocks.lock);
3013 if (rdev->badblocks.page == NULL)
3018 EXPORT_SYMBOL_GPL(md_rdev_init);
3020 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3022 * mark the device faulty if:
3024 * - the device is nonexistent (zero size)
3025 * - the device has no valid superblock
3027 * a faulty rdev _never_ has rdev->sb set.
3029 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3031 char b[BDEVNAME_SIZE];
3033 struct md_rdev *rdev;
3036 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3038 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3039 return ERR_PTR(-ENOMEM);
3042 err = md_rdev_init(rdev);
3045 err = alloc_disk_sb(rdev);
3049 err = lock_rdev(rdev, newdev, super_format == -2);
3053 kobject_init(&rdev->kobj, &rdev_ktype);
3055 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3058 "md: %s has zero or unknown size, marking faulty!\n",
3059 bdevname(rdev->bdev,b));
3064 if (super_format >= 0) {
3065 err = super_types[super_format].
3066 load_super(rdev, NULL, super_minor);
3067 if (err == -EINVAL) {
3069 "md: %s does not have a valid v%d.%d "
3070 "superblock, not importing!\n",
3071 bdevname(rdev->bdev,b),
3072 super_format, super_minor);
3077 "md: could not read %s's sb, not importing!\n",
3078 bdevname(rdev->bdev,b));
3088 md_rdev_clear(rdev);
3090 return ERR_PTR(err);
3094 * Check a full RAID array for plausibility
3097 static void analyze_sbs(struct mddev *mddev)
3100 struct md_rdev *rdev, *freshest, *tmp;
3101 char b[BDEVNAME_SIZE];
3104 rdev_for_each_safe(rdev, tmp, mddev)
3105 switch (super_types[mddev->major_version].
3106 load_super(rdev, freshest, mddev->minor_version)) {
3114 "md: fatal superblock inconsistency in %s"
3115 " -- removing from array\n",
3116 bdevname(rdev->bdev,b));
3117 kick_rdev_from_array(rdev);
3120 super_types[mddev->major_version].
3121 validate_super(mddev, freshest);
3124 rdev_for_each_safe(rdev, tmp, mddev) {
3125 if (mddev->max_disks &&
3126 (rdev->desc_nr >= mddev->max_disks ||
3127 i > mddev->max_disks)) {
3129 "md: %s: %s: only %d devices permitted\n",
3130 mdname(mddev), bdevname(rdev->bdev, b),
3132 kick_rdev_from_array(rdev);
3135 if (rdev != freshest)
3136 if (super_types[mddev->major_version].
3137 validate_super(mddev, rdev)) {
3138 printk(KERN_WARNING "md: kicking non-fresh %s"
3140 bdevname(rdev->bdev,b));
3141 kick_rdev_from_array(rdev);
3144 if (mddev->level == LEVEL_MULTIPATH) {
3145 rdev->desc_nr = i++;
3146 rdev->raid_disk = rdev->desc_nr;
3147 set_bit(In_sync, &rdev->flags);
3148 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3149 rdev->raid_disk = -1;
3150 clear_bit(In_sync, &rdev->flags);
3155 /* Read a fixed-point number.
3156 * Numbers in sysfs attributes should be in "standard" units where
3157 * possible, so time should be in seconds.
3158 * However we internally use a a much smaller unit such as
3159 * milliseconds or jiffies.
3160 * This function takes a decimal number with a possible fractional
3161 * component, and produces an integer which is the result of
3162 * multiplying that number by 10^'scale'.
3163 * all without any floating-point arithmetic.
3165 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3167 unsigned long result = 0;
3169 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3172 else if (decimals < scale) {
3175 result = result * 10 + value;
3187 while (decimals < scale) {
3195 static void md_safemode_timeout(unsigned long data);
3198 safe_delay_show(struct mddev *mddev, char *page)
3200 int msec = (mddev->safemode_delay*1000)/HZ;
3201 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3204 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3208 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3211 mddev->safemode_delay = 0;
3213 unsigned long old_delay = mddev->safemode_delay;
3214 mddev->safemode_delay = (msec*HZ)/1000;
3215 if (mddev->safemode_delay == 0)
3216 mddev->safemode_delay = 1;
3217 if (mddev->safemode_delay < old_delay || old_delay == 0)
3218 md_safemode_timeout((unsigned long)mddev);
3222 static struct md_sysfs_entry md_safe_delay =
3223 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3226 level_show(struct mddev *mddev, char *page)
3228 struct md_personality *p = mddev->pers;
3230 return sprintf(page, "%s\n", p->name);
3231 else if (mddev->clevel[0])
3232 return sprintf(page, "%s\n", mddev->clevel);
3233 else if (mddev->level != LEVEL_NONE)
3234 return sprintf(page, "%d\n", mddev->level);
3240 level_store(struct mddev *mddev, const char *buf, size_t len)
3244 struct md_personality *pers;
3247 struct md_rdev *rdev;
3249 if (mddev->pers == NULL) {
3252 if (len >= sizeof(mddev->clevel))
3254 strncpy(mddev->clevel, buf, len);
3255 if (mddev->clevel[len-1] == '\n')
3257 mddev->clevel[len] = 0;
3258 mddev->level = LEVEL_NONE;
3264 /* request to change the personality. Need to ensure:
3265 * - array is not engaged in resync/recovery/reshape
3266 * - old personality can be suspended
3267 * - new personality will access other array.
3270 if (mddev->sync_thread ||
3271 mddev->reshape_position != MaxSector ||
3272 mddev->sysfs_active)
3275 if (!mddev->pers->quiesce) {
3276 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3277 mdname(mddev), mddev->pers->name);
3281 /* Now find the new personality */
3282 if (len == 0 || len >= sizeof(clevel))
3284 strncpy(clevel, buf, len);
3285 if (clevel[len-1] == '\n')
3288 if (kstrtol(clevel, 10, &level))
3291 if (request_module("md-%s", clevel) != 0)
3292 request_module("md-level-%s", clevel);
3293 spin_lock(&pers_lock);
3294 pers = find_pers(level, clevel);
3295 if (!pers || !try_module_get(pers->owner)) {
3296 spin_unlock(&pers_lock);
3297 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3300 spin_unlock(&pers_lock);
3302 if (pers == mddev->pers) {
3303 /* Nothing to do! */
3304 module_put(pers->owner);
3307 if (!pers->takeover) {
3308 module_put(pers->owner);
3309 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3310 mdname(mddev), clevel);
3314 rdev_for_each(rdev, mddev)
3315 rdev->new_raid_disk = rdev->raid_disk;
3317 /* ->takeover must set new_* and/or delta_disks
3318 * if it succeeds, and may set them when it fails.
3320 priv = pers->takeover(mddev);
3322 mddev->new_level = mddev->level;
3323 mddev->new_layout = mddev->layout;
3324 mddev->new_chunk_sectors = mddev->chunk_sectors;
3325 mddev->raid_disks -= mddev->delta_disks;
3326 mddev->delta_disks = 0;
3327 mddev->reshape_backwards = 0;
3328 module_put(pers->owner);
3329 printk(KERN_WARNING "md: %s: %s would not accept array\n",
3330 mdname(mddev), clevel);
3331 return PTR_ERR(priv);
3334 /* Looks like we have a winner */
3335 mddev_suspend(mddev);
3336 mddev->pers->stop(mddev);
3338 if (mddev->pers->sync_request == NULL &&
3339 pers->sync_request != NULL) {
3340 /* need to add the md_redundancy_group */
3341 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3343 "md: cannot register extra attributes for %s\n",
3345 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3347 if (mddev->pers->sync_request != NULL &&
3348 pers->sync_request == NULL) {
3349 /* need to remove the md_redundancy_group */
3350 if (mddev->to_remove == NULL)
3351 mddev->to_remove = &md_redundancy_group;
3354 if (mddev->pers->sync_request == NULL &&
3356 /* We are converting from a no-redundancy array
3357 * to a redundancy array and metadata is managed
3358 * externally so we need to be sure that writes
3359 * won't block due to a need to transition
3361 * until external management is started.
3364 mddev->safemode_delay = 0;
3365 mddev->safemode = 0;
3368 rdev_for_each(rdev, mddev) {
3369 if (rdev->raid_disk < 0)
3371 if (rdev->new_raid_disk >= mddev->raid_disks)
3372 rdev->new_raid_disk = -1;
3373 if (rdev->new_raid_disk == rdev->raid_disk)
3375 sysfs_unlink_rdev(mddev, rdev);
3377 rdev_for_each(rdev, mddev) {
3378 if (rdev->raid_disk < 0)
3380 if (rdev->new_raid_disk == rdev->raid_disk)
3382 rdev->raid_disk = rdev->new_raid_disk;
3383 if (rdev->raid_disk < 0)
3384 clear_bit(In_sync, &rdev->flags);
3386 if (sysfs_link_rdev(mddev, rdev))
3387 printk(KERN_WARNING "md: cannot register rd%d"
3388 " for %s after level change\n",
3389 rdev->raid_disk, mdname(mddev));
3393 module_put(mddev->pers->owner);
3395 mddev->private = priv;
3396 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3397 mddev->level = mddev->new_level;
3398 mddev->layout = mddev->new_layout;
3399 mddev->chunk_sectors = mddev->new_chunk_sectors;
3400 mddev->delta_disks = 0;
3401 mddev->reshape_backwards = 0;
3402 mddev->degraded = 0;
3403 if (mddev->pers->sync_request == NULL) {
3404 /* this is now an array without redundancy, so
3405 * it must always be in_sync
3408 del_timer_sync(&mddev->safemode_timer);
3410 blk_set_stacking_limits(&mddev->queue->limits);
3412 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3413 mddev_resume(mddev);
3415 md_update_sb(mddev, 1);
3416 sysfs_notify(&mddev->kobj, NULL, "level");
3417 md_new_event(mddev);
3421 static struct md_sysfs_entry md_level =
3422 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3425 layout_show(struct mddev *mddev, char *page)
3427 /* just a number, not meaningful for all levels */
3428 if (mddev->reshape_position != MaxSector &&
3429 mddev->layout != mddev->new_layout)
3430 return sprintf(page, "%d (%d)\n",
3431 mddev->new_layout, mddev->layout);
3432 return sprintf(page, "%d\n", mddev->layout);
3436 layout_store(struct mddev *mddev, const char *buf, size_t len)
3439 unsigned long n = simple_strtoul(buf, &e, 10);
3441 if (!*buf || (*e && *e != '\n'))
3446 if (mddev->pers->check_reshape == NULL)
3450 mddev->new_layout = n;
3451 err = mddev->pers->check_reshape(mddev);
3453 mddev->new_layout = mddev->layout;
3457 mddev->new_layout = n;
3458 if (mddev->reshape_position == MaxSector)
3463 static struct md_sysfs_entry md_layout =
3464 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3467 raid_disks_show(struct mddev *mddev, char *page)
3469 if (mddev->raid_disks == 0)
3471 if (mddev->reshape_position != MaxSector &&
3472 mddev->delta_disks != 0)
3473 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3474 mddev->raid_disks - mddev->delta_disks);
3475 return sprintf(page, "%d\n", mddev->raid_disks);
3478 static int update_raid_disks(struct mddev *mddev, int raid_disks);
3481 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3485 unsigned long n = simple_strtoul(buf, &e, 10);
3487 if (!*buf || (*e && *e != '\n'))
3491 rv = update_raid_disks(mddev, n);
3492 else if (mddev->reshape_position != MaxSector) {
3493 struct md_rdev *rdev;
3494 int olddisks = mddev->raid_disks - mddev->delta_disks;
3496 rdev_for_each(rdev, mddev) {
3498 rdev->data_offset < rdev->new_data_offset)
3501 rdev->data_offset > rdev->new_data_offset)
3504 mddev->delta_disks = n - olddisks;
3505 mddev->raid_disks = n;
3506 mddev->reshape_backwards = (mddev->delta_disks < 0);
3508 mddev->raid_disks = n;
3509 return rv ? rv : len;
3511 static struct md_sysfs_entry md_raid_disks =
3512 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3515 chunk_size_show(struct mddev *mddev, char *page)
3517 if (mddev->reshape_position != MaxSector &&
3518 mddev->chunk_sectors != mddev->new_chunk_sectors)
3519 return sprintf(page, "%d (%d)\n",
3520 mddev->new_chunk_sectors << 9,
3521 mddev->chunk_sectors << 9);
3522 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3526 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3529 unsigned long n = simple_strtoul(buf, &e, 10);
3531 if (!*buf || (*e && *e != '\n'))
3536 if (mddev->pers->check_reshape == NULL)
3540 mddev->new_chunk_sectors = n >> 9;
3541 err = mddev->pers->check_reshape(mddev);
3543 mddev->new_chunk_sectors = mddev->chunk_sectors;
3547 mddev->new_chunk_sectors = n >> 9;
3548 if (mddev->reshape_position == MaxSector)
3549 mddev->chunk_sectors = n >> 9;
3553 static struct md_sysfs_entry md_chunk_size =
3554 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3557 resync_start_show(struct mddev *mddev, char *page)
3559 if (mddev->recovery_cp == MaxSector)
3560 return sprintf(page, "none\n");
3561 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3565 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3568 unsigned long long n = simple_strtoull(buf, &e, 10);
3570 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3572 if (cmd_match(buf, "none"))
3574 else if (!*buf || (*e && *e != '\n'))
3577 mddev->recovery_cp = n;
3579 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3582 static struct md_sysfs_entry md_resync_start =
3583 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3586 * The array state can be:
3589 * No devices, no size, no level
3590 * Equivalent to STOP_ARRAY ioctl
3592 * May have some settings, but array is not active
3593 * all IO results in error
3594 * When written, doesn't tear down array, but just stops it
3595 * suspended (not supported yet)
3596 * All IO requests will block. The array can be reconfigured.
3597 * Writing this, if accepted, will block until array is quiescent
3599 * no resync can happen. no superblocks get written.
3600 * write requests fail
3602 * like readonly, but behaves like 'clean' on a write request.
3604 * clean - no pending writes, but otherwise active.
3605 * When written to inactive array, starts without resync
3606 * If a write request arrives then
3607 * if metadata is known, mark 'dirty' and switch to 'active'.
3608 * if not known, block and switch to write-pending
3609 * If written to an active array that has pending writes, then fails.
3611 * fully active: IO and resync can be happening.
3612 * When written to inactive array, starts with resync
3615 * clean, but writes are blocked waiting for 'active' to be written.
3618 * like active, but no writes have been seen for a while (100msec).
3621 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3622 write_pending, active_idle, bad_word};
3623 static char *array_states[] = {
3624 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3625 "write-pending", "active-idle", NULL };
3627 static int match_word(const char *word, char **list)
3630 for (n=0; list[n]; n++)
3631 if (cmd_match(word, list[n]))
3637 array_state_show(struct mddev *mddev, char *page)
3639 enum array_state st = inactive;
3652 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
3654 else if (mddev->safemode)
3660 if (list_empty(&mddev->disks) &&
3661 mddev->raid_disks == 0 &&
3662 mddev->dev_sectors == 0)
3667 return sprintf(page, "%s\n", array_states[st]);
3670 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
3671 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
3672 static int do_md_run(struct mddev *mddev);
3673 static int restart_array(struct mddev *mddev);
3676 array_state_store(struct mddev *mddev, const char *buf, size_t len)
3679 enum array_state st = match_word(buf, array_states);
3684 /* stopping an active array */
3685 err = do_md_stop(mddev, 0, NULL);
3688 /* stopping an active array */
3690 err = do_md_stop(mddev, 2, NULL);
3692 err = 0; /* already inactive */
3695 break; /* not supported yet */
3698 err = md_set_readonly(mddev, NULL);
3701 set_disk_ro(mddev->gendisk, 1);
3702 err = do_md_run(mddev);
3708 err = md_set_readonly(mddev, NULL);
3709 else if (mddev->ro == 1)
3710 err = restart_array(mddev);
3713 set_disk_ro(mddev->gendisk, 0);
3717 err = do_md_run(mddev);
3722 restart_array(mddev);
3723 spin_lock_irq(&mddev->write_lock);
3724 if (atomic_read(&mddev->writes_pending) == 0) {
3725 if (mddev->in_sync == 0) {
3727 if (mddev->safemode == 1)
3728 mddev->safemode = 0;
3729 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3734 spin_unlock_irq(&mddev->write_lock);
3740 restart_array(mddev);
3741 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3742 wake_up(&mddev->sb_wait);
3746 set_disk_ro(mddev->gendisk, 0);
3747 err = do_md_run(mddev);
3752 /* these cannot be set */
3758 if (mddev->hold_active == UNTIL_IOCTL)
3759 mddev->hold_active = 0;
3760 sysfs_notify_dirent_safe(mddev->sysfs_state);
3764 static struct md_sysfs_entry md_array_state =
3765 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3768 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
3769 return sprintf(page, "%d\n",
3770 atomic_read(&mddev->max_corr_read_errors));
3774 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
3777 unsigned long n = simple_strtoul(buf, &e, 10);
3779 if (*buf && (*e == 0 || *e == '\n')) {
3780 atomic_set(&mddev->max_corr_read_errors, n);
3786 static struct md_sysfs_entry max_corr_read_errors =
3787 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3788 max_corrected_read_errors_store);
3791 null_show(struct mddev *mddev, char *page)
3797 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
3799 /* buf must be %d:%d\n? giving major and minor numbers */
3800 /* The new device is added to the array.
3801 * If the array has a persistent superblock, we read the
3802 * superblock to initialise info and check validity.
3803 * Otherwise, only checking done is that in bind_rdev_to_array,
3804 * which mainly checks size.
3807 int major = simple_strtoul(buf, &e, 10);
3810 struct md_rdev *rdev;
3813 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3815 minor = simple_strtoul(e+1, &e, 10);
3816 if (*e && *e != '\n')
3818 dev = MKDEV(major, minor);
3819 if (major != MAJOR(dev) ||
3820 minor != MINOR(dev))
3823 if (mddev->persistent) {
3824 rdev = md_import_device(dev, mddev->major_version,
3825 mddev->minor_version);
3826 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3827 struct md_rdev *rdev0
3828 = list_entry(mddev->disks.next,
3829 struct md_rdev, same_set);
3830 err = super_types[mddev->major_version]
3831 .load_super(rdev, rdev0, mddev->minor_version);
3835 } else if (mddev->external)
3836 rdev = md_import_device(dev, -2, -1);
3838 rdev = md_import_device(dev, -1, -1);
3841 return PTR_ERR(rdev);
3842 err = bind_rdev_to_array(rdev, mddev);
3846 return err ? err : len;
3849 static struct md_sysfs_entry md_new_device =
3850 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3853 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
3856 unsigned long chunk, end_chunk;
3860 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3862 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3863 if (buf == end) break;
3864 if (*end == '-') { /* range */
3866 end_chunk = simple_strtoul(buf, &end, 0);
3867 if (buf == end) break;
3869 if (*end && !isspace(*end)) break;
3870 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3871 buf = skip_spaces(end);
3873 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3878 static struct md_sysfs_entry md_bitmap =
3879 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3882 size_show(struct mddev *mddev, char *page)
3884 return sprintf(page, "%llu\n",
3885 (unsigned long long)mddev->dev_sectors / 2);
3888 static int update_size(struct mddev *mddev, sector_t num_sectors);
3891 size_store(struct mddev *mddev, const char *buf, size_t len)
3893 /* If array is inactive, we can reduce the component size, but
3894 * not increase it (except from 0).
3895 * If array is active, we can try an on-line resize
3898 int err = strict_blocks_to_sectors(buf, §ors);
3903 err = update_size(mddev, sectors);
3904 md_update_sb(mddev, 1);
3906 if (mddev->dev_sectors == 0 ||
3907 mddev->dev_sectors > sectors)
3908 mddev->dev_sectors = sectors;
3912 return err ? err : len;
3915 static struct md_sysfs_entry md_size =
3916 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3918 /* Metadata version.
3920 * 'none' for arrays with no metadata (good luck...)
3921 * 'external' for arrays with externally managed metadata,
3922 * or N.M for internally known formats
3925 metadata_show(struct mddev *mddev, char *page)
3927 if (mddev->persistent)
3928 return sprintf(page, "%d.%d\n",
3929 mddev->major_version, mddev->minor_version);
3930 else if (mddev->external)
3931 return sprintf(page, "external:%s\n", mddev->metadata_type);
3933 return sprintf(page, "none\n");
3937 metadata_store(struct mddev *mddev, const char *buf, size_t len)
3941 /* Changing the details of 'external' metadata is
3942 * always permitted. Otherwise there must be
3943 * no devices attached to the array.
3945 if (mddev->external && strncmp(buf, "external:", 9) == 0)
3947 else if (!list_empty(&mddev->disks))
3950 if (cmd_match(buf, "none")) {
3951 mddev->persistent = 0;
3952 mddev->external = 0;
3953 mddev->major_version = 0;
3954 mddev->minor_version = 90;
3957 if (strncmp(buf, "external:", 9) == 0) {
3958 size_t namelen = len-9;
3959 if (namelen >= sizeof(mddev->metadata_type))
3960 namelen = sizeof(mddev->metadata_type)-1;
3961 strncpy(mddev->metadata_type, buf+9, namelen);
3962 mddev->metadata_type[namelen] = 0;
3963 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3964 mddev->metadata_type[--namelen] = 0;
3965 mddev->persistent = 0;
3966 mddev->external = 1;
3967 mddev->major_version = 0;
3968 mddev->minor_version = 90;
3971 major = simple_strtoul(buf, &e, 10);
3972 if (e==buf || *e != '.')
3975 minor = simple_strtoul(buf, &e, 10);
3976 if (e==buf || (*e && *e != '\n') )
3978 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3980 mddev->major_version = major;
3981 mddev->minor_version = minor;
3982 mddev->persistent = 1;
3983 mddev->external = 0;
3987 static struct md_sysfs_entry md_metadata =
3988 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3991 action_show(struct mddev *mddev, char *page)
3993 char *type = "idle";
3994 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3996 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3997 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3998 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4000 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4001 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4003 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
4007 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
4010 return sprintf(page, "%s\n", type);
4014 action_store(struct mddev *mddev, const char *page, size_t len)
4016 if (!mddev->pers || !mddev->pers->sync_request)
4019 if (cmd_match(page, "frozen"))
4020 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4022 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4024 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4025 if (mddev->sync_thread) {
4026 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4027 md_reap_sync_thread(mddev);
4029 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4030 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4032 else if (cmd_match(page, "resync"))
4033 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4034 else if (cmd_match(page, "recover")) {
4035 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4036 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4037 } else if (cmd_match(page, "reshape")) {
4039 if (mddev->pers->start_reshape == NULL)
4041 err = mddev->pers->start_reshape(mddev);
4044 sysfs_notify(&mddev->kobj, NULL, "degraded");
4046 if (cmd_match(page, "check"))
4047 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4048 else if (!cmd_match(page, "repair"))
4050 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4051 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4053 if (mddev->ro == 2) {
4054 /* A write to sync_action is enough to justify
4055 * canceling read-auto mode
4058 md_wakeup_thread(mddev->sync_thread);
4060 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4061 md_wakeup_thread(mddev->thread);
4062 sysfs_notify_dirent_safe(mddev->sysfs_action);
4066 static struct md_sysfs_entry md_scan_mode =
4067 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4070 last_sync_action_show(struct mddev *mddev, char *page)
4072 return sprintf(page, "%s\n", mddev->last_sync_action);
4075 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4078 mismatch_cnt_show(struct mddev *mddev, char *page)
4080 return sprintf(page, "%llu\n",
4081 (unsigned long long)
4082 atomic64_read(&mddev->resync_mismatches));
4085 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4088 sync_min_show(struct mddev *mddev, char *page)
4090 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4091 mddev->sync_speed_min ? "local": "system");
4095 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4099 if (strncmp(buf, "system", 6)==0) {
4100 mddev->sync_speed_min = 0;
4103 min = simple_strtoul(buf, &e, 10);
4104 if (buf == e || (*e && *e != '\n') || min <= 0)
4106 mddev->sync_speed_min = min;
4110 static struct md_sysfs_entry md_sync_min =
4111 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4114 sync_max_show(struct mddev *mddev, char *page)
4116 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4117 mddev->sync_speed_max ? "local": "system");
4121 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4125 if (strncmp(buf, "system", 6)==0) {
4126 mddev->sync_speed_max = 0;
4129 max = simple_strtoul(buf, &e, 10);
4130 if (buf == e || (*e && *e != '\n') || max <= 0)
4132 mddev->sync_speed_max = max;
4136 static struct md_sysfs_entry md_sync_max =
4137 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4140 degraded_show(struct mddev *mddev, char *page)
4142 return sprintf(page, "%d\n", mddev->degraded);
4144 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4147 sync_force_parallel_show(struct mddev *mddev, char *page)
4149 return sprintf(page, "%d\n", mddev->parallel_resync);
4153 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4157 if (kstrtol(buf, 10, &n))
4160 if (n != 0 && n != 1)
4163 mddev->parallel_resync = n;
4165 if (mddev->sync_thread)
4166 wake_up(&resync_wait);
4171 /* force parallel resync, even with shared block devices */
4172 static struct md_sysfs_entry md_sync_force_parallel =
4173 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4174 sync_force_parallel_show, sync_force_parallel_store);
4177 sync_speed_show(struct mddev *mddev, char *page)
4179 unsigned long resync, dt, db;
4180 if (mddev->curr_resync == 0)
4181 return sprintf(page, "none\n");
4182 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4183 dt = (jiffies - mddev->resync_mark) / HZ;
4185 db = resync - mddev->resync_mark_cnt;
4186 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4189 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4192 sync_completed_show(struct mddev *mddev, char *page)
4194 unsigned long long max_sectors, resync;
4196 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4197 return sprintf(page, "none\n");
4199 if (mddev->curr_resync == 1 ||
4200 mddev->curr_resync == 2)
4201 return sprintf(page, "delayed\n");
4203 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4204 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4205 max_sectors = mddev->resync_max_sectors;
4207 max_sectors = mddev->dev_sectors;
4209 resync = mddev->curr_resync_completed;
4210 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4213 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
4216 min_sync_show(struct mddev *mddev, char *page)
4218 return sprintf(page, "%llu\n",
4219 (unsigned long long)mddev->resync_min);
4222 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4224 unsigned long long min;
4225 if (kstrtoull(buf, 10, &min))
4227 if (min > mddev->resync_max)
4229 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4232 /* Must be a multiple of chunk_size */
4233 if (mddev->chunk_sectors) {
4234 sector_t temp = min;
4235 if (sector_div(temp, mddev->chunk_sectors))
4238 mddev->resync_min = min;
4243 static struct md_sysfs_entry md_min_sync =
4244 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4247 max_sync_show(struct mddev *mddev, char *page)
4249 if (mddev->resync_max == MaxSector)
4250 return sprintf(page, "max\n");
4252 return sprintf(page, "%llu\n",
4253 (unsigned long long)mddev->resync_max);
4256 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4258 if (strncmp(buf, "max", 3) == 0)
4259 mddev->resync_max = MaxSector;
4261 unsigned long long max;
4262 if (kstrtoull(buf, 10, &max))
4264 if (max < mddev->resync_min)
4266 if (max < mddev->resync_max &&
4268 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4271 /* Must be a multiple of chunk_size */
4272 if (mddev->chunk_sectors) {
4273 sector_t temp = max;
4274 if (sector_div(temp, mddev->chunk_sectors))
4277 mddev->resync_max = max;
4279 wake_up(&mddev->recovery_wait);
4283 static struct md_sysfs_entry md_max_sync =
4284 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4287 suspend_lo_show(struct mddev *mddev, char *page)
4289 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4293 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4296 unsigned long long new = simple_strtoull(buf, &e, 10);
4297 unsigned long long old = mddev->suspend_lo;
4299 if (mddev->pers == NULL ||
4300 mddev->pers->quiesce == NULL)
4302 if (buf == e || (*e && *e != '\n'))
4305 mddev->suspend_lo = new;
4307 /* Shrinking suspended region */
4308 mddev->pers->quiesce(mddev, 2);
4310 /* Expanding suspended region - need to wait */
4311 mddev->pers->quiesce(mddev, 1);
4312 mddev->pers->quiesce(mddev, 0);
4316 static struct md_sysfs_entry md_suspend_lo =
4317 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4320 suspend_hi_show(struct mddev *mddev, char *page)
4322 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4326 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4329 unsigned long long new = simple_strtoull(buf, &e, 10);
4330 unsigned long long old = mddev->suspend_hi;
4332 if (mddev->pers == NULL ||
4333 mddev->pers->quiesce == NULL)
4335 if (buf == e || (*e && *e != '\n'))
4338 mddev->suspend_hi = new;
4340 /* Shrinking suspended region */
4341 mddev->pers->quiesce(mddev, 2);
4343 /* Expanding suspended region - need to wait */
4344 mddev->pers->quiesce(mddev, 1);
4345 mddev->pers->quiesce(mddev, 0);
4349 static struct md_sysfs_entry md_suspend_hi =
4350 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4353 reshape_position_show(struct mddev *mddev, char *page)
4355 if (mddev->reshape_position != MaxSector)
4356 return sprintf(page, "%llu\n",
4357 (unsigned long long)mddev->reshape_position);
4358 strcpy(page, "none\n");
4363 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4365 struct md_rdev *rdev;
4367 unsigned long long new = simple_strtoull(buf, &e, 10);
4370 if (buf == e || (*e && *e != '\n'))
4372 mddev->reshape_position = new;
4373 mddev->delta_disks = 0;
4374 mddev->reshape_backwards = 0;
4375 mddev->new_level = mddev->level;
4376 mddev->new_layout = mddev->layout;
4377 mddev->new_chunk_sectors = mddev->chunk_sectors;
4378 rdev_for_each(rdev, mddev)
4379 rdev->new_data_offset = rdev->data_offset;
4383 static struct md_sysfs_entry md_reshape_position =
4384 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4385 reshape_position_store);
4388 reshape_direction_show(struct mddev *mddev, char *page)
4390 return sprintf(page, "%s\n",
4391 mddev->reshape_backwards ? "backwards" : "forwards");
4395 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4398 if (cmd_match(buf, "forwards"))
4400 else if (cmd_match(buf, "backwards"))
4404 if (mddev->reshape_backwards == backwards)
4407 /* check if we are allowed to change */
4408 if (mddev->delta_disks)
4411 if (mddev->persistent &&
4412 mddev->major_version == 0)
4415 mddev->reshape_backwards = backwards;
4419 static struct md_sysfs_entry md_reshape_direction =
4420 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4421 reshape_direction_store);
4424 array_size_show(struct mddev *mddev, char *page)
4426 if (mddev->external_size)
4427 return sprintf(page, "%llu\n",
4428 (unsigned long long)mddev->array_sectors/2);
4430 return sprintf(page, "default\n");
4434 array_size_store(struct mddev *mddev, const char *buf, size_t len)
4438 if (strncmp(buf, "default", 7) == 0) {
4440 sectors = mddev->pers->size(mddev, 0, 0);
4442 sectors = mddev->array_sectors;
4444 mddev->external_size = 0;
4446 if (strict_blocks_to_sectors(buf, §ors) < 0)
4448 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4451 mddev->external_size = 1;
4454 mddev->array_sectors = sectors;
4456 set_capacity(mddev->gendisk, mddev->array_sectors);
4457 revalidate_disk(mddev->gendisk);
4462 static struct md_sysfs_entry md_array_size =
4463 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4466 static struct attribute *md_default_attrs[] = {
4469 &md_raid_disks.attr,
4470 &md_chunk_size.attr,
4472 &md_resync_start.attr,
4474 &md_new_device.attr,
4475 &md_safe_delay.attr,
4476 &md_array_state.attr,
4477 &md_reshape_position.attr,
4478 &md_reshape_direction.attr,
4479 &md_array_size.attr,
4480 &max_corr_read_errors.attr,
4484 static struct attribute *md_redundancy_attrs[] = {
4486 &md_last_scan_mode.attr,
4487 &md_mismatches.attr,
4490 &md_sync_speed.attr,
4491 &md_sync_force_parallel.attr,
4492 &md_sync_completed.attr,
4495 &md_suspend_lo.attr,
4496 &md_suspend_hi.attr,
4501 static struct attribute_group md_redundancy_group = {
4503 .attrs = md_redundancy_attrs,
4507 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4509 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4510 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4515 spin_lock(&all_mddevs_lock);
4516 if (list_empty(&mddev->all_mddevs)) {
4517 spin_unlock(&all_mddevs_lock);
4521 spin_unlock(&all_mddevs_lock);
4523 rv = mddev_lock(mddev);
4525 rv = entry->show(mddev, page);
4526 mddev_unlock(mddev);
4533 md_attr_store(struct kobject *kobj, struct attribute *attr,
4534 const char *page, size_t length)
4536 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4537 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4542 if (!capable(CAP_SYS_ADMIN))
4544 spin_lock(&all_mddevs_lock);
4545 if (list_empty(&mddev->all_mddevs)) {
4546 spin_unlock(&all_mddevs_lock);
4550 spin_unlock(&all_mddevs_lock);
4551 if (entry->store == new_dev_store)
4552 flush_workqueue(md_misc_wq);
4553 rv = mddev_lock(mddev);
4555 rv = entry->store(mddev, page, length);
4556 mddev_unlock(mddev);
4562 static void md_free(struct kobject *ko)
4564 struct mddev *mddev = container_of(ko, struct mddev, kobj);
4566 if (mddev->sysfs_state)
4567 sysfs_put(mddev->sysfs_state);
4569 if (mddev->gendisk) {
4570 del_gendisk(mddev->gendisk);
4571 put_disk(mddev->gendisk);
4574 blk_cleanup_queue(mddev->queue);
4579 static const struct sysfs_ops md_sysfs_ops = {
4580 .show = md_attr_show,
4581 .store = md_attr_store,
4583 static struct kobj_type md_ktype = {
4585 .sysfs_ops = &md_sysfs_ops,
4586 .default_attrs = md_default_attrs,
4591 static void mddev_delayed_delete(struct work_struct *ws)
4593 struct mddev *mddev = container_of(ws, struct mddev, del_work);
4595 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4596 kobject_del(&mddev->kobj);
4597 kobject_put(&mddev->kobj);
4600 static int md_alloc(dev_t dev, char *name)
4602 static DEFINE_MUTEX(disks_mutex);
4603 struct mddev *mddev = mddev_find(dev);
4604 struct gendisk *disk;
4613 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4614 shift = partitioned ? MdpMinorShift : 0;
4615 unit = MINOR(mddev->unit) >> shift;
4617 /* wait for any previous instance of this device to be
4618 * completely removed (mddev_delayed_delete).
4620 flush_workqueue(md_misc_wq);
4622 mutex_lock(&disks_mutex);
4628 /* Need to ensure that 'name' is not a duplicate.
4630 struct mddev *mddev2;
4631 spin_lock(&all_mddevs_lock);
4633 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4634 if (mddev2->gendisk &&
4635 strcmp(mddev2->gendisk->disk_name, name) == 0) {
4636 spin_unlock(&all_mddevs_lock);
4639 spin_unlock(&all_mddevs_lock);
4643 mddev->queue = blk_alloc_queue(GFP_KERNEL);
4646 mddev->queue->queuedata = mddev;
4648 blk_queue_make_request(mddev->queue, md_make_request);
4649 blk_set_stacking_limits(&mddev->queue->limits);
4651 disk = alloc_disk(1 << shift);
4653 blk_cleanup_queue(mddev->queue);
4654 mddev->queue = NULL;
4657 disk->major = MAJOR(mddev->unit);
4658 disk->first_minor = unit << shift;
4660 strcpy(disk->disk_name, name);
4661 else if (partitioned)
4662 sprintf(disk->disk_name, "md_d%d", unit);
4664 sprintf(disk->disk_name, "md%d", unit);
4665 disk->fops = &md_fops;
4666 disk->private_data = mddev;
4667 disk->queue = mddev->queue;
4668 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4669 /* Allow extended partitions. This makes the
4670 * 'mdp' device redundant, but we can't really
4673 disk->flags |= GENHD_FL_EXT_DEVT;
4674 mddev->gendisk = disk;
4675 /* As soon as we call add_disk(), another thread could get
4676 * through to md_open, so make sure it doesn't get too far
4678 mutex_lock(&mddev->open_mutex);
4681 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4682 &disk_to_dev(disk)->kobj, "%s", "md");
4684 /* This isn't possible, but as kobject_init_and_add is marked
4685 * __must_check, we must do something with the result
4687 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4691 if (mddev->kobj.sd &&
4692 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4693 printk(KERN_DEBUG "pointless warning\n");
4694 mutex_unlock(&mddev->open_mutex);
4696 mutex_unlock(&disks_mutex);
4697 if (!error && mddev->kobj.sd) {
4698 kobject_uevent(&mddev->kobj, KOBJ_ADD);
4699 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
4705 static struct kobject *md_probe(dev_t dev, int *part, void *data)
4707 md_alloc(dev, NULL);
4711 static int add_named_array(const char *val, struct kernel_param *kp)
4713 /* val must be "md_*" where * is not all digits.
4714 * We allocate an array with a large free minor number, and
4715 * set the name to val. val must not already be an active name.
4717 int len = strlen(val);
4718 char buf[DISK_NAME_LEN];
4720 while (len && val[len-1] == '\n')
4722 if (len >= DISK_NAME_LEN)
4724 strlcpy(buf, val, len+1);
4725 if (strncmp(buf, "md_", 3) != 0)
4727 return md_alloc(0, buf);
4730 static void md_safemode_timeout(unsigned long data)
4732 struct mddev *mddev = (struct mddev *) data;
4734 if (!atomic_read(&mddev->writes_pending)) {
4735 mddev->safemode = 1;
4736 if (mddev->external)
4737 sysfs_notify_dirent_safe(mddev->sysfs_state);
4739 md_wakeup_thread(mddev->thread);
4742 static int start_dirty_degraded;
4744 int md_run(struct mddev *mddev)
4747 struct md_rdev *rdev;
4748 struct md_personality *pers;
4750 if (list_empty(&mddev->disks))
4751 /* cannot run an array with no devices.. */
4756 /* Cannot run until previous stop completes properly */
4757 if (mddev->sysfs_active)
4761 * Analyze all RAID superblock(s)
4763 if (!mddev->raid_disks) {
4764 if (!mddev->persistent)
4769 if (mddev->level != LEVEL_NONE)
4770 request_module("md-level-%d", mddev->level);
4771 else if (mddev->clevel[0])
4772 request_module("md-%s", mddev->clevel);
4775 * Drop all container device buffers, from now on
4776 * the only valid external interface is through the md
4779 rdev_for_each(rdev, mddev) {
4780 if (test_bit(Faulty, &rdev->flags))
4782 sync_blockdev(rdev->bdev);
4783 invalidate_bdev(rdev->bdev);
4785 /* perform some consistency tests on the device.
4786 * We don't want the data to overlap the metadata,
4787 * Internal Bitmap issues have been handled elsewhere.
4789 if (rdev->meta_bdev) {
4790 /* Nothing to check */;
4791 } else if (rdev->data_offset < rdev->sb_start) {
4792 if (mddev->dev_sectors &&
4793 rdev->data_offset + mddev->dev_sectors
4795 printk("md: %s: data overlaps metadata\n",
4800 if (rdev->sb_start + rdev->sb_size/512
4801 > rdev->data_offset) {
4802 printk("md: %s: metadata overlaps data\n",
4807 sysfs_notify_dirent_safe(rdev->sysfs_state);
4810 if (mddev->bio_set == NULL)
4811 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
4813 spin_lock(&pers_lock);
4814 pers = find_pers(mddev->level, mddev->clevel);
4815 if (!pers || !try_module_get(pers->owner)) {
4816 spin_unlock(&pers_lock);
4817 if (mddev->level != LEVEL_NONE)
4818 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4821 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4826 spin_unlock(&pers_lock);
4827 if (mddev->level != pers->level) {
4828 mddev->level = pers->level;
4829 mddev->new_level = pers->level;
4831 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4833 if (mddev->reshape_position != MaxSector &&
4834 pers->start_reshape == NULL) {
4835 /* This personality cannot handle reshaping... */
4837 module_put(pers->owner);
4841 if (pers->sync_request) {
4842 /* Warn if this is a potentially silly
4845 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4846 struct md_rdev *rdev2;
4849 rdev_for_each(rdev, mddev)
4850 rdev_for_each(rdev2, mddev) {
4852 rdev->bdev->bd_contains ==
4853 rdev2->bdev->bd_contains) {
4855 "%s: WARNING: %s appears to be"
4856 " on the same physical disk as"
4859 bdevname(rdev->bdev,b),
4860 bdevname(rdev2->bdev,b2));
4867 "True protection against single-disk"
4868 " failure might be compromised.\n");
4871 mddev->recovery = 0;
4872 /* may be over-ridden by personality */
4873 mddev->resync_max_sectors = mddev->dev_sectors;
4875 mddev->ok_start_degraded = start_dirty_degraded;
4877 if (start_readonly && mddev->ro == 0)
4878 mddev->ro = 2; /* read-only, but switch on first write */
4880 err = mddev->pers->run(mddev);
4882 printk(KERN_ERR "md: pers->run() failed ...\n");
4883 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4884 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4885 " but 'external_size' not in effect?\n", __func__);
4887 "md: invalid array_size %llu > default size %llu\n",
4888 (unsigned long long)mddev->array_sectors / 2,
4889 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4891 mddev->pers->stop(mddev);
4893 if (err == 0 && mddev->pers->sync_request &&
4894 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
4895 err = bitmap_create(mddev);
4897 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4898 mdname(mddev), err);
4899 mddev->pers->stop(mddev);
4903 module_put(mddev->pers->owner);
4905 bitmap_destroy(mddev);
4908 if (mddev->pers->sync_request) {
4909 if (mddev->kobj.sd &&
4910 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4912 "md: cannot register extra attributes for %s\n",
4914 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
4915 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4918 atomic_set(&mddev->writes_pending,0);
4919 atomic_set(&mddev->max_corr_read_errors,
4920 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4921 mddev->safemode = 0;
4922 mddev->safemode_timer.function = md_safemode_timeout;
4923 mddev->safemode_timer.data = (unsigned long) mddev;
4924 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4928 rdev_for_each(rdev, mddev)
4929 if (rdev->raid_disk >= 0)
4930 if (sysfs_link_rdev(mddev, rdev))
4931 /* failure here is OK */;
4933 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4935 if (mddev->flags & MD_UPDATE_SB_FLAGS)
4936 md_update_sb(mddev, 0);
4938 md_new_event(mddev);
4939 sysfs_notify_dirent_safe(mddev->sysfs_state);
4940 sysfs_notify_dirent_safe(mddev->sysfs_action);
4941 sysfs_notify(&mddev->kobj, NULL, "degraded");
4944 EXPORT_SYMBOL_GPL(md_run);
4946 static int do_md_run(struct mddev *mddev)
4950 err = md_run(mddev);
4953 err = bitmap_load(mddev);
4955 bitmap_destroy(mddev);
4959 md_wakeup_thread(mddev->thread);
4960 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4962 set_capacity(mddev->gendisk, mddev->array_sectors);
4963 revalidate_disk(mddev->gendisk);
4965 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4970 static int restart_array(struct mddev *mddev)
4972 struct gendisk *disk = mddev->gendisk;
4974 /* Complain if it has no devices */
4975 if (list_empty(&mddev->disks))
4981 mddev->safemode = 0;
4983 set_disk_ro(disk, 0);
4984 printk(KERN_INFO "md: %s switched to read-write mode.\n",
4986 /* Kick recovery or resync if necessary */
4987 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4988 md_wakeup_thread(mddev->thread);
4989 md_wakeup_thread(mddev->sync_thread);
4990 sysfs_notify_dirent_safe(mddev->sysfs_state);
4994 static void md_clean(struct mddev *mddev)
4996 mddev->array_sectors = 0;
4997 mddev->external_size = 0;
4998 mddev->dev_sectors = 0;
4999 mddev->raid_disks = 0;
5000 mddev->recovery_cp = 0;
5001 mddev->resync_min = 0;
5002 mddev->resync_max = MaxSector;
5003 mddev->reshape_position = MaxSector;
5004 mddev->external = 0;
5005 mddev->persistent = 0;
5006 mddev->level = LEVEL_NONE;
5007 mddev->clevel[0] = 0;
5010 mddev->metadata_type[0] = 0;
5011 mddev->chunk_sectors = 0;
5012 mddev->ctime = mddev->utime = 0;
5014 mddev->max_disks = 0;
5016 mddev->can_decrease_events = 0;
5017 mddev->delta_disks = 0;
5018 mddev->reshape_backwards = 0;
5019 mddev->new_level = LEVEL_NONE;
5020 mddev->new_layout = 0;
5021 mddev->new_chunk_sectors = 0;
5022 mddev->curr_resync = 0;
5023 atomic64_set(&mddev->resync_mismatches, 0);
5024 mddev->suspend_lo = mddev->suspend_hi = 0;
5025 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5026 mddev->recovery = 0;
5029 mddev->degraded = 0;
5030 mddev->safemode = 0;
5031 mddev->merge_check_needed = 0;
5032 mddev->bitmap_info.offset = 0;
5033 mddev->bitmap_info.default_offset = 0;
5034 mddev->bitmap_info.default_space = 0;
5035 mddev->bitmap_info.chunksize = 0;
5036 mddev->bitmap_info.daemon_sleep = 0;
5037 mddev->bitmap_info.max_write_behind = 0;
5040 static void __md_stop_writes(struct mddev *mddev)
5042 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5043 if (mddev->sync_thread) {
5044 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5045 md_reap_sync_thread(mddev);
5048 del_timer_sync(&mddev->safemode_timer);
5050 bitmap_flush(mddev);
5051 md_super_wait(mddev);
5053 if (mddev->ro == 0 &&
5054 (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
5055 /* mark array as shutdown cleanly */
5057 md_update_sb(mddev, 1);
5061 void md_stop_writes(struct mddev *mddev)
5063 mddev_lock_nointr(mddev);
5064 __md_stop_writes(mddev);
5065 mddev_unlock(mddev);
5067 EXPORT_SYMBOL_GPL(md_stop_writes);
5069 static void __md_stop(struct mddev *mddev)
5072 mddev->pers->stop(mddev);
5073 if (mddev->pers->sync_request && mddev->to_remove == NULL)
5074 mddev->to_remove = &md_redundancy_group;
5075 module_put(mddev->pers->owner);
5077 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5080 void md_stop(struct mddev *mddev)
5082 /* stop the array and free an attached data structures.
5083 * This is called from dm-raid
5086 bitmap_destroy(mddev);
5088 bioset_free(mddev->bio_set);
5091 EXPORT_SYMBOL_GPL(md_stop);
5093 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5098 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5100 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5101 md_wakeup_thread(mddev->thread);
5103 if (mddev->sync_thread) {
5104 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5105 /* Thread might be blocked waiting for metadata update
5106 * which will now never happen */
5107 wake_up_process(mddev->sync_thread->tsk);
5109 mddev_unlock(mddev);
5110 wait_event(resync_wait, mddev->sync_thread == NULL);
5111 mddev_lock_nointr(mddev);
5113 mutex_lock(&mddev->open_mutex);
5114 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5115 mddev->sync_thread ||
5116 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5117 printk("md: %s still in use.\n",mdname(mddev));
5119 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5120 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5121 md_wakeup_thread(mddev->thread);
5127 __md_stop_writes(mddev);
5133 set_disk_ro(mddev->gendisk, 1);
5134 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5135 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5136 md_wakeup_thread(mddev->thread);
5137 sysfs_notify_dirent_safe(mddev->sysfs_state);
5141 mutex_unlock(&mddev->open_mutex);
5146 * 0 - completely stop and dis-assemble array
5147 * 2 - stop but do not disassemble array
5149 static int do_md_stop(struct mddev *mddev, int mode,
5150 struct block_device *bdev)
5152 struct gendisk *disk = mddev->gendisk;
5153 struct md_rdev *rdev;
5156 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5158 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5159 md_wakeup_thread(mddev->thread);
5161 if (mddev->sync_thread) {
5162 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5163 /* Thread might be blocked waiting for metadata update
5164 * which will now never happen */
5165 wake_up_process(mddev->sync_thread->tsk);
5167 mddev_unlock(mddev);
5168 wait_event(resync_wait, mddev->sync_thread == NULL);
5169 mddev_lock_nointr(mddev);
5171 mutex_lock(&mddev->open_mutex);
5172 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5173 mddev->sysfs_active ||
5174 mddev->sync_thread ||
5175 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5176 printk("md: %s still in use.\n",mdname(mddev));
5177 mutex_unlock(&mddev->open_mutex);
5179 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5180 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5181 md_wakeup_thread(mddev->thread);
5187 set_disk_ro(disk, 0);
5189 __md_stop_writes(mddev);
5191 mddev->queue->merge_bvec_fn = NULL;
5192 mddev->queue->backing_dev_info.congested_fn = NULL;
5194 /* tell userspace to handle 'inactive' */
5195 sysfs_notify_dirent_safe(mddev->sysfs_state);
5197 rdev_for_each(rdev, mddev)
5198 if (rdev->raid_disk >= 0)
5199 sysfs_unlink_rdev(mddev, rdev);
5201 set_capacity(disk, 0);
5202 mutex_unlock(&mddev->open_mutex);
5204 revalidate_disk(disk);
5209 mutex_unlock(&mddev->open_mutex);
5211 * Free resources if final stop
5214 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5216 bitmap_destroy(mddev);
5217 if (mddev->bitmap_info.file) {
5218 fput(mddev->bitmap_info.file);
5219 mddev->bitmap_info.file = NULL;
5221 mddev->bitmap_info.offset = 0;
5223 export_array(mddev);
5226 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5227 if (mddev->hold_active == UNTIL_STOP)
5228 mddev->hold_active = 0;
5230 blk_integrity_unregister(disk);
5231 md_new_event(mddev);
5232 sysfs_notify_dirent_safe(mddev->sysfs_state);
5237 static void autorun_array(struct mddev *mddev)
5239 struct md_rdev *rdev;
5242 if (list_empty(&mddev->disks))
5245 printk(KERN_INFO "md: running: ");
5247 rdev_for_each(rdev, mddev) {
5248 char b[BDEVNAME_SIZE];
5249 printk("<%s>", bdevname(rdev->bdev,b));
5253 err = do_md_run(mddev);
5255 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5256 do_md_stop(mddev, 0, NULL);
5261 * lets try to run arrays based on all disks that have arrived
5262 * until now. (those are in pending_raid_disks)
5264 * the method: pick the first pending disk, collect all disks with
5265 * the same UUID, remove all from the pending list and put them into
5266 * the 'same_array' list. Then order this list based on superblock
5267 * update time (freshest comes first), kick out 'old' disks and
5268 * compare superblocks. If everything's fine then run it.
5270 * If "unit" is allocated, then bump its reference count
5272 static void autorun_devices(int part)
5274 struct md_rdev *rdev0, *rdev, *tmp;
5275 struct mddev *mddev;
5276 char b[BDEVNAME_SIZE];
5278 printk(KERN_INFO "md: autorun ...\n");
5279 while (!list_empty(&pending_raid_disks)) {
5282 LIST_HEAD(candidates);
5283 rdev0 = list_entry(pending_raid_disks.next,
5284 struct md_rdev, same_set);
5286 printk(KERN_INFO "md: considering %s ...\n",
5287 bdevname(rdev0->bdev,b));
5288 INIT_LIST_HEAD(&candidates);
5289 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5290 if (super_90_load(rdev, rdev0, 0) >= 0) {
5291 printk(KERN_INFO "md: adding %s ...\n",
5292 bdevname(rdev->bdev,b));
5293 list_move(&rdev->same_set, &candidates);
5296 * now we have a set of devices, with all of them having
5297 * mostly sane superblocks. It's time to allocate the
5301 dev = MKDEV(mdp_major,
5302 rdev0->preferred_minor << MdpMinorShift);
5303 unit = MINOR(dev) >> MdpMinorShift;
5305 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5308 if (rdev0->preferred_minor != unit) {
5309 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5310 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5314 md_probe(dev, NULL, NULL);
5315 mddev = mddev_find(dev);
5316 if (!mddev || !mddev->gendisk) {
5320 "md: cannot allocate memory for md drive.\n");
5323 if (mddev_lock(mddev))
5324 printk(KERN_WARNING "md: %s locked, cannot run\n",
5326 else if (mddev->raid_disks || mddev->major_version
5327 || !list_empty(&mddev->disks)) {
5329 "md: %s already running, cannot run %s\n",
5330 mdname(mddev), bdevname(rdev0->bdev,b));
5331 mddev_unlock(mddev);
5333 printk(KERN_INFO "md: created %s\n", mdname(mddev));
5334 mddev->persistent = 1;
5335 rdev_for_each_list(rdev, tmp, &candidates) {
5336 list_del_init(&rdev->same_set);
5337 if (bind_rdev_to_array(rdev, mddev))
5340 autorun_array(mddev);
5341 mddev_unlock(mddev);
5343 /* on success, candidates will be empty, on error
5346 rdev_for_each_list(rdev, tmp, &candidates) {
5347 list_del_init(&rdev->same_set);
5352 printk(KERN_INFO "md: ... autorun DONE.\n");
5354 #endif /* !MODULE */
5356 static int get_version(void __user *arg)
5360 ver.major = MD_MAJOR_VERSION;
5361 ver.minor = MD_MINOR_VERSION;
5362 ver.patchlevel = MD_PATCHLEVEL_VERSION;
5364 if (copy_to_user(arg, &ver, sizeof(ver)))
5370 static int get_array_info(struct mddev *mddev, void __user *arg)
5372 mdu_array_info_t info;
5373 int nr,working,insync,failed,spare;
5374 struct md_rdev *rdev;
5376 nr = working = insync = failed = spare = 0;
5378 rdev_for_each_rcu(rdev, mddev) {
5380 if (test_bit(Faulty, &rdev->flags))
5384 if (test_bit(In_sync, &rdev->flags))
5392 info.major_version = mddev->major_version;
5393 info.minor_version = mddev->minor_version;
5394 info.patch_version = MD_PATCHLEVEL_VERSION;
5395 info.ctime = mddev->ctime;
5396 info.level = mddev->level;
5397 info.size = mddev->dev_sectors / 2;
5398 if (info.size != mddev->dev_sectors / 2) /* overflow */
5401 info.raid_disks = mddev->raid_disks;
5402 info.md_minor = mddev->md_minor;
5403 info.not_persistent= !mddev->persistent;
5405 info.utime = mddev->utime;
5408 info.state = (1<<MD_SB_CLEAN);
5409 if (mddev->bitmap && mddev->bitmap_info.offset)
5410 info.state |= (1<<MD_SB_BITMAP_PRESENT);
5411 info.active_disks = insync;
5412 info.working_disks = working;
5413 info.failed_disks = failed;
5414 info.spare_disks = spare;
5416 info.layout = mddev->layout;
5417 info.chunk_size = mddev->chunk_sectors << 9;
5419 if (copy_to_user(arg, &info, sizeof(info)))
5425 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
5427 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5428 char *ptr, *buf = NULL;
5431 file = kmalloc(sizeof(*file), GFP_NOIO);
5436 /* bitmap disabled, zero the first byte and copy out */
5437 if (!mddev->bitmap || !mddev->bitmap->storage.file) {
5438 file->pathname[0] = '\0';
5442 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
5446 ptr = d_path(&mddev->bitmap->storage.file->f_path,
5447 buf, sizeof(file->pathname));
5451 strcpy(file->pathname, ptr);
5455 if (copy_to_user(arg, file, sizeof(*file)))
5463 static int get_disk_info(struct mddev *mddev, void __user * arg)
5465 mdu_disk_info_t info;
5466 struct md_rdev *rdev;
5468 if (copy_from_user(&info, arg, sizeof(info)))
5472 rdev = find_rdev_nr_rcu(mddev, info.number);
5474 info.major = MAJOR(rdev->bdev->bd_dev);
5475 info.minor = MINOR(rdev->bdev->bd_dev);
5476 info.raid_disk = rdev->raid_disk;
5478 if (test_bit(Faulty, &rdev->flags))
5479 info.state |= (1<<MD_DISK_FAULTY);
5480 else if (test_bit(In_sync, &rdev->flags)) {
5481 info.state |= (1<<MD_DISK_ACTIVE);
5482 info.state |= (1<<MD_DISK_SYNC);
5484 if (test_bit(WriteMostly, &rdev->flags))
5485 info.state |= (1<<MD_DISK_WRITEMOSTLY);
5487 info.major = info.minor = 0;
5488 info.raid_disk = -1;
5489 info.state = (1<<MD_DISK_REMOVED);
5493 if (copy_to_user(arg, &info, sizeof(info)))
5499 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
5501 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5502 struct md_rdev *rdev;
5503 dev_t dev = MKDEV(info->major,info->minor);
5505 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5508 if (!mddev->raid_disks) {
5510 /* expecting a device which has a superblock */
5511 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5514 "md: md_import_device returned %ld\n",
5516 return PTR_ERR(rdev);
5518 if (!list_empty(&mddev->disks)) {
5519 struct md_rdev *rdev0
5520 = list_entry(mddev->disks.next,
5521 struct md_rdev, same_set);
5522 err = super_types[mddev->major_version]
5523 .load_super(rdev, rdev0, mddev->minor_version);
5526 "md: %s has different UUID to %s\n",
5527 bdevname(rdev->bdev,b),
5528 bdevname(rdev0->bdev,b2));
5533 err = bind_rdev_to_array(rdev, mddev);
5540 * add_new_disk can be used once the array is assembled
5541 * to add "hot spares". They must already have a superblock
5546 if (!mddev->pers->hot_add_disk) {
5548 "%s: personality does not support diskops!\n",
5552 if (mddev->persistent)
5553 rdev = md_import_device(dev, mddev->major_version,
5554 mddev->minor_version);
5556 rdev = md_import_device(dev, -1, -1);
5559 "md: md_import_device returned %ld\n",
5561 return PTR_ERR(rdev);
5563 /* set saved_raid_disk if appropriate */
5564 if (!mddev->persistent) {
5565 if (info->state & (1<<MD_DISK_SYNC) &&
5566 info->raid_disk < mddev->raid_disks) {
5567 rdev->raid_disk = info->raid_disk;
5568 set_bit(In_sync, &rdev->flags);
5569 clear_bit(Bitmap_sync, &rdev->flags);
5571 rdev->raid_disk = -1;
5572 rdev->saved_raid_disk = rdev->raid_disk;
5574 super_types[mddev->major_version].
5575 validate_super(mddev, rdev);
5576 if ((info->state & (1<<MD_DISK_SYNC)) &&
5577 rdev->raid_disk != info->raid_disk) {
5578 /* This was a hot-add request, but events doesn't
5579 * match, so reject it.
5585 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5586 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5587 set_bit(WriteMostly, &rdev->flags);
5589 clear_bit(WriteMostly, &rdev->flags);
5591 rdev->raid_disk = -1;
5592 err = bind_rdev_to_array(rdev, mddev);
5593 if (!err && !mddev->pers->hot_remove_disk) {
5594 /* If there is hot_add_disk but no hot_remove_disk
5595 * then added disks for geometry changes,
5596 * and should be added immediately.
5598 super_types[mddev->major_version].
5599 validate_super(mddev, rdev);
5600 err = mddev->pers->hot_add_disk(mddev, rdev);
5602 unbind_rdev_from_array(rdev);
5607 sysfs_notify_dirent_safe(rdev->sysfs_state);
5609 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5610 if (mddev->degraded)
5611 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5612 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5614 md_new_event(mddev);
5615 md_wakeup_thread(mddev->thread);
5619 /* otherwise, add_new_disk is only allowed
5620 * for major_version==0 superblocks
5622 if (mddev->major_version != 0) {
5623 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5628 if (!(info->state & (1<<MD_DISK_FAULTY))) {
5630 rdev = md_import_device(dev, -1, 0);
5633 "md: error, md_import_device() returned %ld\n",
5635 return PTR_ERR(rdev);
5637 rdev->desc_nr = info->number;
5638 if (info->raid_disk < mddev->raid_disks)
5639 rdev->raid_disk = info->raid_disk;
5641 rdev->raid_disk = -1;
5643 if (rdev->raid_disk < mddev->raid_disks)
5644 if (info->state & (1<<MD_DISK_SYNC))
5645 set_bit(In_sync, &rdev->flags);
5647 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5648 set_bit(WriteMostly, &rdev->flags);
5650 if (!mddev->persistent) {
5651 printk(KERN_INFO "md: nonpersistent superblock ...\n");
5652 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5654 rdev->sb_start = calc_dev_sboffset(rdev);
5655 rdev->sectors = rdev->sb_start;
5657 err = bind_rdev_to_array(rdev, mddev);
5667 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
5669 char b[BDEVNAME_SIZE];
5670 struct md_rdev *rdev;
5672 rdev = find_rdev(mddev, dev);
5676 clear_bit(Blocked, &rdev->flags);
5677 remove_and_add_spares(mddev, rdev);
5679 if (rdev->raid_disk >= 0)
5682 kick_rdev_from_array(rdev);
5683 md_update_sb(mddev, 1);
5684 md_new_event(mddev);
5688 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5689 bdevname(rdev->bdev,b), mdname(mddev));
5693 static int hot_add_disk(struct mddev *mddev, dev_t dev)
5695 char b[BDEVNAME_SIZE];
5697 struct md_rdev *rdev;
5702 if (mddev->major_version != 0) {
5703 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5704 " version-0 superblocks.\n",
5708 if (!mddev->pers->hot_add_disk) {
5710 "%s: personality does not support diskops!\n",
5715 rdev = md_import_device(dev, -1, 0);
5718 "md: error, md_import_device() returned %ld\n",
5723 if (mddev->persistent)
5724 rdev->sb_start = calc_dev_sboffset(rdev);
5726 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5728 rdev->sectors = rdev->sb_start;
5730 if (test_bit(Faulty, &rdev->flags)) {
5732 "md: can not hot-add faulty %s disk to %s!\n",
5733 bdevname(rdev->bdev,b), mdname(mddev));
5737 clear_bit(In_sync, &rdev->flags);
5739 rdev->saved_raid_disk = -1;
5740 err = bind_rdev_to_array(rdev, mddev);
5745 * The rest should better be atomic, we can have disk failures
5746 * noticed in interrupt contexts ...
5749 rdev->raid_disk = -1;
5751 md_update_sb(mddev, 1);
5754 * Kick recovery, maybe this spare has to be added to the
5755 * array immediately.
5757 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5758 md_wakeup_thread(mddev->thread);
5759 md_new_event(mddev);
5767 static int set_bitmap_file(struct mddev *mddev, int fd)
5772 if (!mddev->pers->quiesce || !mddev->thread)
5774 if (mddev->recovery || mddev->sync_thread)
5776 /* we should be able to change the bitmap.. */
5780 struct inode *inode;
5782 return -EEXIST; /* cannot add when bitmap is present */
5783 mddev->bitmap_info.file = fget(fd);
5785 if (mddev->bitmap_info.file == NULL) {
5786 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5791 inode = mddev->bitmap_info.file->f_mapping->host;
5792 if (!S_ISREG(inode->i_mode)) {
5793 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
5796 } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) {
5797 printk(KERN_ERR "%s: error: bitmap file must open for write\n",
5800 } else if (atomic_read(&inode->i_writecount) != 1) {
5801 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5806 fput(mddev->bitmap_info.file);
5807 mddev->bitmap_info.file = NULL;
5810 mddev->bitmap_info.offset = 0; /* file overrides offset */
5811 } else if (mddev->bitmap == NULL)
5812 return -ENOENT; /* cannot remove what isn't there */
5815 mddev->pers->quiesce(mddev, 1);
5817 err = bitmap_create(mddev);
5819 err = bitmap_load(mddev);
5821 if (fd < 0 || err) {
5822 bitmap_destroy(mddev);
5823 fd = -1; /* make sure to put the file */
5825 mddev->pers->quiesce(mddev, 0);
5828 if (mddev->bitmap_info.file)
5829 fput(mddev->bitmap_info.file);
5830 mddev->bitmap_info.file = NULL;
5837 * set_array_info is used two different ways
5838 * The original usage is when creating a new array.
5839 * In this usage, raid_disks is > 0 and it together with
5840 * level, size, not_persistent,layout,chunksize determine the
5841 * shape of the array.
5842 * This will always create an array with a type-0.90.0 superblock.
5843 * The newer usage is when assembling an array.
5844 * In this case raid_disks will be 0, and the major_version field is
5845 * use to determine which style super-blocks are to be found on the devices.
5846 * The minor and patch _version numbers are also kept incase the
5847 * super_block handler wishes to interpret them.
5849 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
5852 if (info->raid_disks == 0) {
5853 /* just setting version number for superblock loading */
5854 if (info->major_version < 0 ||
5855 info->major_version >= ARRAY_SIZE(super_types) ||
5856 super_types[info->major_version].name == NULL) {
5857 /* maybe try to auto-load a module? */
5859 "md: superblock version %d not known\n",
5860 info->major_version);
5863 mddev->major_version = info->major_version;
5864 mddev->minor_version = info->minor_version;
5865 mddev->patch_version = info->patch_version;
5866 mddev->persistent = !info->not_persistent;
5867 /* ensure mddev_put doesn't delete this now that there
5868 * is some minimal configuration.
5870 mddev->ctime = get_seconds();
5873 mddev->major_version = MD_MAJOR_VERSION;
5874 mddev->minor_version = MD_MINOR_VERSION;
5875 mddev->patch_version = MD_PATCHLEVEL_VERSION;
5876 mddev->ctime = get_seconds();
5878 mddev->level = info->level;
5879 mddev->clevel[0] = 0;
5880 mddev->dev_sectors = 2 * (sector_t)info->size;
5881 mddev->raid_disks = info->raid_disks;
5882 /* don't set md_minor, it is determined by which /dev/md* was
5885 if (info->state & (1<<MD_SB_CLEAN))
5886 mddev->recovery_cp = MaxSector;
5888 mddev->recovery_cp = 0;
5889 mddev->persistent = ! info->not_persistent;
5890 mddev->external = 0;
5892 mddev->layout = info->layout;
5893 mddev->chunk_sectors = info->chunk_size >> 9;
5895 mddev->max_disks = MD_SB_DISKS;
5897 if (mddev->persistent)
5899 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5901 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
5902 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
5903 mddev->bitmap_info.offset = 0;
5905 mddev->reshape_position = MaxSector;
5908 * Generate a 128 bit UUID
5910 get_random_bytes(mddev->uuid, 16);
5912 mddev->new_level = mddev->level;
5913 mddev->new_chunk_sectors = mddev->chunk_sectors;
5914 mddev->new_layout = mddev->layout;
5915 mddev->delta_disks = 0;
5916 mddev->reshape_backwards = 0;
5921 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
5923 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5925 if (mddev->external_size)
5928 mddev->array_sectors = array_sectors;
5930 EXPORT_SYMBOL(md_set_array_sectors);
5932 static int update_size(struct mddev *mddev, sector_t num_sectors)
5934 struct md_rdev *rdev;
5936 int fit = (num_sectors == 0);
5938 if (mddev->pers->resize == NULL)
5940 /* The "num_sectors" is the number of sectors of each device that
5941 * is used. This can only make sense for arrays with redundancy.
5942 * linear and raid0 always use whatever space is available. We can only
5943 * consider changing this number if no resync or reconstruction is
5944 * happening, and if the new size is acceptable. It must fit before the
5945 * sb_start or, if that is <data_offset, it must fit before the size
5946 * of each device. If num_sectors is zero, we find the largest size
5949 if (mddev->sync_thread)
5954 rdev_for_each(rdev, mddev) {
5955 sector_t avail = rdev->sectors;
5957 if (fit && (num_sectors == 0 || num_sectors > avail))
5958 num_sectors = avail;
5959 if (avail < num_sectors)
5962 rv = mddev->pers->resize(mddev, num_sectors);
5964 revalidate_disk(mddev->gendisk);
5968 static int update_raid_disks(struct mddev *mddev, int raid_disks)
5971 struct md_rdev *rdev;
5972 /* change the number of raid disks */
5973 if (mddev->pers->check_reshape == NULL)
5977 if (raid_disks <= 0 ||
5978 (mddev->max_disks && raid_disks >= mddev->max_disks))
5980 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5983 rdev_for_each(rdev, mddev) {
5984 if (mddev->raid_disks < raid_disks &&
5985 rdev->data_offset < rdev->new_data_offset)
5987 if (mddev->raid_disks > raid_disks &&
5988 rdev->data_offset > rdev->new_data_offset)
5992 mddev->delta_disks = raid_disks - mddev->raid_disks;
5993 if (mddev->delta_disks < 0)
5994 mddev->reshape_backwards = 1;
5995 else if (mddev->delta_disks > 0)
5996 mddev->reshape_backwards = 0;
5998 rv = mddev->pers->check_reshape(mddev);
6000 mddev->delta_disks = 0;
6001 mddev->reshape_backwards = 0;
6007 * update_array_info is used to change the configuration of an
6009 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6010 * fields in the info are checked against the array.
6011 * Any differences that cannot be handled will cause an error.
6012 * Normally, only one change can be managed at a time.
6014 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6020 /* calculate expected state,ignoring low bits */
6021 if (mddev->bitmap && mddev->bitmap_info.offset)
6022 state |= (1 << MD_SB_BITMAP_PRESENT);
6024 if (mddev->major_version != info->major_version ||
6025 mddev->minor_version != info->minor_version ||
6026 /* mddev->patch_version != info->patch_version || */
6027 mddev->ctime != info->ctime ||
6028 mddev->level != info->level ||
6029 /* mddev->layout != info->layout || */
6030 !mddev->persistent != info->not_persistent||
6031 mddev->chunk_sectors != info->chunk_size >> 9 ||
6032 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6033 ((state^info->state) & 0xfffffe00)
6036 /* Check there is only one change */
6037 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6039 if (mddev->raid_disks != info->raid_disks)
6041 if (mddev->layout != info->layout)
6043 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6050 if (mddev->layout != info->layout) {
6052 * we don't need to do anything at the md level, the
6053 * personality will take care of it all.
6055 if (mddev->pers->check_reshape == NULL)
6058 mddev->new_layout = info->layout;
6059 rv = mddev->pers->check_reshape(mddev);
6061 mddev->new_layout = mddev->layout;
6065 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6066 rv = update_size(mddev, (sector_t)info->size * 2);
6068 if (mddev->raid_disks != info->raid_disks)
6069 rv = update_raid_disks(mddev, info->raid_disks);
6071 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6072 if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
6074 if (mddev->recovery || mddev->sync_thread)
6076 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6077 /* add the bitmap */
6080 if (mddev->bitmap_info.default_offset == 0)
6082 mddev->bitmap_info.offset =
6083 mddev->bitmap_info.default_offset;
6084 mddev->bitmap_info.space =
6085 mddev->bitmap_info.default_space;
6086 mddev->pers->quiesce(mddev, 1);
6087 rv = bitmap_create(mddev);
6089 rv = bitmap_load(mddev);
6091 bitmap_destroy(mddev);
6092 mddev->pers->quiesce(mddev, 0);
6094 /* remove the bitmap */
6097 if (mddev->bitmap->storage.file)
6099 mddev->pers->quiesce(mddev, 1);
6100 bitmap_destroy(mddev);
6101 mddev->pers->quiesce(mddev, 0);
6102 mddev->bitmap_info.offset = 0;
6105 md_update_sb(mddev, 1);
6109 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6111 struct md_rdev *rdev;
6114 if (mddev->pers == NULL)
6118 rdev = find_rdev_rcu(mddev, dev);
6122 md_error(mddev, rdev);
6123 if (!test_bit(Faulty, &rdev->flags))
6131 * We have a problem here : there is no easy way to give a CHS
6132 * virtual geometry. We currently pretend that we have a 2 heads
6133 * 4 sectors (with a BIG number of cylinders...). This drives
6134 * dosfs just mad... ;-)
6136 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6138 struct mddev *mddev = bdev->bd_disk->private_data;
6142 geo->cylinders = mddev->array_sectors / 8;
6146 static inline bool md_ioctl_valid(unsigned int cmd)
6151 case GET_ARRAY_INFO:
6152 case GET_BITMAP_FILE:
6155 case HOT_REMOVE_DISK:
6158 case RESTART_ARRAY_RW:
6160 case SET_ARRAY_INFO:
6161 case SET_BITMAP_FILE:
6162 case SET_DISK_FAULTY:
6171 static int md_ioctl(struct block_device *bdev, fmode_t mode,
6172 unsigned int cmd, unsigned long arg)
6175 void __user *argp = (void __user *)arg;
6176 struct mddev *mddev = NULL;
6179 if (!md_ioctl_valid(cmd))
6184 case GET_ARRAY_INFO:
6188 if (!capable(CAP_SYS_ADMIN))
6193 * Commands dealing with the RAID driver but not any
6198 err = get_version(argp);
6204 autostart_arrays(arg);
6211 * Commands creating/starting a new array:
6214 mddev = bdev->bd_disk->private_data;
6221 /* Some actions do not requires the mutex */
6223 case GET_ARRAY_INFO:
6224 if (!mddev->raid_disks && !mddev->external)
6227 err = get_array_info(mddev, argp);
6231 if (!mddev->raid_disks && !mddev->external)
6234 err = get_disk_info(mddev, argp);
6237 case SET_DISK_FAULTY:
6238 err = set_disk_faulty(mddev, new_decode_dev(arg));
6242 if (cmd == ADD_NEW_DISK)
6243 /* need to ensure md_delayed_delete() has completed */
6244 flush_workqueue(md_misc_wq);
6246 if (cmd == HOT_REMOVE_DISK)
6247 /* need to ensure recovery thread has run */
6248 wait_event_interruptible_timeout(mddev->sb_wait,
6249 !test_bit(MD_RECOVERY_NEEDED,
6251 msecs_to_jiffies(5000));
6252 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6253 /* Need to flush page cache, and ensure no-one else opens
6256 mutex_lock(&mddev->open_mutex);
6257 if (mddev->pers && atomic_read(&mddev->openers) > 1) {
6258 mutex_unlock(&mddev->open_mutex);
6262 set_bit(MD_STILL_CLOSED, &mddev->flags);
6263 mutex_unlock(&mddev->open_mutex);
6264 sync_blockdev(bdev);
6266 err = mddev_lock(mddev);
6269 "md: ioctl lock interrupted, reason %d, cmd %d\n",
6274 if (cmd == SET_ARRAY_INFO) {
6275 mdu_array_info_t info;
6277 memset(&info, 0, sizeof(info));
6278 else if (copy_from_user(&info, argp, sizeof(info))) {
6283 err = update_array_info(mddev, &info);
6285 printk(KERN_WARNING "md: couldn't update"
6286 " array info. %d\n", err);
6291 if (!list_empty(&mddev->disks)) {
6293 "md: array %s already has disks!\n",
6298 if (mddev->raid_disks) {
6300 "md: array %s already initialised!\n",
6305 err = set_array_info(mddev, &info);
6307 printk(KERN_WARNING "md: couldn't set"
6308 " array info. %d\n", err);
6315 * Commands querying/configuring an existing array:
6317 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
6318 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
6319 if ((!mddev->raid_disks && !mddev->external)
6320 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6321 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6322 && cmd != GET_BITMAP_FILE) {
6328 * Commands even a read-only array can execute:
6331 case GET_BITMAP_FILE:
6332 err = get_bitmap_file(mddev, argp);
6335 case RESTART_ARRAY_RW:
6336 err = restart_array(mddev);
6340 err = do_md_stop(mddev, 0, bdev);
6344 err = md_set_readonly(mddev, bdev);
6347 case HOT_REMOVE_DISK:
6348 err = hot_remove_disk(mddev, new_decode_dev(arg));
6352 /* We can support ADD_NEW_DISK on read-only arrays
6353 * on if we are re-adding a preexisting device.
6354 * So require mddev->pers and MD_DISK_SYNC.
6357 mdu_disk_info_t info;
6358 if (copy_from_user(&info, argp, sizeof(info)))
6360 else if (!(info.state & (1<<MD_DISK_SYNC)))
6361 /* Need to clear read-only for this */
6364 err = add_new_disk(mddev, &info);
6370 if (get_user(ro, (int __user *)(arg))) {
6376 /* if the bdev is going readonly the value of mddev->ro
6377 * does not matter, no writes are coming
6382 /* are we are already prepared for writes? */
6386 /* transitioning to readauto need only happen for
6387 * arrays that call md_write_start
6390 err = restart_array(mddev);
6393 set_disk_ro(mddev->gendisk, 0);
6400 * The remaining ioctls are changing the state of the
6401 * superblock, so we do not allow them on read-only arrays.
6403 if (mddev->ro && mddev->pers) {
6404 if (mddev->ro == 2) {
6406 sysfs_notify_dirent_safe(mddev->sysfs_state);
6407 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6408 /* mddev_unlock will wake thread */
6409 /* If a device failed while we were read-only, we
6410 * need to make sure the metadata is updated now.
6412 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6413 mddev_unlock(mddev);
6414 wait_event(mddev->sb_wait,
6415 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6416 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6417 mddev_lock_nointr(mddev);
6428 mdu_disk_info_t info;
6429 if (copy_from_user(&info, argp, sizeof(info)))
6432 err = add_new_disk(mddev, &info);
6437 err = hot_add_disk(mddev, new_decode_dev(arg));
6441 err = do_md_run(mddev);
6444 case SET_BITMAP_FILE:
6445 err = set_bitmap_file(mddev, (int)arg);
6454 if (mddev->hold_active == UNTIL_IOCTL &&
6456 mddev->hold_active = 0;
6457 mddev_unlock(mddev);
6461 #ifdef CONFIG_COMPAT
6462 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6463 unsigned int cmd, unsigned long arg)
6466 case HOT_REMOVE_DISK:
6468 case SET_DISK_FAULTY:
6469 case SET_BITMAP_FILE:
6470 /* These take in integer arg, do not convert */
6473 arg = (unsigned long)compat_ptr(arg);
6477 return md_ioctl(bdev, mode, cmd, arg);
6479 #endif /* CONFIG_COMPAT */
6481 static int md_open(struct block_device *bdev, fmode_t mode)
6484 * Succeed if we can lock the mddev, which confirms that
6485 * it isn't being stopped right now.
6487 struct mddev *mddev = mddev_find(bdev->bd_dev);
6493 if (mddev->gendisk != bdev->bd_disk) {
6494 /* we are racing with mddev_put which is discarding this
6498 /* Wait until bdev->bd_disk is definitely gone */
6499 flush_workqueue(md_misc_wq);
6500 /* Then retry the open from the top */
6501 return -ERESTARTSYS;
6503 BUG_ON(mddev != bdev->bd_disk->private_data);
6505 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
6509 atomic_inc(&mddev->openers);
6510 clear_bit(MD_STILL_CLOSED, &mddev->flags);
6511 mutex_unlock(&mddev->open_mutex);
6513 check_disk_change(bdev);
6518 static void md_release(struct gendisk *disk, fmode_t mode)
6520 struct mddev *mddev = disk->private_data;
6523 atomic_dec(&mddev->openers);
6527 static int md_media_changed(struct gendisk *disk)
6529 struct mddev *mddev = disk->private_data;
6531 return mddev->changed;
6534 static int md_revalidate(struct gendisk *disk)
6536 struct mddev *mddev = disk->private_data;
6541 static const struct block_device_operations md_fops =
6543 .owner = THIS_MODULE,
6545 .release = md_release,
6547 #ifdef CONFIG_COMPAT
6548 .compat_ioctl = md_compat_ioctl,
6550 .getgeo = md_getgeo,
6551 .media_changed = md_media_changed,
6552 .revalidate_disk= md_revalidate,
6555 static int md_thread(void *arg)
6557 struct md_thread *thread = arg;
6560 * md_thread is a 'system-thread', it's priority should be very
6561 * high. We avoid resource deadlocks individually in each
6562 * raid personality. (RAID5 does preallocation) We also use RR and
6563 * the very same RT priority as kswapd, thus we will never get
6564 * into a priority inversion deadlock.
6566 * we definitely have to have equal or higher priority than
6567 * bdflush, otherwise bdflush will deadlock if there are too
6568 * many dirty RAID5 blocks.
6571 allow_signal(SIGKILL);
6572 while (!kthread_should_stop()) {
6574 /* We need to wait INTERRUPTIBLE so that
6575 * we don't add to the load-average.
6576 * That means we need to be sure no signals are
6579 if (signal_pending(current))
6580 flush_signals(current);
6582 wait_event_interruptible_timeout
6584 test_bit(THREAD_WAKEUP, &thread->flags)
6585 || kthread_should_stop(),
6588 clear_bit(THREAD_WAKEUP, &thread->flags);
6589 if (!kthread_should_stop())
6590 thread->run(thread);
6596 void md_wakeup_thread(struct md_thread *thread)
6599 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
6600 set_bit(THREAD_WAKEUP, &thread->flags);
6601 wake_up(&thread->wqueue);
6604 EXPORT_SYMBOL(md_wakeup_thread);
6606 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6607 struct mddev *mddev, const char *name)
6609 struct md_thread *thread;
6611 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
6615 init_waitqueue_head(&thread->wqueue);
6618 thread->mddev = mddev;
6619 thread->timeout = MAX_SCHEDULE_TIMEOUT;
6620 thread->tsk = kthread_run(md_thread, thread,
6622 mdname(thread->mddev),
6624 if (IS_ERR(thread->tsk)) {
6630 EXPORT_SYMBOL(md_register_thread);
6632 void md_unregister_thread(struct md_thread **threadp)
6634 struct md_thread *thread = *threadp;
6637 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6638 /* Locking ensures that mddev_unlock does not wake_up a
6639 * non-existent thread
6641 spin_lock(&pers_lock);
6643 spin_unlock(&pers_lock);
6645 kthread_stop(thread->tsk);
6648 EXPORT_SYMBOL(md_unregister_thread);
6650 void md_error(struct mddev *mddev, struct md_rdev *rdev)
6652 if (!rdev || test_bit(Faulty, &rdev->flags))
6655 if (!mddev->pers || !mddev->pers->error_handler)
6657 mddev->pers->error_handler(mddev,rdev);
6658 if (mddev->degraded)
6659 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6660 sysfs_notify_dirent_safe(rdev->sysfs_state);
6661 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6662 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6663 md_wakeup_thread(mddev->thread);
6664 if (mddev->event_work.func)
6665 queue_work(md_misc_wq, &mddev->event_work);
6666 md_new_event_inintr(mddev);
6668 EXPORT_SYMBOL(md_error);
6670 /* seq_file implementation /proc/mdstat */
6672 static void status_unused(struct seq_file *seq)
6675 struct md_rdev *rdev;
6677 seq_printf(seq, "unused devices: ");
6679 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
6680 char b[BDEVNAME_SIZE];
6682 seq_printf(seq, "%s ",
6683 bdevname(rdev->bdev,b));
6686 seq_printf(seq, "<none>");
6688 seq_printf(seq, "\n");
6691 static void status_resync(struct seq_file *seq, struct mddev *mddev)
6693 sector_t max_sectors, resync, res;
6694 unsigned long dt, db;
6697 unsigned int per_milli;
6699 if (mddev->curr_resync <= 3)
6702 resync = mddev->curr_resync
6703 - atomic_read(&mddev->recovery_active);
6705 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
6706 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6707 max_sectors = mddev->resync_max_sectors;
6709 max_sectors = mddev->dev_sectors;
6711 WARN_ON(max_sectors == 0);
6712 /* Pick 'scale' such that (resync>>scale)*1000 will fit
6713 * in a sector_t, and (max_sectors>>scale) will fit in a
6714 * u32, as those are the requirements for sector_div.
6715 * Thus 'scale' must be at least 10
6718 if (sizeof(sector_t) > sizeof(unsigned long)) {
6719 while ( max_sectors/2 > (1ULL<<(scale+32)))
6722 res = (resync>>scale)*1000;
6723 sector_div(res, (u32)((max_sectors>>scale)+1));
6727 int i, x = per_milli/50, y = 20-x;
6728 seq_printf(seq, "[");
6729 for (i = 0; i < x; i++)
6730 seq_printf(seq, "=");
6731 seq_printf(seq, ">");
6732 for (i = 0; i < y; i++)
6733 seq_printf(seq, ".");
6734 seq_printf(seq, "] ");
6736 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
6737 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6739 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6741 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6742 "resync" : "recovery"))),
6743 per_milli/10, per_milli % 10,
6744 (unsigned long long) resync/2,
6745 (unsigned long long) max_sectors/2);
6748 * dt: time from mark until now
6749 * db: blocks written from mark until now
6750 * rt: remaining time
6752 * rt is a sector_t, so could be 32bit or 64bit.
6753 * So we divide before multiply in case it is 32bit and close
6755 * We scale the divisor (db) by 32 to avoid losing precision
6756 * near the end of resync when the number of remaining sectors
6758 * We then divide rt by 32 after multiplying by db to compensate.
6759 * The '+1' avoids division by zero if db is very small.
6761 dt = ((jiffies - mddev->resync_mark) / HZ);
6763 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6764 - mddev->resync_mark_cnt;
6766 rt = max_sectors - resync; /* number of remaining sectors */
6767 sector_div(rt, db/32+1);
6771 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6772 ((unsigned long)rt % 60)/6);
6774 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6777 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6779 struct list_head *tmp;
6781 struct mddev *mddev;
6789 spin_lock(&all_mddevs_lock);
6790 list_for_each(tmp,&all_mddevs)
6792 mddev = list_entry(tmp, struct mddev, all_mddevs);
6794 spin_unlock(&all_mddevs_lock);
6797 spin_unlock(&all_mddevs_lock);
6799 return (void*)2;/* tail */
6803 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6805 struct list_head *tmp;
6806 struct mddev *next_mddev, *mddev = v;
6812 spin_lock(&all_mddevs_lock);
6814 tmp = all_mddevs.next;
6816 tmp = mddev->all_mddevs.next;
6817 if (tmp != &all_mddevs)
6818 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
6820 next_mddev = (void*)2;
6823 spin_unlock(&all_mddevs_lock);
6831 static void md_seq_stop(struct seq_file *seq, void *v)
6833 struct mddev *mddev = v;
6835 if (mddev && v != (void*)1 && v != (void*)2)
6839 static int md_seq_show(struct seq_file *seq, void *v)
6841 struct mddev *mddev = v;
6843 struct md_rdev *rdev;
6845 if (v == (void*)1) {
6846 struct md_personality *pers;
6847 seq_printf(seq, "Personalities : ");
6848 spin_lock(&pers_lock);
6849 list_for_each_entry(pers, &pers_list, list)
6850 seq_printf(seq, "[%s] ", pers->name);
6852 spin_unlock(&pers_lock);
6853 seq_printf(seq, "\n");
6854 seq->poll_event = atomic_read(&md_event_count);
6857 if (v == (void*)2) {
6862 if (mddev_lock(mddev) < 0)
6865 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
6866 seq_printf(seq, "%s : %sactive", mdname(mddev),
6867 mddev->pers ? "" : "in");
6870 seq_printf(seq, " (read-only)");
6872 seq_printf(seq, " (auto-read-only)");
6873 seq_printf(seq, " %s", mddev->pers->name);
6877 rdev_for_each(rdev, mddev) {
6878 char b[BDEVNAME_SIZE];
6879 seq_printf(seq, " %s[%d]",
6880 bdevname(rdev->bdev,b), rdev->desc_nr);
6881 if (test_bit(WriteMostly, &rdev->flags))
6882 seq_printf(seq, "(W)");
6883 if (test_bit(Faulty, &rdev->flags)) {
6884 seq_printf(seq, "(F)");
6887 if (rdev->raid_disk < 0)
6888 seq_printf(seq, "(S)"); /* spare */
6889 if (test_bit(Replacement, &rdev->flags))
6890 seq_printf(seq, "(R)");
6891 sectors += rdev->sectors;
6894 if (!list_empty(&mddev->disks)) {
6896 seq_printf(seq, "\n %llu blocks",
6897 (unsigned long long)
6898 mddev->array_sectors / 2);
6900 seq_printf(seq, "\n %llu blocks",
6901 (unsigned long long)sectors / 2);
6903 if (mddev->persistent) {
6904 if (mddev->major_version != 0 ||
6905 mddev->minor_version != 90) {
6906 seq_printf(seq," super %d.%d",
6907 mddev->major_version,
6908 mddev->minor_version);
6910 } else if (mddev->external)
6911 seq_printf(seq, " super external:%s",
6912 mddev->metadata_type);
6914 seq_printf(seq, " super non-persistent");
6917 mddev->pers->status(seq, mddev);
6918 seq_printf(seq, "\n ");
6919 if (mddev->pers->sync_request) {
6920 if (mddev->curr_resync > 2) {
6921 status_resync(seq, mddev);
6922 seq_printf(seq, "\n ");
6923 } else if (mddev->curr_resync >= 1)
6924 seq_printf(seq, "\tresync=DELAYED\n ");
6925 else if (mddev->recovery_cp < MaxSector)
6926 seq_printf(seq, "\tresync=PENDING\n ");
6929 seq_printf(seq, "\n ");
6931 bitmap_status(seq, mddev->bitmap);
6933 seq_printf(seq, "\n");
6935 mddev_unlock(mddev);
6940 static const struct seq_operations md_seq_ops = {
6941 .start = md_seq_start,
6942 .next = md_seq_next,
6943 .stop = md_seq_stop,
6944 .show = md_seq_show,
6947 static int md_seq_open(struct inode *inode, struct file *file)
6949 struct seq_file *seq;
6952 error = seq_open(file, &md_seq_ops);
6956 seq = file->private_data;
6957 seq->poll_event = atomic_read(&md_event_count);
6961 static int md_unloading;
6962 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6964 struct seq_file *seq = filp->private_data;
6968 return POLLIN|POLLRDNORM|POLLERR|POLLPRI;;
6969 poll_wait(filp, &md_event_waiters, wait);
6971 /* always allow read */
6972 mask = POLLIN | POLLRDNORM;
6974 if (seq->poll_event != atomic_read(&md_event_count))
6975 mask |= POLLERR | POLLPRI;
6979 static const struct file_operations md_seq_fops = {
6980 .owner = THIS_MODULE,
6981 .open = md_seq_open,
6983 .llseek = seq_lseek,
6984 .release = seq_release_private,
6985 .poll = mdstat_poll,
6988 int register_md_personality(struct md_personality *p)
6990 printk(KERN_INFO "md: %s personality registered for level %d\n",
6992 spin_lock(&pers_lock);
6993 list_add_tail(&p->list, &pers_list);
6994 spin_unlock(&pers_lock);
6997 EXPORT_SYMBOL(register_md_personality);
6999 int unregister_md_personality(struct md_personality *p)
7001 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
7002 spin_lock(&pers_lock);
7003 list_del_init(&p->list);
7004 spin_unlock(&pers_lock);
7007 EXPORT_SYMBOL(unregister_md_personality);
7009 static int is_mddev_idle(struct mddev *mddev, int init)
7011 struct md_rdev *rdev;
7017 rdev_for_each_rcu(rdev, mddev) {
7018 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7019 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7020 (int)part_stat_read(&disk->part0, sectors[1]) -
7021 atomic_read(&disk->sync_io);
7022 /* sync IO will cause sync_io to increase before the disk_stats
7023 * as sync_io is counted when a request starts, and
7024 * disk_stats is counted when it completes.
7025 * So resync activity will cause curr_events to be smaller than
7026 * when there was no such activity.
7027 * non-sync IO will cause disk_stat to increase without
7028 * increasing sync_io so curr_events will (eventually)
7029 * be larger than it was before. Once it becomes
7030 * substantially larger, the test below will cause
7031 * the array to appear non-idle, and resync will slow
7033 * If there is a lot of outstanding resync activity when
7034 * we set last_event to curr_events, then all that activity
7035 * completing might cause the array to appear non-idle
7036 * and resync will be slowed down even though there might
7037 * not have been non-resync activity. This will only
7038 * happen once though. 'last_events' will soon reflect
7039 * the state where there is little or no outstanding
7040 * resync requests, and further resync activity will
7041 * always make curr_events less than last_events.
7044 if (init || curr_events - rdev->last_events > 64) {
7045 rdev->last_events = curr_events;
7053 void md_done_sync(struct mddev *mddev, int blocks, int ok)
7055 /* another "blocks" (512byte) blocks have been synced */
7056 atomic_sub(blocks, &mddev->recovery_active);
7057 wake_up(&mddev->recovery_wait);
7059 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7060 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7061 md_wakeup_thread(mddev->thread);
7062 // stop recovery, signal do_sync ....
7065 EXPORT_SYMBOL(md_done_sync);
7067 /* md_write_start(mddev, bi)
7068 * If we need to update some array metadata (e.g. 'active' flag
7069 * in superblock) before writing, schedule a superblock update
7070 * and wait for it to complete.
7072 void md_write_start(struct mddev *mddev, struct bio *bi)
7075 if (bio_data_dir(bi) != WRITE)
7078 BUG_ON(mddev->ro == 1);
7079 if (mddev->ro == 2) {
7080 /* need to switch to read/write */
7082 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7083 md_wakeup_thread(mddev->thread);
7084 md_wakeup_thread(mddev->sync_thread);
7087 atomic_inc(&mddev->writes_pending);
7088 if (mddev->safemode == 1)
7089 mddev->safemode = 0;
7090 if (mddev->in_sync) {
7091 spin_lock_irq(&mddev->write_lock);
7092 if (mddev->in_sync) {
7094 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7095 set_bit(MD_CHANGE_PENDING, &mddev->flags);
7096 md_wakeup_thread(mddev->thread);
7099 spin_unlock_irq(&mddev->write_lock);
7102 sysfs_notify_dirent_safe(mddev->sysfs_state);
7103 wait_event(mddev->sb_wait,
7104 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
7106 EXPORT_SYMBOL(md_write_start);
7108 void md_write_end(struct mddev *mddev)
7110 if (atomic_dec_and_test(&mddev->writes_pending)) {
7111 if (mddev->safemode == 2)
7112 md_wakeup_thread(mddev->thread);
7113 else if (mddev->safemode_delay)
7114 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7117 EXPORT_SYMBOL(md_write_end);
7119 /* md_allow_write(mddev)
7120 * Calling this ensures that the array is marked 'active' so that writes
7121 * may proceed without blocking. It is important to call this before
7122 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7123 * Must be called with mddev_lock held.
7125 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7126 * is dropped, so return -EAGAIN after notifying userspace.
7128 int md_allow_write(struct mddev *mddev)
7134 if (!mddev->pers->sync_request)
7137 spin_lock_irq(&mddev->write_lock);
7138 if (mddev->in_sync) {
7140 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7141 set_bit(MD_CHANGE_PENDING, &mddev->flags);
7142 if (mddev->safemode_delay &&
7143 mddev->safemode == 0)
7144 mddev->safemode = 1;
7145 spin_unlock_irq(&mddev->write_lock);
7146 md_update_sb(mddev, 0);
7147 sysfs_notify_dirent_safe(mddev->sysfs_state);
7149 spin_unlock_irq(&mddev->write_lock);
7151 if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
7156 EXPORT_SYMBOL_GPL(md_allow_write);
7158 #define SYNC_MARKS 10
7159 #define SYNC_MARK_STEP (3*HZ)
7160 #define UPDATE_FREQUENCY (5*60*HZ)
7161 void md_do_sync(struct md_thread *thread)
7163 struct mddev *mddev = thread->mddev;
7164 struct mddev *mddev2;
7165 unsigned int currspeed = 0,
7167 sector_t max_sectors,j, io_sectors, recovery_done;
7168 unsigned long mark[SYNC_MARKS];
7169 unsigned long update_time;
7170 sector_t mark_cnt[SYNC_MARKS];
7172 struct list_head *tmp;
7173 sector_t last_check;
7175 struct md_rdev *rdev;
7176 char *desc, *action = NULL;
7177 struct blk_plug plug;
7179 /* just incase thread restarts... */
7180 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7182 if (mddev->ro) {/* never try to sync a read-only array */
7183 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7187 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7188 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
7189 desc = "data-check";
7191 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7192 desc = "requested-resync";
7196 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7201 mddev->last_sync_action = action ?: desc;
7203 /* we overload curr_resync somewhat here.
7204 * 0 == not engaged in resync at all
7205 * 2 == checking that there is no conflict with another sync
7206 * 1 == like 2, but have yielded to allow conflicting resync to
7208 * other == active in resync - this many blocks
7210 * Before starting a resync we must have set curr_resync to
7211 * 2, and then checked that every "conflicting" array has curr_resync
7212 * less than ours. When we find one that is the same or higher
7213 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
7214 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7215 * This will mean we have to start checking from the beginning again.
7220 mddev->curr_resync = 2;
7223 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7225 for_each_mddev(mddev2, tmp) {
7226 if (mddev2 == mddev)
7228 if (!mddev->parallel_resync
7229 && mddev2->curr_resync
7230 && match_mddev_units(mddev, mddev2)) {
7232 if (mddev < mddev2 && mddev->curr_resync == 2) {
7233 /* arbitrarily yield */
7234 mddev->curr_resync = 1;
7235 wake_up(&resync_wait);
7237 if (mddev > mddev2 && mddev->curr_resync == 1)
7238 /* no need to wait here, we can wait the next
7239 * time 'round when curr_resync == 2
7242 /* We need to wait 'interruptible' so as not to
7243 * contribute to the load average, and not to
7244 * be caught by 'softlockup'
7246 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7247 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7248 mddev2->curr_resync >= mddev->curr_resync) {
7249 printk(KERN_INFO "md: delaying %s of %s"
7250 " until %s has finished (they"
7251 " share one or more physical units)\n",
7252 desc, mdname(mddev), mdname(mddev2));
7254 if (signal_pending(current))
7255 flush_signals(current);
7257 finish_wait(&resync_wait, &wq);
7260 finish_wait(&resync_wait, &wq);
7263 } while (mddev->curr_resync < 2);
7266 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7267 /* resync follows the size requested by the personality,
7268 * which defaults to physical size, but can be virtual size
7270 max_sectors = mddev->resync_max_sectors;
7271 atomic64_set(&mddev->resync_mismatches, 0);
7272 /* we don't use the checkpoint if there's a bitmap */
7273 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7274 j = mddev->resync_min;
7275 else if (!mddev->bitmap)
7276 j = mddev->recovery_cp;
7278 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7279 max_sectors = mddev->resync_max_sectors;
7281 /* recovery follows the physical size of devices */
7282 max_sectors = mddev->dev_sectors;
7285 rdev_for_each_rcu(rdev, mddev)
7286 if (rdev->raid_disk >= 0 &&
7287 !test_bit(Faulty, &rdev->flags) &&
7288 !test_bit(In_sync, &rdev->flags) &&
7289 rdev->recovery_offset < j)
7290 j = rdev->recovery_offset;
7293 /* If there is a bitmap, we need to make sure all
7294 * writes that started before we added a spare
7295 * complete before we start doing a recovery.
7296 * Otherwise the write might complete and (via
7297 * bitmap_endwrite) set a bit in the bitmap after the
7298 * recovery has checked that bit and skipped that
7301 if (mddev->bitmap) {
7302 mddev->pers->quiesce(mddev, 1);
7303 mddev->pers->quiesce(mddev, 0);
7307 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7308 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
7309 " %d KB/sec/disk.\n", speed_min(mddev));
7310 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
7311 "(but not more than %d KB/sec) for %s.\n",
7312 speed_max(mddev), desc);
7314 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7317 for (m = 0; m < SYNC_MARKS; m++) {
7319 mark_cnt[m] = io_sectors;
7322 mddev->resync_mark = mark[last_mark];
7323 mddev->resync_mark_cnt = mark_cnt[last_mark];
7326 * Tune reconstruction:
7328 window = 32*(PAGE_SIZE/512);
7329 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7330 window/2, (unsigned long long)max_sectors/2);
7332 atomic_set(&mddev->recovery_active, 0);
7337 "md: resuming %s of %s from checkpoint.\n",
7338 desc, mdname(mddev));
7339 mddev->curr_resync = j;
7341 mddev->curr_resync = 3; /* no longer delayed */
7342 mddev->curr_resync_completed = j;
7343 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7344 md_new_event(mddev);
7345 update_time = jiffies;
7347 blk_start_plug(&plug);
7348 while (j < max_sectors) {
7353 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7354 ((mddev->curr_resync > mddev->curr_resync_completed &&
7355 (mddev->curr_resync - mddev->curr_resync_completed)
7356 > (max_sectors >> 4)) ||
7357 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7358 (j - mddev->curr_resync_completed)*2
7359 >= mddev->resync_max - mddev->curr_resync_completed
7361 /* time to update curr_resync_completed */
7362 wait_event(mddev->recovery_wait,
7363 atomic_read(&mddev->recovery_active) == 0);
7364 mddev->curr_resync_completed = j;
7365 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7366 j > mddev->recovery_cp)
7367 mddev->recovery_cp = j;
7368 update_time = jiffies;
7369 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7370 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7373 while (j >= mddev->resync_max &&
7374 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7375 /* As this condition is controlled by user-space,
7376 * we can block indefinitely, so use '_interruptible'
7377 * to avoid triggering warnings.
7379 flush_signals(current); /* just in case */
7380 wait_event_interruptible(mddev->recovery_wait,
7381 mddev->resync_max > j
7382 || test_bit(MD_RECOVERY_INTR,
7386 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7389 sectors = mddev->pers->sync_request(mddev, j, &skipped,
7390 currspeed < speed_min(mddev));
7392 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7396 if (!skipped) { /* actual IO requested */
7397 io_sectors += sectors;
7398 atomic_add(sectors, &mddev->recovery_active);
7401 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7406 mddev->curr_resync = j;
7407 mddev->curr_mark_cnt = io_sectors;
7408 if (last_check == 0)
7409 /* this is the earliest that rebuild will be
7410 * visible in /proc/mdstat
7412 md_new_event(mddev);
7414 if (last_check + window > io_sectors || j == max_sectors)
7417 last_check = io_sectors;
7419 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7421 int next = (last_mark+1) % SYNC_MARKS;
7423 mddev->resync_mark = mark[next];
7424 mddev->resync_mark_cnt = mark_cnt[next];
7425 mark[next] = jiffies;
7426 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
7430 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7434 * this loop exits only if either when we are slower than
7435 * the 'hard' speed limit, or the system was IO-idle for
7437 * the system might be non-idle CPU-wise, but we only care
7438 * about not overloading the IO subsystem. (things like an
7439 * e2fsck being done on the RAID array should execute fast)
7443 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
7444 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
7445 /((jiffies-mddev->resync_mark)/HZ +1) +1;
7447 if (currspeed > speed_min(mddev)) {
7448 if ((currspeed > speed_max(mddev)) ||
7449 !is_mddev_idle(mddev, 0)) {
7455 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
7456 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
7457 ? "interrupted" : "done");
7459 * this also signals 'finished resyncing' to md_stop
7461 blk_finish_plug(&plug);
7462 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7464 /* tell personality that we are finished */
7465 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
7467 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
7468 mddev->curr_resync > 2) {
7469 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7470 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7471 if (mddev->curr_resync >= mddev->recovery_cp) {
7473 "md: checkpointing %s of %s.\n",
7474 desc, mdname(mddev));
7475 if (test_bit(MD_RECOVERY_ERROR,
7477 mddev->recovery_cp =
7478 mddev->curr_resync_completed;
7480 mddev->recovery_cp =
7484 mddev->recovery_cp = MaxSector;
7486 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7487 mddev->curr_resync = MaxSector;
7489 rdev_for_each_rcu(rdev, mddev)
7490 if (rdev->raid_disk >= 0 &&
7491 mddev->delta_disks >= 0 &&
7492 !test_bit(Faulty, &rdev->flags) &&
7493 !test_bit(In_sync, &rdev->flags) &&
7494 rdev->recovery_offset < mddev->curr_resync)
7495 rdev->recovery_offset = mddev->curr_resync;
7500 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7502 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7503 /* We completed so min/max setting can be forgotten if used. */
7504 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7505 mddev->resync_min = 0;
7506 mddev->resync_max = MaxSector;
7507 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7508 mddev->resync_min = mddev->curr_resync_completed;
7509 mddev->curr_resync = 0;
7510 wake_up(&resync_wait);
7511 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7512 md_wakeup_thread(mddev->thread);
7515 EXPORT_SYMBOL_GPL(md_do_sync);
7517 static int remove_and_add_spares(struct mddev *mddev,
7518 struct md_rdev *this)
7520 struct md_rdev *rdev;
7524 rdev_for_each(rdev, mddev)
7525 if ((this == NULL || rdev == this) &&
7526 rdev->raid_disk >= 0 &&
7527 !test_bit(Blocked, &rdev->flags) &&
7528 (test_bit(Faulty, &rdev->flags) ||
7529 ! test_bit(In_sync, &rdev->flags)) &&
7530 atomic_read(&rdev->nr_pending)==0) {
7531 if (mddev->pers->hot_remove_disk(
7532 mddev, rdev) == 0) {
7533 sysfs_unlink_rdev(mddev, rdev);
7534 rdev->raid_disk = -1;
7538 if (removed && mddev->kobj.sd)
7539 sysfs_notify(&mddev->kobj, NULL, "degraded");
7544 rdev_for_each(rdev, mddev) {
7545 if (rdev->raid_disk >= 0 &&
7546 !test_bit(In_sync, &rdev->flags) &&
7547 !test_bit(Faulty, &rdev->flags))
7549 if (rdev->raid_disk >= 0)
7551 if (test_bit(Faulty, &rdev->flags))
7554 ! (rdev->saved_raid_disk >= 0 &&
7555 !test_bit(Bitmap_sync, &rdev->flags)))
7558 if (rdev->saved_raid_disk < 0)
7559 rdev->recovery_offset = 0;
7561 hot_add_disk(mddev, rdev) == 0) {
7562 if (sysfs_link_rdev(mddev, rdev))
7563 /* failure here is OK */;
7565 md_new_event(mddev);
7566 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7571 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7575 static void md_start_sync(struct work_struct *ws)
7577 struct mddev *mddev = container_of(ws, struct mddev, del_work);
7579 mddev->sync_thread = md_register_thread(md_do_sync,
7582 if (!mddev->sync_thread) {
7583 printk(KERN_ERR "%s: could not start resync"
7586 /* leave the spares where they are, it shouldn't hurt */
7587 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7588 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7589 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7590 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7591 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7592 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7594 if (mddev->sysfs_action)
7595 sysfs_notify_dirent_safe(mddev->sysfs_action);
7597 md_wakeup_thread(mddev->sync_thread);
7598 sysfs_notify_dirent_safe(mddev->sysfs_action);
7599 md_new_event(mddev);
7603 * This routine is regularly called by all per-raid-array threads to
7604 * deal with generic issues like resync and super-block update.
7605 * Raid personalities that don't have a thread (linear/raid0) do not
7606 * need this as they never do any recovery or update the superblock.
7608 * It does not do any resync itself, but rather "forks" off other threads
7609 * to do that as needed.
7610 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
7611 * "->recovery" and create a thread at ->sync_thread.
7612 * When the thread finishes it sets MD_RECOVERY_DONE
7613 * and wakeups up this thread which will reap the thread and finish up.
7614 * This thread also removes any faulty devices (with nr_pending == 0).
7616 * The overall approach is:
7617 * 1/ if the superblock needs updating, update it.
7618 * 2/ If a recovery thread is running, don't do anything else.
7619 * 3/ If recovery has finished, clean up, possibly marking spares active.
7620 * 4/ If there are any faulty devices, remove them.
7621 * 5/ If array is degraded, try to add spares devices
7622 * 6/ If array has spares or is not in-sync, start a resync thread.
7624 void md_check_recovery(struct mddev *mddev)
7626 if (mddev->suspended)
7630 bitmap_daemon_work(mddev);
7632 if (signal_pending(current)) {
7633 if (mddev->pers->sync_request && !mddev->external) {
7634 printk(KERN_INFO "md: %s in immediate safe mode\n",
7636 mddev->safemode = 2;
7638 flush_signals(current);
7641 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7644 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
7645 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7646 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7647 (mddev->external == 0 && mddev->safemode == 1) ||
7648 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
7649 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
7653 if (mddev_trylock(mddev)) {
7657 /* On a read-only array we can:
7658 * - remove failed devices
7659 * - add already-in_sync devices if the array itself
7661 * As we only add devices that are already in-sync,
7662 * we can activate the spares immediately.
7664 remove_and_add_spares(mddev, NULL);
7665 /* There is no thread, but we need to call
7666 * ->spare_active and clear saved_raid_disk
7668 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7669 md_reap_sync_thread(mddev);
7670 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7674 if (!mddev->external) {
7676 spin_lock_irq(&mddev->write_lock);
7677 if (mddev->safemode &&
7678 !atomic_read(&mddev->writes_pending) &&
7680 mddev->recovery_cp == MaxSector) {
7683 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7685 if (mddev->safemode == 1)
7686 mddev->safemode = 0;
7687 spin_unlock_irq(&mddev->write_lock);
7689 sysfs_notify_dirent_safe(mddev->sysfs_state);
7692 if (mddev->flags & MD_UPDATE_SB_FLAGS)
7693 md_update_sb(mddev, 0);
7695 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
7696 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
7697 /* resync/recovery still happening */
7698 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7701 if (mddev->sync_thread) {
7702 md_reap_sync_thread(mddev);
7705 /* Set RUNNING before clearing NEEDED to avoid
7706 * any transients in the value of "sync_action".
7708 mddev->curr_resync_completed = 0;
7709 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7710 /* Clear some bits that don't mean anything, but
7713 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7714 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7716 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7717 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7719 /* no recovery is running.
7720 * remove any failed drives, then
7721 * add spares if possible.
7722 * Spares are also removed and re-added, to allow
7723 * the personality to fail the re-add.
7726 if (mddev->reshape_position != MaxSector) {
7727 if (mddev->pers->check_reshape == NULL ||
7728 mddev->pers->check_reshape(mddev) != 0)
7729 /* Cannot proceed */
7731 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7732 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7733 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
7734 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7735 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7736 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7737 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7738 } else if (mddev->recovery_cp < MaxSector) {
7739 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7740 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7741 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
7742 /* nothing to be done ... */
7745 if (mddev->pers->sync_request) {
7747 /* We are adding a device or devices to an array
7748 * which has the bitmap stored on all devices.
7749 * So make sure all bitmap pages get written
7751 bitmap_write_all(mddev->bitmap);
7753 INIT_WORK(&mddev->del_work, md_start_sync);
7754 queue_work(md_misc_wq, &mddev->del_work);
7758 if (!mddev->sync_thread) {
7759 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7760 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7762 if (mddev->sysfs_action)
7763 sysfs_notify_dirent_safe(mddev->sysfs_action);
7766 wake_up(&mddev->sb_wait);
7767 mddev_unlock(mddev);
7770 EXPORT_SYMBOL(md_check_recovery);
7772 void md_reap_sync_thread(struct mddev *mddev)
7774 struct md_rdev *rdev;
7776 /* resync has finished, collect result */
7777 md_unregister_thread(&mddev->sync_thread);
7778 wake_up(&resync_wait);
7779 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7780 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7782 /* activate any spares */
7783 if (mddev->pers->spare_active(mddev)) {
7784 sysfs_notify(&mddev->kobj, NULL,
7786 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7789 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7790 mddev->pers->finish_reshape)
7791 mddev->pers->finish_reshape(mddev);
7793 /* If array is no-longer degraded, then any saved_raid_disk
7794 * information must be scrapped.
7796 if (!mddev->degraded)
7797 rdev_for_each(rdev, mddev)
7798 rdev->saved_raid_disk = -1;
7800 md_update_sb(mddev, 1);
7801 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7802 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7803 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7804 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7805 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7806 /* flag recovery needed just to double check */
7807 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7808 sysfs_notify_dirent_safe(mddev->sysfs_action);
7809 md_new_event(mddev);
7810 if (mddev->event_work.func)
7811 queue_work(md_misc_wq, &mddev->event_work);
7813 EXPORT_SYMBOL(md_reap_sync_thread);
7815 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
7817 sysfs_notify_dirent_safe(rdev->sysfs_state);
7818 wait_event_timeout(rdev->blocked_wait,
7819 !test_bit(Blocked, &rdev->flags) &&
7820 !test_bit(BlockedBadBlocks, &rdev->flags),
7821 msecs_to_jiffies(5000));
7822 rdev_dec_pending(rdev, mddev);
7824 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7826 void md_finish_reshape(struct mddev *mddev)
7828 /* called be personality module when reshape completes. */
7829 struct md_rdev *rdev;
7831 rdev_for_each(rdev, mddev) {
7832 if (rdev->data_offset > rdev->new_data_offset)
7833 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
7835 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
7836 rdev->data_offset = rdev->new_data_offset;
7839 EXPORT_SYMBOL(md_finish_reshape);
7841 /* Bad block management.
7842 * We can record which blocks on each device are 'bad' and so just
7843 * fail those blocks, or that stripe, rather than the whole device.
7844 * Entries in the bad-block table are 64bits wide. This comprises:
7845 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7846 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7847 * A 'shift' can be set so that larger blocks are tracked and
7848 * consequently larger devices can be covered.
7849 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7851 * Locking of the bad-block table uses a seqlock so md_is_badblock
7852 * might need to retry if it is very unlucky.
7853 * We will sometimes want to check for bad blocks in a bi_end_io function,
7854 * so we use the write_seqlock_irq variant.
7856 * When looking for a bad block we specify a range and want to
7857 * know if any block in the range is bad. So we binary-search
7858 * to the last range that starts at-or-before the given endpoint,
7859 * (or "before the sector after the target range")
7860 * then see if it ends after the given start.
7862 * 0 if there are no known bad blocks in the range
7863 * 1 if there are known bad block which are all acknowledged
7864 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7865 * plus the start/length of the first bad section we overlap.
7867 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7868 sector_t *first_bad, int *bad_sectors)
7874 sector_t target = s + sectors;
7877 if (bb->shift > 0) {
7878 /* round the start down, and the end up */
7880 target += (1<<bb->shift) - 1;
7881 target >>= bb->shift;
7882 sectors = target - s;
7884 /* 'target' is now the first block after the bad range */
7887 seq = read_seqbegin(&bb->lock);
7892 /* Binary search between lo and hi for 'target'
7893 * i.e. for the last range that starts before 'target'
7895 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7896 * are known not to be the last range before target.
7897 * VARIANT: hi-lo is the number of possible
7898 * ranges, and decreases until it reaches 1
7900 while (hi - lo > 1) {
7901 int mid = (lo + hi) / 2;
7902 sector_t a = BB_OFFSET(p[mid]);
7904 /* This could still be the one, earlier ranges
7908 /* This and later ranges are definitely out. */
7911 /* 'lo' might be the last that started before target, but 'hi' isn't */
7913 /* need to check all range that end after 's' to see if
7914 * any are unacknowledged.
7917 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7918 if (BB_OFFSET(p[lo]) < target) {
7919 /* starts before the end, and finishes after
7920 * the start, so they must overlap
7922 if (rv != -1 && BB_ACK(p[lo]))
7926 *first_bad = BB_OFFSET(p[lo]);
7927 *bad_sectors = BB_LEN(p[lo]);
7933 if (read_seqretry(&bb->lock, seq))
7938 EXPORT_SYMBOL_GPL(md_is_badblock);
7941 * Add a range of bad blocks to the table.
7942 * This might extend the table, or might contract it
7943 * if two adjacent ranges can be merged.
7944 * We binary-search to find the 'insertion' point, then
7945 * decide how best to handle it.
7947 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7953 unsigned long flags;
7956 /* badblocks are disabled */
7960 /* round the start down, and the end up */
7961 sector_t next = s + sectors;
7963 next += (1<<bb->shift) - 1;
7968 write_seqlock_irqsave(&bb->lock, flags);
7973 /* Find the last range that starts at-or-before 's' */
7974 while (hi - lo > 1) {
7975 int mid = (lo + hi) / 2;
7976 sector_t a = BB_OFFSET(p[mid]);
7982 if (hi > lo && BB_OFFSET(p[lo]) > s)
7986 /* we found a range that might merge with the start
7989 sector_t a = BB_OFFSET(p[lo]);
7990 sector_t e = a + BB_LEN(p[lo]);
7991 int ack = BB_ACK(p[lo]);
7993 /* Yes, we can merge with a previous range */
7994 if (s == a && s + sectors >= e)
7995 /* new range covers old */
7998 ack = ack && acknowledged;
8000 if (e < s + sectors)
8002 if (e - a <= BB_MAX_LEN) {
8003 p[lo] = BB_MAKE(a, e-a, ack);
8006 /* does not all fit in one range,
8007 * make p[lo] maximal
8009 if (BB_LEN(p[lo]) != BB_MAX_LEN)
8010 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8016 if (sectors && hi < bb->count) {
8017 /* 'hi' points to the first range that starts after 's'.
8018 * Maybe we can merge with the start of that range */
8019 sector_t a = BB_OFFSET(p[hi]);
8020 sector_t e = a + BB_LEN(p[hi]);
8021 int ack = BB_ACK(p[hi]);
8022 if (a <= s + sectors) {
8023 /* merging is possible */
8024 if (e <= s + sectors) {
8029 ack = ack && acknowledged;
8032 if (e - a <= BB_MAX_LEN) {
8033 p[hi] = BB_MAKE(a, e-a, ack);
8036 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8044 if (sectors == 0 && hi < bb->count) {
8045 /* we might be able to combine lo and hi */
8046 /* Note: 's' is at the end of 'lo' */
8047 sector_t a = BB_OFFSET(p[hi]);
8048 int lolen = BB_LEN(p[lo]);
8049 int hilen = BB_LEN(p[hi]);
8050 int newlen = lolen + hilen - (s - a);
8051 if (s >= a && newlen < BB_MAX_LEN) {
8052 /* yes, we can combine them */
8053 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8054 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8055 memmove(p + hi, p + hi + 1,
8056 (bb->count - hi - 1) * 8);
8061 /* didn't merge (it all).
8062 * Need to add a range just before 'hi' */
8063 if (bb->count >= MD_MAX_BADBLOCKS) {
8064 /* No room for more */
8068 int this_sectors = sectors;
8069 memmove(p + hi + 1, p + hi,
8070 (bb->count - hi) * 8);
8073 if (this_sectors > BB_MAX_LEN)
8074 this_sectors = BB_MAX_LEN;
8075 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8076 sectors -= this_sectors;
8083 bb->unacked_exist = 1;
8084 write_sequnlock_irqrestore(&bb->lock, flags);
8089 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8094 s += rdev->new_data_offset;
8096 s += rdev->data_offset;
8097 rv = md_set_badblocks(&rdev->badblocks,
8100 /* Make sure they get written out promptly */
8101 sysfs_notify_dirent_safe(rdev->sysfs_state);
8102 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8103 md_wakeup_thread(rdev->mddev->thread);
8107 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8110 * Remove a range of bad blocks from the table.
8111 * This may involve extending the table if we spilt a region,
8112 * but it must not fail. So if the table becomes full, we just
8113 * drop the remove request.
8115 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8119 sector_t target = s + sectors;
8122 if (bb->shift > 0) {
8123 /* When clearing we round the start up and the end down.
8124 * This should not matter as the shift should align with
8125 * the block size and no rounding should ever be needed.
8126 * However it is better the think a block is bad when it
8127 * isn't than to think a block is not bad when it is.
8129 s += (1<<bb->shift) - 1;
8131 target >>= bb->shift;
8132 sectors = target - s;
8135 write_seqlock_irq(&bb->lock);
8140 /* Find the last range that starts before 'target' */
8141 while (hi - lo > 1) {
8142 int mid = (lo + hi) / 2;
8143 sector_t a = BB_OFFSET(p[mid]);
8150 /* p[lo] is the last range that could overlap the
8151 * current range. Earlier ranges could also overlap,
8152 * but only this one can overlap the end of the range.
8154 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8155 /* Partial overlap, leave the tail of this range */
8156 int ack = BB_ACK(p[lo]);
8157 sector_t a = BB_OFFSET(p[lo]);
8158 sector_t end = a + BB_LEN(p[lo]);
8161 /* we need to split this range */
8162 if (bb->count >= MD_MAX_BADBLOCKS) {
8166 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8168 p[lo] = BB_MAKE(a, s-a, ack);
8171 p[lo] = BB_MAKE(target, end - target, ack);
8172 /* there is no longer an overlap */
8177 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8178 /* This range does overlap */
8179 if (BB_OFFSET(p[lo]) < s) {
8180 /* Keep the early parts of this range. */
8181 int ack = BB_ACK(p[lo]);
8182 sector_t start = BB_OFFSET(p[lo]);
8183 p[lo] = BB_MAKE(start, s - start, ack);
8184 /* now low doesn't overlap, so.. */
8189 /* 'lo' is strictly before, 'hi' is strictly after,
8190 * anything between needs to be discarded
8193 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8194 bb->count -= (hi - lo - 1);
8200 write_sequnlock_irq(&bb->lock);
8204 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8208 s += rdev->new_data_offset;
8210 s += rdev->data_offset;
8211 return md_clear_badblocks(&rdev->badblocks,
8214 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8217 * Acknowledge all bad blocks in a list.
8218 * This only succeeds if ->changed is clear. It is used by
8219 * in-kernel metadata updates
8221 void md_ack_all_badblocks(struct badblocks *bb)
8223 if (bb->page == NULL || bb->changed)
8224 /* no point even trying */
8226 write_seqlock_irq(&bb->lock);
8228 if (bb->changed == 0 && bb->unacked_exist) {
8231 for (i = 0; i < bb->count ; i++) {
8232 if (!BB_ACK(p[i])) {
8233 sector_t start = BB_OFFSET(p[i]);
8234 int len = BB_LEN(p[i]);
8235 p[i] = BB_MAKE(start, len, 1);
8238 bb->unacked_exist = 0;
8240 write_sequnlock_irq(&bb->lock);
8242 EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8244 /* sysfs access to bad-blocks list.
8245 * We present two files.
8246 * 'bad-blocks' lists sector numbers and lengths of ranges that
8247 * are recorded as bad. The list is truncated to fit within
8248 * the one-page limit of sysfs.
8249 * Writing "sector length" to this file adds an acknowledged
8251 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8252 * been acknowledged. Writing to this file adds bad blocks
8253 * without acknowledging them. This is largely for testing.
8257 badblocks_show(struct badblocks *bb, char *page, int unack)
8268 seq = read_seqbegin(&bb->lock);
8273 while (len < PAGE_SIZE && i < bb->count) {
8274 sector_t s = BB_OFFSET(p[i]);
8275 unsigned int length = BB_LEN(p[i]);
8276 int ack = BB_ACK(p[i]);
8282 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8283 (unsigned long long)s << bb->shift,
8284 length << bb->shift);
8286 if (unack && len == 0)
8287 bb->unacked_exist = 0;
8289 if (read_seqretry(&bb->lock, seq))
8298 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8300 unsigned long long sector;
8304 /* Allow clearing via sysfs *only* for testing/debugging.
8305 * Normally only a successful write may clear a badblock
8308 if (page[0] == '-') {
8312 #endif /* DO_DEBUG */
8314 switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) {
8316 if (newline != '\n')
8328 md_clear_badblocks(bb, sector, length);
8331 #endif /* DO_DEBUG */
8332 if (md_set_badblocks(bb, sector, length, !unack))
8338 static int md_notify_reboot(struct notifier_block *this,
8339 unsigned long code, void *x)
8341 struct list_head *tmp;
8342 struct mddev *mddev;
8345 for_each_mddev(mddev, tmp) {
8346 if (mddev_trylock(mddev)) {
8348 __md_stop_writes(mddev);
8349 if (mddev->persistent)
8350 mddev->safemode = 2;
8351 mddev_unlock(mddev);
8356 * certain more exotic SCSI devices are known to be
8357 * volatile wrt too early system reboots. While the
8358 * right place to handle this issue is the given
8359 * driver, we do want to have a safe RAID driver ...
8367 static struct notifier_block md_notifier = {
8368 .notifier_call = md_notify_reboot,
8370 .priority = INT_MAX, /* before any real devices */
8373 static void md_geninit(void)
8375 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8377 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8380 static int __init md_init(void)
8384 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8388 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8392 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8395 if ((ret = register_blkdev(0, "mdp")) < 0)
8399 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
8400 md_probe, NULL, NULL);
8401 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8402 md_probe, NULL, NULL);
8404 register_reboot_notifier(&md_notifier);
8405 raid_table_header = register_sysctl_table(raid_root_table);
8411 unregister_blkdev(MD_MAJOR, "md");
8413 destroy_workqueue(md_misc_wq);
8415 destroy_workqueue(md_wq);
8423 * Searches all registered partitions for autorun RAID arrays
8427 static LIST_HEAD(all_detected_devices);
8428 struct detected_devices_node {
8429 struct list_head list;
8433 void md_autodetect_dev(dev_t dev)
8435 struct detected_devices_node *node_detected_dev;
8437 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8438 if (node_detected_dev) {
8439 node_detected_dev->dev = dev;
8440 list_add_tail(&node_detected_dev->list, &all_detected_devices);
8442 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8443 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8447 static void autostart_arrays(int part)
8449 struct md_rdev *rdev;
8450 struct detected_devices_node *node_detected_dev;
8452 int i_scanned, i_passed;
8457 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8459 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8461 node_detected_dev = list_entry(all_detected_devices.next,
8462 struct detected_devices_node, list);
8463 list_del(&node_detected_dev->list);
8464 dev = node_detected_dev->dev;
8465 kfree(node_detected_dev);
8466 rdev = md_import_device(dev,0, 90);
8470 if (test_bit(Faulty, &rdev->flags))
8473 set_bit(AutoDetected, &rdev->flags);
8474 list_add(&rdev->same_set, &pending_raid_disks);
8478 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8479 i_scanned, i_passed);
8481 autorun_devices(part);
8484 #endif /* !MODULE */
8486 static __exit void md_exit(void)
8488 struct mddev *mddev;
8489 struct list_head *tmp;
8492 blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
8493 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
8495 unregister_blkdev(MD_MAJOR,"md");
8496 unregister_blkdev(mdp_major, "mdp");
8497 unregister_reboot_notifier(&md_notifier);
8498 unregister_sysctl_table(raid_table_header);
8500 /* We cannot unload the modules while some process is
8501 * waiting for us in select() or poll() - wake them up
8504 while (waitqueue_active(&md_event_waiters)) {
8505 /* not safe to leave yet */
8506 wake_up(&md_event_waiters);
8510 remove_proc_entry("mdstat", NULL);
8512 for_each_mddev(mddev, tmp) {
8513 export_array(mddev);
8514 mddev->hold_active = 0;
8516 destroy_workqueue(md_misc_wq);
8517 destroy_workqueue(md_wq);
8520 subsys_initcall(md_init);
8521 module_exit(md_exit)
8523 static int get_ro(char *buffer, struct kernel_param *kp)
8525 return sprintf(buffer, "%d", start_readonly);
8527 static int set_ro(const char *val, struct kernel_param *kp)
8530 int num = simple_strtoul(val, &e, 10);
8531 if (*val && (*e == '\0' || *e == '\n')) {
8532 start_readonly = num;
8538 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
8539 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
8540 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
8542 MODULE_LICENSE("GPL");
8543 MODULE_DESCRIPTION("MD RAID framework");
8545 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);