raid5-cache: add journal hot add/remove support

Add support for journal disk hot add/remove. Mostly trival checks in md
part. The raid5 part is a little tricky. For hot-remove, we can't wait
pending write as it's called from raid5d. The wait will cause deadlock.
We simplily fail the hot-remove. A hot-remove retry can success
eventually since if journal disk is faulty all pending write will be
failed and finish. For hot-add, since an array supporting journal but
without journal disk will be marked read-only, we are safe to hot add
journal without stopping IO (should be read IO, while journal only
handles write IO).

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.com>
This commit is contained in:
Shaohua Li 2015-12-21 10:51:02 +11:00 committed by NeilBrown
parent 9ebc6ef188
commit f6b6ec5cfa
3 changed files with 68 additions and 24 deletions

View File

@ -2055,8 +2055,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return -EEXIST;
/* make sure rdev->sectors exceeds mddev->dev_sectors */
if (rdev->sectors && (mddev->dev_sectors == 0 ||
rdev->sectors < mddev->dev_sectors)) {
if (!test_bit(Journal, &rdev->flags) &&
rdev->sectors &&
(mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
if (mddev->pers) {
/* Cannot change size, so fail
* If mddev->level <= 0, then we don't care
@ -2087,7 +2088,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
}
}
rcu_read_unlock();
if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
if (!test_bit(Journal, &rdev->flags) &&
mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
mdname(mddev), mddev->max_disks);
return -EBUSY;
@ -6044,8 +6046,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else
clear_bit(WriteMostly, &rdev->flags);
if (info->state & (1<<MD_DISK_JOURNAL))
if (info->state & (1<<MD_DISK_JOURNAL)) {
struct md_rdev *rdev2;
bool has_journal = false;
/* make sure no existing journal disk */
rdev_for_each(rdev2, mddev) {
if (test_bit(Journal, &rdev2->flags)) {
has_journal = true;
break;
}
}
if (has_journal) {
export_rdev(rdev);
return -EBUSY;
}
set_bit(Journal, &rdev->flags);
}
/*
* check whether the device shows up in other nodes
*/
@ -8181,18 +8198,19 @@ static int remove_and_add_spares(struct mddev *mddev,
continue;
if (test_bit(Faulty, &rdev->flags))
continue;
if (test_bit(Journal, &rdev->flags))
continue;
if (!test_bit(Journal, &rdev->flags)) {
if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
continue;
rdev->recovery_offset = 0;
}
if (mddev->pers->
hot_add_disk(mddev, rdev) == 0) {
if (sysfs_link_rdev(mddev, rdev))
/* failure here is OK */;
if (!test_bit(Journal, &rdev->flags))
spares++;
md_new_event(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);

View File

@ -799,10 +799,18 @@ void r5l_quiesce(struct r5l_log *log, int state)
bool r5l_log_disk_error(struct r5conf *conf)
{
struct r5l_log *log;
bool ret;
/* don't allow write if journal disk is missing */
if (!conf->log)
return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return test_bit(Faulty, &conf->log->rdev->flags);
rcu_read_lock();
log = rcu_dereference(conf->log);
if (!log)
ret = test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
else
ret = test_bit(Faulty, &log->rdev->flags);
rcu_read_unlock();
return ret;
}
struct r5l_recovery_ctx {
@ -1165,7 +1173,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (r5l_load_log(log))
goto error;
conf->log = log;
rcu_assign_pointer(conf->log, log);
return 0;
error:
md_unregister_thread(&log->reclaim_thread);

View File

@ -7139,14 +7139,19 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct disk_info *p = conf->disks + number;
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags)) {
if (test_bit(Journal, &rdev->flags) && conf->log) {
struct r5l_log *log;
/*
* journal disk is not removable, but we need give a chance to
* update superblock of other disks. Otherwise journal disk
* will be considered as 'fresh'
* we can't wait pending write here, as this is called in
* raid5d, wait will deadlock.
*/
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return -EINVAL;
if (atomic_read(&mddev->writes_pending))
return -EBUSY;
log = conf->log;
conf->log = NULL;
synchronize_rcu();
r5l_exit_log(log);
return 0;
}
if (rdev == p->rdev)
rdevp = &p->rdev;
@ -7210,8 +7215,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int first = 0;
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags))
return -EINVAL;
if (test_bit(Journal, &rdev->flags)) {
char b[BDEVNAME_SIZE];
if (conf->log)
return -EBUSY;
rdev->raid_disk = 0;
/*
* The array is in readonly mode if journal is missing, so no
* write requests running. We should be safe
*/
r5l_init_log(conf, rdev);
printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
mdname(mddev), bdevname(rdev->bdev, b));
return 0;
}
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;