mirror of
https://github.com/FEX-Emu/linux.git
synced 2024-12-27 03:47:43 +00:00
ocfs2/dlm: wait for dlm recovery done when migrating all lock resources
Wait for dlm recovery done when migrating all lock resources in case that
new lock resource left after leaving dlm domain. And the left lock
resource will cause other nodes BUG.
NodeA NodeB NodeC
umount:
dlm_unregister_domain()
dlm_migrate_all_locks()
NodeB down
do recovery for NodeB
and collect a new lockres
form other live nodes:
dlm_do_recovery
dlm_remaster_locks
dlm_request_all_locks:
dlm_mig_lockres_handler
dlm_new_lockres
__dlm_insert_lockres
at last NodeA become the
master of the new lockres
and leave domain:
dlm_leave_domain()
mount:
dlm_join_domain()
touch file and request
for the owner of the new
lockres, but all the
other nodes said 'NO',
so NodeC decide to be
the owner, and send do
assert msg to other
nodes:
dlmlock()
dlm_get_lock_resource()
dlm_do_assert_master()
other nodes receive the msg
and found two masters exist.
at last cause BUG in
dlm_assert_master_handler()
-->BUG();
Link: http://lkml.kernel.org/r/5AAA6E25.7090303@huawei.com
Fixes: bc9838c4d4
("dlm: allow dlm do recovery during shutdown")
Signed-off-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Alex Chen <alex.chen@huawei.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
Acked-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
a43d24cb3b
commit
60c7ec9ee4
@ -140,6 +140,7 @@ struct dlm_ctxt
|
||||
u8 node_num;
|
||||
u32 key;
|
||||
u8 joining_node;
|
||||
u8 migrate_done; /* set to 1 means node has migrated all lock resources */
|
||||
wait_queue_head_t dlm_join_events;
|
||||
unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
|
||||
|
@ -461,6 +461,19 @@ redo_bucket:
|
||||
cond_resched_lock(&dlm->spinlock);
|
||||
num += n;
|
||||
}
|
||||
|
||||
if (!num) {
|
||||
if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
|
||||
mlog(0, "%s: perhaps there are more lock resources "
|
||||
"need to be migrated after dlm recovery\n", dlm->name);
|
||||
ret = -EAGAIN;
|
||||
} else {
|
||||
mlog(0, "%s: we won't do dlm recovery after migrating "
|
||||
"all lock resources\n", dlm->name);
|
||||
dlm->migrate_done = 1;
|
||||
}
|
||||
}
|
||||
|
||||
spin_unlock(&dlm->spinlock);
|
||||
wake_up(&dlm->dlm_thread_wq);
|
||||
|
||||
@ -2038,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
|
||||
dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
|
||||
init_waitqueue_head(&dlm->dlm_join_events);
|
||||
|
||||
dlm->migrate_done = 0;
|
||||
|
||||
dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
|
||||
dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
|
||||
|
||||
|
@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
|
||||
|
||||
static void dlm_begin_recovery(struct dlm_ctxt *dlm)
|
||||
{
|
||||
spin_lock(&dlm->spinlock);
|
||||
assert_spin_locked(&dlm->spinlock);
|
||||
BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
|
||||
printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
|
||||
dlm->name, dlm->reco.dead_node);
|
||||
dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
|
||||
spin_unlock(&dlm->spinlock);
|
||||
}
|
||||
|
||||
static void dlm_end_recovery(struct dlm_ctxt *dlm)
|
||||
@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
|
||||
|
||||
spin_lock(&dlm->spinlock);
|
||||
|
||||
if (dlm->migrate_done) {
|
||||
mlog(0, "%s: no need do recovery after migrating all "
|
||||
"lock resources\n", dlm->name);
|
||||
spin_unlock(&dlm->spinlock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* check to see if the new master has died */
|
||||
if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
|
||||
test_bit(dlm->reco.new_master, dlm->recovery_map)) {
|
||||
@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
|
||||
mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
|
||||
dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
|
||||
dlm->reco.dead_node);
|
||||
spin_unlock(&dlm->spinlock);
|
||||
|
||||
/* take write barrier */
|
||||
/* (stops the list reshuffling thread, proxy ast handling) */
|
||||
dlm_begin_recovery(dlm);
|
||||
|
||||
spin_unlock(&dlm->spinlock);
|
||||
|
||||
if (dlm->reco.new_master == dlm->node_num)
|
||||
goto master_here;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user