Thread (12 messages) 12 messages, 3 authors, 2020-11-11

Re: [PATCH 2/2] md/cluster: fix deadlock when doing reshape job

From: Guoqing Jiang <hidden>
Date: 2020-11-10 06:36:39


On 11/8/20 15:53, Zhao Heming wrote:
There is a similar deadlock in commit 0ba959774e93
("md-cluster: use sync way to handle METADATA_UPDATED msg")

This issue is very like 0ba959774e93, except <c>.
0ba959774e93 step c is "update sb", this issue is "mdadm --remove"
nodeA                       nodeB
--------------------     --------------------
a.
send METADATA_UPDATED
held token_lockres:EX
                          b.
                          md_do_sync
                           resync_info_update
                             send RESYNCING
                              + set MD_CLUSTER_SEND_LOCK
                              + wait for holding token_lockres:EX

                          c.
                          mdadm /dev/md0 --remove /dev/sdg
                           + held reconfig_mutex
                           + send REMOVE
                              + wait_event(MD_CLUSTER_SEND_LOCK)

                          d.
                          recv_daemon //METADATA_UPDATED from A
                           process_metadata_update
                            + (mddev_trylock(mddev) ||
                               MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD)
                              //this time, both return false forever.
Explaination:

a>
A send METADATA_UPDATED
this will block all other nodes to send msg in cluster.

b>
B does sync jobs, so it will send RESYNCING at intervals.
this will be block for holding token_lockres:EX lock.
md_do_sync
  raid1_sync_request
   resync_info_update
    sendmsg //with mddev_locked: false
     lock_comm
      + wait_event(cinfo->wait,
      |    !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
      + lock_token //for step<a>, block holding EX lock
         + dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); // blocking
c>
B do "--remove" action and will send REMOVE.
this will be blocked by step <b>: MD_CLUSTER_SEND_LOCK is 1.
md_ioctl
+ mddev_lock(mddev) //holding reconfig_mutex, it influnces <d>
+ hot_remove_disk
    remove_disk
     sendmsg
      lock_comm
        wait_event(cinfo->wait,
          !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));//blocking
d>
B recv METADATA_UPDATED msg which send from A in step <a>.
this will be blocked by step <c>: holding mddev lock, it makes
wait_event can't hold mddev lock. (btw,
MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD keep ZERO in this scenario.)
process_metadata_update
   wait_event(mddev->thread->wqueue,
         (got_lock = mddev_trylock(mddev)) ||
         test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
How to fix:

There are two sides to fix (or break the dead loop):
1. on sending msg side, modify lock_comm, change it to return
    success/failed.
    This will make mdadm cmd return error when lock_comm is timeout.
2. on receiving msg side, process_metadata_update need to add error
    handling.
    currently, other msg types won't trigger error or error doesn't need
    to return sender. So only process_metadata_update need to modify.

Ether of 1 & 2 can fix the hunging issue, but I prefer fix on both side.
It's better if you put the deadlock information (such as the 'D' task 
stack) to the header.
quoted hunk ↗ jump to hunk
Signed-off-by: Zhao Heming <redacted>
---
  drivers/md/md-cluster.c | 42 ++++++++++++++++++++++++++---------------
  1 file changed, 27 insertions(+), 15 deletions(-)
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 4aaf4820b6f6..d59a033e7589 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -523,19 +523,24 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
  }
  
  
-static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
+static int process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
  {
-	int got_lock = 0;
+	int got_lock = 0, rv;
  	struct md_cluster_info *cinfo = mddev->cluster_info;
  	mddev->good_device_nr = le32_to_cpu(msg->raid_slot);
  
  	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
-	wait_event(mddev->thread->wqueue,
-		   (got_lock = mddev_trylock(mddev)) ||
-		    test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state));
-	md_reload_sb(mddev, mddev->good_device_nr);
-	if (got_lock)
-		mddev_unlock(mddev);
+	rv = wait_event_timeout(mddev->thread->wqueue,
+			   (got_lock = mddev_trylock(mddev)) ||
+			   test_bit(MD_CLUSTER_HOLDING_MUTEX_FOR_RECVD, &cinfo->state),
+			   msecs_to_jiffies(5000));
+	if (rv) {
+		md_reload_sb(mddev, mddev->good_device_nr);
+		if (got_lock)
+			mddev_unlock(mddev);
+		return 0;
+	}
+	return -1;
  }
  
  static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
@@ -578,7 +583,7 @@ static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
  		return -1;
  	switch (le32_to_cpu(msg->type)) {
  	case METADATA_UPDATED:
-		process_metadata_update(mddev, msg);
+		ret = process_metadata_update(mddev, msg);
  		break;
  	case CHANGE_CAPACITY:
  		set_capacity(mddev->gendisk, mddev->array_sectors);
@@ -701,10 +706,15 @@ static int lock_token(struct md_cluster_info *cinfo, bool mddev_locked)
   */
  static int lock_comm(struct md_cluster_info *cinfo, bool mddev_locked)
  {
-	wait_event(cinfo->wait,
-		   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state));
+	int rv;
  
-	return lock_token(cinfo, mddev_locked);
+	rv = wait_event_timeout(cinfo->wait,
+			   !test_and_set_bit(MD_CLUSTER_SEND_LOCK, &cinfo->state),
+			   msecs_to_jiffies(5000));
+	if (rv)
+		return lock_token(cinfo, mddev_locked);
+	else
+		return -1;
  }
  
  static void unlock_comm(struct md_cluster_info *cinfo)
@@ -784,9 +794,11 @@ static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg,
  {
  	int ret;
  
-	lock_comm(cinfo, mddev_locked);
-	ret = __sendmsg(cinfo, cmsg);
-	unlock_comm(cinfo);
+	ret = lock_comm(cinfo, mddev_locked);
+	if (!ret) {
+		ret = __sendmsg(cinfo, cmsg);
+		unlock_comm(cinfo);
+	}
  	return ret;
  }
  
What happens if the cluster has latency issue? Let's say, node normally 
can't get lock during the 5s window. IOW, is the timeout value justified
well?

Thanks,
Guoqing
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help