[PATCH v3 1/4] Tools: hv: Reopen the devices if read() or write() returns errors
From: Dexuan Cui <decui@microsoft.com>
Date: 2020-01-25 19:55:09
Also in:
lkml
Subsystem:
hyper-v/azure core and drivers, the rest · Maintainers:
"K. Y. Srinivasan", Haiyang Zhang, Wei Liu, Dexuan Cui, Long Li, Linus Torvalds
The state machine in the hv_utils driver can run out of order in some
corner cases, e.g. if the kvp daemon doesn't call write() fast enough
due to some reason, kvp_timeout_func() can run first and move the state
to HVUTIL_READY; next, when kvp_on_msg() is called it returns -EINVAL
since kvp_transaction.state is smaller than HVUTIL_USERSPACE_REQ; later,
the daemon's write() gets an error -EINVAL, and the daemon will exit().
We can reproduce the issue by sending a SIGSTOP signal to the daemon, wait
for 1 minute, and send a SIGCONT signal to the daemon: the daemon will
exit() quickly.
We can fix the issue by forcing a reset of the device (which means the
daemon can close() and open() the device again) and doing extra necessary
clean-up.
Signed-off-by: Dexuan Cui <decui@microsoft.com>
---
Changes in v2:
This is actually a new patch that makes the daemons more robust.
Changes in v3 (I addressed Michael's comments):
Don't reset target_fd, since that's unnecessary.
Reset target_fname by: target_fname[0] = '\0';
Added the missing "fs_frozen = true;" in vss_operate().
After reopen_vss_fd: if vss_operate(VSS_OP_THAW) can not clear
fs_frozen due to an error, we just exit().
Added comments.
tools/hv/hv_fcopy_daemon.c | 33 +++++++++++++++++++++----
tools/hv/hv_kvp_daemon.c | 36 ++++++++++++++++------------
tools/hv/hv_vss_daemon.c | 49 +++++++++++++++++++++++++++++---------
3 files changed, 88 insertions(+), 30 deletions(-)
diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c
index aea2d91ab364..48cfa032432c 100644
--- a/tools/hv/hv_fcopy_daemon.c
+++ b/tools/hv/hv_fcopy_daemon.c@@ -80,6 +80,8 @@ static int hv_start_fcopy(struct hv_start_fcopy *smsg) error = 0; done: + if (error) + target_fname[0] = '\0'; return error; }
@@ -108,15 +110,29 @@ static int hv_copy_data(struct hv_do_fcopy *cpmsg) return ret; } +/* + * Reset target_fname to "" in the two below functions for hibernation: if + * the fcopy operation is aborted by hibernation, the daemon should remove the + * partially-copied file; to achieve this, the hv_utils driver always fakes a + * CANCEL_FCOPY message upon suspend, and later when the VM resumes back, + * the daemon calls hv_copy_cancel() to remove the file; if a file is copied + * successfully before suspend, hv_copy_finished() must reset target_fname to + * avoid that the file can be incorrectly removed upon resume, since the faked + * CANCEL_FCOPY message is spurious in this case. + */ static int hv_copy_finished(void) { close(target_fd); + target_fname[0] = '\0'; return 0; } static int hv_copy_cancel(void) { close(target_fd); - unlink(target_fname); + if (strlen(target_fname) > 0) { + unlink(target_fname); + target_fname[0] = '\0'; + } return 0; }
@@ -141,7 +157,7 @@ int main(int argc, char *argv[]) struct hv_do_fcopy copy; __u32 kernel_modver; } buffer = { }; - int in_handshake = 1; + int in_handshake; static struct option long_options[] = { {"help", no_argument, 0, 'h' },
@@ -170,6 +186,10 @@ int main(int argc, char *argv[]) openlog("HV_FCOPY", 0, LOG_USER); syslog(LOG_INFO, "starting; pid is:%d", getpid()); +reopen_fcopy_fd: + /* Remove any possible partially-copied file on error */ + hv_copy_cancel(); + in_handshake = 1; fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR); if (fcopy_fd < 0) {
@@ -196,7 +216,7 @@ int main(int argc, char *argv[]) len = pread(fcopy_fd, &buffer, sizeof(buffer), 0); if (len < 0) { syslog(LOG_ERR, "pread failed: %s", strerror(errno)); - exit(EXIT_FAILURE); + goto reopen_fcopy_fd; } if (in_handshake) {
@@ -231,9 +251,14 @@ int main(int argc, char *argv[]) } + /* + * pwrite() may return an error due to the faked CANCEL_FCOPY + * message upon hibernation. Ignore the error by resetting the + * dev file, i.e. closing and re-opening it. + */ if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) { syslog(LOG_ERR, "pwrite failed: %s", strerror(errno)); - exit(EXIT_FAILURE); + goto reopen_fcopy_fd; } } }
diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
index e9ef4ca6a655..ee9c1bb2293e 100644
--- a/tools/hv/hv_kvp_daemon.c
+++ b/tools/hv/hv_kvp_daemon.c@@ -76,7 +76,7 @@ enum { DNS }; -static int in_hand_shake = 1; +static int in_hand_shake; static char *os_name = ""; static char *os_major = "";
@@ -1360,7 +1360,7 @@ void print_usage(char *argv[]) int main(int argc, char *argv[]) { - int kvp_fd, len; + int kvp_fd = -1, len; int error; struct pollfd pfd; char *p;
@@ -1400,14 +1400,6 @@ int main(int argc, char *argv[]) openlog("KVP", 0, LOG_USER); syslog(LOG_INFO, "KVP starting; pid is:%d", getpid()); - kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC); - - if (kvp_fd < 0) { - syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s", - errno, strerror(errno)); - exit(EXIT_FAILURE); - } - /* * Retrieve OS release information. */
@@ -1423,6 +1415,18 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } +reopen_kvp_fd: + if (kvp_fd != -1) + close(kvp_fd); + in_hand_shake = 1; + kvp_fd = open("/dev/vmbus/hv_kvp", O_RDWR | O_CLOEXEC); + + if (kvp_fd < 0) { + syslog(LOG_ERR, "open /dev/vmbus/hv_kvp failed; error: %d %s", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + /* * Register ourselves with the kernel. */
@@ -1456,9 +1460,7 @@ int main(int argc, char *argv[]) if (len != sizeof(struct hv_kvp_msg)) { syslog(LOG_ERR, "read failed; error:%d %s", errno, strerror(errno)); - - close(kvp_fd); - return EXIT_FAILURE; + goto reopen_kvp_fd; } /*
@@ -1617,13 +1619,17 @@ int main(int argc, char *argv[]) break; } - /* Send the value back to the kernel. */ + /* + * Send the value back to the kernel. Note: the write() may + * return an error due to hibernation; we can ignore the error + * by resetting the dev file, i.e. closing and re-opening it. + */ kvp_done: len = write(kvp_fd, hv_msg, sizeof(struct hv_kvp_msg)); if (len != sizeof(struct hv_kvp_msg)) { syslog(LOG_ERR, "write failed; error: %d %s", errno, strerror(errno)); - exit(EXIT_FAILURE); + goto reopen_kvp_fd; } }
diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c
index 92902a88f671..dd111870beee 100644
--- a/tools/hv/hv_vss_daemon.c
+++ b/tools/hv/hv_vss_daemon.c@@ -28,6 +28,8 @@ #include <stdbool.h> #include <dirent.h> +static bool fs_frozen; + /* Don't use syslog() in the function since that can cause write to disk */ static int vss_do_freeze(char *dir, unsigned int cmd) {
@@ -155,18 +157,27 @@ static int vss_operate(int operation) continue; } error |= vss_do_freeze(ent->mnt_dir, cmd); - if (error && operation == VSS_OP_FREEZE) - goto err; + if (operation == VSS_OP_FREEZE) { + if (error) + goto err; + fs_frozen = true; + } } endmntent(mounts); if (root_seen) { error |= vss_do_freeze("/", cmd); - if (error && operation == VSS_OP_FREEZE) - goto err; + if (operation == VSS_OP_FREEZE) { + if (error) + goto err; + fs_frozen = true; + } } + if (operation == VSS_OP_THAW && !error) + fs_frozen = false; + goto out; err: save_errno = errno;
@@ -175,6 +186,7 @@ static int vss_operate(int operation) endmntent(mounts); } vss_operate(VSS_OP_THAW); + fs_frozen = false; /* Call syslog after we thaw all filesystems */ if (ent) syslog(LOG_ERR, "FREEZE of %s failed; error:%d %s",
@@ -196,13 +208,13 @@ void print_usage(char *argv[]) int main(int argc, char *argv[]) { - int vss_fd, len; + int vss_fd = -1, len; int error; struct pollfd pfd; int op; struct hv_vss_msg vss_msg[1]; int daemonize = 1, long_index = 0, opt; - int in_handshake = 1; + int in_handshake; __u32 kernel_modver; static struct option long_options[] = {
@@ -232,6 +244,18 @@ int main(int argc, char *argv[]) openlog("Hyper-V VSS", 0, LOG_USER); syslog(LOG_INFO, "VSS starting; pid is:%d", getpid()); +reopen_vss_fd: + if (vss_fd != -1) + close(vss_fd); + if (fs_frozen) { + if (vss_operate(VSS_OP_THAW) || fs_frozen) { + syslog(LOG_ERR, "failed to thaw file system: err=%d", + errno); + exit(EXIT_FAILURE); + } + } + + in_handshake = 1; vss_fd = open("/dev/vmbus/hv_vss", O_RDWR); if (vss_fd < 0) { syslog(LOG_ERR, "open /dev/vmbus/hv_vss failed; error: %d %s",
@@ -284,8 +308,7 @@ int main(int argc, char *argv[]) if (len != sizeof(struct hv_vss_msg)) { syslog(LOG_ERR, "read failed; error:%d %s", errno, strerror(errno)); - close(vss_fd); - return EXIT_FAILURE; + goto reopen_vss_fd; } op = vss_msg->vss_hdr.operation;
@@ -312,14 +335,18 @@ int main(int argc, char *argv[]) default: syslog(LOG_ERR, "Illegal op:%d\n", op); } + + /* + * The write() may return an error due to the faked VSS_OP_THAW + * message upon hibernation. Ignore the error by resetting the + * dev file, i.e. closing and re-opening it. + */ vss_msg->error = error; len = write(vss_fd, vss_msg, sizeof(struct hv_vss_msg)); if (len != sizeof(struct hv_vss_msg)) { syslog(LOG_ERR, "write failed; error: %d %s", errno, strerror(errno)); - - if (op == VSS_OP_FREEZE) - vss_operate(VSS_OP_THAW); + goto reopen_vss_fd; } }
--
2.19.1