Re: [PATCH 2/3] libxfs: simulate system failure after a certain number of writes
From: Brian Foster <hidden>
Date: 2021-02-16 11:59:07
On Fri, Feb 12, 2021 at 09:46:56PM -0800, Darrick J. Wong wrote:
quoted hunk ↗ jump to hunk
From: Darrick J. Wong <djwong@kernel.org> Add an error injection knob so that we can simulate system failure after a certain number of disk writes. This knob is being added so that we can check repair's behavior after an arbitrary number of tests. Set LIBXFS_DEBUG_WRITE_CRASH={ddev,logdev,rtdev}=nn in the environment to make libxfs SIGKILL itself after nn writes to the data, log, or rt devices. Note that this only applies to xfs_buf writes and zero_range. Signed-off-by: Darrick J. Wong <djwong@kernel.org> --- libxfs/init.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++--- libxfs/libxfs_io.h | 19 +++++++++++++++ libxfs/rdwr.c | 6 ++++- 3 files changed, 88 insertions(+), 5 deletions(-)diff --git a/libxfs/init.c b/libxfs/init.c index 8a8ce3c4..1ec83791 100644 --- a/libxfs/init.c +++ b/libxfs/init.c
...
quoted hunk ↗ jump to hunk
@@ -614,6 +634,46 @@ libxfs_buftarg_init( dev_t logdev, dev_t rtdev) { + char *p = getenv("LIBXFS_DEBUG_WRITE_CRASH"); + unsigned long dfail = 0, lfail = 0, rfail = 0;
Was there a reason for using an environment variable now rather than the original command line option?
+
+ /* Simulate utility crash after a certain number of writes. */
+ while (p && *p) {
+ char *val;
+
+ switch (getsubopt(&p, wf_opts, &val)) {
+ case WF_DATA:
+ if (!val) {
+ fprintf(stderr,
+ _("ddev write fail requires a parameter\n"));
+ exit(1);
+ }
+ dfail = strtoul(val, NULL, 0);
+ break;
+ case WF_LOG:
+ if (!val) {
+ fprintf(stderr,
+ _("logdev write fail requires a parameter\n"));
+ exit(1);
+ }
+ lfail = strtoul(val, NULL, 0);
+ break;
+ case WF_RT:
+ if (!val) {
+ fprintf(stderr,
+ _("rtdev write fail requires a parameter\n"));
+ exit(1);
+ }
+ rfail = strtoul(val, NULL, 0);
+ break;
+ default:
+ fprintf(stderr, _("unknown write fail type %s\n"),
+ val);
+ exit(1);
+ break;
+ }
+ }
+
if (mp->m_ddev_targp) {
/* should already have all buftargs initialised */
if (mp->m_ddev_targp->bt_bdev != dev ||...
quoted hunk ↗ jump to hunk
diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h index c80e2d59..85485257 100644 --- a/libxfs/libxfs_io.h +++ b/libxfs/libxfs_io.h
...
quoted hunk ↗ jump to hunk
@@ -30,6 +32,23 @@ struct xfs_buftarg { #define XFS_BUFTARG_LOST_WRITE (1 << 0) /* A dirty buffer failed the write verifier. */ #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) +/* Simulate failure after a certain number of writes. */ +#define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) + +/* Simulate the system crashing after a write. */ +static inline void +xfs_buftarg_trip_write( + struct xfs_buftarg *btp) +{ + if (!(btp->flags & XFS_BUFTARG_INJECT_WRITE_FAIL)) + return; + + pthread_mutex_lock(&btp->lock); + btp->writes_left--; + if (!btp->writes_left) + kill(getpid(), SIGKILL);
Can we just exit()? (Same questions for the next patch..) Brian
quoted hunk ↗ jump to hunk
+ pthread_mutex_unlock(&btp->lock); +} extern void libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev, dev_t logdev, dev_t rtdev);diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c index ca272387..fd456d6b 100644 --- a/libxfs/rdwr.c +++ b/libxfs/rdwr.c@@ -74,8 +74,10 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) /* try to use special zeroing methods, fall back to writes if needed */ len_bytes = LIBXFS_BBTOOFF64(len); error = platform_zero_range(fd, start_offset, len_bytes); - if (!error) + if (!error) { + xfs_buftarg_trip_write(btp); return 0; + } zsize = min(BDSTRAT_SIZE, BBTOB(len)); if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) {@@ -105,6 +107,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) progname, __FUNCTION__); exit(1); } + xfs_buftarg_trip_write(btp); offset += bytes; } free(z);@@ -860,6 +863,7 @@ libxfs_bwrite( } else { bp->b_flags |= LIBXFS_B_UPTODATE; bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED); + xfs_buftarg_trip_write(bp->b_target); } return bp->b_error; }