Thread (32 messages) 32 messages, 8 authors, 2021-12-19
STALE1652d

[PATCH 4/4] btrfs: add allocator_hint mode

From: Goffredo Baroncelli <hidden>
Date: 2021-10-24 15:39:41
Subsystem: btrfs file system, filesystems (vfs and infrastructure), the rest · Maintainers: Chris Mason, David Sterba, Alexander Viro, Christian Brauner, Linus Torvalds

From: Goffredo Baroncelli <redacted>

When this mode is enabled, the chunk allocation policy is modified as
follow.

Each disk may have a different tag:
- BTRFS_DEV_ALLOCATION_PREFERRED_METADATA
- BTRFS_DEV_ALLOCATION_METADATA_ONLY
- BTRFS_DEV_ALLOCATION_DATA_ONLY
- BTRFS_DEV_ALLOCATION_PREFERRED_DATA (default)

Where:
- ALLOCATION_PREFERRED_X means that it is preferred to use this disk for
the X chunk type (the other type may be allowed when the space is low)
- ALLOCATION_X_ONLY means that it is used *only* for the X chunk type.
This means also that it is a preferred choice.

Each time the allocator allocates a chunk of type X , first it takes the
disks tagged as ALLOCATION_X_ONLY or ALLOCATION_PREFERRED_X; if the space
is not enough, it uses also the disks tagged as ALLOCATION_METADATA_ONLY;
if the space is not enough, it uses also the other disks, with the
exception of the one marked as ALLOCATION_PREFERRED_Y, where Y the other
type of chunk (i.e. not X).

Signed-off-by: Goffredo Baroncelli <redacted>
---
 fs/btrfs/volumes.c | 98 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/volumes.h |  1 +
 2 files changed, 98 insertions(+), 1 deletion(-)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8ac99771f43c..7ee9c6e7bd44 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -153,6 +153,20 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	},
 };
 
+#define BTRFS_DEV_ALLOCATION_MASK ((1ULL << \
+		BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT) - 1)
+#define BTRFS_DEV_ALLOCATION_MASK_COUNT (1ULL << \
+		BTRFS_DEV_ALLOCATION_MASK_BIT_COUNT)
+
+static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_MASK_COUNT] = {
+	[BTRFS_DEV_ALLOCATION_DATA_ONLY] = -1,
+	[BTRFS_DEV_ALLOCATION_PREFERRED_DATA] = 0,
+	[BTRFS_DEV_ALLOCATION_PREFERRED_METADATA] = 1,
+	[BTRFS_DEV_ALLOCATION_METADATA_ONLY] = 2,
+	/* the other values are set to 0 */
+};
+
+
 const char *btrfs_bg_type_to_raid_name(u64 flags)
 {
 	const int index = btrfs_bg_flags_to_raid_index(flags);
@@ -4997,13 +5011,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
 }
 
 /*
- * sort the devices in descending order by max_avail, total_avail
+ * sort the devices in descending order by alloc_hint,
+ * max_avail, total_avail
  */
 static int btrfs_cmp_device_info(const void *a, const void *b)
 {
 	const struct btrfs_device_info *di_a = a;
 	const struct btrfs_device_info *di_b = b;
 
+	if (di_a->alloc_hint > di_b->alloc_hint)
+		return -1;
+	if (di_a->alloc_hint < di_b->alloc_hint)
+		return 1;
 	if (di_a->max_avail > di_b->max_avail)
 		return -1;
 	if (di_a->max_avail < di_b->max_avail)
@@ -5166,6 +5185,8 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
 	int ndevs = 0;
 	u64 max_avail;
 	u64 dev_offset;
+	int hint;
+	int i;
 
 	/*
 	 * in the first pass through the devices list, we gather information
@@ -5218,16 +5239,91 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices,
 		devices_info[ndevs].max_avail = max_avail;
 		devices_info[ndevs].total_avail = total_avail;
 		devices_info[ndevs].dev = device;
+
+		if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) &&
+		     (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) {
+			/*
+			 * if mixed bg set all the alloc_hint
+			 * fields to the same value, so the sorting
+			 * is not affected
+			 */
+			devices_info[ndevs].alloc_hint = 0;
+		} else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) {
+			hint = device->type & BTRFS_DEV_ALLOCATION_MASK;
+
+			/*
+			 * skip BTRFS_DEV_METADATA_ONLY disks
+			 */
+			if (hint == BTRFS_DEV_ALLOCATION_METADATA_ONLY)
+				continue;
+			/*
+			 * if a data chunk must be allocated,
+			 * sort also by hint (data disk
+			 * higher priority)
+			 */
+			devices_info[ndevs].alloc_hint = -alloc_hint_map[hint];
+		} else { /* BTRFS_BLOCK_GROUP_METADATA */
+			hint = device->type & BTRFS_DEV_ALLOCATION_MASK;
+
+			/*
+			 * skip BTRFS_DEV_DATA_ONLY disks
+			 */
+			if (hint == BTRFS_DEV_ALLOCATION_DATA_ONLY)
+				continue;
+			/*
+			 * if a data chunk must be allocated,
+			 * sort also by hint (metadata hint
+			 * higher priority)
+			 */
+			devices_info[ndevs].alloc_hint = alloc_hint_map[hint];
+		}
+
 		++ndevs;
 	}
 	ctl->ndevs = ndevs;
 
+	/*
+	 * no devices available
+	 */
+	if (!ndevs)
+		return 0;
+
 	/*
 	 * now sort the devices by hole size / available space
 	 */
 	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
 	     btrfs_cmp_device_info, NULL);
 
+	/*
+	 * select the minimum set of disks grouped by hint that
+	 * can host the chunk
+	 */
+	ndevs = 0;
+	while (ndevs < ctl->ndevs) {
+		hint = devices_info[ndevs++].alloc_hint;
+		while (ndevs < ctl->ndevs &&
+		       devices_info[ndevs].alloc_hint == hint)
+				ndevs++;
+		if (ndevs >= ctl->devs_min)
+			break;
+	}
+
+	BUG_ON(ndevs > ctl->ndevs);
+	ctl->ndevs = ndevs;
+
+	/*
+	 * the next layers require the devices_info ordered by
+	 * max_avail. If we are returing two (or more) different
+	 * group of alloc_hint, this is not always true. So sort
+	 * these gain.
+	 */
+
+	for (i = 0 ; i < ndevs ; i++)
+		devices_info[i].alloc_hint = 0;
+
+	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+	     btrfs_cmp_device_info, NULL);
+
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index b8250f29df6e..37eb37b533c5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -369,6 +369,7 @@ struct btrfs_device_info {
 	u64 dev_offset;
 	u64 max_avail;
 	u64 total_avail;
+	int alloc_hint;
 };
 
 struct btrfs_raid_attr {
-- 
2.33.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help