Re: [PATCH v2 3/9] drivers/base/memory: introduce "memory groups" to logically group memory blocks
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: 2021-07-28 13:39:57
Also in:
linux-acpi, linux-mm, lkml
On Fri, Jul 23, 2021 at 02:52:04PM +0200, David Hildenbrand wrote:
quoted hunk ↗ jump to hunk
In our "auto-movable" memory onlining policy, we want to make decisions across memory blocks of a single memory device. Examples of memory devices include ACPI memory devices (in the simplest case a single DIMM) and virtio-mem. For now, we don't have a connection between a single memory block device and the real memory device. Each memory device consists of 1..X memory block devices. Let's logically group memory blocks belonging to the same memory device in "memory groups". Memory groups can span multiple physical ranges and a memory group itself does not contain any information regarding physical ranges, only properties (e.g., "max_pages") necessary for improved memory onlining. Introduce two memory group types: 1) Static memory group: E.g., a single ACPI memory device, consisting of 1..X memory resources. A memory group consists of 1..Y memory blocks. The whole group is added/removed in one go. If any part cannot get offlined, the whole group cannot be removed. 2) Dynamic memory group: E.g., a single virtio-mem device. Memory is dynamically added/removed in a fixed granularity, called a "unit", consisting of 1..X memory blocks. A unit is added/removed in one go. If any part of a unit cannot get offlined, the whole unit cannot be removed. In case of 1) we usually want either all memory managed by ZONE_MOVABLE or none. In case of 2) we usually want to have as many units as possible managed by ZONE_MOVABLE. We want a single unit to be of the same type. For now, memory groups are an internal concept that is not exposed to user space; we might want to change that in the future, though. add_memory() users can specify a mgid instead of a nid when passing the MHP_NID_IS_MGID flag. Signed-off-by: David Hildenbrand <redacted> --- drivers/base/memory.c | 102 +++++++++++++++++++++++++++++++-- include/linux/memory.h | 46 ++++++++++++++- include/linux/memory_hotplug.h | 6 +- mm/memory_hotplug.c | 11 +++- 4 files changed, 158 insertions(+), 7 deletions(-)diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 86ec2dc82fc2..42109e7fb0b5 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c@@ -82,6 +82,11 @@ static struct bus_type memory_subsys = { */ static DEFINE_XARRAY(memory_blocks); +/* + * Memory groups, indexed by memory group identification (mgid). + */ +static DEFINE_XARRAY_FLAGS(memory_groups, XA_FLAGS_ALLOC); + static BLOCKING_NOTIFIER_HEAD(memory_chain); int register_memory_notifier(struct notifier_block *nb)@@ -634,7 +639,8 @@ int register_memory(struct memory_block *memory) } static int init_memory_block(unsigned long block_id, unsigned long state, - unsigned long nr_vmemmap_pages) + unsigned long nr_vmemmap_pages, + struct memory_group *group) { struct memory_block *mem; int ret = 0;@@ -653,6 +659,11 @@ static int init_memory_block(unsigned long block_id, unsigned long state, mem->nid = NUMA_NO_NODE; mem->nr_vmemmap_pages = nr_vmemmap_pages; + if (group) { + mem->group = group; + refcount_inc(&group->refcount); + } + ret = register_memory(mem); return ret;@@ -671,7 +682,7 @@ static int add_memory_block(unsigned long base_section_nr) if (section_count == 0) return 0; return init_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, 0); + MEM_ONLINE, 0, NULL); } static void unregister_memory(struct memory_block *memory)@@ -681,6 +692,11 @@ static void unregister_memory(struct memory_block *memory) WARN_ON(xa_erase(&memory_blocks, memory->dev.id) == NULL); + if (memory->group) { + refcount_dec(&memory->group->refcount); + memory->group = NULL;
Who freed the memory for the group?
quoted hunk ↗ jump to hunk
+ } + /* drop the ref. we got via find_memory_block() */ put_device(&memory->dev); device_unregister(&memory->dev);@@ -694,7 +710,8 @@ static void unregister_memory(struct memory_block *memory) * Called under device_hotplug_lock. */ int create_memory_block_devices(unsigned long start, unsigned long size, - unsigned long vmemmap_pages) + unsigned long vmemmap_pages, + struct memory_group *group) { const unsigned long start_block_id = pfn_to_block_id(PFN_DOWN(start)); unsigned long end_block_id = pfn_to_block_id(PFN_DOWN(start + size));@@ -707,7 +724,8 @@ int create_memory_block_devices(unsigned long start, unsigned long size, return -EINVAL; for (block_id = start_block_id; block_id != end_block_id; block_id++) { - ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages); + ret = init_memory_block(block_id, MEM_OFFLINE, vmemmap_pages, + group); if (ret) break; }@@ -891,3 +909,79 @@ int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) return bus_for_each_dev(&memory_subsys, NULL, &cb_data, for_each_memory_block_cb); } + +static int register_memory_group(struct memory_group group) +{ + struct memory_group *new_group; + uint32_t mgid; + int ret; + + if (!node_possible(group.nid)) + return -EINVAL; + + new_group = kzalloc(sizeof(group), GFP_KERNEL); + if (!new_group) + return -ENOMEM; + *new_group = group;
You burried a memcpy here, why? Please be explicit as this is now a dynamic structure.
+ refcount_set(&new_group->refcount, 1);
Why not just use a kref? You seem to be treating it as a kref would work, right?
+ + ret = xa_alloc(&memory_groups, &mgid, new_group, xa_limit_31b, + GFP_KERNEL); + if (ret) + kfree(new_group); + return ret ? ret : mgid;
I hate ?: please spell this out: if (ret) return ret; return mgid; There, more obvious and you can read it in 10 years when you have to go fix it up...
+}
+
+int register_static_memory_group(int nid, unsigned long max_pages)
+{
+ struct memory_group group = {
+ .nid = nid,
+ .s = {
+ .max_pages = max_pages,
+ },
+ };
+
+ if (!max_pages)
+ return -EINVAL;
+ return register_memory_group(group);
+}
+EXPORT_SYMBOL_GPL(register_static_memory_group);Let's make our global namespace a bit nicer: memory_group_register_static() memory_group_register_dynamic() and so on. Use prefixes please, not suffixes.
+
+int register_dynamic_memory_group(int nid, unsigned long unit_pages)
+{
+ struct memory_group group = {
+ .nid = nid,
+ .is_dynamic = true,
+ .d = {
+ .unit_pages = unit_pages,
+ },
+ };
+
+ if (!unit_pages || !is_power_of_2(unit_pages) ||
+ unit_pages < PHYS_PFN(memory_block_size_bytes()))
+ return -EINVAL;
+ return register_memory_group(group);
+}
+EXPORT_SYMBOL_GPL(register_dynamic_memory_group);
+
+int unregister_memory_group(int mgid)
+{
+ struct memory_group *group;
+
+ if (mgid < 0)
+ return -EINVAL;
+
+ group = xa_load(&memory_groups, mgid);
+ if (!group || refcount_read(&group->refcount) > 1)
+ return -EINVAL;
+
+ xa_erase(&memory_groups, mgid);
+ kfree(group);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_memory_group);memory_group_unregister()
+
+struct memory_group *get_memory_group(int mgid)
+{
+ return xa_load(&memory_groups, mgid);
+}Global function?
quoted hunk ↗ jump to hunk
diff --git a/include/linux/memory.h b/include/linux/memory.h index 97e92e8b556a..6e20a6174fe5 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h@@ -23,6 +23,42 @@ #define MIN_MEMORY_BLOCK_SIZE (1UL << SECTION_SIZE_BITS) +struct memory_group { + /* Nid the whole group belongs to. */ + int nid;
What is a "nid"?
+ /* References from memory blocks + 1. */
Blank line above this? And put the structure comments in proper kernel doc so that others can read them and we can verify it is correct over time.
+ refcount_t refcount;
+ /*
+ * Memory group type: static vs. dynamic.
+ *
+ * Static: All memory in the group belongs to a single unit, such as,
+ * a DIMM. All memory belonging to the group will be added in
+ * one go and removed in one go -- it's static.
+ *
+ * Dynamic: Memory within the group is added/removed dynamically in
+ * units of the specified granularity of at least one memory block.
+ */
+ bool is_dynamic;
+
+ union {
+ struct {
+ /*
+ * Maximum number of pages we'll have in this static
+ * memory group.
+ */
+ unsigned long max_pages;
+ } s;
+ struct {
+ /*
+ * Unit in pages in which memory is added/removed in
+ * this dynamic memory group. This granularity defines
+ * the alignment of a unit in physical address space.
+ */
+ unsigned long unit_pages;
+ } d;so is_dynamic determines which to use here? Please be explicit. thanks, greg k-h _______________________________________________ Virtualization mailing list Virtualization@lists.linux-foundation.org https://lists.linuxfoundation.org/mailman/listinfo/virtualization