Thread (41 messages) 41 messages, 2 authors, 3h ago

[PATCH v2 11/39] KVM: arm64: gic-v5: Implement VMT/vIST IRS MMIO Ops

From: Sascha Bischoff <hidden>
Date: 2026-05-21 14:54:15
Also in: kvm, kvmarm
Subsystem: arm64 port (aarch64 architecture), kernel virtual machine for arm64 (kvm/arm64), the rest · Maintainers: Catalin Marinas, Will Deacon, Marc Zyngier, Oliver Upton, Linus Torvalds

GICv5 has rules about which fields of a VMTE (or L1 VMT) may be
directly written by the host once the table is valid. This ensures
that no stale state is cached by the hardware, and provides a clear
interface for making VMs, ISTs, etc, valid.

The hypervisor is responsible for populating the VMTE for a
VM. However, it is not permitted to write the Valid bit (as the VM
table is already valid). Instead, the VM is made valid via an IRS MMIO
Op. The same applies to the ISTs - they must be made valid via the
host IRS.

This commit adds support for:

* Making level 1 VMTs valid (only), allowing for dynamic level 2 array
  allocation.
* Making VMTEs (VMs) valid or invalid
* Making SPI/LPI ISTs valid or invalid for a specific VM

As part of this commit, the following vcpu_affinity-based commands are
plumbed in:

        VMT_L2_MAP - Make a second level VM table valid
        VMTE_MAKE_VALID - Make a single VMTE (and hence VM) valid
        VMTE_MAKE_INVALID - Make a single VMTE (and hence VM) invalid
        SPI_VIST_MAKE_VALID - Make the SPI IST valid
        LPI_VIST_MAKE_VALID - Make the LPI IST valid
        LPI_VIST_MAKE_INVALID - Make the LPI IST invalid

Note: the lack of SPI_VIST_MAKE_INVALID is intentional.

When successfully probing for a GICv5, the VMT is allocated, and is
made valid via the IRS's MMIO interface. Treat failures while
allocating or assigning the VMT as hard GICv5 probe failures. At that
point the IRS VM table state is a prerequisite for vGICv5 operation,
and falling back to the legacy path would leave the host without a
valid GICv5 VM table setup. Later failures can only fall back once the
IRS VMT state has been successfully cleared.

Signed-off-by: Sascha Bischoff <redacted>
---
 arch/arm64/kvm/vgic/vgic-v5-tables.c |  58 ++++++---
 arch/arm64/kvm/vgic/vgic-v5-tables.h |   2 +
 arch/arm64/kvm/vgic/vgic-v5.c        | 188 ++++++++++++++++++++++++++-
 3 files changed, 225 insertions(+), 23 deletions(-)
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.c b/arch/arm64/kvm/vgic/vgic-v5-tables.c
index a1d0f620b7913..5c87c6c27087a 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.c
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.c
@@ -67,6 +67,21 @@ static DEFINE_XARRAY(vm_info);
 #define GICV5_VMTEL2_LPI_SECTION	2
 #define GICV5_VMTEL2_SPI_SECTION	3
 
+static int vgic_v5_alloc_linear_ist(struct kvm *kvm, bool spi_ist,
+				    unsigned int id_bits,
+				    unsigned int istsz);
+static int vgic_v5_alloc_l1_ist(struct kvm *kvm, unsigned int id_bits,
+				unsigned int istsz, unsigned int l2_split);
+static int vgic_v5_alloc_l2_ists(struct kvm *kvm, unsigned int id_bits,
+				 unsigned int istsz, unsigned int l2_split);
+static int vgic_v5_alloc_two_level_lpi_ist(struct kvm *kvm,
+					   unsigned int id_bits,
+					   unsigned int istsz,
+					   unsigned int l2_split);
+static int vgic_v5_linear_ist_free(struct kvm *kvm, bool spi);
+static int vgic_v5_two_level_ist_free(struct kvm *kvm, bool spi);
+static int vgic_v5_spi_ist_free(struct kvm *kvm);
+
 /*
  * Our IRS might be coherent or non-coherent. If coherent, we can just emit a
  * DSB to ensure that we're in sync. However, when non-coherent, we need to
@@ -497,25 +512,6 @@ int vgic_v5_vmte_init(struct kvm *kvm)
 	return ret;
 }
 
-/*
- * The following set of forward declarations makes the code layout a *little*
- * clearer as it lets us keep the IST-related code together.
- */
-static int vgic_v5_alloc_linear_ist(struct kvm *kvm, bool spi_ist,
-				    unsigned int id_bits,
-				    unsigned int istsz);
-static int vgic_v5_alloc_l1_ist(struct kvm *kvm, unsigned int id_bits,
-				unsigned int istsz, unsigned int l2_split);
-static int vgic_v5_alloc_l2_ists(struct kvm *kvm, unsigned int id_bits,
-				 unsigned int istsz, unsigned int l2_split);
-static int vgic_v5_alloc_two_level_lpi_ist(struct kvm *kvm,
-					   unsigned int id_bits,
-					   unsigned int istsz,
-					   unsigned int l2_split);
-static int vgic_v5_linear_ist_free(struct kvm *kvm, bool spi);
-static int vgic_v5_two_level_ist_free(struct kvm *kvm, bool spi);
-static int vgic_v5_spi_ist_free(struct kvm *kvm);
-
 /*
  * Release the VMT Entry, freeing up any allocated data structures before
  * zeroing the VMTE.
@@ -665,6 +661,23 @@ int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+phys_addr_t vgic_v5_get_vmt_base(void)
+{
+	phys_addr_t vmt_base;
+
+	if (!vmt_info->two_level)
+		vmt_base = virt_to_phys(vmt_info->linear.vmt_base);
+	else
+		vmt_base = virt_to_phys(vmt_info->l2.vmt_base);
+
+	return vmt_base;
+}
+
+u8 vgic_v5_vmt_vpe_id_bits(void)
+{
+	return fls(vmt_info->max_vpes) - 1;
+}
+
 /*
  * Assign an already allocated IST to the VM by populating the fields in the
  * corresponding VMTE. We re-use this code for both an SPI IST and LPI IST, even
@@ -723,8 +736,13 @@ static int vgic_v5_vmte_assign_ist(struct kvm *kvm, phys_addr_t ist_base,
 	/* Finally, mark the entry as valid */
 	cmd = spi_ist ? SPI_VIST_MAKE_VALID : LPI_VIST_MAKE_VALID;
 	ret = irq_set_vcpu_affinity(vgic_v5_vpe_db(vcpu0), &cmd);
+	if (ret) {
+		WRITE_ONCE(vmte->val[section], 0ULL);
+		vgic_v5_clean_inval(vmte, sizeof(*vmte));
+		return ret;
+	}
 
-	return ret;
+	return 0;
 }
 
 /*
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.h b/arch/arm64/kvm/vgic/vgic-v5-tables.h
index 81fed6c5b1559..acd862b8806d1 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.h
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.h
@@ -82,6 +82,8 @@ static inline int vgic_v5_vpe_db(struct kvm_vcpu *vcpu)
 
 int vgic_v5_vmt_allocate(unsigned int max_vpes);
 int vgic_v5_vmt_free(void);
+phys_addr_t vgic_v5_get_vmt_base(void);
+u8 vgic_v5_vmt_vpe_id_bits(void);
 
 int vgic_v5_allocate_vm_id(struct kvm *kvm);
 void vgic_v5_release_vm_id(struct kvm *kvm);
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 120eadff9a128..f9578c2a634a4 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -10,10 +10,14 @@
 #include <linux/irqdomain.h>
 
 #include "vgic.h"
+#include "vgic-v5-tables.h"
 
 #define ppi_caps	kvm_vgic_global_state.vgic_v5_ppi_caps
 #define irs_caps	kvm_vgic_global_state.vgic_v5_irs_caps
 
+static int vgic_v5_irs_assign_vmt(bool two_level, u8 vm_id_bits, phys_addr_t vmt_base);
+static int vgic_v5_irs_clear_vmt(void);
+
 /*
  * Not all PPIs are guaranteed to be implemented for GICv5. Deterermine which
  * ones are, and generate a mask.
@@ -36,11 +40,32 @@ static void vgic_v5_get_implemented_ppis(void)
 	__assign_bit(GICV5_ARCH_PPI_PMUIRQ, ppi_caps.impl_ppi_mask, system_supports_pmuv3());
 }
 
+/*
+ * The IRS MMIO interface is shared between all VMs, so make sure we don't do
+ * anything stupid!
+ */
+static DEFINE_RAW_SPINLOCK(global_irs_lock);
+
 static u32 irs_readl_relaxed(const u32 reg_offset)
 {
 	return readl_relaxed(irs_caps.irs_base + reg_offset);
 }
 
+static void irs_writel_relaxed(const u32 val, const u32 reg_offset)
+{
+	writel_relaxed(val, irs_caps.irs_base + reg_offset);
+}
+
+static u64 irs_readq_relaxed(const u32 reg_offset)
+{
+	return readq_relaxed(irs_caps.irs_base + reg_offset);
+}
+
+static void irs_writeq_relaxed(const u64 val, const u32 reg_offset)
+{
+	writeq_relaxed(val, irs_caps.irs_base + reg_offset);
+}
+
 static void vgic_v5_irs_extract_vm_caps(const struct gic_kvm_info *info)
 {
 	u64 idr;
@@ -85,6 +110,7 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	int ret;
 
 	kvm_vgic_global_state.type = VGIC_V5;
+	kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS;
 
 	kvm_vgic_global_state.vcpu_base = 0;
 	kvm_vgic_global_state.vctrl_base = NULL;
@@ -105,12 +131,49 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	vgic_v5_irs_extract_vm_caps(info);
 	vgic_v5_get_implemented_ppis();
 
+	/*
+	 * Even if the HW supports more per-VM vCPUs, artificially cap as we
+	 * can't use them all.
+	 */
+	kvm_vgic_global_state.max_gic_vcpus = min(irs_caps.max_vpes,
+						  VGIC_V5_MAX_CPUS);
+
+	/*
+	 * GICv5 requires a set of tables to be allocated in order to manage
+	 * VMs. We allocate them in advance here, which alas means that we
+	 * already have to make a decisions regarding the maximum number of VMs
+	 * we want to run. For now, we match the maximum number offered by the
+	 * hardware, but this might not be a wise choice in the long term.
+	 */
+	ret = vgic_v5_vmt_allocate(kvm_vgic_global_state.max_gic_vcpus);
+	if (ret) {
+		kvm_err("Failed to allocate the GICv5 VM tables; no GICv5 support\n");
+		return -ENODEV;
+	}
+
+	/*
+	 * We've now allocated the VM table, but the host's IRS doesn't know
+	 * about it yet. Provide the base address of the VMT to the IRS, as well
+	 * as the number of ID bits that it covers and the structure used
+	 * (linear/two-level).
+	 */
+	ret = vgic_v5_irs_assign_vmt(irs_caps.two_level_vmt_support,
+				     ilog2(irs_caps.max_vms),
+				     vgic_v5_get_vmt_base());
+	if (ret) {
+		kvm_err("Failed to assign the GICv5 VM tables to the IRS; no GICv5 support\n");
+		vgic_v5_vmt_free();
+		return -ENODEV;
+	}
+
 	kvm_vgic_global_state.max_gic_vcpus = min(irs_caps.max_vpes,
 						  VGIC_V5_MAX_CPUS);
 
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V5);
 	if (ret) {
 		kvm_err("Cannot register GICv5 KVM device.\n");
+		WARN_ON(vgic_v5_irs_clear_vmt());
+		vgic_v5_vmt_free();
 		goto skip_v5;
 	}
 
@@ -138,12 +201,13 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
 	if (ret) {
 		kvm_err("Cannot register GICv3-legacy KVM device.\n");
-		return ret;
+		/* vGICv5 should still work */
+		return v5_registered ? 0 : ret;
 	}
 
 	/* We potentially limit the max VCPUs further than we need to here */
 	kvm_vgic_global_state.max_gic_vcpus = min(VGIC_V3_MAX_CPUS,
-						  VGIC_V5_MAX_CPUS);
+						  kvm_vgic_global_state.max_gic_vcpus);
 
 	static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
 	kvm_info("GCIE legacy system register CPU interface\n");
@@ -153,18 +217,136 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	return 0;
 }
 
+/*
+ * Wait for completion of a change in any of IRS_VMT_BASER, IRS_VMAP_L2_VMTR,
+ * IRS_VMAP_VMR, IRS_VMAP_VPER, IRS_VMAP_VISTR, IRS_VMAP_L2_VISTR.
+ */
+static int vgic_v5_irs_wait_for_vm_op(void)
+{
+	return gicv5_wait_for_op_atomic(irs_caps.irs_base,
+					GICV5_IRS_VMT_STATUSR,
+					GICV5_IRS_VMT_STATUSR_IDLE,
+					NULL);
+}
+
+static int vgic_v5_irs_write_vm_mmio_reg(u64 val, u32 offset)
+{
+	int ret;
+
+	guard(raw_spinlock_irqsave)(&global_irs_lock);
+
+	/* Make sure that we are idle to begin with */
+	ret = vgic_v5_irs_wait_for_vm_op();
+	if (ret)
+		return ret;
+
+	irs_writeq_relaxed(val, offset);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_assign_vmt(bool two_level, u8 vm_id_bits,
+				  phys_addr_t vmt_base)
+{
+	u64 vmt_baser;
+	u32 vmt_cfgr;
+
+	guard(raw_spinlock_irqsave)(&global_irs_lock);
+
+	vmt_baser = irs_readq_relaxed(GICV5_IRS_VMT_BASER);
+	if (!!FIELD_GET(GICV5_IRS_VMT_BASER_VALID, vmt_baser))
+		return -EBUSY;
+
+	vmt_cfgr = FIELD_PREP(GICV5_IRS_VMT_CFGR_VM_ID_BITS, vm_id_bits);
+	if (two_level)
+		vmt_cfgr |= FIELD_PREP(GICV5_IRS_VMT_CFGR_STRUCTURE,
+				       GICV5_IRS_VMT_CFGR_STRUCTURE_TWO_LEVEL);
+
+	irs_writel_relaxed(vmt_cfgr, GICV5_IRS_VMT_CFGR);
+
+	/* The base address is intentionally only masked and not shifted */
+	vmt_baser = FIELD_PREP(GICV5_IRS_VMT_BASER_VALID, true) |
+		    (vmt_base & GICV5_IRS_VMT_BASER_ADDR);
+	irs_writeq_relaxed(vmt_baser, GICV5_IRS_VMT_BASER);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_clear_vmt(void)
+{
+	return vgic_v5_irs_write_vm_mmio_reg(0, GICV5_IRS_VMT_BASER);
+}
+
+static int vgic_v5_irs_vmap_l2_vmt(u16 vm_id)
+{
+	u64 val = FIELD_PREP(GICV5_IRS_VMAP_L2_VMTR_VM_ID, vm_id) |
+		GICV5_IRS_VMAP_L2_VMTR_M;
+
+	return vgic_v5_irs_write_vm_mmio_reg(val, GICV5_IRS_VMAP_L2_VMTR);
+}
+
+static int __vgic_v5_irs_vmap_vm(u16 vm_id, bool unmap)
+{
+	u64 val = FIELD_PREP(GICV5_IRS_VMAP_VMR_VM_ID, vm_id) |
+		FIELD_PREP(GICV5_IRS_VMAP_VMR_U, unmap) |
+		GICV5_IRS_VMAP_VMR_M;
+
+	return vgic_v5_irs_write_vm_mmio_reg(val, GICV5_IRS_VMAP_VMR);
+}
+
+static int vgic_v5_irs_set_vm_valid(u16 vm_id)
+{
+	return __vgic_v5_irs_vmap_vm(vm_id, false);
+}
+
+static int vgic_v5_irs_set_vm_invalid(u16 vm_id)
+{
+	return __vgic_v5_irs_vmap_vm(vm_id, true);
+}
+
+static int __vgic_v5_irs_update_vist_validity(u16 vm_id, bool spi_ist, bool unmap)
+{
+	u8 type = spi_ist ? 0b011 : 0b010;
+	u64 val = FIELD_PREP(GICV5_IRS_VMAP_VISTR_TYPE, type) |
+		FIELD_PREP(GICV5_IRS_VMAP_VISTR_VM_ID, vm_id) |
+		FIELD_PREP(GICV5_IRS_VMAP_VISTR_U, unmap) |
+		GICV5_IRS_VMAP_VISTR_M;
+
+	return vgic_v5_irs_write_vm_mmio_reg(val, GICV5_IRS_VMAP_VISTR);
+}
+
+static int vgic_v5_irs_set_vist_valid(u16 vm_id, bool spi_ist)
+{
+	return __vgic_v5_irs_update_vist_validity(vm_id, spi_ist, false);
+}
+
+/*
+ * LPI ISTs can be invalidated explicitly. SPI ISTs are invalidated by making
+ * the VMTE invalid during teardown.
+ */
+static int vgic_v5_irs_set_vist_invalid(u16 vm_id, bool spi_ist)
+{
+	return __vgic_v5_irs_update_vist_validity(vm_id, spi_ist, true);
+}
+
 static int vgic_v5_db_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
 {
+	struct vgic_v5_vm *vm = data->domain->host_data;
 	enum gicv5_vcpu_cmd *cmd = vcpu_info;
 
 	switch (*cmd) {
 	case VMT_L2_MAP:
+		return vgic_v5_irs_vmap_l2_vmt(vm->vm_id);
 	case VMTE_MAKE_VALID:
+		return vgic_v5_irs_set_vm_valid(vm->vm_id);
 	case VMTE_MAKE_INVALID:
+		return vgic_v5_irs_set_vm_invalid(vm->vm_id);
 	case SPI_VIST_MAKE_VALID:
+		return vgic_v5_irs_set_vist_valid(vm->vm_id, true);
 	case LPI_VIST_MAKE_VALID:
+		return vgic_v5_irs_set_vist_valid(vm->vm_id, false);
 	case LPI_VIST_MAKE_INVALID:
-		/* Not yet implemented */
+		return vgic_v5_irs_set_vist_invalid(vm->vm_id, false);
 	default:
 		return -EINVAL;
 	}
-- 
2.34.1
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help