[PATCH 37/37] vfio: Add support for Shared Virtual Addressing
From: Sinan Kaya <hidden>
Date: 2018-02-28 01:26:39
Also in:
kvm, linux-acpi, linux-devicetree, linux-iommu, linux-pci
Possibly related (same subject, not in this thread)
- 2018-03-21 · Re: [PATCH 37/37] vfio: Add support for Shared Virtual Addressing · Jean-Philippe Brucker <hidden>
- 2018-03-19 · Re: [PATCH 37/37] vfio: Add support for Shared Virtual Addressing · Yisheng Xie <hidden>
- 2018-02-28 · Re: [PATCH 37/37] vfio: Add support for Shared Virtual Addressing · Jean-Philippe Brucker <hidden>
- 2018-02-20 · Re: [PATCH 37/37] vfio: Add support for Shared Virtual Addressing · Jean-Philippe Brucker <hidden>
- 2018-02-16 · Re: [PATCH 37/37] vfio: Add support for Shared Virtual Addressing · Alex Williamson <hidden>
On 2/12/2018 1:33 PM, Jean-Philippe Brucker wrote:
quoted hunk
Add two new ioctl for VFIO containers. VFIO_IOMMU_BIND_PROCESS creates a bond between a container and a process address space, identified by a device-specific ID named PASID. This allows the device to target DMA transactions at the process virtual addresses without a need for mapping and unmapping buffers explicitly in the IOMMU. The process page tables are shared with the IOMMU, and mechanisms such as PCI ATS/PRI are used to handle faults. VFIO_IOMMU_UNBIND_PROCESS removes a bond created with VFIO_IOMMU_BIND_PROCESS. Signed-off-by: Jean-Philippe Brucker <redacted> --- drivers/vfio/vfio_iommu_type1.c | 399 ++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/vfio.h | 76 ++++++++ 2 files changed, 475 insertions(+)diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index e30e29ae4819..cac066f0026b 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c@@ -30,6 +30,7 @@ #include <linux/iommu.h> #include <linux/module.h> #include <linux/mm.h> +#include <linux/ptrace.h> #include <linux/rbtree.h> #include <linux/sched/signal.h> #include <linux/sched/mm.h>@@ -60,6 +61,7 @@ MODULE_PARM_DESC(disable_hugepages, struct vfio_iommu { struct list_head domain_list; + struct list_head mm_list; struct vfio_domain *external_domain; /* domain for external user */ struct mutex lock; struct rb_root dma_list;@@ -90,6 +92,15 @@ struct vfio_dma { struct vfio_group { struct iommu_group *iommu_group; struct list_head next; + bool sva_enabled; +}; + +struct vfio_mm { +#define VFIO_PASID_INVALID (-1) + spinlock_t lock; + int pasid; + struct mm_struct *mm; + struct list_head next; }; /*@@ -1117,6 +1128,157 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, return 0; } +static int vfio_iommu_mm_exit(struct device *dev, int pasid, void *data) +{ + struct vfio_mm *vfio_mm = data; + + /* + * The mm_exit callback cannot block, so we can't take the iommu mutex + * and remove this vfio_mm from the list. Hopefully the SVA code will + * relax its locking requirement in the future. + * + * We mostly care about attach_group, which will attempt to replay all + * binds in this container. Ensure that it doesn't touch this defunct mm + * struct, by clearing the pointer. The structure will be freed when the + * group is removed from the container. + */ + spin_lock(&vfio_mm->lock); + vfio_mm->mm = NULL; + spin_unlock(&vfio_mm->lock); + + return 0; +} + +static int vfio_iommu_sva_init(struct device *dev, void *data) +{
data is not getting used.
+
+ int ret;
+
+ ret = iommu_sva_device_init(dev, IOMMU_SVA_FEAT_PASID |
+ IOMMU_SVA_FEAT_IOPF, 0);
+ if (ret)
+ return ret;
+
+ return iommu_register_mm_exit_handler(dev, vfio_iommu_mm_exit);
+}
+
+static int vfio_iommu_sva_shutdown(struct device *dev, void *data)
+{
+ iommu_sva_device_shutdown(dev);
+ iommu_unregister_mm_exit_handler(dev);
+
+ return 0;
+}
+
+static int vfio_iommu_bind_group(struct vfio_iommu *iommu,
+ struct vfio_group *group,
+ struct vfio_mm *vfio_mm)
+{
+ int ret;
+ int pasid;
+
+ if (!group->sva_enabled) {
+ ret = iommu_group_for_each_dev(group->iommu_group, NULL,
+ vfio_iommu_sva_init);
+ if (ret)
+ return ret;
+
+ group->sva_enabled = true;
+ }
+
+ ret = iommu_sva_bind_group(group->iommu_group, vfio_mm->mm, &pasid,
+ IOMMU_SVA_FEAT_PASID | IOMMU_SVA_FEAT_IOPF,
+ vfio_mm);
+ if (ret)
+ return ret;don't you need to clean up the work done by vfio_iommu_sva_init() here.
quoted hunk
+ + if (WARN_ON(vfio_mm->pasid != VFIO_PASID_INVALID && pasid != + vfio_mm->pasid)) + return -EFAULT; + + vfio_mm->pasid = pasid; + + return 0; +} + +static void vfio_iommu_unbind_group(struct vfio_group *group, + struct vfio_mm *vfio_mm) +{ + iommu_sva_unbind_group(group->iommu_group, vfio_mm->pasid); +} + +static void vfio_iommu_unbind(struct vfio_iommu *iommu, + struct vfio_mm *vfio_mm) +{ + struct vfio_group *group; + struct vfio_domain *domain; + + list_for_each_entry(domain, &iommu->domain_list, next) + list_for_each_entry(group, &domain->group_list, next) + vfio_iommu_unbind_group(group, vfio_mm); +} + +static bool vfio_mm_get(struct vfio_mm *vfio_mm) +{ + bool ret; + + spin_lock(&vfio_mm->lock); + ret = vfio_mm->mm && mmget_not_zero(vfio_mm->mm); + spin_unlock(&vfio_mm->lock); + + return ret; +} + +static void vfio_mm_put(struct vfio_mm *vfio_mm) +{ + mmput(vfio_mm->mm); +} + +static int vfio_iommu_replay_bind(struct vfio_iommu *iommu, struct vfio_group *group) +{ + int ret = 0; + struct vfio_mm *vfio_mm; + + list_for_each_entry(vfio_mm, &iommu->mm_list, next) { + /* + * Ensure mm doesn't exit while we're binding it to the new + * group. + */ + if (!vfio_mm_get(vfio_mm)) + continue; + ret = vfio_iommu_bind_group(iommu, group, vfio_mm); + vfio_mm_put(vfio_mm); + + if (ret) + goto out_unbind; + } + + return 0; + +out_unbind: + list_for_each_entry_continue_reverse(vfio_mm, &iommu->mm_list, next) { + if (!vfio_mm_get(vfio_mm)) + continue; + iommu_sva_unbind_group(group->iommu_group, vfio_mm->pasid); + vfio_mm_put(vfio_mm); + } + + return ret; +} + +static void vfio_iommu_free_all_mm(struct vfio_iommu *iommu) +{ + struct vfio_mm *vfio_mm, *tmp; + + /* + * No need for unbind() here. Since all groups are detached from this + * iommu, bonds have been removed. + */ + list_for_each_entry_safe(vfio_mm, tmp, &iommu->mm_list, next) + kfree(vfio_mm); + INIT_LIST_HEAD(&iommu->mm_list); +} + /* * We change our unmap behavior slightly depending on whether the IOMMU * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage@@ -1301,6 +1463,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, d->prot == domain->prot) { iommu_detach_group(domain->domain, iommu_group); if (!iommu_attach_group(d->domain, iommu_group)) { + if (vfio_iommu_replay_bind(iommu, group)) { + iommu_detach_group(d->domain, iommu_group); + ret = iommu_attach_group(domain->domain, + iommu_group); + if (ret) + goto out_domain; + continue; + } + list_add(&group->next, &d->group_list); iommu_domain_free(domain->domain); kfree(domain);@@ -1321,6 +1492,10 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_detach; + ret = vfio_iommu_replay_bind(iommu, group); + if (ret) + goto out_detach; + if (resv_msi) { ret = iommu_get_msi_cookie(domain->domain, resv_msi_base); if (ret)@@ -1426,6 +1601,11 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, continue; iommu_detach_group(domain->domain, iommu_group); + if (group->sva_enabled) { + iommu_group_for_each_dev(iommu_group, NULL, + vfio_iommu_sva_shutdown); + group->sva_enabled = false; + } list_del(&group->next); kfree(group); /*@@ -1441,6 +1621,7 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, vfio_iommu_unmap_unpin_all(iommu); else vfio_iommu_unmap_unpin_reaccount(iommu); + vfio_iommu_free_all_mm(iommu); } iommu_domain_free(domain->domain); list_del(&domain->next);@@ -1475,6 +1656,7 @@ static void *vfio_iommu_type1_open(unsigned long arg) } INIT_LIST_HEAD(&iommu->domain_list); + INIT_LIST_HEAD(&iommu->mm_list); iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);@@ -1509,6 +1691,7 @@ static void vfio_iommu_type1_release(void *iommu_data) kfree(iommu->external_domain); } + vfio_iommu_free_all_mm(iommu); vfio_iommu_unmap_unpin_all(iommu); list_for_each_entry_safe(domain, domain_tmp,@@ -1537,6 +1720,184 @@ static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu) return ret; } +static struct mm_struct *vfio_iommu_get_mm_by_vpid(pid_t vpid) +{ + struct mm_struct *mm; + struct task_struct *task; + + rcu_read_lock(); + task = find_task_by_vpid(vpid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) + return ERR_PTR(-ESRCH); + + /* Ensure that current has RW access on the mm */ + mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); + put_task_struct(task); + + if (!mm) + return ERR_PTR(-ESRCH); + + return mm; +} + +static long vfio_iommu_type1_bind_process(struct vfio_iommu *iommu, + void __user *arg, + struct vfio_iommu_type1_bind *bind) +{ + struct vfio_iommu_type1_bind_process params; + struct vfio_domain *domain; + struct vfio_group *group; + struct vfio_mm *vfio_mm; + struct mm_struct *mm; + unsigned long minsz; + int ret = 0; + + minsz = sizeof(*bind) + sizeof(params); + if (bind->argsz < minsz) + return -EINVAL; + + arg += sizeof(*bind); + if (copy_from_user(¶ms, arg, sizeof(params))) + return -EFAULT; + + if (params.flags & ~VFIO_IOMMU_BIND_PID) + return -EINVAL; + + if (params.flags & VFIO_IOMMU_BIND_PID) { + mm = vfio_iommu_get_mm_by_vpid(params.pid); + if (IS_ERR(mm)) + return PTR_ERR(mm); + } else { + mm = get_task_mm(current); + if (!mm) + return -EINVAL; + }
I think you can merge mm failure in both states.
+
+ mutex_lock(&iommu->lock);
+ if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
+ ret = -EINVAL;
+ goto out_put_mm;
+ }
+
+ list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
+ if (vfio_mm->mm != mm)
+ continue;
+
+ params.pasid = vfio_mm->pasid;
+
+ ret = copy_to_user(arg, ¶ms, sizeof(params)) ? -EFAULT : 0;
+ goto out_put_mm;
+ }
+
+ vfio_mm = kzalloc(sizeof(*vfio_mm), GFP_KERNEL);
+ if (!vfio_mm) {
+ ret = -ENOMEM;
+ goto out_put_mm;
+ }
+
+ vfio_mm->mm = mm;
+ vfio_mm->pasid = VFIO_PASID_INVALID;
+ spin_lock_init(&vfio_mm->lock);
+
+ list_for_each_entry(domain, &iommu->domain_list, next) {
+ list_for_each_entry(group, &domain->group_list, next) {
+ ret = vfio_iommu_bind_group(iommu, group, vfio_mm);
+ if (ret)
+ break;
+ }
+ if (ret)
+ break;
+ }
+
+ if (ret) {
+ /* Undo all binds that already succeeded */
+ list_for_each_entry_continue_reverse(group, &domain->group_list,
+ next)
+ vfio_iommu_unbind_group(group, vfio_mm);
+ list_for_each_entry_continue_reverse(domain, &iommu->domain_list,
+ next)
+ list_for_each_entry(group, &domain->group_list, next)
+ vfio_iommu_unbind_group(group, vfio_mm);
+ kfree(vfio_mm);
+ } else {
+ list_add(&vfio_mm->next, &iommu->mm_list);
+
+ params.pasid = vfio_mm->pasid;
+ ret = copy_to_user(arg, ¶ms, sizeof(params)) ? -EFAULT : 0;
+ if (ret) {
+ vfio_iommu_unbind(iommu, vfio_mm);
+ kfree(vfio_mm);
+ }
+ }
+
+out_put_mm:
+ mutex_unlock(&iommu->lock);
+ mmput(mm);
+
+ return ret;
+}
+
+static long vfio_iommu_type1_unbind_process(struct vfio_iommu *iommu,
+ void __user *arg,
+ struct vfio_iommu_type1_bind *bind)
+{
+ int ret = -EINVAL;
+ unsigned long minsz;
+ struct mm_struct *mm;
+ struct vfio_mm *vfio_mm;
+ struct vfio_iommu_type1_bind_process params;
+
+ minsz = sizeof(*bind) + sizeof(params);
+ if (bind->argsz < minsz)
+ return -EINVAL;
+
+ arg += sizeof(*bind);
+ if (copy_from_user(¶ms, arg, sizeof(params)))
+ return -EFAULT;
+
+ if (params.flags & ~VFIO_IOMMU_BIND_PID)
+ return -EINVAL;
+
+ /*
+ * We can't simply unbind a foreign process by PASID, because the
+ * process might have died and the PASID might have been reallocated to
+ * another process. Instead we need to fetch that process mm by PID
+ * again to make sure we remove the right vfio_mm. In addition, holding
+ * the mm guarantees that mm_users isn't dropped while we unbind and the
+ * exit_mm handler doesn't fire. While not strictly necessary, not
+ * having to care about that race simplifies everyone's life.
+ */
+ if (params.flags & VFIO_IOMMU_BIND_PID) {
+ mm = vfio_iommu_get_mm_by_vpid(params.pid);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
+ } else {
+ mm = get_task_mm(current);
+ if (!mm)
+ return -EINVAL;
+ }
+I think you can merge mm failure in both states.
+ ret = -ESRCH;
+ mutex_lock(&iommu->lock);
+ list_for_each_entry(vfio_mm, &iommu->mm_list, next) {
+ if (vfio_mm->mm != mm)
+ continue;
+these loops look wierd 1. for loops + break 2. for loops + goto how about closing the for loop here. and then return here if not vfio_mm not found.
+ vfio_iommu_unbind(iommu, vfio_mm); + list_del(&vfio_mm->next); + kfree(vfio_mm); + ret = 0; + break; + } + mutex_unlock(&iommu->lock); + mmput(mm); + + return ret; +} +
-- Sinan Kaya Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc. Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.