Re: [PATCH kernel v4 3/4] vfio/spapr: Reference mm in tce_container
From: David Gibson <hidden>
Date: 2016-11-09 00:46:12
On Tue, Nov 08, 2016 at 03:25:19PM -0700, Alex Williamson wrote:
On Mon, 24 Oct 2016 17:53:09 +1100 Alexey Kardashevskiy [off-list ref] wrote:quoted
In some situations the userspace memory context may live longer than the userspace process itself so if we need to do proper memory context cleanup, we better have tce_container take a reference to mm_struct and use it later when the process is gone (@current or @current->mm is NULL). This references mm and stores the pointer in the container; this is done when a container is just created so checking for !current->mm in other places becomes pointless. This replaces current->mm with container->mm everywhere except debug prints. This adds a check that current->mm is the same as the one stored in the container to prevent userspace from making changes to a memory context of other processes; in order to add this check, VFIO_CHECK_EXTENSION is moved out of the switch(cmd) as it is quite special anyway - it is the only ioctl() called when neither container nor container->mm is initialized. Signed-off-by: Alexey Kardashevskiy <redacted> --- Changes: v4: * added check for container->mm!=current->mm in tce_iommu_ioctl() for all ioctls and removed other redundand checks --- drivers/vfio/vfio_iommu_spapr_tce.c | 131 ++++++++++++++++++------------------ 1 file changed, 66 insertions(+), 65 deletions(-)diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index d0c38b2..81ab93f 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c@@ -31,49 +31,46 @@ static void tce_iommu_detach_group(void *iommu_data, struct iommu_group *iommu_group); -static long try_increment_locked_vm(long npages) +static long try_increment_locked_vm(struct mm_struct *mm, long npages) { long ret = 0, locked, lock_limit; - if (!current || !current->mm) - return -ESRCH; /* process exited */ - if (!npages) return 0; - down_write(¤t->mm->mmap_sem); - locked = current->mm->locked_vm + npages; + down_write(&mm->mmap_sem); + locked = mm->locked_vm + npages; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) ret = -ENOMEM; else - current->mm->locked_vm += npages; + mm->locked_vm += npages; pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid, npages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK), ret ? " - exceeded" : ""); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); return ret; } -static void decrement_locked_vm(long npages) +static void decrement_locked_vm(struct mm_struct *mm, long npages) { - if (!current || !current->mm || !npages) - return; /* process exited */ + if (!npages) + return; - down_write(¤t->mm->mmap_sem); - if (WARN_ON_ONCE(npages > current->mm->locked_vm)) - npages = current->mm->locked_vm; - current->mm->locked_vm -= npages; + down_write(&mm->mmap_sem); + if (WARN_ON_ONCE(npages > mm->locked_vm)) + npages = mm->locked_vm; + mm->locked_vm -= npages; pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid, npages << PAGE_SHIFT, - current->mm->locked_vm << PAGE_SHIFT, + mm->locked_vm << PAGE_SHIFT, rlimit(RLIMIT_MEMLOCK)); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); } /*@@ -98,6 +95,7 @@ struct tce_container { bool enabled; bool v2; unsigned long locked_pages; + struct mm_struct *mm; struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES]; struct list_head group_list; };@@ -107,17 +105,14 @@ static long tce_iommu_unregister_pages(struct tce_container *container, { struct mm_iommu_table_group_mem_t *mem; - if (!current || !current->mm) - return -ESRCH; /* process exited */ - if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK)) return -EINVAL; - mem = mm_iommu_find(current->mm, vaddr, size >> PAGE_SHIFT); + mem = mm_iommu_find(container->mm, vaddr, size >> PAGE_SHIFT); if (!mem) return -ENOENT; - return mm_iommu_put(current->mm, mem); + return mm_iommu_put(container->mm, mem); } static long tce_iommu_register_pages(struct tce_container *container,@@ -127,14 +122,11 @@ static long tce_iommu_register_pages(struct tce_container *container, struct mm_iommu_table_group_mem_t *mem = NULL; unsigned long entries = size >> PAGE_SHIFT; - if (!current || !current->mm) - return -ESRCH; /* process exited */ - if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) || ((vaddr + size) < vaddr)) return -EINVAL; - ret = mm_iommu_get(current->mm, vaddr, entries, &mem); + ret = mm_iommu_get(container->mm, vaddr, entries, &mem); if (ret) return ret;@@ -143,7 +135,8 @@ static long tce_iommu_register_pages(struct tce_container *container, return 0; } -static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) +static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl, + struct mm_struct *mm) { unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE);@@ -152,13 +145,13 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) BUG_ON(tbl->it_userspace); - ret = try_increment_locked_vm(cb >> PAGE_SHIFT); + ret = try_increment_locked_vm(mm, cb >> PAGE_SHIFT); if (ret) return ret; uas = vzalloc(cb); if (!uas) { - decrement_locked_vm(cb >> PAGE_SHIFT); + decrement_locked_vm(mm, cb >> PAGE_SHIFT); return -ENOMEM; } tbl->it_userspace = uas;@@ -166,7 +159,8 @@ static long tce_iommu_userspace_view_alloc(struct iommu_table *tbl) return 0; } -static void tce_iommu_userspace_view_free(struct iommu_table *tbl) +static void tce_iommu_userspace_view_free(struct iommu_table *tbl, + struct mm_struct *mm) { unsigned long cb = _ALIGN_UP(sizeof(tbl->it_userspace[0]) * tbl->it_size, PAGE_SIZE);@@ -176,7 +170,7 @@ static void tce_iommu_userspace_view_free(struct iommu_table *tbl) vfree(tbl->it_userspace); tbl->it_userspace = NULL; - decrement_locked_vm(cb >> PAGE_SHIFT); + decrement_locked_vm(mm, cb >> PAGE_SHIFT); } static bool tce_page_is_contained(struct page *page, unsigned page_shift)@@ -236,9 +230,6 @@ static int tce_iommu_enable(struct tce_container *container) struct iommu_table_group *table_group; struct tce_iommu_group *tcegrp; - if (!current->mm) - return -ESRCH; /* process exited */ - if (container->enabled) return -EBUSY;@@ -284,7 +275,7 @@ static int tce_iommu_enable(struct tce_container *container) return -EPERM; locked = table_group->tce32_size >> PAGE_SHIFT; - ret = try_increment_locked_vm(locked); + ret = try_increment_locked_vm(container->mm, locked); if (ret) return ret;@@ -302,10 +293,7 @@ static void tce_iommu_disable(struct tce_container *container) container->enabled = false; - if (!current->mm) - return; - - decrement_locked_vm(container->locked_pages); + decrement_locked_vm(container->mm, container->locked_pages); } static void *tce_iommu_open(unsigned long arg)@@ -326,13 +314,18 @@ static void *tce_iommu_open(unsigned long arg) container->v2 = arg == VFIO_SPAPR_TCE_v2_IOMMU; + /* current->mm cannot be NULL in this context */ + container->mm = current->mm; + atomic_inc(&container->mm->mm_count);Are you sure you wouldn't rather do this on the actual preregistration? The advise I gave to Kirti for mdev was that it's currently possible to have a configuration where a privileged user opens a container, adds a group, sets the iommu, and then passes the file descriptors to another user. We can then set an mm once mappings, or preregistration occurs, and from there require that the unmapping mm matches the mapping mm, or that both of those match the preregistration mm. I'm not sure I see any reason to impose that current->mm that performs this operation is the only one that's allowed to perform those later tasks.quoted
+ return container; } static int tce_iommu_clear(struct tce_container *container, struct iommu_table *tbl, unsigned long entry, unsigned long pages); -static void tce_iommu_free_table(struct iommu_table *tbl); +static void tce_iommu_free_table(struct tce_container *container, + struct iommu_table *tbl); static void tce_iommu_release(void *iommu_data) {@@ -357,10 +350,11 @@ static void tce_iommu_release(void *iommu_data) continue; tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size); - tce_iommu_free_table(tbl); + tce_iommu_free_table(container, tbl); } tce_iommu_disable(container); + mmdrop(container->mm);I imagine you'd still do this here, just: if (container->mm) mmdrop(container->mm); Of course you'd need to check how many places you'd need similar tests, maybe some of the current->mm tests would be converted to container->mm rather than dropped. Thanks,
Right, as you've implied here, the advantage of locking the mm on open is simplicity - open always happens, and it always happens before anything else, so there's no conditionals about whether we have or haven't set the mm yet. That said, you have just described a plausible use case where it's useful to have one mm open the container, then another one use it, so it might be worth adding that ability. -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachments
- signature.asc [application/pgp-signature] 819 bytes