--- v9
+++ v10
@@ -1,421 +1,360 @@
-This adds create/remove window ioctls to create and remove DMA windows.
-sPAPR defines a Dynamic DMA windows capability which allows
-para-virtualized guests to create additional DMA windows on a PCI bus.
-The existing linux kernels use this new window to map the entire guest
-memory and switch to the direct DMA operations saving time on map/unmap
-requests which would normally happen in a big amounts.
-
-This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
-VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
-Up to 2 windows are supported now by the hardware and by this driver.
-
-This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
-information such as a number of supported windows and maximum number
-levels of TCE tables.
-
-DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
-as we still want to support v2 on platforms which cannot do DDW for
-the sake of TCE acceleration in KVM (coming soon).
+We are adding support for DMA memory pre-registration to be used in
+conjunction with VFIO. The idea is that the userspace which is going to
+run a guest may want to pre-register a user space memory region so
+it all gets pinned once and never goes away. Having this done,
+a hypervisor will not have to pin/unpin pages on every DMA map/unmap
+request. This is going to help with multiple pinning of the same memory.
+
+Another use of it is in-kernel real mode (mmu off) acceleration of
+DMA requests where real time translation of guest physical to host
+physical addresses is non-trivial and may fail as linux ptes may be
+temporarily invalid. Also, having cached host physical addresses
+(compared to just pinning at the start and then walking the page table
+again on every H_PUT_TCE), we can be sure that the addresses which we put
+into TCE table are the ones we already pinned.
+
+This adds a list of memory regions to mm_context_t. Each region consists
+of a header and a list of physical addresses. This adds API to:
+1. register/unregister memory regions;
+2. do final cleanup (which puts all pre-registered pages);
+3. do userspace to physical address translation;
+4. manage a mapped pages counter; when it is zero, it is safe to
+unregister the region.
+
+Multiple registration of the same region is allowed, kref is used to
+track the number of registrations. atomic_inc_not_zero() and
+atomic_dec_if_positive() are used to decide whether to allow or deny
+the mapped counter increments.
+
+Each registered region keeps a counter for mapped TCEs plus one per
+the registered area.
+
+Host physical addresses are stored in vmalloc'ed array. In order to
+access these in the real mode (mmu off), there is a real_vmalloc_addr()
+helper. In-kernel acceleration patchset will move it from KVM to MMU code.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
-[aw: for the vfio related changes]
-Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
Changes:
-v7:
-* s/VFIO_IOMMU_INFO_DDW/VFIO_IOMMU_SPAPR_INFO_DDW/
-* fixed typos in and updated vfio.txt
-* fixed VFIO_IOMMU_SPAPR_TCE_GET_INFO handler
-* moved ddw properties to vfio_iommu_spapr_tce_ddw_info
-
-v6:
-* added explicit VFIO_IOMMU_INFO_DDW flag to vfio_iommu_spapr_tce_info,
-it used to be page mask flags from platform code
-* added explicit pgsizes field
-* added cleanup if tce_iommu_create_window() failed in a middle
-* added checks for callbacks in tce_iommu_create_window and remove those
-from tce_iommu_remove_window when it is too late to test anyway
-* spapr_tce_find_free_table returns sensible error code now
-* updated description of VFIO_IOMMU_SPAPR_TCE_CREATE/
-VFIO_IOMMU_SPAPR_TCE_REMOVE
-
-v4:
-* moved code to tce_iommu_create_window()/tce_iommu_remove_window()
-helpers
-* added docs
+v10:
+* split mm_iommu_mapped_update into mm_iommu_mapped_dec + mm_iommu_mapped_inc
+* mapped counter now keep one reference for itself and mm_iommu_mapped_inc()
+can tell if the region is being released
+* updated commit log
+
+v8:
+* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
+* fixed error fallback look (s/[i]/[j]/)
---
- Documentation/vfio.txt | 19 ++++
- arch/powerpc/include/asm/iommu.h | 2 +-
- drivers/vfio/vfio_iommu_spapr_tce.c | 197 +++++++++++++++++++++++++++++++++++-
- include/uapi/linux/vfio.h | 61 ++++++++++-
- 4 files changed, 274 insertions(+), 5 deletions(-)
-
-diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
-index 7dcf2b5..8b1ec51 100644
---- a/Documentation/vfio.txt
-+++ b/Documentation/vfio.txt
-@@ -452,6 +452,25 @@ address is from pre-registered range.
-
- This separation helps in optimizing DMA for guests.
-
-+6) sPAPR specification allows guests to have an additional DMA window(s) on
-+a PCI bus with a variable page size. Two ioctls have been added to support
-+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
-+The platform has to support the functionality or error will be returned to
-+the userspace. The existing hardware supports up to 2 DMA windows, one is
-+2GB long, uses 4K pages and called "default 32bit window"; the other can
-+be as big as entire RAM, use different page size, it is optional - guests
-+create those in run-time if the guest driver supports 64bit DMA.
-+
-+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
-+a number of TCE table levels (if a TCE table is going to be big enough and
-+the kernel may not be able to allocate enough of physically contiguous memory).
-+It creates a new window in the available slot and returns the bus address where
-+the new window starts. Due to hardware limitation, the user space cannot choose
-+the location of DMA windows.
-+
-+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
-+and removes it.
-+
- -------------------------------------------------------------------------------
-
- [1] VFIO was originally an acronym for "Virtual Function I/O" in its
-diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
-index 9844c106..282767f 100644
---- a/arch/powerpc/include/asm/iommu.h
-+++ b/arch/powerpc/include/asm/iommu.h
-@@ -151,7 +151,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
- extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
- int nid);
-
--#define IOMMU_TABLE_GROUP_MAX_TABLES 1
-+#define IOMMU_TABLE_GROUP_MAX_TABLES 2
-
- struct iommu_table_group;
-
-diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index 970e3a2..f04c6f5 100644
---- a/drivers/vfio/vfio_iommu_spapr_tce.c
-+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
-@@ -266,6 +266,20 @@ static void tce_iommu_disable(struct tce_container *container)
- decrement_locked_vm(container->locked_pages);
+ arch/powerpc/include/asm/mmu-hash64.h | 3 +
+ arch/powerpc/include/asm/mmu_context.h | 17 +++
+ arch/powerpc/mm/Makefile | 1 +
+ arch/powerpc/mm/mmu_context_hash64.c | 6 +
+ arch/powerpc/mm/mmu_context_hash64_iommu.c | 221 +++++++++++++++++++++++++++++
+ 5 files changed, 248 insertions(+)
+ create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c
+
+diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
+index 1da6a81..a82f534 100644
+--- a/arch/powerpc/include/asm/mmu-hash64.h
++++ b/arch/powerpc/include/asm/mmu-hash64.h
+@@ -536,6 +536,9 @@ typedef struct {
+ /* for 4K PTE fragment support */
+ void *pte_frag;
+ #endif
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++ struct list_head iommu_group_mem_list;
++#endif
+ } mm_context_t;
+
+
+diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
+index 73382eb..138bb53 100644
+--- a/arch/powerpc/include/asm/mmu_context.h
++++ b/arch/powerpc/include/asm/mmu_context.h
+@@ -16,6 +16,23 @@
+ */
+ extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+ extern void destroy_context(struct mm_struct *mm);
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++struct mm_iommu_table_group_mem_t;
++
++extern bool mm_iommu_preregistered(void);
++extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
++ struct mm_iommu_table_group_mem_t **pmem);
++extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
++ unsigned long entries);
++extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
++extern void mm_iommu_cleanup(mm_context_t *ctx);
++extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
++ unsigned long size);
++extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
++ unsigned long ua, unsigned long *hpa);
++extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
++extern long mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
++#endif
+
+ extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
+ extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
+diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
+index 9c8770b..e216704 100644
+--- a/arch/powerpc/mm/Makefile
++++ b/arch/powerpc/mm/Makefile
+@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT) += subpage-prot.o
+ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
+ obj-$(CONFIG_HIGHMEM) += highmem.o
+ obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
++obj-$(CONFIG_SPAPR_TCE_IOMMU) += mmu_context_hash64_iommu.o
+diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
+index 178876ae..eb3080c 100644
+--- a/arch/powerpc/mm/mmu_context_hash64.c
++++ b/arch/powerpc/mm/mmu_context_hash64.c
+@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+ #ifdef CONFIG_PPC_64K_PAGES
+ mm->context.pte_frag = NULL;
+ #endif
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++ INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
++#endif
+ return 0;
}
-+static int spapr_tce_find_free_table(struct tce_container *container)
-+{
-+ int i;
-+
-+ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-+ struct iommu_table *tbl = &container->tables[i];
-+
-+ if (!tbl->it_size)
-+ return i;
-+ }
-+
-+ return -ENOSPC;
-+}
-+
- static long tce_iommu_create_table(struct iommu_table_group *table_group,
- int num,
- __u32 page_shift,
-@@ -559,11 +573,114 @@ static long tce_iommu_build_v2(struct tce_container *container,
- return ret;
- }
-
-+static long tce_iommu_create_window(struct tce_container *container,
-+ __u32 page_shift, __u64 window_size, __u32 levels,
-+ __u64 *start_addr)
-+{
-+ struct tce_iommu_group *tcegrp;
-+ struct iommu_table_group *table_group;
-+ struct iommu_table *tbl;
-+ long ret, num;
-+
-+ num = spapr_tce_find_free_table(container);
-+ if (num < 0)
-+ return num;
-+
-+ tbl = &container->tables[num];
-+
-+ /* Get the first group for ops::create_table */
-+ tcegrp = list_first_entry(&container->group_list,
-+ struct tce_iommu_group, next);
-+ table_group = iommu_group_get_iommudata(tcegrp->grp);
-+ if (!table_group)
+@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
+
+ void destroy_context(struct mm_struct *mm)
+ {
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++ mm_iommu_cleanup(&mm->context);
++#endif
+
+ #ifdef CONFIG_PPC_ICSWX
+ drop_cop(mm->context.acop, mm);
+diff --git a/arch/powerpc/mm/mmu_context_hash64_iommu.c b/arch/powerpc/mm/mmu_context_hash64_iommu.c
+new file mode 100644
+index 0000000..002c6c9
+--- /dev/null
++++ b/arch/powerpc/mm/mmu_context_hash64_iommu.c
+@@ -0,0 +1,221 @@
++/*
++ * IOMMU helpers in MMU context.
++ *
++ * Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
++ *
++ * This program is free software; you can redistribute it and/or
++ * modify it under the terms of the GNU General Public License
++ * as published by the Free Software Foundation; either version
++ * 2 of the License, or (at your option) any later version.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/rculist.h>
++#include <linux/vmalloc.h>
++#include <linux/kref.h>
++#include <asm/mmu_context.h>
++
++struct mm_iommu_table_group_mem_t {
++ struct list_head next;
++ struct rcu_head rcu;
++ struct kref kref; /* one reference per VFIO container */
++ atomic_t mapped; /* number of currently mapped pages */
++ u64 ua; /* userspace address */
++ u64 entries; /* number of entries in hpas[] */
++ u64 *hpas; /* vmalloc'ed */
++};
++
++bool mm_iommu_preregistered(void)
++{
++ if (!current || !current->mm)
++ return false;
++
++ return !list_empty(¤t->mm->context.iommu_group_mem_list);
++}
++EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
++
++long mm_iommu_alloc(unsigned long ua, unsigned long entries,
++ struct mm_iommu_table_group_mem_t **pmem)
++{
++ struct mm_iommu_table_group_mem_t *mem;
++ long i, j;
++ struct page *page = NULL;
++
++ list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list,
++ next) {
++ if ((mem->ua == ua) && (mem->entries == entries))
++ return -EBUSY;
++
++ /* Overlap? */
++ if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
++ (ua < (mem->ua + (mem->entries << PAGE_SHIFT))))
++ return -EINVAL;
++ }
++
++ mem = kzalloc(sizeof(*mem), GFP_KERNEL);
++ if (!mem)
++ return -ENOMEM;
++
++ mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
++ if (!mem->hpas) {
++ kfree(mem);
++ return -ENOMEM;
++ }
++
++ for (i = 0; i < entries; ++i) {
++ if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
++ 1/* pages */, 1/* iswrite */, &page)) {
++ for (j = 0; j < i; ++j)
++ put_page(pfn_to_page(
++ mem->hpas[j] >> PAGE_SHIFT));
++ vfree(mem->hpas);
++ kfree(mem);
++ return -EFAULT;
++ }
++
++ mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
++ }
++
++ kref_init(&mem->kref);
++ atomic_set(&mem->mapped, 1);
++ mem->ua = ua;
++ mem->entries = entries;
++ *pmem = mem;
++
++ list_add_rcu(&mem->next, ¤t->mm->context.iommu_group_mem_list);
++
++ return 0;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_alloc);
++
++static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
++{
++ long i;
++ struct page *page = NULL;
++
++ for (i = 0; i < mem->entries; ++i) {
++ if (!mem->hpas[i])
++ continue;
++
++ page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
++ if (!page)
++ continue;
++
++ put_page(page);
++ mem->hpas[i] = 0;
++ }
++}
++
++static void mm_iommu_free(struct rcu_head *head)
++{
++ struct mm_iommu_table_group_mem_t *mem = container_of(head,
++ struct mm_iommu_table_group_mem_t, rcu);
++
++ mm_iommu_unpin(mem);
++ vfree(mem->hpas);
++ kfree(mem);
++}
++
++static void mm_iommu_release(struct kref *kref)
++{
++ struct mm_iommu_table_group_mem_t *mem = container_of(kref,
++ struct mm_iommu_table_group_mem_t, kref);
++
++ list_del_rcu(&mem->next);
++ call_rcu(&mem->rcu, mm_iommu_free);
++}
++
++struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
++ unsigned long entries)
++{
++ struct mm_iommu_table_group_mem_t *mem;
++
++ list_for_each_entry_rcu(mem, ¤t->mm->context.iommu_group_mem_list,
++ next) {
++ if ((mem->ua == ua) && (mem->entries == entries)) {
++ kref_get(&mem->kref);
++ return mem;
++ }
++ }
++
++ return NULL;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_get);
++
++long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
++{
++ if (1 != atomic_dec_if_positive(&mem->mapped)) {
++ /* There are mappings, exit */
++ atomic_inc(&mem->mapped);
++ return -EBUSY;
++ }
++
++ kref_put(&mem->kref, mm_iommu_release);
++
++ return 0;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_put);
++
++struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
++ unsigned long size)
++{
++ struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
++
++ list_for_each_entry_rcu(mem,
++ ¤t->mm->context.iommu_group_mem_list,
++ next) {
++ if ((mem->ua <= ua) &&
++ (ua + size <= mem->ua +
++ (mem->entries << PAGE_SHIFT))) {
++ ret = mem;
++ break;
++ }
++ }
++
++ return ret;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_lookup);
++
++long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
++ unsigned long ua, unsigned long *hpa)
++{
++ const long entry = (ua - mem->ua) >> PAGE_SHIFT;
++ u64 *va = &mem->hpas[entry];
++
++ if (entry >= mem->entries)
+ return -EFAULT;
+
-+ if (!(table_group->pgsizes & (1ULL << page_shift)))
-+ return -EINVAL;
-+
-+ if (!table_group->ops->set_window || !table_group->ops->unset_window ||
-+ !table_group->ops->get_table_size ||
-+ !table_group->ops->create_table)
-+ return -EPERM;
-+
-+ /* Create TCE table */
-+ ret = tce_iommu_create_table(table_group, num,
-+ page_shift, window_size, levels, tbl);
-+ if (ret)
-+ return ret;
-+
-+ BUG_ON(!tbl->it_ops->free);
-+
-+ /*
-+ * Program the table to every group.
-+ * Groups have been tested for compatibility at the attach time.
-+ */
-+ list_for_each_entry(tcegrp, &container->group_list, next) {
-+ table_group = iommu_group_get_iommudata(tcegrp->grp);
-+
-+ ret = table_group->ops->set_window(table_group, num, tbl);
-+ if (ret)
-+ goto unset_exit;
-+ }
-+
-+ /* Return start address assigned by platform in create_table() */
-+ *start_addr = tbl->it_offset << tbl->it_page_shift;
++ *hpa = *va | (ua & ~PAGE_MASK);
+
+ return 0;
-+
-+unset_exit:
-+ list_for_each_entry(tcegrp, &container->group_list, next) {
-+ table_group = iommu_group_get_iommudata(tcegrp->grp);
-+ table_group->ops->unset_window(table_group, num);
-+ }
-+ tce_iommu_free_table(tbl);
-+
-+ return ret;
-+}
-+
-+static long tce_iommu_remove_window(struct tce_container *container,
-+ __u64 start_addr)
-+{
-+ struct iommu_table_group *table_group = NULL;
-+ struct iommu_table *tbl;
-+ struct tce_iommu_group *tcegrp;
-+ int num;
-+
-+ tbl = spapr_tce_find_table(container, start_addr);
-+ if (!tbl)
-+ return -EINVAL;
-+
-+ /* Detach groups from IOMMUs */
-+ num = tbl - container->tables;
-+ list_for_each_entry(tcegrp, &container->group_list, next) {
-+ table_group = iommu_group_get_iommudata(tcegrp->grp);
-+
-+ /*
-+ * SPAPR TCE IOMMU exposes the default DMA window to
-+ * the guest via dma32_window_start/size of
-+ * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
-+ * the userspace to remove this window, some do not so
-+ * here we check for the platform capability.
-+ */
-+ if (!table_group->ops || !table_group->ops->unset_window)
-+ return -EPERM;
-+
-+ if (container->tables[num].it_size)
-+ table_group->ops->unset_window(table_group, num);
-+ }
-+
-+ /* Free table */
-+ tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-+ tce_iommu_free_table(tbl);
-+
-+ return 0;
-+}
-+
- static long tce_iommu_ioctl(void *iommu_data,
- unsigned int cmd, unsigned long arg)
- {
- struct tce_container *container = iommu_data;
-- unsigned long minsz;
-+ unsigned long minsz, ddwsz;
- long ret;
-
- switch (cmd) {
-@@ -607,6 +724,21 @@ static long tce_iommu_ioctl(void *iommu_data,
- info.dma32_window_start = table_group->tce32_start;
- info.dma32_window_size = table_group->tce32_size;
- info.flags = 0;
-+ memset(&info.ddw, 0, sizeof(info.ddw));
-+
-+ if (table_group->max_dynamic_windows_supported &&
-+ container->v2) {
-+ info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
-+ info.ddw.pgsizes = table_group->pgsizes;
-+ info.ddw.max_dynamic_windows_supported =
-+ table_group->max_dynamic_windows_supported;
-+ info.ddw.levels = table_group->max_levels;
-+ }
-+
-+ ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
-+
-+ if (info.argsz >= ddwsz)
-+ minsz = ddwsz;
-
- if (copy_to_user((void __user *)arg, &info, minsz))
- return -EFAULT;
-@@ -797,6 +929,69 @@ static long tce_iommu_ioctl(void *iommu_data,
- return ret;
- }
-
-+ case VFIO_IOMMU_SPAPR_TCE_CREATE: {
-+ struct vfio_iommu_spapr_tce_create create;
-+
-+ if (!container->v2)
-+ break;
-+
-+ if (!tce_groups_attached(container))
-+ return -ENXIO;
-+
-+ minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
-+ start_addr);
-+
-+ if (copy_from_user(&create, (void __user *)arg, minsz))
-+ return -EFAULT;
-+
-+ if (create.argsz < minsz)
-+ return -EINVAL;
-+
-+ if (create.flags)
-+ return -EINVAL;
-+
-+ mutex_lock(&container->lock);
-+
-+ ret = tce_iommu_create_window(container, create.page_shift,
-+ create.window_size, create.levels,
-+ &create.start_addr);
-+
-+ mutex_unlock(&container->lock);
-+
-+ if (!ret && copy_to_user((void __user *)arg, &create, minsz))
-+ ret = -EFAULT;
-+
-+ return ret;
-+ }
-+ case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
-+ struct vfio_iommu_spapr_tce_remove remove;
-+
-+ if (!container->v2)
-+ break;
-+
-+ if (!tce_groups_attached(container))
-+ return -ENXIO;
-+
-+ minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
-+ start_addr);
-+
-+ if (copy_from_user(&remove, (void __user *)arg, minsz))
-+ return -EFAULT;
-+
-+ if (remove.argsz < minsz)
-+ return -EINVAL;
-+
-+ if (remove.flags)
-+ return -EINVAL;
-+
-+ mutex_lock(&container->lock);
-+
-+ ret = tce_iommu_remove_window(container, remove.start_addr);
-+
-+ mutex_unlock(&container->lock);
-+
-+ return ret;
-+ }
- }
-
- return -ENOTTY;
-diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
-index 8fdcfb9..dde0fe5 100644
---- a/include/uapi/linux/vfio.h
-+++ b/include/uapi/linux/vfio.h
-@@ -445,6 +445,23 @@ struct vfio_iommu_type1_dma_unmap {
- /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
-
- /*
-+ * The SPAPR TCE DDW info struct provides the information about
-+ * the details of Dynamic DMA window capability.
-+ *
-+ * @pgsizes contains a page size bitmask, 4K/64K/16M are supported.
-+ * @max_dynamic_windows_supported tells the maximum number of windows
-+ * which the platform can create.
-+ * @levels tells the maximum number of levels in multi-level IOMMU tables;
-+ * this allows splitting a table into smaller chunks which reduces
-+ * the amount of physically contiguous memory required for the table.
-+ */
-+struct vfio_iommu_spapr_tce_ddw_info {
-+ __u64 pgsizes; /* Bitmap of supported page sizes */
-+ __u32 max_dynamic_windows_supported;
-+ __u32 levels;
-+};
-+
-+/*
- * The SPAPR TCE info struct provides the information about the PCI bus
- * address ranges available for DMA, these values are programmed into
- * the hardware so the guest has to know that information.
-@@ -454,14 +471,17 @@ struct vfio_iommu_type1_dma_unmap {
- * addresses too so the window works as a filter rather than an offset
- * for IOVA addresses.
- *
-- * A flag will need to be added if other page sizes are supported,
-- * so as defined here, it is always 4k.
-+ * Flags supported:
-+ * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows
-+ * (DDW) support is present. @ddw is only supported when DDW is present.
- */
- struct vfio_iommu_spapr_tce_info {
- __u32 argsz;
-- __u32 flags; /* reserved for future use */
-+ __u32 flags;
-+#define VFIO_IOMMU_SPAPR_INFO_DDW (1 << 0) /* DDW supported */
- __u32 dma32_window_start; /* 32 bit window start (bytes) */
- __u32 dma32_window_size; /* 32 bit window size (bytes) */
-+ struct vfio_iommu_spapr_tce_ddw_info ddw;
- };
-
- #define VFIO_IOMMU_SPAPR_TCE_GET_INFO _IO(VFIO_TYPE, VFIO_BASE + 12)
-@@ -522,6 +542,41 @@ struct vfio_iommu_spapr_register_memory {
- */
- #define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY _IO(VFIO_TYPE, VFIO_BASE + 18)
-
-+/**
-+ * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create)
-+ *
-+ * Creates an additional TCE table and programs it (sets a new DMA window)
-+ * to every IOMMU group in the container. It receives page shift, window
-+ * size and number of levels in the TCE table being created.
-+ *
-+ * It allocates and returns an offset on a PCI bus of the new DMA window.
-+ */
-+struct vfio_iommu_spapr_tce_create {
-+ __u32 argsz;
-+ __u32 flags;
-+ /* in */
-+ __u32 page_shift;
-+ __u64 window_size;
-+ __u32 levels;
-+ /* out */
-+ __u64 start_addr;
-+};
-+#define VFIO_IOMMU_SPAPR_TCE_CREATE _IO(VFIO_TYPE, VFIO_BASE + 19)
-+
-+/**
-+ * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove)
-+ *
-+ * Unprograms a TCE table from all groups in the container and destroys it.
-+ * It receives a PCI bus offset as a window id.
-+ */
-+struct vfio_iommu_spapr_tce_remove {
-+ __u32 argsz;
-+ __u32 flags;
-+ /* in */
-+ __u64 start_addr;
-+};
-+#define VFIO_IOMMU_SPAPR_TCE_REMOVE _IO(VFIO_TYPE, VFIO_BASE + 20)
-+
- /* ***************************************************************** */
-
- #endif /* _UAPIVFIO_H */
++}
++EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
++
++long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
++{
++ if (atomic_inc_not_zero(&mem->mapped))
++ return 0;
++
++ /* Last mm_iommu_put() has been called, no more mappings allowed() */
++ return -ENXIO;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
++
++long mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
++{
++ return atomic_dec_if_positive(&mem->mapped);
++}
++EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
++
++void mm_iommu_cleanup(mm_context_t *ctx)
++{
++ while (!list_empty(&ctx->iommu_group_mem_list)) {
++ struct mm_iommu_table_group_mem_t *mem;
++
++ mem = list_first_entry(&ctx->iommu_group_mem_list,
++ struct mm_iommu_table_group_mem_t, next);
++ mm_iommu_release(&mem->kref);
++ }
++}
--
-2.0.0
+2.4.0.rc3.8.gfb3e7d5