Inter-revision diff: patch 32

Comparing v9 (message) to v10 (message)

--- v9
+++ v10
@@ -1,421 +1,360 @@
-This adds create/remove window ioctls to create and remove DMA windows.
-sPAPR defines a Dynamic DMA windows capability which allows
-para-virtualized guests to create additional DMA windows on a PCI bus.
-The existing linux kernels use this new window to map the entire guest
-memory and switch to the direct DMA operations saving time on map/unmap
-requests which would normally happen in a big amounts.
-
-This adds 2 ioctl handlers - VFIO_IOMMU_SPAPR_TCE_CREATE and
-VFIO_IOMMU_SPAPR_TCE_REMOVE - to create and remove windows.
-Up to 2 windows are supported now by the hardware and by this driver.
-
-This changes VFIO_IOMMU_SPAPR_TCE_GET_INFO handler to return additional
-information such as a number of supported windows and maximum number
-levels of TCE tables.
-
-DDW is added as a capability, not as a SPAPR TCE IOMMU v2 unique feature
-as we still want to support v2 on platforms which cannot do DDW for
-the sake of TCE acceleration in KVM (coming soon).
+We are adding support for DMA memory pre-registration to be used in
+conjunction with VFIO. The idea is that the userspace which is going to
+run a guest may want to pre-register a user space memory region so
+it all gets pinned once and never goes away. Having this done,
+a hypervisor will not have to pin/unpin pages on every DMA map/unmap
+request. This is going to help with multiple pinning of the same memory.
+
+Another use of it is in-kernel real mode (mmu off) acceleration of
+DMA requests where real time translation of guest physical to host
+physical addresses is non-trivial and may fail as linux ptes may be
+temporarily invalid. Also, having cached host physical addresses
+(compared to just pinning at the start and then walking the page table
+again on every H_PUT_TCE), we can be sure that the addresses which we put
+into TCE table are the ones we already pinned.
+
+This adds a list of memory regions to mm_context_t. Each region consists
+of a header and a list of physical addresses. This adds API to:
+1. register/unregister memory regions;
+2. do final cleanup (which puts all pre-registered pages);
+3. do userspace to physical address translation;
+4. manage a mapped pages counter; when it is zero, it is safe to
+unregister the region.
+
+Multiple registration of the same region is allowed, kref is used to
+track the number of registrations. atomic_inc_not_zero() and
+atomic_dec_if_positive() are used to decide whether to allow or deny
+the mapped counter increments.
+
+Each registered region keeps a counter for mapped TCEs plus one per
+the registered area.
+
+Host physical addresses are stored in vmalloc'ed array. In order to
+access these in the real mode (mmu off), there is a real_vmalloc_addr()
+helper. In-kernel acceleration patchset will move it from KVM to MMU code.
 
 Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
-[aw: for the vfio related changes]
-Acked-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 Changes:
-v7:
-* s/VFIO_IOMMU_INFO_DDW/VFIO_IOMMU_SPAPR_INFO_DDW/
-* fixed typos in and updated vfio.txt
-* fixed VFIO_IOMMU_SPAPR_TCE_GET_INFO handler
-* moved ddw properties to vfio_iommu_spapr_tce_ddw_info
-
-v6:
-* added explicit VFIO_IOMMU_INFO_DDW flag to vfio_iommu_spapr_tce_info,
-it used to be page mask flags from platform code
-* added explicit pgsizes field
-* added cleanup if tce_iommu_create_window() failed in a middle
-* added checks for callbacks in tce_iommu_create_window and remove those
-from tce_iommu_remove_window when it is too late to test anyway
-* spapr_tce_find_free_table returns sensible error code now
-* updated description of VFIO_IOMMU_SPAPR_TCE_CREATE/
-VFIO_IOMMU_SPAPR_TCE_REMOVE
-
-v4:
-* moved code to tce_iommu_create_window()/tce_iommu_remove_window()
-helpers
-* added docs
+v10:
+* split mm_iommu_mapped_update into mm_iommu_mapped_dec + mm_iommu_mapped_inc
+* mapped counter now keep one reference for itself and mm_iommu_mapped_inc()
+can tell if the region is being released
+* updated commit log
+
+v8:
+* s/mm_iommu_table_group_mem_t/struct mm_iommu_table_group_mem_t/
+* fixed error fallback look (s/[i]/[j]/)
 ---
- Documentation/vfio.txt              |  19 ++++
- arch/powerpc/include/asm/iommu.h    |   2 +-
- drivers/vfio/vfio_iommu_spapr_tce.c | 197 +++++++++++++++++++++++++++++++++++-
- include/uapi/linux/vfio.h           |  61 ++++++++++-
- 4 files changed, 274 insertions(+), 5 deletions(-)
-
-diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
-index 7dcf2b5..8b1ec51 100644
---- a/Documentation/vfio.txt
-+++ b/Documentation/vfio.txt
-@@ -452,6 +452,25 @@ address is from pre-registered range.
- 
- This separation helps in optimizing DMA for guests.
- 
-+6) sPAPR specification allows guests to have an additional DMA window(s) on
-+a PCI bus with a variable page size. Two ioctls have been added to support
-+this: VFIO_IOMMU_SPAPR_TCE_CREATE and VFIO_IOMMU_SPAPR_TCE_REMOVE.
-+The platform has to support the functionality or error will be returned to
-+the userspace. The existing hardware supports up to 2 DMA windows, one is
-+2GB long, uses 4K pages and called "default 32bit window"; the other can
-+be as big as entire RAM, use different page size, it is optional - guests
-+create those in run-time if the guest driver supports 64bit DMA.
-+
-+VFIO_IOMMU_SPAPR_TCE_CREATE receives a page shift, a DMA window size and
-+a number of TCE table levels (if a TCE table is going to be big enough and
-+the kernel may not be able to allocate enough of physically contiguous memory).
-+It creates a new window in the available slot and returns the bus address where
-+the new window starts. Due to hardware limitation, the user space cannot choose
-+the location of DMA windows.
-+
-+VFIO_IOMMU_SPAPR_TCE_REMOVE receives the bus start address of the window
-+and removes it.
-+
- -------------------------------------------------------------------------------
- 
- [1] VFIO was originally an acronym for "Virtual Function I/O" in its
-diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
-index 9844c106..282767f 100644
---- a/arch/powerpc/include/asm/iommu.h
-+++ b/arch/powerpc/include/asm/iommu.h
-@@ -151,7 +151,7 @@ extern void iommu_free_table(struct iommu_table *tbl, const char *node_name);
- extern struct iommu_table *iommu_init_table(struct iommu_table * tbl,
- 					    int nid);
- 
--#define IOMMU_TABLE_GROUP_MAX_TABLES	1
-+#define IOMMU_TABLE_GROUP_MAX_TABLES	2
- 
- struct iommu_table_group;
- 
-diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index 970e3a2..f04c6f5 100644
---- a/drivers/vfio/vfio_iommu_spapr_tce.c
-+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
-@@ -266,6 +266,20 @@ static void tce_iommu_disable(struct tce_container *container)
- 	decrement_locked_vm(container->locked_pages);
+ arch/powerpc/include/asm/mmu-hash64.h      |   3 +
+ arch/powerpc/include/asm/mmu_context.h     |  17 +++
+ arch/powerpc/mm/Makefile                   |   1 +
+ arch/powerpc/mm/mmu_context_hash64.c       |   6 +
+ arch/powerpc/mm/mmu_context_hash64_iommu.c | 221 +++++++++++++++++++++++++++++
+ 5 files changed, 248 insertions(+)
+ create mode 100644 arch/powerpc/mm/mmu_context_hash64_iommu.c
+
+diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/mmu-hash64.h
+index 1da6a81..a82f534 100644
+--- a/arch/powerpc/include/asm/mmu-hash64.h
++++ b/arch/powerpc/include/asm/mmu-hash64.h
+@@ -536,6 +536,9 @@ typedef struct {
+ 	/* for 4K PTE fragment support */
+ 	void *pte_frag;
+ #endif
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++	struct list_head iommu_group_mem_list;
++#endif
+ } mm_context_t;
+ 
+ 
+diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h
+index 73382eb..138bb53 100644
+--- a/arch/powerpc/include/asm/mmu_context.h
++++ b/arch/powerpc/include/asm/mmu_context.h
+@@ -16,6 +16,23 @@
+  */
+ extern int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+ extern void destroy_context(struct mm_struct *mm);
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++struct mm_iommu_table_group_mem_t;
++
++extern bool mm_iommu_preregistered(void);
++extern long mm_iommu_alloc(unsigned long ua, unsigned long entries,
++		struct mm_iommu_table_group_mem_t **pmem);
++extern struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
++		unsigned long entries);
++extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
++extern void mm_iommu_cleanup(mm_context_t *ctx);
++extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
++		unsigned long size);
++extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
++		unsigned long ua, unsigned long *hpa);
++extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
++extern long mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem);
++#endif
+ 
+ extern void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next);
+ extern void switch_slb(struct task_struct *tsk, struct mm_struct *mm);
+diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
+index 9c8770b..e216704 100644
+--- a/arch/powerpc/mm/Makefile
++++ b/arch/powerpc/mm/Makefile
+@@ -36,3 +36,4 @@ obj-$(CONFIG_PPC_SUBPAGE_PROT)	+= subpage-prot.o
+ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
+ obj-$(CONFIG_HIGHMEM)		+= highmem.o
+ obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
++obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= mmu_context_hash64_iommu.o
+diff --git a/arch/powerpc/mm/mmu_context_hash64.c b/arch/powerpc/mm/mmu_context_hash64.c
+index 178876ae..eb3080c 100644
+--- a/arch/powerpc/mm/mmu_context_hash64.c
++++ b/arch/powerpc/mm/mmu_context_hash64.c
+@@ -89,6 +89,9 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
+ #ifdef CONFIG_PPC_64K_PAGES
+ 	mm->context.pte_frag = NULL;
+ #endif
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++	INIT_LIST_HEAD_RCU(&mm->context.iommu_group_mem_list);
++#endif
+ 	return 0;
  }
  
-+static int spapr_tce_find_free_table(struct tce_container *container)
-+{
-+	int i;
-+
-+	for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-+		struct iommu_table *tbl = &container->tables[i];
-+
-+		if (!tbl->it_size)
-+			return i;
-+	}
-+
-+	return -ENOSPC;
-+}
-+
- static long tce_iommu_create_table(struct iommu_table_group *table_group,
- 			int num,
- 			__u32 page_shift,
-@@ -559,11 +573,114 @@ static long tce_iommu_build_v2(struct tce_container *container,
- 	return ret;
- }
- 
-+static long tce_iommu_create_window(struct tce_container *container,
-+		__u32 page_shift, __u64 window_size, __u32 levels,
-+		__u64 *start_addr)
-+{
-+	struct tce_iommu_group *tcegrp;
-+	struct iommu_table_group *table_group;
-+	struct iommu_table *tbl;
-+	long ret, num;
-+
-+	num = spapr_tce_find_free_table(container);
-+	if (num < 0)
-+		return num;
-+
-+	tbl = &container->tables[num];
-+
-+	/* Get the first group for ops::create_table */
-+	tcegrp = list_first_entry(&container->group_list,
-+			struct tce_iommu_group, next);
-+	table_group = iommu_group_get_iommudata(tcegrp->grp);
-+	if (!table_group)
+@@ -132,6 +135,9 @@ static inline void destroy_pagetable_page(struct mm_struct *mm)
+ 
+ void destroy_context(struct mm_struct *mm)
+ {
++#ifdef CONFIG_SPAPR_TCE_IOMMU
++	mm_iommu_cleanup(&mm->context);
++#endif
+ 
+ #ifdef CONFIG_PPC_ICSWX
+ 	drop_cop(mm->context.acop, mm);
+diff --git a/arch/powerpc/mm/mmu_context_hash64_iommu.c b/arch/powerpc/mm/mmu_context_hash64_iommu.c
+new file mode 100644
+index 0000000..002c6c9
+--- /dev/null
++++ b/arch/powerpc/mm/mmu_context_hash64_iommu.c
+@@ -0,0 +1,221 @@
++/*
++ *  IOMMU helpers in MMU context.
++ *
++ *  Copyright (C) 2015 IBM Corp. <aik@ozlabs.ru>
++ *
++ *  This program is free software; you can redistribute it and/or
++ *  modify it under the terms of the GNU General Public License
++ *  as published by the Free Software Foundation; either version
++ *  2 of the License, or (at your option) any later version.
++ *
++ */
++
++#include <linux/sched.h>
++#include <linux/slab.h>
++#include <linux/rculist.h>
++#include <linux/vmalloc.h>
++#include <linux/kref.h>
++#include <asm/mmu_context.h>
++
++struct mm_iommu_table_group_mem_t {
++	struct list_head next;
++	struct rcu_head rcu;
++	struct kref kref;	/* one reference per VFIO container */
++	atomic_t mapped;	/* number of currently mapped pages */
++	u64 ua;			/* userspace address */
++	u64 entries;		/* number of entries in hpas[] */
++	u64 *hpas;		/* vmalloc'ed */
++};
++
++bool mm_iommu_preregistered(void)
++{
++	if (!current || !current->mm)
++		return false;
++
++	return !list_empty(&current->mm->context.iommu_group_mem_list);
++}
++EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
++
++long mm_iommu_alloc(unsigned long ua, unsigned long entries,
++		struct mm_iommu_table_group_mem_t **pmem)
++{
++	struct mm_iommu_table_group_mem_t *mem;
++	long i, j;
++	struct page *page = NULL;
++
++	list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
++			next) {
++		if ((mem->ua == ua) && (mem->entries == entries))
++			return -EBUSY;
++
++		/* Overlap? */
++		if ((mem->ua < (ua + (entries << PAGE_SHIFT))) &&
++				(ua < (mem->ua + (mem->entries << PAGE_SHIFT))))
++			return -EINVAL;
++	}
++
++	mem = kzalloc(sizeof(*mem), GFP_KERNEL);
++	if (!mem)
++		return -ENOMEM;
++
++	mem->hpas = vzalloc(entries * sizeof(mem->hpas[0]));
++	if (!mem->hpas) {
++		kfree(mem);
++		return -ENOMEM;
++	}
++
++	for (i = 0; i < entries; ++i) {
++		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
++					1/* pages */, 1/* iswrite */, &page)) {
++			for (j = 0; j < i; ++j)
++				put_page(pfn_to_page(
++						mem->hpas[j] >> PAGE_SHIFT));
++			vfree(mem->hpas);
++			kfree(mem);
++			return -EFAULT;
++		}
++
++		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
++	}
++
++	kref_init(&mem->kref);
++	atomic_set(&mem->mapped, 1);
++	mem->ua = ua;
++	mem->entries = entries;
++	*pmem = mem;
++
++	list_add_rcu(&mem->next, &current->mm->context.iommu_group_mem_list);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_alloc);
++
++static void mm_iommu_unpin(struct mm_iommu_table_group_mem_t *mem)
++{
++	long i;
++	struct page *page = NULL;
++
++	for (i = 0; i < mem->entries; ++i) {
++		if (!mem->hpas[i])
++			continue;
++
++		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
++		if (!page)
++			continue;
++
++		put_page(page);
++		mem->hpas[i] = 0;
++	}
++}
++
++static void mm_iommu_free(struct rcu_head *head)
++{
++	struct mm_iommu_table_group_mem_t *mem = container_of(head,
++			struct mm_iommu_table_group_mem_t, rcu);
++
++	mm_iommu_unpin(mem);
++	vfree(mem->hpas);
++	kfree(mem);
++}
++
++static void mm_iommu_release(struct kref *kref)
++{
++	struct mm_iommu_table_group_mem_t *mem = container_of(kref,
++			struct mm_iommu_table_group_mem_t, kref);
++
++	list_del_rcu(&mem->next);
++	call_rcu(&mem->rcu, mm_iommu_free);
++}
++
++struct mm_iommu_table_group_mem_t *mm_iommu_get(unsigned long ua,
++		unsigned long entries)
++{
++	struct mm_iommu_table_group_mem_t *mem;
++
++	list_for_each_entry_rcu(mem, &current->mm->context.iommu_group_mem_list,
++			next) {
++		if ((mem->ua == ua) && (mem->entries == entries)) {
++			kref_get(&mem->kref);
++			return mem;
++		}
++	}
++
++	return NULL;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_get);
++
++long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem)
++{
++	if (1 != atomic_dec_if_positive(&mem->mapped)) {
++		/* There are mappings, exit */
++		atomic_inc(&mem->mapped);
++		return -EBUSY;
++	}
++
++	kref_put(&mem->kref, mm_iommu_release);
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_put);
++
++struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
++		unsigned long size)
++{
++	struct mm_iommu_table_group_mem_t *mem, *ret = NULL;
++
++	list_for_each_entry_rcu(mem,
++			&current->mm->context.iommu_group_mem_list,
++			next) {
++		if ((mem->ua <= ua) &&
++				(ua + size <= mem->ua +
++				 (mem->entries << PAGE_SHIFT))) {
++			ret = mem;
++			break;
++		}
++	}
++
++	return ret;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_lookup);
++
++long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
++		unsigned long ua, unsigned long *hpa)
++{
++	const long entry = (ua - mem->ua) >> PAGE_SHIFT;
++	u64 *va = &mem->hpas[entry];
++
++	if (entry >= mem->entries)
 +		return -EFAULT;
 +
-+	if (!(table_group->pgsizes & (1ULL << page_shift)))
-+		return -EINVAL;
-+
-+	if (!table_group->ops->set_window || !table_group->ops->unset_window ||
-+			!table_group->ops->get_table_size ||
-+			!table_group->ops->create_table)
-+		return -EPERM;
-+
-+	/* Create TCE table */
-+	ret = tce_iommu_create_table(table_group, num,
-+			page_shift, window_size, levels, tbl);
-+	if (ret)
-+		return ret;
-+
-+	BUG_ON(!tbl->it_ops->free);
-+
-+	/*
-+	 * Program the table to every group.
-+	 * Groups have been tested for compatibility at the attach time.
-+	 */
-+	list_for_each_entry(tcegrp, &container->group_list, next) {
-+		table_group = iommu_group_get_iommudata(tcegrp->grp);
-+
-+		ret = table_group->ops->set_window(table_group, num, tbl);
-+		if (ret)
-+			goto unset_exit;
-+	}
-+
-+	/* Return start address assigned by platform in create_table() */
-+	*start_addr = tbl->it_offset << tbl->it_page_shift;
++	*hpa = *va | (ua & ~PAGE_MASK);
 +
 +	return 0;
-+
-+unset_exit:
-+	list_for_each_entry(tcegrp, &container->group_list, next) {
-+		table_group = iommu_group_get_iommudata(tcegrp->grp);
-+		table_group->ops->unset_window(table_group, num);
-+	}
-+	tce_iommu_free_table(tbl);
-+
-+	return ret;
-+}
-+
-+static long tce_iommu_remove_window(struct tce_container *container,
-+		__u64 start_addr)
-+{
-+	struct iommu_table_group *table_group = NULL;
-+	struct iommu_table *tbl;
-+	struct tce_iommu_group *tcegrp;
-+	int num;
-+
-+	tbl = spapr_tce_find_table(container, start_addr);
-+	if (!tbl)
-+		return -EINVAL;
-+
-+	/* Detach groups from IOMMUs */
-+	num = tbl - container->tables;
-+	list_for_each_entry(tcegrp, &container->group_list, next) {
-+		table_group = iommu_group_get_iommudata(tcegrp->grp);
-+
-+		/*
-+		 * SPAPR TCE IOMMU exposes the default DMA window to
-+		 * the guest via dma32_window_start/size of
-+		 * VFIO_IOMMU_SPAPR_TCE_GET_INFO. Some platforms allow
-+		 * the userspace to remove this window, some do not so
-+		 * here we check for the platform capability.
-+		 */
-+		if (!table_group->ops || !table_group->ops->unset_window)
-+			return -EPERM;
-+
-+		if (container->tables[num].it_size)
-+			table_group->ops->unset_window(table_group, num);
-+	}
-+
-+	/* Free table */
-+	tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-+	tce_iommu_free_table(tbl);
-+
-+	return 0;
-+}
-+
- static long tce_iommu_ioctl(void *iommu_data,
- 				 unsigned int cmd, unsigned long arg)
- {
- 	struct tce_container *container = iommu_data;
--	unsigned long minsz;
-+	unsigned long minsz, ddwsz;
- 	long ret;
- 
- 	switch (cmd) {
-@@ -607,6 +724,21 @@ static long tce_iommu_ioctl(void *iommu_data,
- 		info.dma32_window_start = table_group->tce32_start;
- 		info.dma32_window_size = table_group->tce32_size;
- 		info.flags = 0;
-+		memset(&info.ddw, 0, sizeof(info.ddw));
-+
-+		if (table_group->max_dynamic_windows_supported &&
-+				container->v2) {
-+			info.flags |= VFIO_IOMMU_SPAPR_INFO_DDW;
-+			info.ddw.pgsizes = table_group->pgsizes;
-+			info.ddw.max_dynamic_windows_supported =
-+				table_group->max_dynamic_windows_supported;
-+			info.ddw.levels = table_group->max_levels;
-+		}
-+
-+		ddwsz = offsetofend(struct vfio_iommu_spapr_tce_info, ddw);
-+
-+		if (info.argsz >= ddwsz)
-+			minsz = ddwsz;
- 
- 		if (copy_to_user((void __user *)arg, &info, minsz))
- 			return -EFAULT;
-@@ -797,6 +929,69 @@ static long tce_iommu_ioctl(void *iommu_data,
- 		return ret;
- 	}
- 
-+	case VFIO_IOMMU_SPAPR_TCE_CREATE: {
-+		struct vfio_iommu_spapr_tce_create create;
-+
-+		if (!container->v2)
-+			break;
-+
-+		if (!tce_groups_attached(container))
-+			return -ENXIO;
-+
-+		minsz = offsetofend(struct vfio_iommu_spapr_tce_create,
-+				start_addr);
-+
-+		if (copy_from_user(&create, (void __user *)arg, minsz))
-+			return -EFAULT;
-+
-+		if (create.argsz < minsz)
-+			return -EINVAL;
-+
-+		if (create.flags)
-+			return -EINVAL;
-+
-+		mutex_lock(&container->lock);
-+
-+		ret = tce_iommu_create_window(container, create.page_shift,
-+				create.window_size, create.levels,
-+				&create.start_addr);
-+
-+		mutex_unlock(&container->lock);
-+
-+		if (!ret && copy_to_user((void __user *)arg, &create, minsz))
-+			ret = -EFAULT;
-+
-+		return ret;
-+	}
-+	case VFIO_IOMMU_SPAPR_TCE_REMOVE: {
-+		struct vfio_iommu_spapr_tce_remove remove;
-+
-+		if (!container->v2)
-+			break;
-+
-+		if (!tce_groups_attached(container))
-+			return -ENXIO;
-+
-+		minsz = offsetofend(struct vfio_iommu_spapr_tce_remove,
-+				start_addr);
-+
-+		if (copy_from_user(&remove, (void __user *)arg, minsz))
-+			return -EFAULT;
-+
-+		if (remove.argsz < minsz)
-+			return -EINVAL;
-+
-+		if (remove.flags)
-+			return -EINVAL;
-+
-+		mutex_lock(&container->lock);
-+
-+		ret = tce_iommu_remove_window(container, remove.start_addr);
-+
-+		mutex_unlock(&container->lock);
-+
-+		return ret;
-+	}
- 	}
- 
- 	return -ENOTTY;
-diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
-index 8fdcfb9..dde0fe5 100644
---- a/include/uapi/linux/vfio.h
-+++ b/include/uapi/linux/vfio.h
-@@ -445,6 +445,23 @@ struct vfio_iommu_type1_dma_unmap {
- /* -------- Additional API for SPAPR TCE (Server POWERPC) IOMMU -------- */
- 
- /*
-+ * The SPAPR TCE DDW info struct provides the information about
-+ * the details of Dynamic DMA window capability.
-+ *
-+ * @pgsizes contains a page size bitmask, 4K/64K/16M are supported.
-+ * @max_dynamic_windows_supported tells the maximum number of windows
-+ * which the platform can create.
-+ * @levels tells the maximum number of levels in multi-level IOMMU tables;
-+ * this allows splitting a table into smaller chunks which reduces
-+ * the amount of physically contiguous memory required for the table.
-+ */
-+struct vfio_iommu_spapr_tce_ddw_info {
-+	__u64 pgsizes;			/* Bitmap of supported page sizes */
-+	__u32 max_dynamic_windows_supported;
-+	__u32 levels;
-+};
-+
-+/*
-  * The SPAPR TCE info struct provides the information about the PCI bus
-  * address ranges available for DMA, these values are programmed into
-  * the hardware so the guest has to know that information.
-@@ -454,14 +471,17 @@ struct vfio_iommu_type1_dma_unmap {
-  * addresses too so the window works as a filter rather than an offset
-  * for IOVA addresses.
-  *
-- * A flag will need to be added if other page sizes are supported,
-- * so as defined here, it is always 4k.
-+ * Flags supported:
-+ * - VFIO_IOMMU_SPAPR_INFO_DDW: informs the userspace that dynamic DMA windows
-+ *   (DDW) support is present. @ddw is only supported when DDW is present.
-  */
- struct vfio_iommu_spapr_tce_info {
- 	__u32 argsz;
--	__u32 flags;			/* reserved for future use */
-+	__u32 flags;
-+#define VFIO_IOMMU_SPAPR_INFO_DDW	(1 << 0)	/* DDW supported */
- 	__u32 dma32_window_start;	/* 32 bit window start (bytes) */
- 	__u32 dma32_window_size;	/* 32 bit window size (bytes) */
-+	struct vfio_iommu_spapr_tce_ddw_info ddw;
- };
- 
- #define VFIO_IOMMU_SPAPR_TCE_GET_INFO	_IO(VFIO_TYPE, VFIO_BASE + 12)
-@@ -522,6 +542,41 @@ struct vfio_iommu_spapr_register_memory {
-  */
- #define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
- 
-+/**
-+ * VFIO_IOMMU_SPAPR_TCE_CREATE - _IOWR(VFIO_TYPE, VFIO_BASE + 19, struct vfio_iommu_spapr_tce_create)
-+ *
-+ * Creates an additional TCE table and programs it (sets a new DMA window)
-+ * to every IOMMU group in the container. It receives page shift, window
-+ * size and number of levels in the TCE table being created.
-+ *
-+ * It allocates and returns an offset on a PCI bus of the new DMA window.
-+ */
-+struct vfio_iommu_spapr_tce_create {
-+	__u32 argsz;
-+	__u32 flags;
-+	/* in */
-+	__u32 page_shift;
-+	__u64 window_size;
-+	__u32 levels;
-+	/* out */
-+	__u64 start_addr;
-+};
-+#define VFIO_IOMMU_SPAPR_TCE_CREATE	_IO(VFIO_TYPE, VFIO_BASE + 19)
-+
-+/**
-+ * VFIO_IOMMU_SPAPR_TCE_REMOVE - _IOW(VFIO_TYPE, VFIO_BASE + 20, struct vfio_iommu_spapr_tce_remove)
-+ *
-+ * Unprograms a TCE table from all groups in the container and destroys it.
-+ * It receives a PCI bus offset as a window id.
-+ */
-+struct vfio_iommu_spapr_tce_remove {
-+	__u32 argsz;
-+	__u32 flags;
-+	/* in */
-+	__u64 start_addr;
-+};
-+#define VFIO_IOMMU_SPAPR_TCE_REMOVE	_IO(VFIO_TYPE, VFIO_BASE + 20)
-+
- /* ***************************************************************** */
- 
- #endif /* _UAPIVFIO_H */
++}
++EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa);
++
++long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem)
++{
++	if (atomic_inc_not_zero(&mem->mapped))
++		return 0;
++
++	/* Last mm_iommu_put() has been called, no more mappings allowed() */
++	return -ENXIO;
++}
++EXPORT_SYMBOL_GPL(mm_iommu_mapped_inc);
++
++long mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem)
++{
++	return atomic_dec_if_positive(&mem->mapped);
++}
++EXPORT_SYMBOL_GPL(mm_iommu_mapped_dec);
++
++void mm_iommu_cleanup(mm_context_t *ctx)
++{
++	while (!list_empty(&ctx->iommu_group_mem_list)) {
++		struct mm_iommu_table_group_mem_t *mem;
++
++		mem = list_first_entry(&ctx->iommu_group_mem_list,
++				struct mm_iommu_table_group_mem_t, next);
++		mm_iommu_release(&mem->kref);
++	}
++}
 -- 
-2.0.0
+2.4.0.rc3.8.gfb3e7d5
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help