Inter-revision diff: patch 8

Comparing v11 (message) to v4 (message)

--- v11
+++ v4
@@ -1,104 +1,449 @@
-This makes use of the it_page_size from the iommu_table struct
-as page size can differ.
-
-This replaces missing IOMMU_PAGE_SHIFT macro in commented debug code
-as recently introduced IOMMU_PAGE_XXX macros do not include
-IOMMU_PAGE_SHIFT.
+The existing implementation accounts the whole DMA window in
+the locked_vm counter which is going to be even worse with multiple
+containers and huge DMA windows.
+
+This introduces 2 ioctls to register/unregister DMA memory which
+receive user space address and size of a memory region which
+needs to be pinned/unpinned and counted in locked_vm.
+
+If any memory region was registered, all subsequent DMA map requests
+should address already pinned memory. If no memory was registered,
+then the amount of memory required for a single default memory will be
+accounted when the container is enabled and every map/unmap will pin/unpin
+a page (with degraded performance).
+
+Dynamic DMA window and in-kernel acceleration will require memory to
+be preregistered in order to work.
+
+The accounting is done per VFIO container. When the support of
+multiple groups per container is added, we will have accurate locked_vm
+accounting.
 
 Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
-Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
-[aw: for the vfio related changes]
-Acked-by: Alex Williamson <alex.williamson@redhat.com>
-Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
 ---
- drivers/vfio/vfio_iommu_spapr_tce.c | 26 +++++++++++++-------------
- 1 file changed, 13 insertions(+), 13 deletions(-)
-
+Changes:
+v4:
+* updated docs
+* s/kzmalloc/vzalloc/
+* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
+replaced offset with index
+* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
+and removed duplicating vfio_iommu_spapr_register_memory
+---
+ Documentation/vfio.txt              |  19 +++
+ drivers/vfio/vfio_iommu_spapr_tce.c | 274 +++++++++++++++++++++++++++++++++++-
+ include/uapi/linux/vfio.h           |  25 ++++
+ 3 files changed, 312 insertions(+), 6 deletions(-)
+
+diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
+index 96978ec..791e85c 100644
+--- a/Documentation/vfio.txt
++++ b/Documentation/vfio.txt
+@@ -427,6 +427,25 @@ The code flow from the example above should be slightly changed:
+ 
+ 	....
+ 
++5) PPC64 paravirtualized guests may generate a lot of map/unmap requests,
++and the handling of those includes pinning/unpinning pages and updating
++mm::locked_vm counter to make sure we do not exceed the rlimit. Handling these
++in real-mode is quite expensive and may fail. In order to simplify in-kernel
++acceleration of map/unmap requests, two ioctls have been added to pre-register
++and unregister guest RAM pages where DMA can possibly happen to. Having these
++calles, the userspace and in-kernel handlers do not have to take care of
++pinning or accounting.
++
++The ioctls are VFIO_IOMMU_SPAPR_REGISTER_MEMORY and
++VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY.
++These receive a user space address and size of the block to be pinned.
++Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
++be called with the exact address and size used for registering
++the memory block.
++
++The user space is not expected to call these often and the block descriptors
++are stored in a linked list in the kernel.
++
+ -------------------------------------------------------------------------------
+ 
+ [1] VFIO was originally an acronym for "Virtual Function I/O" in its
 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index 735b308..64300cc 100644
+index 7fd60f9..9b884e0 100644
 --- a/drivers/vfio/vfio_iommu_spapr_tce.c
 +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
-@@ -91,7 +91,7 @@ static int tce_iommu_enable(struct tce_container *container)
- 	 * enforcing the limit based on the max that the guest can map.
+@@ -21,6 +21,7 @@
+ #include <linux/uaccess.h>
+ #include <linux/err.h>
+ #include <linux/vfio.h>
++#include <linux/vmalloc.h>
+ #include <asm/iommu.h>
+ #include <asm/tce.h>
+ 
+@@ -93,8 +94,196 @@ struct tce_container {
+ 	struct iommu_table *tbl;
+ 	bool enabled;
+ 	unsigned long locked_pages;
++	struct list_head mem_list;
+ };
+ 
++struct tce_memory {
++	struct list_head next;
++	struct rcu_head rcu;
++	__u64 vaddr;
++	__u64 size;
++	__u64 hpas[];
++};
++
++static inline bool tce_preregistered(struct tce_container *container)
++{
++	return !list_empty(&container->mem_list);
++}
++
++static struct tce_memory *tce_mem_alloc(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem;
++	long ret;
++
++	ret = try_increment_locked_vm(size >> PAGE_SHIFT);
++	if (ret)
++		return NULL;
++
++	mem = vzalloc(sizeof(*mem) + (size >> (PAGE_SHIFT - 3)));
++	if (!mem) {
++		decrement_locked_vm(size >> PAGE_SHIFT);
++		return NULL;
++	}
++
++	mem->vaddr = vaddr;
++	mem->size = size;
++
++	list_add_rcu(&mem->next, &container->mem_list);
++
++	return mem;
++}
++
++static void release_tce_memory(struct rcu_head *head)
++{
++	struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
++
++	vfree(mem);
++}
++
++static void tce_mem_free(struct tce_memory *mem)
++{
++	decrement_locked_vm(mem->size);
++	list_del_rcu(&mem->next);
++	call_rcu(&mem->rcu, release_tce_memory);
++}
++
++static struct tce_memory *tce_pinned_desc(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem, *ret = NULL;
++
++	rcu_read_lock();
++	vaddr &= ~(TCE_PCI_READ | TCE_PCI_WRITE);
++	list_for_each_entry_rcu(mem, &container->mem_list, next) {
++		if ((mem->vaddr <= vaddr) &&
++				(vaddr + size <= mem->vaddr + mem->size)) {
++			ret = mem;
++			break;
++		}
++	}
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static bool tce_mem_overlapped(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem;
++	bool ret = false;
++
++	rcu_read_lock();
++	list_for_each_entry_rcu(mem, &container->mem_list, next) {
++		if ((mem->vaddr < (vaddr + size)) &&
++				(vaddr < (mem->vaddr + mem->size))) {
++			ret = true;
++			break;
++		}
++	}
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static void tce_unpin_pages(struct tce_container *container,
++		struct tce_memory *mem)
++{
++	long i;
++	struct page *page = NULL;
++
++	for (i = 0; i < (mem->size >> PAGE_SHIFT); ++i) {
++		if (!mem->hpas[i])
++			continue;
++
++		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
++		if (!page)
++			continue;
++
++		put_page(page);
++		mem->hpas[i] = 0;
++	}
++}
++
++static long tce_unregister_pages(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem, *memtmp;
++
++	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
++		return -EINVAL;
++
++	list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) {
++		if ((mem->vaddr == vaddr) && (mem->size == size)) {
++			tce_unpin_pages(container, mem);
++			tce_mem_free(mem);
++
++			/* If that was the last region, disable the container */
++			if (!tce_preregistered(container))
++				container->enabled = false;
++
++			return 0;
++		}
++	}
++
++	return -ENOENT;
++}
++
++static void tce_mem_unregister_all(struct tce_container *container)
++{
++	struct tce_memory *mem, *memtmp;
++
++	list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) {
++		tce_unpin_pages(container, mem);
++		tce_mem_free(mem);
++	}
++}
++
++static long tce_pin_pages(struct tce_container *container,
++		struct tce_memory *mem)
++{
++	long i;
++	struct page *page = NULL;
++
++	for (i = 0; i < (mem->size >> PAGE_SHIFT); ++i) {
++		if (1 != get_user_pages_fast(mem->vaddr + (i << PAGE_SHIFT),
++					1/* pages */, 1/* iswrite */, &page)) {
++			tce_unpin_pages(container, mem);
++			return -EFAULT;
++		}
++
++		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
++	}
++
++	return 0;
++}
++
++static long tce_register_pages(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem;
++
++	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
++			((vaddr + size) < vaddr))
++		return -EINVAL;
++
++	if (tce_mem_overlapped(container, vaddr, size))
++		return -EBUSY;
++
++	mem = tce_mem_alloc(container, vaddr, size);
++	if (!mem)
++		return -ENOMEM;
++
++	if (tce_pin_pages(container, mem)) {
++		tce_mem_free(mem);
++		return -EFAULT;
++	}
++
++	container->enabled = true;
++
++	return 0;
++}
++
+ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+ {
+ 	unsigned shift;
+@@ -151,12 +340,14 @@ static int tce_iommu_enable(struct tce_container *container)
+ 	 * as this information is only available from KVM and VFIO is
+ 	 * KVM agnostic.
  	 */
- 	down_write(&current->mm->mmap_sem);
--	npages = (tbl->it_size << IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
-+	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
- 	locked = current->mm->locked_vm + npages;
- 	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- 	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-@@ -120,7 +120,7 @@ static void tce_iommu_disable(struct tce_container *container)
- 
- 	down_write(&current->mm->mmap_sem);
- 	current->mm->locked_vm -= (container->tbl->it_size <<
--			IOMMU_PAGE_SHIFT_4K) >> PAGE_SHIFT;
-+			container->tbl->it_page_shift) >> PAGE_SHIFT;
- 	up_write(&current->mm->mmap_sem);
+-	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+-	ret = try_increment_locked_vm(locked);
+-	if (ret)
+-		return ret;
++	if (!tce_preregistered(container)) {
++		locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
++		ret = try_increment_locked_vm(locked);
++		if (ret)
++			return ret;
+ 
+-	container->locked_pages = locked;
++		container->locked_pages = locked;
++	}
+ 
+ 	container->enabled = true;
+ 
+@@ -190,6 +381,7 @@ static void *tce_iommu_open(unsigned long arg)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	mutex_init(&container->lock);
++	INIT_LIST_HEAD_RCU(&container->mem_list);
+ 
+ 	return container;
  }
- 
-@@ -215,7 +215,7 @@ static long tce_iommu_build(struct tce_container *container,
- 					tce, ret);
+@@ -212,6 +404,9 @@ static void tce_iommu_release(void *iommu_data)
+ 		if (tbl->it_group)
+ 			tce_iommu_detach_group(iommu_data, tbl->it_group);
+ 	}
++
++	tce_mem_unregister_all(container);
++
+ 	mutex_destroy(&container->lock);
+ 
+ 	kfree(container);
+@@ -230,6 +425,9 @@ static void tce_iommu_unuse_page(struct tce_container *container,
+ 	if (oldtce & TCE_PCI_WRITE)
+ 		SetPageDirty(page);
+ 
++	if (tce_preregistered(container))
++		return;
++
+ 	put_page(page);
+ }
+ 
+@@ -280,6 +478,22 @@ static unsigned long tce_get_hva(struct tce_container *container,
+ 	return hva;
+ }
+ 
++static unsigned long tce_get_hva_cached(struct tce_container *container,
++		unsigned page_shift, unsigned long tce)
++{
++	struct tce_memory *mem;
++	unsigned long gfn;
++
++	tce &= PAGE_MASK;
++	mem = tce_pinned_desc(container, tce, 1ULL << page_shift);
++	if (!mem)
++		return -1;
++
++	gfn = (tce - mem->vaddr) >> PAGE_SHIFT;
++
++	return (unsigned long) __va(mem->hpas[gfn]);
++}
++
+ static long tce_iommu_build(struct tce_container *container,
+ 		struct iommu_table *tbl,
+ 		unsigned long entry, unsigned long tce, unsigned long pages)
+@@ -290,7 +504,11 @@ static long tce_iommu_build(struct tce_container *container,
+ 	enum dma_data_direction direction = tce_iommu_direction(tce);
+ 
+ 	for (i = 0; i < pages; ++i) {
+-		hva = tce_get_hva(container, tbl->it_page_shift, tce);
++		if (tce_preregistered(container))
++			hva = tce_get_hva_cached(container, tbl->it_page_shift,
++					tce);
++		else
++			hva = tce_get_hva(container, tbl->it_page_shift, tce);
+ 		if (hva == -1) {
+ 			ret = -EFAULT;
  			break;
- 		}
--		tce += IOMMU_PAGE_SIZE_4K;
-+		tce += IOMMU_PAGE_SIZE(tbl);
+@@ -455,6 +673,50 @@ static long tce_iommu_ioctl(void *iommu_data,
+ 
+ 		return ret;
  	}
- 
- 	if (ret)
-@@ -260,8 +260,8 @@ static long tce_iommu_ioctl(void *iommu_data,
- 		if (info.argsz < minsz)
- 			return -EINVAL;
- 
--		info.dma32_window_start = tbl->it_offset << IOMMU_PAGE_SHIFT_4K;
--		info.dma32_window_size = tbl->it_size << IOMMU_PAGE_SHIFT_4K;
-+		info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
-+		info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
- 		info.flags = 0;
- 
- 		if (copy_to_user((void __user *)arg, &info, minsz))
-@@ -291,8 +291,8 @@ static long tce_iommu_ioctl(void *iommu_data,
- 				VFIO_DMA_MAP_FLAG_WRITE))
- 			return -EINVAL;
- 
--		if ((param.size & ~IOMMU_PAGE_MASK_4K) ||
--				(param.vaddr & ~IOMMU_PAGE_MASK_4K))
-+		if ((param.size & ~IOMMU_PAGE_MASK(tbl)) ||
-+				(param.vaddr & ~IOMMU_PAGE_MASK(tbl)))
- 			return -EINVAL;
- 
- 		/* iova is checked by the IOMMU API */
-@@ -307,8 +307,8 @@ static long tce_iommu_ioctl(void *iommu_data,
- 			return ret;
- 
- 		ret = tce_iommu_build(container, tbl,
--				param.iova >> IOMMU_PAGE_SHIFT_4K,
--				tce, param.size >> IOMMU_PAGE_SHIFT_4K);
-+				param.iova >> tbl->it_page_shift,
-+				tce, param.size >> tbl->it_page_shift);
- 
- 		iommu_flush_tce(tbl);
- 
-@@ -334,17 +334,17 @@ static long tce_iommu_ioctl(void *iommu_data,
- 		if (param.flags)
- 			return -EINVAL;
- 
--		if (param.size & ~IOMMU_PAGE_MASK_4K)
-+		if (param.size & ~IOMMU_PAGE_MASK(tbl))
- 			return -EINVAL;
- 
- 		ret = iommu_tce_clear_param_check(tbl, param.iova, 0,
--				param.size >> IOMMU_PAGE_SHIFT_4K);
-+				param.size >> tbl->it_page_shift);
- 		if (ret)
- 			return ret;
- 
- 		ret = tce_iommu_clear(container, tbl,
--				param.iova >> IOMMU_PAGE_SHIFT_4K,
--				param.size >> IOMMU_PAGE_SHIFT_4K);
-+				param.iova >> tbl->it_page_shift,
-+				param.size >> tbl->it_page_shift);
- 		iommu_flush_tce(tbl);
- 
- 		return ret;
++	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
++		struct vfio_iommu_spapr_register_memory param;
++
++		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
++				size);
++
++		if (copy_from_user(&param, (void __user *)arg, minsz))
++			return -EFAULT;
++
++		if (param.argsz < minsz)
++			return -EINVAL;
++
++		/* No flag is supported now */
++		if (param.flags)
++			return -EINVAL;
++
++		mutex_lock(&container->lock);
++		ret = tce_register_pages(container, param.vaddr, param.size);
++		mutex_unlock(&container->lock);
++
++		return ret;
++	}
++	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
++		struct vfio_iommu_spapr_register_memory param;
++
++		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
++				size);
++
++		if (copy_from_user(&param, (void __user *)arg, minsz))
++			return -EFAULT;
++
++		if (param.argsz < minsz)
++			return -EINVAL;
++
++		/* No flag is supported now */
++		if (param.flags)
++			return -EINVAL;
++
++		mutex_lock(&container->lock);
++		tce_unregister_pages(container, param.vaddr, param.size);
++		mutex_unlock(&container->lock);
++
++		return 0;
++	}
+ 	case VFIO_IOMMU_ENABLE:
+ 		mutex_lock(&container->lock);
+ 		ret = tce_iommu_enable(container);
+diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
+index 29715d2..0f55c08 100644
+--- a/include/uapi/linux/vfio.h
++++ b/include/uapi/linux/vfio.h
+@@ -492,6 +492,31 @@ struct vfio_eeh_pe_op {
+ 
+ #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
+ 
++/**
++ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
++ *
++ * Registers user space memory where DMA is allowed. It pins
++ * user pages and does the locked memory accounting so
++ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
++ * get faster.
++ */
++struct vfio_iommu_spapr_register_memory {
++	__u32	argsz;
++	__u32	flags;
++	__u64	vaddr;				/* Process virtual address */
++	__u64	size;				/* Size of mapping (bytes) */
++};
++#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
++
++/**
++ * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
++ *
++ * Unregisters user space memory registered with
++ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
++ * Uses vfio_iommu_spapr_register_memory for parameters.
++ */
++#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
++
+ /* ***************************************************************** */
+ 
+ #endif /* _UAPIVFIO_H */
 -- 
-2.4.0.rc3.8.gfb3e7d5
+2.0.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help