Inter-revision diff: patch 8

Comparing v9 (message) to v4 (message)

--- v9
+++ v4
@@ -1,153 +1,449 @@
-This is a pretty mechanical patch to make next patches simpler.
-
-New tce_iommu_unuse_page() helper does put_page() now but it might skip
-that after the memory registering patch applied.
-
-As we are here, this removes unnecessary checks for a value returned
-by pfn_to_page() as it cannot possibly return NULL.
-
-This moves tce_iommu_disable() later to let tce_iommu_clear() know if
-the container has been enabled because if it has not been, then
-put_page() must not be called on TCEs from the TCE table. This situation
-is not yet possible but it will after KVM acceleration patchset is
-applied.
-
-This changes code to work with physical addresses rather than linear
-mapping addresses for better code readability. Following patches will
-add an xchg() callback for an IOMMU table which will accept/return
-physical addresses (unlike current tce_build()) which will eliminate
-redundant conversions.
+The existing implementation accounts the whole DMA window in
+the locked_vm counter which is going to be even worse with multiple
+containers and huge DMA windows.
+
+This introduces 2 ioctls to register/unregister DMA memory which
+receive user space address and size of a memory region which
+needs to be pinned/unpinned and counted in locked_vm.
+
+If any memory region was registered, all subsequent DMA map requests
+should address already pinned memory. If no memory was registered,
+then the amount of memory required for a single default memory will be
+accounted when the container is enabled and every map/unmap will pin/unpin
+a page (with degraded performance).
+
+Dynamic DMA window and in-kernel acceleration will require memory to
+be preregistered in order to work.
+
+The accounting is done per VFIO container. When the support of
+multiple groups per container is added, we will have accurate locked_vm
+accounting.
 
 Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
-[aw: for the vfio related changes]
-Acked-by: Alex Williamson <alex.williamson@redhat.com>
 ---
 Changes:
-v9:
-* changed helpers to work with physical addresses rather than linear
-(for simplicity - later ::xchg() will receive physical and avoid
-additional convertions)
-
-v6:
-* tce_get_hva() returns hva via a pointer
+v4:
+* updated docs
+* s/kzmalloc/vzalloc/
+* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
+replaced offset with index
+* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
+and removed duplicating vfio_iommu_spapr_register_memory
 ---
- drivers/vfio/vfio_iommu_spapr_tce.c | 61 +++++++++++++++++++++++++------------
- 1 file changed, 41 insertions(+), 20 deletions(-)
-
+ Documentation/vfio.txt              |  19 +++
+ drivers/vfio/vfio_iommu_spapr_tce.c | 274 +++++++++++++++++++++++++++++++++++-
+ include/uapi/linux/vfio.h           |  25 ++++
+ 3 files changed, 312 insertions(+), 6 deletions(-)
+
+diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
+index 96978ec..791e85c 100644
+--- a/Documentation/vfio.txt
++++ b/Documentation/vfio.txt
+@@ -427,6 +427,25 @@ The code flow from the example above should be slightly changed:
+ 
+ 	....
+ 
++5) PPC64 paravirtualized guests may generate a lot of map/unmap requests,
++and the handling of those includes pinning/unpinning pages and updating
++mm::locked_vm counter to make sure we do not exceed the rlimit. Handling these
++in real-mode is quite expensive and may fail. In order to simplify in-kernel
++acceleration of map/unmap requests, two ioctls have been added to pre-register
++and unregister guest RAM pages where DMA can possibly happen to. Having these
++calles, the userspace and in-kernel handlers do not have to take care of
++pinning or accounting.
++
++The ioctls are VFIO_IOMMU_SPAPR_REGISTER_MEMORY and
++VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY.
++These receive a user space address and size of the block to be pinned.
++Bisecting is not supported and VFIO_IOMMU_UNREGISTER_MEMORY is expected to
++be called with the exact address and size used for registering
++the memory block.
++
++The user space is not expected to call these often and the block descriptors
++are stored in a linked list in the kernel.
++
+ -------------------------------------------------------------------------------
+ 
+ [1] VFIO was originally an acronym for "Virtual Function I/O" in its
 diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index e21479c..115d5e6 100644
+index 7fd60f9..9b884e0 100644
 --- a/drivers/vfio/vfio_iommu_spapr_tce.c
 +++ b/drivers/vfio/vfio_iommu_spapr_tce.c
-@@ -191,69 +191,90 @@ static void tce_iommu_release(void *iommu_data)
- 	struct tce_container *container = iommu_data;
- 
- 	WARN_ON(container->tbl && !container->tbl->it_group);
--	tce_iommu_disable(container);
- 
- 	if (container->tbl && container->tbl->it_group)
- 		tce_iommu_detach_group(iommu_data, container->tbl->it_group);
- 
-+	tce_iommu_disable(container);
+@@ -21,6 +21,7 @@
+ #include <linux/uaccess.h>
+ #include <linux/err.h>
+ #include <linux/vfio.h>
++#include <linux/vmalloc.h>
+ #include <asm/iommu.h>
+ #include <asm/tce.h>
+ 
+@@ -93,8 +94,196 @@ struct tce_container {
+ 	struct iommu_table *tbl;
+ 	bool enabled;
+ 	unsigned long locked_pages;
++	struct list_head mem_list;
+ };
+ 
++struct tce_memory {
++	struct list_head next;
++	struct rcu_head rcu;
++	__u64 vaddr;
++	__u64 size;
++	__u64 hpas[];
++};
++
++static inline bool tce_preregistered(struct tce_container *container)
++{
++	return !list_empty(&container->mem_list);
++}
++
++static struct tce_memory *tce_mem_alloc(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem;
++	long ret;
++
++	ret = try_increment_locked_vm(size >> PAGE_SHIFT);
++	if (ret)
++		return NULL;
++
++	mem = vzalloc(sizeof(*mem) + (size >> (PAGE_SHIFT - 3)));
++	if (!mem) {
++		decrement_locked_vm(size >> PAGE_SHIFT);
++		return NULL;
++	}
++
++	mem->vaddr = vaddr;
++	mem->size = size;
++
++	list_add_rcu(&mem->next, &container->mem_list);
++
++	return mem;
++}
++
++static void release_tce_memory(struct rcu_head *head)
++{
++	struct tce_memory *mem = container_of(head, struct tce_memory, rcu);
++
++	vfree(mem);
++}
++
++static void tce_mem_free(struct tce_memory *mem)
++{
++	decrement_locked_vm(mem->size);
++	list_del_rcu(&mem->next);
++	call_rcu(&mem->rcu, release_tce_memory);
++}
++
++static struct tce_memory *tce_pinned_desc(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem, *ret = NULL;
++
++	rcu_read_lock();
++	vaddr &= ~(TCE_PCI_READ | TCE_PCI_WRITE);
++	list_for_each_entry_rcu(mem, &container->mem_list, next) {
++		if ((mem->vaddr <= vaddr) &&
++				(vaddr + size <= mem->vaddr + mem->size)) {
++			ret = mem;
++			break;
++		}
++	}
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static bool tce_mem_overlapped(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem;
++	bool ret = false;
++
++	rcu_read_lock();
++	list_for_each_entry_rcu(mem, &container->mem_list, next) {
++		if ((mem->vaddr < (vaddr + size)) &&
++				(vaddr < (mem->vaddr + mem->size))) {
++			ret = true;
++			break;
++		}
++	}
++	rcu_read_unlock();
++
++	return ret;
++}
++
++static void tce_unpin_pages(struct tce_container *container,
++		struct tce_memory *mem)
++{
++	long i;
++	struct page *page = NULL;
++
++	for (i = 0; i < (mem->size >> PAGE_SHIFT); ++i) {
++		if (!mem->hpas[i])
++			continue;
++
++		page = pfn_to_page(mem->hpas[i] >> PAGE_SHIFT);
++		if (!page)
++			continue;
++
++		put_page(page);
++		mem->hpas[i] = 0;
++	}
++}
++
++static long tce_unregister_pages(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem, *memtmp;
++
++	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
++		return -EINVAL;
++
++	list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) {
++		if ((mem->vaddr == vaddr) && (mem->size == size)) {
++			tce_unpin_pages(container, mem);
++			tce_mem_free(mem);
++
++			/* If that was the last region, disable the container */
++			if (!tce_preregistered(container))
++				container->enabled = false;
++
++			return 0;
++		}
++	}
++
++	return -ENOENT;
++}
++
++static void tce_mem_unregister_all(struct tce_container *container)
++{
++	struct tce_memory *mem, *memtmp;
++
++	list_for_each_entry_safe(mem, memtmp, &container->mem_list, next) {
++		tce_unpin_pages(container, mem);
++		tce_mem_free(mem);
++	}
++}
++
++static long tce_pin_pages(struct tce_container *container,
++		struct tce_memory *mem)
++{
++	long i;
++	struct page *page = NULL;
++
++	for (i = 0; i < (mem->size >> PAGE_SHIFT); ++i) {
++		if (1 != get_user_pages_fast(mem->vaddr + (i << PAGE_SHIFT),
++					1/* pages */, 1/* iswrite */, &page)) {
++			tce_unpin_pages(container, mem);
++			return -EFAULT;
++		}
++
++		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
++	}
++
++	return 0;
++}
++
++static long tce_register_pages(struct tce_container *container,
++		__u64 vaddr, __u64 size)
++{
++	struct tce_memory *mem;
++
++	if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
++			((vaddr + size) < vaddr))
++		return -EINVAL;
++
++	if (tce_mem_overlapped(container, vaddr, size))
++		return -EBUSY;
++
++	mem = tce_mem_alloc(container, vaddr, size);
++	if (!mem)
++		return -ENOMEM;
++
++	if (tce_pin_pages(container, mem)) {
++		tce_mem_free(mem);
++		return -EFAULT;
++	}
++
++	container->enabled = true;
++
++	return 0;
++}
++
+ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+ {
+ 	unsigned shift;
+@@ -151,12 +340,14 @@ static int tce_iommu_enable(struct tce_container *container)
+ 	 * as this information is only available from KVM and VFIO is
+ 	 * KVM agnostic.
+ 	 */
+-	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+-	ret = try_increment_locked_vm(locked);
+-	if (ret)
+-		return ret;
++	if (!tce_preregistered(container)) {
++		locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
++		ret = try_increment_locked_vm(locked);
++		if (ret)
++			return ret;
+ 
+-	container->locked_pages = locked;
++		container->locked_pages = locked;
++	}
+ 
+ 	container->enabled = true;
+ 
+@@ -190,6 +381,7 @@ static void *tce_iommu_open(unsigned long arg)
+ 		return ERR_PTR(-ENOMEM);
+ 
+ 	mutex_init(&container->lock);
++	INIT_LIST_HEAD_RCU(&container->mem_list);
+ 
+ 	return container;
+ }
+@@ -212,6 +404,9 @@ static void tce_iommu_release(void *iommu_data)
+ 		if (tbl->it_group)
+ 			tce_iommu_detach_group(iommu_data, tbl->it_group);
+ 	}
++
++	tce_mem_unregister_all(container);
++
  	mutex_destroy(&container->lock);
  
  	kfree(container);
+@@ -230,6 +425,9 @@ static void tce_iommu_unuse_page(struct tce_container *container,
+ 	if (oldtce & TCE_PCI_WRITE)
+ 		SetPageDirty(page);
+ 
++	if (tce_preregistered(container))
++		return;
++
+ 	put_page(page);
  }
  
-+static void tce_iommu_unuse_page(struct tce_container *container,
-+		unsigned long oldtce)
-+{
-+	struct page *page;
-+
-+	if (!(oldtce & (TCE_PCI_READ | TCE_PCI_WRITE)))
-+		return;
-+
-+	page = pfn_to_page(oldtce >> PAGE_SHIFT);
-+
-+	if (oldtce & TCE_PCI_WRITE)
-+		SetPageDirty(page);
-+
-+	put_page(page);
-+}
-+
- static int tce_iommu_clear(struct tce_container *container,
- 		struct iommu_table *tbl,
- 		unsigned long entry, unsigned long pages)
- {
- 	unsigned long oldtce;
--	struct page *page;
- 
- 	for ( ; pages; --pages, ++entry) {
- 		oldtce = iommu_clear_tce(tbl, entry);
- 		if (!oldtce)
- 			continue;
- 
--		page = pfn_to_page(oldtce >> PAGE_SHIFT);
--		WARN_ON(!page);
--		if (page) {
--			if (oldtce & TCE_PCI_WRITE)
--				SetPageDirty(page);
--			put_page(page);
--		}
-+		tce_iommu_unuse_page(container, oldtce);
- 	}
- 
- 	return 0;
+@@ -280,6 +478,22 @@ static unsigned long tce_get_hva(struct tce_container *container,
+ 	return hva;
  }
  
-+static int tce_iommu_use_page(unsigned long tce, unsigned long *hpa)
-+{
-+	struct page *page = NULL;
-+	enum dma_data_direction direction = iommu_tce_direction(tce);
-+
-+	if (get_user_pages_fast(tce & PAGE_MASK, 1,
-+			direction != DMA_TO_DEVICE, &page) != 1)
-+		return -EFAULT;
-+
-+	*hpa = __pa((unsigned long) page_address(page));
-+
-+	return 0;
++static unsigned long tce_get_hva_cached(struct tce_container *container,
++		unsigned page_shift, unsigned long tce)
++{
++	struct tce_memory *mem;
++	unsigned long gfn;
++
++	tce &= PAGE_MASK;
++	mem = tce_pinned_desc(container, tce, 1ULL << page_shift);
++	if (!mem)
++		return -1;
++
++	gfn = (tce - mem->vaddr) >> PAGE_SHIFT;
++
++	return (unsigned long) __va(mem->hpas[gfn]);
 +}
 +
  static long tce_iommu_build(struct tce_container *container,
  		struct iommu_table *tbl,
  		unsigned long entry, unsigned long tce, unsigned long pages)
- {
- 	long i, ret = 0;
--	struct page *page = NULL;
--	unsigned long hva;
-+	struct page *page;
-+	unsigned long hpa;
- 	enum dma_data_direction direction = iommu_tce_direction(tce);
+@@ -290,7 +504,11 @@ static long tce_iommu_build(struct tce_container *container,
+ 	enum dma_data_direction direction = tce_iommu_direction(tce);
  
  	for (i = 0; i < pages; ++i) {
- 		unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
- 
--		ret = get_user_pages_fast(tce & PAGE_MASK, 1,
--				direction != DMA_TO_DEVICE, &page);
--		if (unlikely(ret != 1)) {
--			ret = -EFAULT;
-+		ret = tce_iommu_use_page(tce, &hpa);
-+		if (ret)
+-		hva = tce_get_hva(container, tbl->it_page_shift, tce);
++		if (tce_preregistered(container))
++			hva = tce_get_hva_cached(container, tbl->it_page_shift,
++					tce);
++		else
++			hva = tce_get_hva(container, tbl->it_page_shift, tce);
+ 		if (hva == -1) {
+ 			ret = -EFAULT;
  			break;
--		}
- 
-+		page = pfn_to_page(hpa >> PAGE_SHIFT);
- 		if (!tce_page_is_contained(page, tbl->it_page_shift)) {
- 			ret = -EPERM;
- 			break;
- 		}
- 
--		hva = (unsigned long) page_address(page) + offset;
--
--		ret = iommu_tce_build(tbl, entry + i, hva, direction);
-+		hpa |= offset;
-+		ret = iommu_tce_build(tbl, entry + i, (unsigned long) __va(hpa),
-+				direction);
- 		if (ret) {
--			put_page(page);
-+			tce_iommu_unuse_page(container, hpa);
- 			pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
- 					__func__, entry << tbl->it_page_shift,
- 					tce, ret);
+@@ -455,6 +673,50 @@ static long tce_iommu_ioctl(void *iommu_data,
+ 
+ 		return ret;
+ 	}
++	case VFIO_IOMMU_SPAPR_REGISTER_MEMORY: {
++		struct vfio_iommu_spapr_register_memory param;
++
++		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
++				size);
++
++		if (copy_from_user(&param, (void __user *)arg, minsz))
++			return -EFAULT;
++
++		if (param.argsz < minsz)
++			return -EINVAL;
++
++		/* No flag is supported now */
++		if (param.flags)
++			return -EINVAL;
++
++		mutex_lock(&container->lock);
++		ret = tce_register_pages(container, param.vaddr, param.size);
++		mutex_unlock(&container->lock);
++
++		return ret;
++	}
++	case VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY: {
++		struct vfio_iommu_spapr_register_memory param;
++
++		minsz = offsetofend(struct vfio_iommu_spapr_register_memory,
++				size);
++
++		if (copy_from_user(&param, (void __user *)arg, minsz))
++			return -EFAULT;
++
++		if (param.argsz < minsz)
++			return -EINVAL;
++
++		/* No flag is supported now */
++		if (param.flags)
++			return -EINVAL;
++
++		mutex_lock(&container->lock);
++		tce_unregister_pages(container, param.vaddr, param.size);
++		mutex_unlock(&container->lock);
++
++		return 0;
++	}
+ 	case VFIO_IOMMU_ENABLE:
+ 		mutex_lock(&container->lock);
+ 		ret = tce_iommu_enable(container);
+diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
+index 29715d2..0f55c08 100644
+--- a/include/uapi/linux/vfio.h
++++ b/include/uapi/linux/vfio.h
+@@ -492,6 +492,31 @@ struct vfio_eeh_pe_op {
+ 
+ #define VFIO_EEH_PE_OP			_IO(VFIO_TYPE, VFIO_BASE + 21)
+ 
++/**
++ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 17, struct vfio_iommu_spapr_register_memory)
++ *
++ * Registers user space memory where DMA is allowed. It pins
++ * user pages and does the locked memory accounting so
++ * subsequent VFIO_IOMMU_MAP_DMA/VFIO_IOMMU_UNMAP_DMA calls
++ * get faster.
++ */
++struct vfio_iommu_spapr_register_memory {
++	__u32	argsz;
++	__u32	flags;
++	__u64	vaddr;				/* Process virtual address */
++	__u64	size;				/* Size of mapping (bytes) */
++};
++#define VFIO_IOMMU_SPAPR_REGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 17)
++
++/**
++ * VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY - _IOW(VFIO_TYPE, VFIO_BASE + 18, struct vfio_iommu_spapr_register_memory)
++ *
++ * Unregisters user space memory registered with
++ * VFIO_IOMMU_SPAPR_REGISTER_MEMORY.
++ * Uses vfio_iommu_spapr_register_memory for parameters.
++ */
++#define VFIO_IOMMU_SPAPR_UNREGISTER_MEMORY	_IO(VFIO_TYPE, VFIO_BASE + 18)
++
+ /* ***************************************************************** */
+ 
+ #endif /* _UAPIVFIO_H */
 -- 
 2.0.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help