Inter-revision diff: patch 5

Comparing v10 (message) to v7 (message)

--- v10
+++ v7
@@ -1,29 +1,153 @@
-At the moment iommu_free_table() only releases memory if
-the table was initialized for the platform code use, i.e. it had
-it_map initialized (which purpose is to track DMA memory space use).
+There moves locked pages accounting to helpers.
+Later they will be reused for Dynamic DMA windows (DDW).
 
-With dynamic DMA windows, we will need to be able to release
-iommu_table even if it was used for VFIO in which case it_map is NULL
-so does the patch.
+This reworks debug messages to show the current value and the limit.
+
+This stores the locked pages number in the container so when unlocking
+the iommu table pointer won't be needed. This does not have an effect
+now but it will with the multiple tables per container as then we will
+allow attaching/detaching groups on fly and we may end up having
+a container with no group attached but with the counter incremented.
+
+While we are here, update the comment explaining why RLIMIT_MEMLOCK
+might be required to be bigger than the guest RAM. This also prints
+pid of the current process in pr_warn/pr_debug.
 
 Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
 ---
- arch/powerpc/kernel/iommu.c | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
+Changes:
+v4:
+* new helpers do nothing if @npages == 0
+* tce_iommu_disable() now can decrement the counter if the group was
+detached (not possible now but will be in the future)
+---
+ drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
+ 1 file changed, 63 insertions(+), 19 deletions(-)
 
-diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
-index 3d47eb3..2c02d4c 100644
---- a/arch/powerpc/kernel/iommu.c
-+++ b/arch/powerpc/kernel/iommu.c
-@@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
- 	unsigned int order;
+diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
+index 8bbee22..9448e39 100644
+--- a/drivers/vfio/vfio_iommu_spapr_tce.c
++++ b/drivers/vfio/vfio_iommu_spapr_tce.c
+@@ -29,6 +29,51 @@
+ static void tce_iommu_detach_group(void *iommu_data,
+ 		struct iommu_group *iommu_group);
  
- 	if (!tbl || !tbl->it_map) {
--		printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
--				node_name);
-+		kfree(tbl);
++static long try_increment_locked_vm(long npages)
++{
++	long ret = 0, locked, lock_limit;
++
++	if (!current || !current->mm)
++		return -ESRCH; /* process exited */
++
++	if (!npages)
++		return 0;
++
++	down_write(&current->mm->mmap_sem);
++	locked = current->mm->locked_vm + npages;
++	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
++	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
++		ret = -ENOMEM;
++	else
++		current->mm->locked_vm += npages;
++
++	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
++			npages << PAGE_SHIFT,
++			current->mm->locked_vm << PAGE_SHIFT,
++			rlimit(RLIMIT_MEMLOCK),
++			ret ? " - exceeded" : "");
++
++	up_write(&current->mm->mmap_sem);
++
++	return ret;
++}
++
++static void decrement_locked_vm(long npages)
++{
++	if (!current || !current->mm || !npages)
++		return; /* process exited */
++
++	down_write(&current->mm->mmap_sem);
++	if (npages > current->mm->locked_vm)
++		npages = current->mm->locked_vm;
++	current->mm->locked_vm -= npages;
++	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
++			npages << PAGE_SHIFT,
++			current->mm->locked_vm << PAGE_SHIFT,
++			rlimit(RLIMIT_MEMLOCK));
++	up_write(&current->mm->mmap_sem);
++}
++
+ /*
+  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+  *
+@@ -45,6 +90,7 @@ struct tce_container {
+ 	struct mutex lock;
+ 	struct iommu_table *tbl;
+ 	bool enabled;
++	unsigned long locked_pages;
+ };
+ 
+ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+ static int tce_iommu_enable(struct tce_container *container)
+ {
+ 	int ret = 0;
+-	unsigned long locked, lock_limit, npages;
++	unsigned long locked;
+ 	struct iommu_table *tbl = container->tbl;
+ 
+ 	if (!container->tbl)
+@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
+ 	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+ 	 * that would effectively kill the guest at random points, much better
+ 	 * enforcing the limit based on the max that the guest can map.
++	 *
++	 * Unfortunately at the moment it counts whole tables, no matter how
++	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
++	 * each with 2GB DMA window, 8GB will be counted here. The reason for
++	 * this is that we cannot tell here the amount of RAM used by the guest
++	 * as this information is only available from KVM and VFIO is
++	 * KVM agnostic.
+ 	 */
+-	down_write(&current->mm->mmap_sem);
+-	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+-	locked = current->mm->locked_vm + npages;
+-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+-		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+-				rlimit(RLIMIT_MEMLOCK));
+-		ret = -ENOMEM;
+-	} else {
++	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
++	ret = try_increment_locked_vm(locked);
++	if (ret)
++		return ret;
+ 
+-		current->mm->locked_vm += npages;
+-		container->enabled = true;
+-	}
+-	up_write(&current->mm->mmap_sem);
++	container->locked_pages = locked;
++
++	container->enabled = true;
+ 
+ 	return ret;
+ }
+@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
+ 
+ 	container->enabled = false;
+ 
+-	if (!container->tbl || !current->mm)
++	if (!current->mm)
  		return;
- 	}
  
+-	down_write(&current->mm->mmap_sem);
+-	current->mm->locked_vm -= (container->tbl->it_size <<
+-			container->tbl->it_page_shift) >> PAGE_SHIFT;
+-	up_write(&current->mm->mmap_sem);
++	decrement_locked_vm(container->locked_pages);
+ }
+ 
+ static void *tce_iommu_open(unsigned long arg)
 -- 
-2.4.0.rc3.8.gfb3e7d5
+2.0.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help