Inter-revision diff: patch 9

Comparing v3 (message) to v10 (message)

--- v3
+++ v10
@@ -1,98 +1,156 @@
-This adds missing locks in iommu_take_ownership()/
-iommu_release_ownership().
+There moves locked pages accounting to helpers.
+Later they will be reused for Dynamic DMA windows (DDW).
 
-This marks all pages busy in iommu_table::it_map in order to catch
-errors if there is an attempt to use this table while ownership over it
-is taken.
+This reworks debug messages to show the current value and the limit.
 
-This only clears TCE content if there is no page marked busy in it_map.
-Clearing must be done outside of the table locks as iommu_clear_tce()
-called from iommu_clear_tces_and_put_pages() does this.
+This stores the locked pages number in the container so when unlocking
+the iommu table pointer won't be needed. This does not have an effect
+now but it will with the multiple tables per container as then we will
+allow attaching/detaching groups on fly and we may end up having
+a container with no group attached but with the counter incremented.
+
+While we are here, update the comment explaining why RLIMIT_MEMLOCK
+might be required to be bigger than the guest RAM. This also prints
+pid of the current process in pr_warn/pr_debug.
 
 Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
+[aw: for the vfio related changes]
+Acked-by: Alex Williamson <alex.williamson@redhat.com>
+Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
 ---
-Note: we might want to get rid of it as this patchset removes it_map
-from tables passed to VFIO.
+Changes:
+v4:
+* new helpers do nothing if @npages == 0
+* tce_iommu_disable() now can decrement the counter if the group was
+detached (not possible now but will be in the future)
+---
+ drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
+ 1 file changed, 63 insertions(+), 19 deletions(-)
 
-Changes:
-v5:
-* do not store bit#0 value, it has to be set for zero-based table
-anyway
-* removed test_and_clear_bit
-* only disable bypass if succeeded
----
- arch/powerpc/kernel/iommu.c | 31 +++++++++++++++++++++++++------
- 1 file changed, 25 insertions(+), 6 deletions(-)
-
-diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
-index 952939f..407d0d6 100644
---- a/arch/powerpc/kernel/iommu.c
-+++ b/arch/powerpc/kernel/iommu.c
-@@ -1024,33 +1024,48 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
+diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
+index 64300cc..40583f9 100644
+--- a/drivers/vfio/vfio_iommu_spapr_tce.c
++++ b/drivers/vfio/vfio_iommu_spapr_tce.c
+@@ -29,6 +29,51 @@
+ static void tce_iommu_detach_group(void *iommu_data,
+ 		struct iommu_group *iommu_group);
  
- int iommu_take_ownership(struct iommu_table *tbl)
++static long try_increment_locked_vm(long npages)
++{
++	long ret = 0, locked, lock_limit;
++
++	if (!current || !current->mm)
++		return -ESRCH; /* process exited */
++
++	if (!npages)
++		return 0;
++
++	down_write(&current->mm->mmap_sem);
++	locked = current->mm->locked_vm + npages;
++	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
++	if (locked > lock_limit && !capable(CAP_IPC_LOCK))
++		ret = -ENOMEM;
++	else
++		current->mm->locked_vm += npages;
++
++	pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
++			npages << PAGE_SHIFT,
++			current->mm->locked_vm << PAGE_SHIFT,
++			rlimit(RLIMIT_MEMLOCK),
++			ret ? " - exceeded" : "");
++
++	up_write(&current->mm->mmap_sem);
++
++	return ret;
++}
++
++static void decrement_locked_vm(long npages)
++{
++	if (!current || !current->mm || !npages)
++		return; /* process exited */
++
++	down_write(&current->mm->mmap_sem);
++	if (npages > current->mm->locked_vm)
++		npages = current->mm->locked_vm;
++	current->mm->locked_vm -= npages;
++	pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
++			npages << PAGE_SHIFT,
++			current->mm->locked_vm << PAGE_SHIFT,
++			rlimit(RLIMIT_MEMLOCK));
++	up_write(&current->mm->mmap_sem);
++}
++
+ /*
+  * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+  *
+@@ -45,6 +90,7 @@ struct tce_container {
+ 	struct mutex lock;
+ 	struct iommu_table *tbl;
+ 	bool enabled;
++	unsigned long locked_pages;
+ };
+ 
+ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+ static int tce_iommu_enable(struct tce_container *container)
  {
--	unsigned long sz = (tbl->it_size + 7) >> 3;
-+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
-+	int ret = 0;
+ 	int ret = 0;
+-	unsigned long locked, lock_limit, npages;
++	unsigned long locked;
+ 	struct iommu_table *tbl = container->tbl;
+ 
+ 	if (!container->tbl)
+@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
+ 	 * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+ 	 * that would effectively kill the guest at random points, much better
+ 	 * enforcing the limit based on the max that the guest can map.
++	 *
++	 * Unfortunately at the moment it counts whole tables, no matter how
++	 * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
++	 * each with 2GB DMA window, 8GB will be counted here. The reason for
++	 * this is that we cannot tell here the amount of RAM used by the guest
++	 * as this information is only available from KVM and VFIO is
++	 * KVM agnostic.
+ 	 */
+-	down_write(&current->mm->mmap_sem);
+-	npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+-	locked = current->mm->locked_vm + npages;
+-	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+-	if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+-		pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+-				rlimit(RLIMIT_MEMLOCK));
+-		ret = -ENOMEM;
+-	} else {
++	locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
++	ret = try_increment_locked_vm(locked);
++	if (ret)
++		return ret;
+ 
+-		current->mm->locked_vm += npages;
+-		container->enabled = true;
+-	}
+-	up_write(&current->mm->mmap_sem);
++	container->locked_pages = locked;
 +
-+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
-+	for (i = 0; i < tbl->nr_pools; i++)
-+		spin_lock(&tbl->pools[i].lock);
++	container->enabled = true;
  
- 	if (tbl->it_offset == 0)
- 		clear_bit(0, tbl->it_map);
+ 	return ret;
+ }
+@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
  
- 	if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
- 		pr_err("iommu_tce: it_map is not empty");
--		return -EBUSY;
-+		ret = -EBUSY;
-+		if (tbl->it_offset == 0)
-+			set_bit(0, tbl->it_map);
-+	} else {
-+		memset(tbl->it_map, 0xff, sz);
- 	}
+ 	container->enabled = false;
  
--	memset(tbl->it_map, 0xff, sz);
-+	for (i = 0; i < tbl->nr_pools; i++)
-+		spin_unlock(&tbl->pools[i].lock);
-+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+-	if (!container->tbl || !current->mm)
++	if (!current->mm)
+ 		return;
  
- 	/*
- 	 * Disable iommu bypass, otherwise the user can DMA to all of
- 	 * our physical memory via the bypass window instead of just
- 	 * the pages that has been explicitly mapped into the iommu
- 	 */
--	if (tbl->set_bypass)
-+	if (!ret && tbl->set_bypass)
- 		tbl->set_bypass(tbl, false);
+-	down_write(&current->mm->mmap_sem);
+-	current->mm->locked_vm -= (container->tbl->it_size <<
+-			container->tbl->it_page_shift) >> PAGE_SHIFT;
+-	up_write(&current->mm->mmap_sem);
++	decrement_locked_vm(container->locked_pages);
+ }
  
--	return 0;
-+	return ret;
- }
- EXPORT_SYMBOL_GPL(iommu_take_ownership);
- 
- void iommu_release_ownership(struct iommu_table *tbl)
- {
--	unsigned long sz = (tbl->it_size + 7) >> 3;
-+	unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
-+
-+	spin_lock_irqsave(&tbl->large_pool.lock, flags);
-+	for (i = 0; i < tbl->nr_pools; i++)
-+		spin_lock(&tbl->pools[i].lock);
- 
- 	memset(tbl->it_map, 0, sz);
- 
-@@ -1058,6 +1073,10 @@ void iommu_release_ownership(struct iommu_table *tbl)
- 	if (tbl->it_offset == 0)
- 		set_bit(0, tbl->it_map);
- 
-+	for (i = 0; i < tbl->nr_pools; i++)
-+		spin_unlock(&tbl->pools[i].lock);
-+	spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
-+
- 	/* The kernel owns the device now, we can restore the iommu bypass */
- 	if (tbl->set_bypass)
- 		tbl->set_bypass(tbl, true);
+ static void *tce_iommu_open(unsigned long arg)
 -- 
-2.0.0
+2.4.0.rc3.8.gfb3e7d5
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help