--- v3
+++ v10
@@ -1,98 +1,156 @@
-This adds missing locks in iommu_take_ownership()/
-iommu_release_ownership().
+There moves locked pages accounting to helpers.
+Later they will be reused for Dynamic DMA windows (DDW).
-This marks all pages busy in iommu_table::it_map in order to catch
-errors if there is an attempt to use this table while ownership over it
-is taken.
+This reworks debug messages to show the current value and the limit.
-This only clears TCE content if there is no page marked busy in it_map.
-Clearing must be done outside of the table locks as iommu_clear_tce()
-called from iommu_clear_tces_and_put_pages() does this.
+This stores the locked pages number in the container so when unlocking
+the iommu table pointer won't be needed. This does not have an effect
+now but it will with the multiple tables per container as then we will
+allow attaching/detaching groups on fly and we may end up having
+a container with no group attached but with the counter incremented.
+
+While we are here, update the comment explaining why RLIMIT_MEMLOCK
+might be required to be bigger than the guest RAM. This also prints
+pid of the current process in pr_warn/pr_debug.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
+[aw: for the vfio related changes]
+Acked-by: Alex Williamson <alex.williamson@redhat.com>
+Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
-Note: we might want to get rid of it as this patchset removes it_map
-from tables passed to VFIO.
+Changes:
+v4:
+* new helpers do nothing if @npages == 0
+* tce_iommu_disable() now can decrement the counter if the group was
+detached (not possible now but will be in the future)
+---
+ drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
+ 1 file changed, 63 insertions(+), 19 deletions(-)
-Changes:
-v5:
-* do not store bit#0 value, it has to be set for zero-based table
-anyway
-* removed test_and_clear_bit
-* only disable bypass if succeeded
----
- arch/powerpc/kernel/iommu.c | 31 +++++++++++++++++++++++++------
- 1 file changed, 25 insertions(+), 6 deletions(-)
-
-diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
-index 952939f..407d0d6 100644
---- a/arch/powerpc/kernel/iommu.c
-+++ b/arch/powerpc/kernel/iommu.c
-@@ -1024,33 +1024,48 @@ EXPORT_SYMBOL_GPL(iommu_tce_build);
+diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
+index 64300cc..40583f9 100644
+--- a/drivers/vfio/vfio_iommu_spapr_tce.c
++++ b/drivers/vfio/vfio_iommu_spapr_tce.c
+@@ -29,6 +29,51 @@
+ static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group);
- int iommu_take_ownership(struct iommu_table *tbl)
++static long try_increment_locked_vm(long npages)
++{
++ long ret = 0, locked, lock_limit;
++
++ if (!current || !current->mm)
++ return -ESRCH; /* process exited */
++
++ if (!npages)
++ return 0;
++
++ down_write(¤t->mm->mmap_sem);
++ locked = current->mm->locked_vm + npages;
++ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
++ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
++ ret = -ENOMEM;
++ else
++ current->mm->locked_vm += npages;
++
++ pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
++ npages << PAGE_SHIFT,
++ current->mm->locked_vm << PAGE_SHIFT,
++ rlimit(RLIMIT_MEMLOCK),
++ ret ? " - exceeded" : "");
++
++ up_write(¤t->mm->mmap_sem);
++
++ return ret;
++}
++
++static void decrement_locked_vm(long npages)
++{
++ if (!current || !current->mm || !npages)
++ return; /* process exited */
++
++ down_write(¤t->mm->mmap_sem);
++ if (npages > current->mm->locked_vm)
++ npages = current->mm->locked_vm;
++ current->mm->locked_vm -= npages;
++ pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
++ npages << PAGE_SHIFT,
++ current->mm->locked_vm << PAGE_SHIFT,
++ rlimit(RLIMIT_MEMLOCK));
++ up_write(¤t->mm->mmap_sem);
++}
++
+ /*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+@@ -45,6 +90,7 @@ struct tce_container {
+ struct mutex lock;
+ struct iommu_table *tbl;
+ bool enabled;
++ unsigned long locked_pages;
+ };
+
+ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+ static int tce_iommu_enable(struct tce_container *container)
{
-- unsigned long sz = (tbl->it_size + 7) >> 3;
-+ unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
-+ int ret = 0;
+ int ret = 0;
+- unsigned long locked, lock_limit, npages;
++ unsigned long locked;
+ struct iommu_table *tbl = container->tbl;
+
+ if (!container->tbl)
+@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
+ * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+ * that would effectively kill the guest at random points, much better
+ * enforcing the limit based on the max that the guest can map.
++ *
++ * Unfortunately at the moment it counts whole tables, no matter how
++ * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
++ * each with 2GB DMA window, 8GB will be counted here. The reason for
++ * this is that we cannot tell here the amount of RAM used by the guest
++ * as this information is only available from KVM and VFIO is
++ * KVM agnostic.
+ */
+- down_write(¤t->mm->mmap_sem);
+- npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+- locked = current->mm->locked_vm + npages;
+- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+- pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+- rlimit(RLIMIT_MEMLOCK));
+- ret = -ENOMEM;
+- } else {
++ locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
++ ret = try_increment_locked_vm(locked);
++ if (ret)
++ return ret;
+
+- current->mm->locked_vm += npages;
+- container->enabled = true;
+- }
+- up_write(¤t->mm->mmap_sem);
++ container->locked_pages = locked;
+
-+ spin_lock_irqsave(&tbl->large_pool.lock, flags);
-+ for (i = 0; i < tbl->nr_pools; i++)
-+ spin_lock(&tbl->pools[i].lock);
++ container->enabled = true;
- if (tbl->it_offset == 0)
- clear_bit(0, tbl->it_map);
+ return ret;
+ }
+@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
- if (!bitmap_empty(tbl->it_map, tbl->it_size)) {
- pr_err("iommu_tce: it_map is not empty");
-- return -EBUSY;
-+ ret = -EBUSY;
-+ if (tbl->it_offset == 0)
-+ set_bit(0, tbl->it_map);
-+ } else {
-+ memset(tbl->it_map, 0xff, sz);
- }
+ container->enabled = false;
-- memset(tbl->it_map, 0xff, sz);
-+ for (i = 0; i < tbl->nr_pools; i++)
-+ spin_unlock(&tbl->pools[i].lock);
-+ spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+- if (!container->tbl || !current->mm)
++ if (!current->mm)
+ return;
- /*
- * Disable iommu bypass, otherwise the user can DMA to all of
- * our physical memory via the bypass window instead of just
- * the pages that has been explicitly mapped into the iommu
- */
-- if (tbl->set_bypass)
-+ if (!ret && tbl->set_bypass)
- tbl->set_bypass(tbl, false);
+- down_write(¤t->mm->mmap_sem);
+- current->mm->locked_vm -= (container->tbl->it_size <<
+- container->tbl->it_page_shift) >> PAGE_SHIFT;
+- up_write(¤t->mm->mmap_sem);
++ decrement_locked_vm(container->locked_pages);
+ }
-- return 0;
-+ return ret;
- }
- EXPORT_SYMBOL_GPL(iommu_take_ownership);
-
- void iommu_release_ownership(struct iommu_table *tbl)
- {
-- unsigned long sz = (tbl->it_size + 7) >> 3;
-+ unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
-+
-+ spin_lock_irqsave(&tbl->large_pool.lock, flags);
-+ for (i = 0; i < tbl->nr_pools; i++)
-+ spin_lock(&tbl->pools[i].lock);
-
- memset(tbl->it_map, 0, sz);
-
-@@ -1058,6 +1073,10 @@ void iommu_release_ownership(struct iommu_table *tbl)
- if (tbl->it_offset == 0)
- set_bit(0, tbl->it_map);
-
-+ for (i = 0; i < tbl->nr_pools; i++)
-+ spin_unlock(&tbl->pools[i].lock);
-+ spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
-+
- /* The kernel owns the device now, we can restore the iommu bypass */
- if (tbl->set_bypass)
- tbl->set_bypass(tbl, true);
+ static void *tce_iommu_open(unsigned long arg)
--
-2.0.0
+2.4.0.rc3.8.gfb3e7d5