--- v10
+++ v7
@@ -1,29 +1,153 @@
-At the moment iommu_free_table() only releases memory if
-the table was initialized for the platform code use, i.e. it had
-it_map initialized (which purpose is to track DMA memory space use).
+There moves locked pages accounting to helpers.
+Later they will be reused for Dynamic DMA windows (DDW).
-With dynamic DMA windows, we will need to be able to release
-iommu_table even if it was used for VFIO in which case it_map is NULL
-so does the patch.
+This reworks debug messages to show the current value and the limit.
+
+This stores the locked pages number in the container so when unlocking
+the iommu table pointer won't be needed. This does not have an effect
+now but it will with the multiple tables per container as then we will
+allow attaching/detaching groups on fly and we may end up having
+a container with no group attached but with the counter incremented.
+
+While we are here, update the comment explaining why RLIMIT_MEMLOCK
+might be required to be bigger than the guest RAM. This also prints
+pid of the current process in pr_warn/pr_debug.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
- arch/powerpc/kernel/iommu.c | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
+Changes:
+v4:
+* new helpers do nothing if @npages == 0
+* tce_iommu_disable() now can decrement the counter if the group was
+detached (not possible now but will be in the future)
+---
+ drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
+ 1 file changed, 63 insertions(+), 19 deletions(-)
-diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
-index 3d47eb3..2c02d4c 100644
---- a/arch/powerpc/kernel/iommu.c
-+++ b/arch/powerpc/kernel/iommu.c
-@@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
- unsigned int order;
+diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
+index 8bbee22..9448e39 100644
+--- a/drivers/vfio/vfio_iommu_spapr_tce.c
++++ b/drivers/vfio/vfio_iommu_spapr_tce.c
+@@ -29,6 +29,51 @@
+ static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group);
- if (!tbl || !tbl->it_map) {
-- printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
-- node_name);
-+ kfree(tbl);
++static long try_increment_locked_vm(long npages)
++{
++ long ret = 0, locked, lock_limit;
++
++ if (!current || !current->mm)
++ return -ESRCH; /* process exited */
++
++ if (!npages)
++ return 0;
++
++ down_write(¤t->mm->mmap_sem);
++ locked = current->mm->locked_vm + npages;
++ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
++ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
++ ret = -ENOMEM;
++ else
++ current->mm->locked_vm += npages;
++
++ pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
++ npages << PAGE_SHIFT,
++ current->mm->locked_vm << PAGE_SHIFT,
++ rlimit(RLIMIT_MEMLOCK),
++ ret ? " - exceeded" : "");
++
++ up_write(¤t->mm->mmap_sem);
++
++ return ret;
++}
++
++static void decrement_locked_vm(long npages)
++{
++ if (!current || !current->mm || !npages)
++ return; /* process exited */
++
++ down_write(¤t->mm->mmap_sem);
++ if (npages > current->mm->locked_vm)
++ npages = current->mm->locked_vm;
++ current->mm->locked_vm -= npages;
++ pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
++ npages << PAGE_SHIFT,
++ current->mm->locked_vm << PAGE_SHIFT,
++ rlimit(RLIMIT_MEMLOCK));
++ up_write(¤t->mm->mmap_sem);
++}
++
+ /*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+@@ -45,6 +90,7 @@ struct tce_container {
+ struct mutex lock;
+ struct iommu_table *tbl;
+ bool enabled;
++ unsigned long locked_pages;
+ };
+
+ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+ static int tce_iommu_enable(struct tce_container *container)
+ {
+ int ret = 0;
+- unsigned long locked, lock_limit, npages;
++ unsigned long locked;
+ struct iommu_table *tbl = container->tbl;
+
+ if (!container->tbl)
+@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
+ * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+ * that would effectively kill the guest at random points, much better
+ * enforcing the limit based on the max that the guest can map.
++ *
++ * Unfortunately at the moment it counts whole tables, no matter how
++ * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
++ * each with 2GB DMA window, 8GB will be counted here. The reason for
++ * this is that we cannot tell here the amount of RAM used by the guest
++ * as this information is only available from KVM and VFIO is
++ * KVM agnostic.
+ */
+- down_write(¤t->mm->mmap_sem);
+- npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+- locked = current->mm->locked_vm + npages;
+- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+- pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+- rlimit(RLIMIT_MEMLOCK));
+- ret = -ENOMEM;
+- } else {
++ locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
++ ret = try_increment_locked_vm(locked);
++ if (ret)
++ return ret;
+
+- current->mm->locked_vm += npages;
+- container->enabled = true;
+- }
+- up_write(¤t->mm->mmap_sem);
++ container->locked_pages = locked;
++
++ container->enabled = true;
+
+ return ret;
+ }
+@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
+
+ container->enabled = false;
+
+- if (!container->tbl || !current->mm)
++ if (!current->mm)
return;
- }
+- down_write(¤t->mm->mmap_sem);
+- current->mm->locked_vm -= (container->tbl->it_size <<
+- container->tbl->it_page_shift) >> PAGE_SHIFT;
+- up_write(¤t->mm->mmap_sem);
++ decrement_locked_vm(container->locked_pages);
+ }
+
+ static void *tce_iommu_open(unsigned long arg)
--
-2.4.0.rc3.8.gfb3e7d5
+2.0.0