--- v5
+++ v10
@@ -1,153 +1,29 @@
-There moves locked pages accounting to helpers.
-Later they will be reused for Dynamic DMA windows (DDW).
+At the moment iommu_free_table() only releases memory if
+the table was initialized for the platform code use, i.e. it had
+it_map initialized (which purpose is to track DMA memory space use).
-This reworks debug messages to show the current value and the limit.
-
-This stores the locked pages number in the container so when unlocking
-the iommu table pointer won't be needed. This does not have an effect
-now but it will with the multiple tables per container as then we will
-allow attaching/detaching groups on fly and we may end up having
-a container with no group attached but with the counter incremented.
-
-While we are here, update the comment explaining why RLIMIT_MEMLOCK
-might be required to be bigger than the guest RAM. This also prints
-pid of the current process in pr_warn/pr_debug.
+With dynamic DMA windows, we will need to be able to release
+iommu_table even if it was used for VFIO in which case it_map is NULL
+so does the patch.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
-Changes:
-v4:
-* new helpers do nothing if @npages == 0
-* tce_iommu_disable() now can decrement the counter if the group was
-detached (not possible now but will be in the future)
----
- drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
- 1 file changed, 63 insertions(+), 19 deletions(-)
+ arch/powerpc/kernel/iommu.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
-diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index 0e37400..432a0de 100644
---- a/drivers/vfio/vfio_iommu_spapr_tce.c
-+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
-@@ -31,6 +31,51 @@
- static void tce_iommu_detach_group(void *iommu_data,
- struct iommu_group *iommu_group);
+diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
+index 3d47eb3..2c02d4c 100644
+--- a/arch/powerpc/kernel/iommu.c
++++ b/arch/powerpc/kernel/iommu.c
+@@ -714,8 +714,7 @@ void iommu_free_table(struct iommu_table *tbl, const char *node_name)
+ unsigned int order;
-+static long try_increment_locked_vm(long npages)
-+{
-+ long ret = 0, locked, lock_limit;
-+
-+ if (!current || !current->mm)
-+ return -ESRCH; /* process exited */
-+
-+ if (!npages)
-+ return 0;
-+
-+ down_write(¤t->mm->mmap_sem);
-+ locked = current->mm->locked_vm + npages;
-+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-+ ret = -ENOMEM;
-+ else
-+ current->mm->locked_vm += npages;
-+
-+ pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
-+ npages << PAGE_SHIFT,
-+ current->mm->locked_vm << PAGE_SHIFT,
-+ rlimit(RLIMIT_MEMLOCK),
-+ ret ? " - exceeded" : "");
-+
-+ up_write(¤t->mm->mmap_sem);
-+
-+ return ret;
-+}
-+
-+static void decrement_locked_vm(long npages)
-+{
-+ if (!current || !current->mm || !npages)
-+ return; /* process exited */
-+
-+ down_write(¤t->mm->mmap_sem);
-+ if (npages > current->mm->locked_vm)
-+ npages = current->mm->locked_vm;
-+ current->mm->locked_vm -= npages;
-+ pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
-+ npages << PAGE_SHIFT,
-+ current->mm->locked_vm << PAGE_SHIFT,
-+ rlimit(RLIMIT_MEMLOCK));
-+ up_write(¤t->mm->mmap_sem);
-+}
-+
- /*
- * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
- *
-@@ -47,6 +92,7 @@ struct tce_container {
- struct mutex lock;
- struct iommu_table *tbl;
- bool enabled;
-+ unsigned long locked_pages;
- };
+ if (!tbl || !tbl->it_map) {
+- printk(KERN_ERR "%s: expected TCE map for %s\n", __func__,
+- node_name);
++ kfree(tbl);
+ return;
+ }
- static bool tce_page_is_contained(struct page *page, unsigned page_shift)
-@@ -68,7 +114,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
- static int tce_iommu_enable(struct tce_container *container)
- {
- int ret = 0;
-- unsigned long locked, lock_limit, npages;
-+ unsigned long locked;
- struct iommu_table *tbl = container->tbl;
-
- if (!container->tbl)
-@@ -97,21 +143,22 @@ static int tce_iommu_enable(struct tce_container *container)
- * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
- * that would effectively kill the guest at random points, much better
- * enforcing the limit based on the max that the guest can map.
-+ *
-+ * Unfortunately at the moment it counts whole tables, no matter how
-+ * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
-+ * each with 2GB DMA window, 8GB will be counted here. The reason for
-+ * this is that we cannot tell here the amount of RAM used by the guest
-+ * as this information is only available from KVM and VFIO is
-+ * KVM agnostic.
- */
-- down_write(¤t->mm->mmap_sem);
-- npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
-- locked = current->mm->locked_vm + npages;
-- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-- pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
-- rlimit(RLIMIT_MEMLOCK));
-- ret = -ENOMEM;
-- } else {
-+ locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
-+ ret = try_increment_locked_vm(locked);
-+ if (ret)
-+ return ret;
-
-- current->mm->locked_vm += npages;
-- container->enabled = true;
-- }
-- up_write(¤t->mm->mmap_sem);
-+ container->locked_pages = locked;
-+
-+ container->enabled = true;
-
- return ret;
- }
-@@ -123,13 +170,10 @@ static void tce_iommu_disable(struct tce_container *container)
-
- container->enabled = false;
-
-- if (!container->tbl || !current->mm)
-+ if (!current->mm)
- return;
-
-- down_write(¤t->mm->mmap_sem);
-- current->mm->locked_vm -= (container->tbl->it_size <<
-- container->tbl->it_page_shift) >> PAGE_SHIFT;
-- up_write(¤t->mm->mmap_sem);
-+ decrement_locked_vm(container->locked_pages);
- }
-
- static void *tce_iommu_open(unsigned long arg)
--
-2.0.0
+2.4.0.rc3.8.gfb3e7d5