--- v9
+++ v10
@@ -1,17 +1,15 @@
-There moves locked pages accounting to helpers.
-Later they will be reused for Dynamic DMA windows (DDW).
-
-This reworks debug messages to show the current value and the limit.
-
-This stores the locked pages number in the container so when unlocking
-the iommu table pointer won't be needed. This does not have an effect
-now but it will with the multiple tables per container as then we will
-allow attaching/detaching groups on fly and we may end up having
-a container with no group attached but with the counter incremented.
-
-While we are here, update the comment explaining why RLIMIT_MEMLOCK
-might be required to be bigger than the guest RAM. This also prints
-pid of the current process in pr_warn/pr_debug.
+This moves page pinning (get_user_pages_fast()/put_page()) code out of
+the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
+to as the platform code does not deal with page pinning.
+
+This makes iommu_take_ownership()/iommu_release_ownership() deal with
+the IOMMU table bitmap only.
+
+This removes page unpinning from iommu_take_ownership() as the actual
+TCE table might contain garbage and doing put_page() on it is undefined
+behaviour.
+
+Besides the last part, the rest of the patch is mechanical.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
[aw: for the vfio related changes]
@@ -19,138 +17,239 @@
Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
Changes:
+v9:
+* added missing tce_iommu_clear call after iommu_release_ownership()
+* brought @offset (a local variable) back to make patch even more
+mechanical
+
v4:
-* new helpers do nothing if @npages == 0
-* tce_iommu_disable() now can decrement the counter if the group was
-detached (not possible now but will be in the future)
+* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
---
- drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
- 1 file changed, 63 insertions(+), 19 deletions(-)
-
+ arch/powerpc/include/asm/iommu.h | 4 --
+ arch/powerpc/kernel/iommu.c | 55 -------------------------
+ drivers/vfio/vfio_iommu_spapr_tce.c | 80 +++++++++++++++++++++++++++++++------
+ 3 files changed, 67 insertions(+), 72 deletions(-)
+
+diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
+index 8353c86..e94a5e3 100644
+--- a/arch/powerpc/include/asm/iommu.h
++++ b/arch/powerpc/include/asm/iommu.h
+@@ -194,10 +194,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+ unsigned long hwaddr, enum dma_data_direction direction);
+ extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
+ unsigned long entry);
+-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+- unsigned long entry, unsigned long pages);
+-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
+- unsigned long entry, unsigned long tce);
+
+ extern void iommu_flush_tce(struct iommu_table *tbl);
+ extern int iommu_take_ownership(struct iommu_table *tbl);
+diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
+index 2c02d4c..8673c94 100644
+--- a/arch/powerpc/kernel/iommu.c
++++ b/arch/powerpc/kernel/iommu.c
+@@ -983,30 +983,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+ }
+ EXPORT_SYMBOL_GPL(iommu_clear_tce);
+
+-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+- unsigned long entry, unsigned long pages)
+-{
+- unsigned long oldtce;
+- struct page *page;
+-
+- for ( ; pages; --pages, ++entry) {
+- oldtce = iommu_clear_tce(tbl, entry);
+- if (!oldtce)
+- continue;
+-
+- page = pfn_to_page(oldtce >> PAGE_SHIFT);
+- WARN_ON(!page);
+- if (page) {
+- if (oldtce & TCE_PCI_WRITE)
+- SetPageDirty(page);
+- put_page(page);
+- }
+- }
+-
+- return 0;
+-}
+-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
+-
+ /*
+ * hwaddr is a kernel virtual address here (0xc... bazillion),
+ * tce_build converts it to a physical address.
+@@ -1036,35 +1012,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+ }
+ EXPORT_SYMBOL_GPL(iommu_tce_build);
+
+-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
+- unsigned long tce)
+-{
+- int ret;
+- struct page *page = NULL;
+- unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+- enum dma_data_direction direction = iommu_tce_direction(tce);
+-
+- ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+- direction != DMA_TO_DEVICE, &page);
+- if (unlikely(ret != 1)) {
+- /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
+- tce, entry << tbl->it_page_shift, ret); */
+- return -EFAULT;
+- }
+- hwaddr = (unsigned long) page_address(page) + offset;
+-
+- ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+- if (ret)
+- put_page(page);
+-
+- if (ret < 0)
+- pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
+- __func__, entry << tbl->it_page_shift, tce, ret);
+-
+- return ret;
+-}
+-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+-
+ int iommu_take_ownership(struct iommu_table *tbl)
+ {
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+@@ -1078,7 +1025,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
+ }
+
+ memset(tbl->it_map, 0xff, sz);
+- iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+
+ /*
+ * Disable iommu bypass, otherwise the user can DMA to all of
+@@ -1096,7 +1042,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
+ {
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+
+- iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+ memset(tbl->it_map, 0, sz);
+
+ /* Restore bit#0 set by iommu_init_table() */
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index 64300cc..40583f9 100644
+index 730b4ef..b95fa2b 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
-@@ -29,6 +29,51 @@
- static void tce_iommu_detach_group(void *iommu_data,
- struct iommu_group *iommu_group);
-
-+static long try_increment_locked_vm(long npages)
+@@ -147,6 +147,67 @@ static void tce_iommu_release(void *iommu_data)
+ kfree(container);
+ }
+
++static int tce_iommu_clear(struct tce_container *container,
++ struct iommu_table *tbl,
++ unsigned long entry, unsigned long pages)
+{
-+ long ret = 0, locked, lock_limit;
-+
-+ if (!current || !current->mm)
-+ return -ESRCH; /* process exited */
-+
-+ if (!npages)
-+ return 0;
-+
-+ down_write(¤t->mm->mmap_sem);
-+ locked = current->mm->locked_vm + npages;
-+ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-+ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
-+ ret = -ENOMEM;
-+ else
-+ current->mm->locked_vm += npages;
-+
-+ pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
-+ npages << PAGE_SHIFT,
-+ current->mm->locked_vm << PAGE_SHIFT,
-+ rlimit(RLIMIT_MEMLOCK),
-+ ret ? " - exceeded" : "");
-+
-+ up_write(¤t->mm->mmap_sem);
++ unsigned long oldtce;
++ struct page *page;
++
++ for ( ; pages; --pages, ++entry) {
++ oldtce = iommu_clear_tce(tbl, entry);
++ if (!oldtce)
++ continue;
++
++ page = pfn_to_page(oldtce >> PAGE_SHIFT);
++ WARN_ON(!page);
++ if (page) {
++ if (oldtce & TCE_PCI_WRITE)
++ SetPageDirty(page);
++ put_page(page);
++ }
++ }
++
++ return 0;
++}
++
++static long tce_iommu_build(struct tce_container *container,
++ struct iommu_table *tbl,
++ unsigned long entry, unsigned long tce, unsigned long pages)
++{
++ long i, ret = 0;
++ struct page *page = NULL;
++ unsigned long hva;
++ enum dma_data_direction direction = iommu_tce_direction(tce);
++
++ for (i = 0; i < pages; ++i) {
++ unsigned long offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
++
++ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
++ direction != DMA_TO_DEVICE, &page);
++ if (unlikely(ret != 1)) {
++ ret = -EFAULT;
++ break;
++ }
++ hva = (unsigned long) page_address(page) + offset;
++
++ ret = iommu_tce_build(tbl, entry + i, hva, direction);
++ if (ret) {
++ put_page(page);
++ pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
++ __func__, entry << tbl->it_page_shift,
++ tce, ret);
++ break;
++ }
++ tce += IOMMU_PAGE_SIZE_4K;
++ }
++
++ if (ret)
++ tce_iommu_clear(container, tbl, entry, i);
+
+ return ret;
+}
+
-+static void decrement_locked_vm(long npages)
-+{
-+ if (!current || !current->mm || !npages)
-+ return; /* process exited */
-+
-+ down_write(¤t->mm->mmap_sem);
-+ if (npages > current->mm->locked_vm)
-+ npages = current->mm->locked_vm;
-+ current->mm->locked_vm -= npages;
-+ pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
-+ npages << PAGE_SHIFT,
-+ current->mm->locked_vm << PAGE_SHIFT,
-+ rlimit(RLIMIT_MEMLOCK));
-+ up_write(¤t->mm->mmap_sem);
-+}
-+
- /*
- * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
- *
-@@ -45,6 +90,7 @@ struct tce_container {
- struct mutex lock;
- struct iommu_table *tbl;
- bool enabled;
-+ unsigned long locked_pages;
- };
-
- static bool tce_page_is_contained(struct page *page, unsigned page_shift)
-@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
- static int tce_iommu_enable(struct tce_container *container)
+ static long tce_iommu_ioctl(void *iommu_data,
+ unsigned int cmd, unsigned long arg)
{
- int ret = 0;
-- unsigned long locked, lock_limit, npages;
-+ unsigned long locked;
- struct iommu_table *tbl = container->tbl;
-
- if (!container->tbl)
-@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
- * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
- * that would effectively kill the guest at random points, much better
- * enforcing the limit based on the max that the guest can map.
-+ *
-+ * Unfortunately at the moment it counts whole tables, no matter how
-+ * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
-+ * each with 2GB DMA window, 8GB will be counted here. The reason for
-+ * this is that we cannot tell here the amount of RAM used by the guest
-+ * as this information is only available from KVM and VFIO is
-+ * KVM agnostic.
- */
-- down_write(¤t->mm->mmap_sem);
-- npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
-- locked = current->mm->locked_vm + npages;
-- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
-- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
-- pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
-- rlimit(RLIMIT_MEMLOCK));
-- ret = -ENOMEM;
-- } else {
-+ locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
-+ ret = try_increment_locked_vm(locked);
-+ if (ret)
-+ return ret;
-
-- current->mm->locked_vm += npages;
-- container->enabled = true;
-- }
-- up_write(¤t->mm->mmap_sem);
-+ container->locked_pages = locked;
-+
-+ container->enabled = true;
-
- return ret;
- }
-@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
-
- container->enabled = false;
-
-- if (!container->tbl || !current->mm)
-+ if (!current->mm)
- return;
-
-- down_write(¤t->mm->mmap_sem);
-- current->mm->locked_vm -= (container->tbl->it_size <<
-- container->tbl->it_page_shift) >> PAGE_SHIFT;
-- up_write(¤t->mm->mmap_sem);
-+ decrement_locked_vm(container->locked_pages);
- }
-
- static void *tce_iommu_open(unsigned long arg)
+@@ -195,7 +256,7 @@ static long tce_iommu_ioctl(void *iommu_data,
+ case VFIO_IOMMU_MAP_DMA: {
+ struct vfio_iommu_type1_dma_map param;
+ struct iommu_table *tbl = container->tbl;
+- unsigned long tce, i;
++ unsigned long tce;
+
+ if (!tbl)
+ return -ENXIO;
+@@ -229,17 +290,9 @@ static long tce_iommu_ioctl(void *iommu_data,
+ if (ret)
+ return ret;
+
+- for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
+- ret = iommu_put_tce_user_mode(tbl,
+- (param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
+- tce);
+- if (ret)
+- break;
+- tce += IOMMU_PAGE_SIZE_4K;
+- }
+- if (ret)
+- iommu_clear_tces_and_put_pages(tbl,
+- param.iova >> IOMMU_PAGE_SHIFT_4K, i);
++ ret = tce_iommu_build(container, tbl,
++ param.iova >> IOMMU_PAGE_SHIFT_4K,
++ tce, param.size >> IOMMU_PAGE_SHIFT_4K);
+
+ iommu_flush_tce(tbl);
+
+@@ -273,7 +326,7 @@ static long tce_iommu_ioctl(void *iommu_data,
+ if (ret)
+ return ret;
+
+- ret = iommu_clear_tces_and_put_pages(tbl,
++ ret = tce_iommu_clear(container, tbl,
+ param.iova >> IOMMU_PAGE_SHIFT_4K,
+ param.size >> IOMMU_PAGE_SHIFT_4K);
+ iommu_flush_tce(tbl);
+@@ -357,6 +410,7 @@ static void tce_iommu_detach_group(void *iommu_data,
+ /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
+ iommu_group_id(iommu_group), iommu_group); */
+ container->tbl = NULL;
++ tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
+ iommu_release_ownership(tbl);
+ }
+ mutex_unlock(&container->lock);
--
-2.0.0
+2.4.0.rc3.8.gfb3e7d5