--- v1
+++ v10
@@ -1,108 +1,156 @@
-The previous patch introduced iommu_table_ops::exchange() callback
-which effectively disabled VFIO on pseries. This implements exchange()
-for pseries/lpar so VFIO can work in nested guests.
+There moves locked pages accounting to helpers.
+Later they will be reused for Dynamic DMA windows (DDW).
-Since exchaange() callback returns an old TCE, it has to call H_GET_TCE
-for every TCE being put to the table so VFIO performance in guests
-running under PR KVM is expected to be slower than in guests running under
-HV KVM or bare metal hosts.
+This reworks debug messages to show the current value and the limit.
+
+This stores the locked pages number in the container so when unlocking
+the iommu table pointer won't be needed. This does not have an effect
+now but it will with the multiple tables per container as then we will
+allow attaching/detaching groups on fly and we may end up having
+a container with no group attached but with the counter incremented.
+
+While we are here, update the comment explaining why RLIMIT_MEMLOCK
+might be required to be bigger than the guest RAM. This also prints
+pid of the current process in pr_warn/pr_debug.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
+[aw: for the vfio related changes]
+Acked-by: Alex Williamson <alex.williamson@redhat.com>
+Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
---
- arch/powerpc/platforms/pseries/iommu.c | 25 +++++++++++++++++++++++--
- 1 file changed, 23 insertions(+), 2 deletions(-)
+Changes:
+v4:
+* new helpers do nothing if @npages == 0
+* tce_iommu_disable() now can decrement the counter if the group was
+detached (not possible now but will be in the future)
+---
+ drivers/vfio/vfio_iommu_spapr_tce.c | 82 ++++++++++++++++++++++++++++---------
+ 1 file changed, 63 insertions(+), 19 deletions(-)
-diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
-index 9a7364f..ae15b5a 100644
---- a/arch/powerpc/platforms/pseries/iommu.c
-+++ b/arch/powerpc/platforms/pseries/iommu.c
-@@ -138,13 +138,14 @@ static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long);
+diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
+index 64300cc..40583f9 100644
+--- a/drivers/vfio/vfio_iommu_spapr_tce.c
++++ b/drivers/vfio/vfio_iommu_spapr_tce.c
+@@ -29,6 +29,51 @@
+ static void tce_iommu_detach_group(void *iommu_data,
+ struct iommu_group *iommu_group);
- static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
- long npages, unsigned long uaddr,
-+ unsigned long *old_tces,
- enum dma_data_direction direction,
- struct dma_attrs *attrs)
++static long try_increment_locked_vm(long npages)
++{
++ long ret = 0, locked, lock_limit;
++
++ if (!current || !current->mm)
++ return -ESRCH; /* process exited */
++
++ if (!npages)
++ return 0;
++
++ down_write(¤t->mm->mmap_sem);
++ locked = current->mm->locked_vm + npages;
++ lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
++ if (locked > lock_limit && !capable(CAP_IPC_LOCK))
++ ret = -ENOMEM;
++ else
++ current->mm->locked_vm += npages;
++
++ pr_debug("[%d] RLIMIT_MEMLOCK +%ld %ld/%ld%s\n", current->pid,
++ npages << PAGE_SHIFT,
++ current->mm->locked_vm << PAGE_SHIFT,
++ rlimit(RLIMIT_MEMLOCK),
++ ret ? " - exceeded" : "");
++
++ up_write(¤t->mm->mmap_sem);
++
++ return ret;
++}
++
++static void decrement_locked_vm(long npages)
++{
++ if (!current || !current->mm || !npages)
++ return; /* process exited */
++
++ down_write(¤t->mm->mmap_sem);
++ if (npages > current->mm->locked_vm)
++ npages = current->mm->locked_vm;
++ current->mm->locked_vm -= npages;
++ pr_debug("[%d] RLIMIT_MEMLOCK -%ld %ld/%ld\n", current->pid,
++ npages << PAGE_SHIFT,
++ current->mm->locked_vm << PAGE_SHIFT,
++ rlimit(RLIMIT_MEMLOCK));
++ up_write(¤t->mm->mmap_sem);
++}
++
+ /*
+ * VFIO IOMMU fd for SPAPR_TCE IOMMU implementation
+ *
+@@ -45,6 +90,7 @@ struct tce_container {
+ struct mutex lock;
+ struct iommu_table *tbl;
+ bool enabled;
++ unsigned long locked_pages;
+ };
+
+ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+@@ -60,7 +106,7 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+ static int tce_iommu_enable(struct tce_container *container)
{
- u64 rc = 0;
- u64 proto_tce, tce;
- u64 rpn;
-- int ret = 0;
-+ int ret = 0, i = 0;
- long tcenum_start = tcenum, npages_start = npages;
+ int ret = 0;
+- unsigned long locked, lock_limit, npages;
++ unsigned long locked;
+ struct iommu_table *tbl = container->tbl;
- rpn = __pa(uaddr) >> TCE_SHIFT;
-@@ -154,6 +155,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
+ if (!container->tbl)
+@@ -89,21 +135,22 @@ static int tce_iommu_enable(struct tce_container *container)
+ * Also we don't have a nice way to fail on H_PUT_TCE due to ulimits,
+ * that would effectively kill the guest at random points, much better
+ * enforcing the limit based on the max that the guest can map.
++ *
++ * Unfortunately at the moment it counts whole tables, no matter how
++ * much memory the guest has. I.e. for 4GB guest and 4 IOMMU groups
++ * each with 2GB DMA window, 8GB will be counted here. The reason for
++ * this is that we cannot tell here the amount of RAM used by the guest
++ * as this information is only available from KVM and VFIO is
++ * KVM agnostic.
+ */
+- down_write(¤t->mm->mmap_sem);
+- npages = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
+- locked = current->mm->locked_vm + npages;
+- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+- if (locked > lock_limit && !capable(CAP_IPC_LOCK)) {
+- pr_warn("RLIMIT_MEMLOCK (%ld) exceeded\n",
+- rlimit(RLIMIT_MEMLOCK));
+- ret = -ENOMEM;
+- } else {
++ locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
++ ret = try_increment_locked_vm(locked);
++ if (ret)
++ return ret;
- while (npages--) {
- tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT;
-+ if (old_tces)
-+ plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12,
-+ &old_tces[i++]);
- rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce);
+- current->mm->locked_vm += npages;
+- container->enabled = true;
+- }
+- up_write(¤t->mm->mmap_sem);
++ container->locked_pages = locked;
++
++ container->enabled = true;
- if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) {
-@@ -179,8 +183,9 @@ static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum,
-
- static DEFINE_PER_CPU(__be64 *, tce_page);
-
--static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
-+static int tce_xchg_pSeriesLP(struct iommu_table *tbl, long tcenum,
- long npages, unsigned long uaddr,
-+ unsigned long *old_tces,
- enum dma_data_direction direction,
- struct dma_attrs *attrs)
- {
-@@ -195,6 +200,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
-
- if ((npages == 1) || !firmware_has_feature(FW_FEATURE_MULTITCE)) {
- return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
-+ old_tces,
- direction, attrs);
- }
-
-@@ -211,6 +217,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
- if (!tcep) {
- local_irq_restore(flags);
- return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr,
-+ old_tces,
- direction, attrs);
- }
- __get_cpu_var(tce_page) = tcep;
-@@ -232,6 +239,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
- for (l = 0; l < limit; l++) {
- tcep[l] = cpu_to_be64(proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT);
- rpn++;
-+ if (old_tces)
-+ plpar_tce_get((u64)tbl->it_index,
-+ (u64)(tcenum + l) << 12,
-+ &old_tces[tcenum + l]);
- }
-
- rc = plpar_tce_put_indirect((u64)tbl->it_index,
-@@ -262,6 +273,15 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
return ret;
}
+@@ -115,13 +162,10 @@ static void tce_iommu_disable(struct tce_container *container)
-+static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum,
-+ long npages, unsigned long uaddr,
-+ enum dma_data_direction direction,
-+ struct dma_attrs *attrs)
-+{
-+ return tce_xchg_pSeriesLP(tbl, tcenum, npages, uaddr, NULL,
-+ direction, attrs);
-+}
-+
- static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages)
- {
- u64 rc;
-@@ -637,6 +657,7 @@ static void pci_dma_bus_setup_pSeries(struct pci_bus *bus)
+ container->enabled = false;
- struct iommu_table_ops iommu_table_lpar_multi_ops = {
- .set = tce_buildmulti_pSeriesLP,
-+ .exchange = tce_xchg_pSeriesLP,
- .clear = tce_freemulti_pSeriesLP,
- .get = tce_get_pSeriesLP
- };
+- if (!container->tbl || !current->mm)
++ if (!current->mm)
+ return;
+
+- down_write(¤t->mm->mmap_sem);
+- current->mm->locked_vm -= (container->tbl->it_size <<
+- container->tbl->it_page_shift) >> PAGE_SHIFT;
+- up_write(¤t->mm->mmap_sem);
++ decrement_locked_vm(container->locked_pages);
+ }
+
+ static void *tce_iommu_open(unsigned long arg)
--
-2.0.0
+2.4.0.rc3.8.gfb3e7d5