--- v12
+++ v7
@@ -1,100 +1,121 @@
-This is a part of moving DMA window programming to an iommu_ops
-callback. pnv_pci_ioda2_set_window() takes an iommu_table_group as
-a first parameter (not pnv_ioda_pe) as it is going to be used as
-a callback for VFIO DDW code.
+In order to support memory pre-registration, we need a way to track
+the use of every registered memory region and only allow unregistration
+if a region is not in use anymore. So we need a way to tell from what
+region the just cleared TCE was from.
-This should cause no behavioural change.
+This adds a userspace view of the TCE table into iommu_table struct.
+It contains userspace address, one per TCE entry. The table is only
+allocated when the ownership over an IOMMU group is taken which means
+it is only used from outside of the powernv code (such as VFIO).
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
-Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
-Reviewed-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
---
-Changes:
-v12:
-* removed comment from commit log about pnv_pci_ioda2_tvt_invalidate()/
-pnv_pci_ioda2_invalidate_entire()
+ arch/powerpc/include/asm/iommu.h | 6 ++++++
+ arch/powerpc/kernel/iommu.c | 7 +++++++
+ arch/powerpc/platforms/powernv/pci-ioda.c | 23 ++++++++++++++++++++++-
+ 3 files changed, 35 insertions(+), 1 deletion(-)
-v11:
-* replaced some 1<<it_page_shift with IOMMU_PAGE_SIZE() macro
-
-v9:
-* initialize pe->table_group.tables[0] at the very end when
-tbl is fully initialized
-* moved pnv_pci_ioda2_tvt_invalidate() from earlier patch
----
- arch/powerpc/platforms/powernv/pci-ioda.c | 47 +++++++++++++++++++++++++------
- 1 file changed, 38 insertions(+), 9 deletions(-)
-
+diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
+index 2c08c91..a768a4d 100644
+--- a/arch/powerpc/include/asm/iommu.h
++++ b/arch/powerpc/include/asm/iommu.h
+@@ -106,9 +106,15 @@ struct iommu_table {
+ unsigned long *it_map; /* A simple allocation bitmap for now */
+ unsigned long it_page_shift;/* table iommu page size */
+ struct iommu_table_group *it_group;
++ unsigned long *it_userspace; /* userspace view of the table */
+ struct iommu_table_ops *it_ops;
+ };
+
++#define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \
++ ((tbl)->it_userspace ? \
++ &((tbl)->it_userspace[(entry) - (tbl)->it_offset]) : \
++ NULL)
++
+ /* Pure 2^n version of get_order */
+ static inline __attribute_const__
+ int get_iommu_order(unsigned long size, struct iommu_table *tbl)
+diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
+index 0bcd988..82102d1 100644
+--- a/arch/powerpc/kernel/iommu.c
++++ b/arch/powerpc/kernel/iommu.c
+@@ -38,6 +38,7 @@
+ #include <linux/pci.h>
+ #include <linux/iommu.h>
+ #include <linux/sched.h>
++#include <linux/vmalloc.h>
+ #include <asm/io.h>
+ #include <asm/prom.h>
+ #include <asm/iommu.h>
+@@ -1069,6 +1070,9 @@ static int iommu_table_take_ownership(struct iommu_table *tbl)
+ spin_unlock(&tbl->pools[i].lock);
+ spin_unlock_irqrestore(&tbl->large_pool.lock, flags);
+
++ BUG_ON(tbl->it_userspace);
++ tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
++
+ return 0;
+ }
+
+@@ -1102,6 +1106,9 @@ static void iommu_table_release_ownership(struct iommu_table *tbl)
+ {
+ unsigned long flags, i, sz = (tbl->it_size + 7) >> 3;
+
++ vfree(tbl->it_userspace);
++ tbl->it_userspace = NULL;
++
+ spin_lock_irqsave(&tbl->large_pool.lock, flags);
+ for (i = 0; i < tbl->nr_pools; i++)
+ spin_lock(&tbl->pools[i].lock);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
-index 38d53dc..da14043 100644
+index bc36cf1..036f3c1 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
-@@ -1969,6 +1969,43 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
- }
+@@ -26,6 +26,7 @@
+ #include <linux/iommu.h>
+ #include <linux/mmzone.h>
+ #include <linux/sizes.h>
++#include <linux/vmalloc.h>
+
+ #include <asm/mmzone.h>
+ #include <asm/sections.h>
+@@ -1469,6 +1470,9 @@ static void pnv_pci_free_table(struct iommu_table *tbl)
+ if (!tbl->it_size)
+ return;
+
++ if (tbl->it_userspace)
++ vfree(tbl->it_userspace);
++
+ pnv_free_tce_table(tbl->it_base, size, tbl->it_indirect_levels);
+ iommu_reset_table(tbl, "ioda2");
+ }
+@@ -1656,9 +1660,26 @@ static void pnv_ioda2_set_ownership(struct iommu_table_group *table_group,
+ pnv_pci_ioda2_set_bypass(pe, !enable);
}
-+static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
-+ int num, struct iommu_table *tbl)
++static long pnv_pci_ioda2_create_table_with_uas(
++ struct iommu_table_group *table_group,
++ int num, __u32 page_shift, __u64 window_size, __u32 levels,
++ struct iommu_table *tbl)
+{
-+ struct pnv_ioda_pe *pe = container_of(table_group, struct pnv_ioda_pe,
-+ table_group);
-+ struct pnv_phb *phb = pe->phb;
-+ int64_t rc;
-+ const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
-+ const __u64 win_size = tbl->it_size << tbl->it_page_shift;
++ long ret = pnv_pci_ioda2_create_table(table_group, num,
++ page_shift, window_size, levels, tbl);
+
-+ pe_info(pe, "Setting up window %llx..%llx pg=%x\n",
-+ start_addr, start_addr + win_size - 1,
-+ IOMMU_PAGE_SIZE(tbl));
++ if (ret)
++ return ret;
+
-+ /*
-+ * Map TCE table through TVT. The TVE index is the PE number
-+ * shifted by 1 bit for 32-bits DMA space.
-+ */
-+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
-+ pe->pe_number,
-+ pe->pe_number << 1,
-+ 1,
-+ __pa(tbl->it_base),
-+ tbl->it_size << 3,
-+ IOMMU_PAGE_SIZE(tbl));
-+ if (rc) {
-+ pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
-+ return rc;
-+ }
-+
-+ pnv_pci_link_table_and_group(phb->hose->node, num,
-+ tbl, &pe->table_group);
-+ pnv_pci_ioda2_tce_invalidate_entire(pe);
++ BUG_ON(tbl->it_userspace);
++ tbl->it_userspace = vzalloc(sizeof(*tbl->it_userspace) * tbl->it_size);
+
+ return 0;
+}
+
- static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable)
- {
- uint16_t window_id = (pe->pe_number << 1 ) + 1;
-@@ -2124,21 +2161,13 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
- pe->table_group.ops = &pnv_pci_ioda2_ops;
- #endif
-
-- /*
-- * Map TCE table through TVT. The TVE index is the PE number
-- * shifted by 1 bit for 32-bits DMA space.
-- */
-- rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
-- pe->pe_number << 1, 1, __pa(tbl->it_base),
-- tbl->it_size << 3, 1ULL << tbl->it_page_shift);
-+ rc = pnv_pci_ioda2_set_window(&pe->table_group, 0, tbl);
- if (rc) {
- pe_err(pe, "Failed to configure 32-bit TCE table,"
- " err %ld\n", rc);
- goto fail;
- }
-
-- pnv_pci_ioda2_tce_invalidate_entire(pe);
--
- /* OPAL variant of PHB3 invalidated TCEs */
- if (phb->ioda.tce_inval_reg)
- tbl->it_type |= (TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE);
+ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
+ .set_ownership = pnv_ioda2_set_ownership,
+- .create_table = pnv_pci_ioda2_create_table,
++ .create_table = pnv_pci_ioda2_create_table_with_uas,
+ .set_window = pnv_pci_ioda2_set_window,
+ .unset_window = pnv_pci_ioda2_unset_window,
+ };
--
-2.4.0.rc3.8.gfb3e7d5
+2.0.0