--- v4
+++ v12
@@ -1,398 +1,265 @@
-The existing implementation accounts the whole DMA window in
-the locked_vm counter which is going to be even worse with multiple
-containers and huge DMA windows.
-
-This introduces 2 ioctls to register/unregister DMA memory which
-receive user space address and size of the memory region which
-needs to be pinned/unpinned and counted in locked_vm.
-
-If any memory region was registered, all subsequent DMA map requests
-should address already pinned memory. If no memory was registered,
-then the amount of memory required for a single default memory will be
-accounted when the container is enabled and every map/unmap will pin/unpin
-a page.
-
-Dynamic DMA window and in-kernel acceleration will require memory to
-be registered in order to work.
-
-The accounting is done per VFIO container. When the support of
-multiple groups per container is added, we will have accurate locked_vm
-accounting.
+TCE tables might get too big in case of 4K IOMMU pages and DDW enabled
+on huge guests (hundreds of GB of RAM) so the kernel might be unable to
+allocate contiguous chunk of physical memory to store the TCE table.
+
+To address this, POWER8 CPU (actually, IODA2) supports multi-level
+TCE tables, up to 5 levels which splits the table into a tree of
+smaller subtables.
+
+This adds multi-level TCE tables support to
+pnv_pci_ioda2_table_alloc_pages() and pnv_pci_ioda2_table_free_pages()
+helpers.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
Changes:
-v4:
-* updated docs
-* s/kzmalloc/vzalloc/
-* in tce_pin_pages()/tce_unpin_pages() removed @vaddr, @size and
-replaced offset with index
-* renamed vfio_iommu_type_register_memory to vfio_iommu_spapr_register_memory
-and removed duplicating vfio_iommu_spapr_register_memory
+v12:
+* changed pnv_pci_ioda2_table_do_alloc_pages() to return NULL to
+pnv_pci_ioda2_table_alloc_pages() only if the first level allocation
+failed, otherwise it always returns non zero value
+* pnv_pci_ioda2_table_do_free_pages() now takes __be64* rather than
+uinsigned long
+* s/tce_table_allocated/current_offset/
+
+v10:
+* fixed multiple comments received for v9
+
+v9:
+* moved from ioda2 to common powernv pci code
+* fixed cleanup if allocation fails in a middle
+* removed check for the size - all boundary checks happen in the calling code
+anyway
---
- drivers/vfio/vfio_iommu_spapr_tce.c | 222 ++++++++++++++++++++++++------------
- 1 file changed, 148 insertions(+), 74 deletions(-)
-
-diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index 4ff8289..ee91d51 100644
---- a/drivers/vfio/vfio_iommu_spapr_tce.c
-+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
-@@ -91,10 +91,16 @@ static void decrement_locked_vm(long npages)
- */
- struct tce_container {
- struct mutex lock;
-- struct iommu_group *grp;
- bool enabled;
- unsigned long locked_pages;
- struct list_head mem_list;
-+ struct iommu_table tables[POWERPC_IOMMU_MAX_TABLES];
-+ struct list_head group_list;
-+};
-+
-+struct tce_iommu_group {
-+ struct list_head next;
-+ struct iommu_group *grp;
- };
-
- struct tce_memory {
-@@ -300,19 +306,20 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
- return false;
- }
-
-+static inline bool tce_groups_attached(struct tce_container *container)
+ arch/powerpc/include/asm/iommu.h | 2 +
+ arch/powerpc/platforms/powernv/pci-ioda.c | 105 +++++++++++++++++++++++++++---
+ arch/powerpc/platforms/powernv/pci.c | 13 ++++
+ 3 files changed, 111 insertions(+), 9 deletions(-)
+
+diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
+index 4636734..706cfc0 100644
+--- a/arch/powerpc/include/asm/iommu.h
++++ b/arch/powerpc/include/asm/iommu.h
+@@ -96,6 +96,8 @@ struct iommu_pool {
+ struct iommu_table {
+ unsigned long it_busno; /* Bus number this table belongs to */
+ unsigned long it_size; /* Size of iommu table in entries */
++ unsigned long it_indirect_levels;
++ unsigned long it_level_size;
+ unsigned long it_offset; /* Offset into global table */
+ unsigned long it_base; /* mapped address of tce table */
+ unsigned long it_index; /* which iommu table this is */
+diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
+index da14043..a253dda 100644
+--- a/arch/powerpc/platforms/powernv/pci-ioda.c
++++ b/arch/powerpc/platforms/powernv/pci-ioda.c
+@@ -50,6 +50,9 @@
+ /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
+ #define TCE32_TABLE_SIZE ((0x10000000 / 0x1000) * 8)
+
++#define POWERNV_IOMMU_DEFAULT_LEVELS 1
++#define POWERNV_IOMMU_MAX_LEVELS 5
++
+ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl);
+
+ static void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
+@@ -1976,6 +1979,8 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+ table_group);
+ struct pnv_phb *phb = pe->phb;
+ int64_t rc;
++ const unsigned long size = tbl->it_indirect_levels ?
++ tbl->it_level_size : tbl->it_size;
+ const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
+ const __u64 win_size = tbl->it_size << tbl->it_page_shift;
+
+@@ -1990,9 +1995,9 @@ static long pnv_pci_ioda2_set_window(struct iommu_table_group *table_group,
+ rc = opal_pci_map_pe_dma_window(phb->opal_id,
+ pe->pe_number,
+ pe->pe_number << 1,
+- 1,
++ tbl->it_indirect_levels + 1,
+ __pa(tbl->it_base),
+- tbl->it_size << 3,
++ size << 3,
+ IOMMU_PAGE_SIZE(tbl));
+ if (rc) {
+ pe_err(pe, "Failed to configure TCE table, err %ld\n", rc);
+@@ -2072,11 +2077,16 @@ static void pnv_pci_ioda_setup_opal_tce_kill(struct pnv_phb *phb)
+ phb->ioda.tce_inval_reg = ioremap(phb->ioda.tce_inval_reg_phys, 8);
+ }
+
+-static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
++static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift,
++ unsigned levels, unsigned long limit,
++ unsigned long *current_offset)
+ {
+ struct page *tce_mem = NULL;
+- __be64 *addr;
++ __be64 *addr, *tmp;
+ unsigned order = max_t(unsigned, shift, PAGE_SHIFT) - PAGE_SHIFT;
++ unsigned long allocated = 1UL << (order + PAGE_SHIFT);
++ unsigned entries = 1UL << (shift - 3);
++ long i;
+
+ tce_mem = alloc_pages_node(nid, GFP_KERNEL, order);
+ if (!tce_mem) {
+@@ -2084,31 +2094,79 @@ static __be64 *pnv_pci_ioda2_table_do_alloc_pages(int nid, unsigned shift)
+ return NULL;
+ }
+ addr = page_address(tce_mem);
+- memset(addr, 0, 1UL << (order + PAGE_SHIFT));
++ memset(addr, 0, allocated);
++
++ --levels;
++ if (!levels) {
++ *current_offset += allocated;
++ return addr;
++ }
++
++ for (i = 0; i < entries; ++i) {
++ tmp = pnv_pci_ioda2_table_do_alloc_pages(nid, shift,
++ levels, limit, current_offset);
++ if (!tmp)
++ break;
++
++ addr[i] = cpu_to_be64(__pa(tmp) |
++ TCE_PCI_READ | TCE_PCI_WRITE);
++
++ if (*current_offset >= limit)
++ break;
++ }
+
+ return addr;
+ }
+
++static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
++ unsigned long size, unsigned level);
++
+ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+- __u32 page_shift, __u64 window_size, struct iommu_table *tbl)
++ __u32 page_shift, __u64 window_size, __u32 levels,
++ struct iommu_table *tbl)
+ {
+ void *addr;
++ unsigned long offset = 0, level_shift;
+ const unsigned window_shift = ilog2(window_size);
+ unsigned entries_shift = window_shift - page_shift;
+ unsigned table_shift = max_t(unsigned, entries_shift + 3, PAGE_SHIFT);
+ const unsigned long tce_table_size = 1UL << table_shift;
+
++ if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
++ return -EINVAL;
++
+ if ((window_size > memory_hotplug_max()) || !is_power_of_2(window_size))
+ return -EINVAL;
+
++ /* Adjust direct table size from window_size and levels */
++ entries_shift = (entries_shift + levels - 1) / levels;
++ level_shift = entries_shift + 3;
++ level_shift = max_t(unsigned, level_shift, PAGE_SHIFT);
++
+ /* Allocate TCE table */
+- addr = pnv_pci_ioda2_table_do_alloc_pages(nid, table_shift);
++ addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
++ levels, tce_table_size, &offset);
++
++ /* addr==NULL means that the first level allocation failed */
+ if (!addr)
+ return -ENOMEM;
+
++ /*
++ * First level was allocated but some lower level failed as
++ * we did not allocate as much as we wanted,
++ * release partially allocated table.
++ */
++ if (offset < tce_table_size) {
++ pnv_pci_ioda2_table_do_free_pages(addr,
++ 1ULL << (level_shift - 3), levels - 1);
++ return -ENOMEM;
++ }
++
+ /* Setup linux iommu table */
+ pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, bus_offset,
+ page_shift);
++ tbl->it_level_size = 1ULL << (level_shift - 3);
++ tbl->it_indirect_levels = levels - 1;
+
+ pr_devel("Created TCE table: ws=%08llx ts=%lx @%08llx\n",
+ window_size, tce_table_size, bus_offset);
+@@ -2116,12 +2174,40 @@ static long pnv_pci_ioda2_table_alloc_pages(int nid, __u64 bus_offset,
+ return 0;
+ }
+
++static void pnv_pci_ioda2_table_do_free_pages(__be64 *addr,
++ unsigned long size, unsigned level)
+{
-+ return !list_empty(&container->group_list);
++ const unsigned long addr_ul = (unsigned long) addr &
++ ~(TCE_PCI_READ | TCE_PCI_WRITE);
++
++ if (level) {
++ long i;
++ u64 *tmp = (u64 *) addr_ul;
++
++ for (i = 0; i < size; ++i) {
++ unsigned long hpa = be64_to_cpu(tmp[i]);
++
++ if (!(hpa & (TCE_PCI_READ | TCE_PCI_WRITE)))
++ continue;
++
++ pnv_pci_ioda2_table_do_free_pages(__va(hpa), size,
++ level - 1);
++ }
++ }
++
++ free_pages(addr_ul, get_order(size << 3));
+}
+
- static struct iommu_table *spapr_tce_find_table(
- struct tce_container *container,
- phys_addr_t ioba)
- {
- long i;
- struct iommu_table *ret = NULL;
-- struct powerpc_iommu *iommu = iommu_group_get_iommudata(container->grp);
--
-- if (!iommu)
-- return NULL;
-
- for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) {
-- struct iommu_table *tbl = &iommu->tables[i];
-+ struct iommu_table *tbl = &container->tables[i];
- unsigned long entry = ioba >> tbl->it_page_shift;
- unsigned long start = tbl->it_offset;
- unsigned long end = start + tbl->it_size;
-@@ -330,11 +337,8 @@ static int tce_iommu_enable(struct tce_container *container)
- {
- int ret = 0;
- unsigned long locked;
-- struct iommu_table *tbl;
- struct powerpc_iommu *iommu;
--
-- if (!container->grp)
-- return -ENXIO;
-+ struct tce_iommu_group *tcegrp;
-
- if (!current->mm)
- return -ESRCH; /* process exited */
-@@ -368,12 +372,24 @@ static int tce_iommu_enable(struct tce_container *container)
- * KVM agnostic.
- */
- if (!tce_preregistered(container)) {
-- iommu = iommu_group_get_iommudata(container->grp);
-+ if (!tce_groups_attached(container))
-+ return -ENODEV;
-+
-+ tcegrp = list_first_entry(&container->group_list,
-+ struct tce_iommu_group, next);
-+ iommu = iommu_group_get_iommudata(tcegrp->grp);
- if (!iommu)
- return -ENODEV;
-
-- tbl = &iommu->tables[0];
-- locked = (tbl->it_size << tbl->it_page_shift) >> PAGE_SHIFT;
-+ /*
-+ * We do not allow enabling a group if no DMA-able memory was
-+ * registered as there is no way to know how much we should
-+ * increment the locked_vm counter.
-+ */
-+ if (!iommu->tce32_size)
-+ return -EPERM;
-+
-+ locked = iommu->tce32_size >> PAGE_SHIFT;
- ret = try_increment_locked_vm(locked);
- if (ret)
- return ret;
-@@ -386,6 +402,10 @@ static int tce_iommu_enable(struct tce_container *container)
- return ret;
- }
-
-+static int tce_iommu_clear(struct tce_container *container,
-+ struct iommu_table *tbl,
-+ unsigned long entry, unsigned long pages);
-+
- static void tce_iommu_disable(struct tce_container *container)
- {
- if (!container->enabled)
-@@ -414,6 +434,7 @@ static void *tce_iommu_open(unsigned long arg)
-
- mutex_init(&container->lock);
- INIT_LIST_HEAD_RCU(&container->mem_list);
-+ INIT_LIST_HEAD_RCU(&container->group_list);
-
- return container;
- }
-@@ -427,16 +448,30 @@ static void tce_iommu_release(void *iommu_data)
- struct tce_container *container = iommu_data;
- struct iommu_table *tbl;
- struct powerpc_iommu *iommu;
-+ int i;
-+ struct tce_iommu_group *tcegrp;
-+ struct powerpc_iommu_ops *iommuops = NULL;
-
-- WARN_ON(container->grp);
- tce_iommu_disable(container);
-
-- if (container->grp) {
-- iommu = iommu_group_get_iommudata(container->grp);
-- tbl = &iommu->tables[0];
-- tce_iommu_clear(container, tbl, tbl->it_offset, tbl->it_size);
-- iommu->ops->free_table(tbl);
-- tce_iommu_detach_group(iommu_data, container->grp);
-+ /* Free tables */
-+ if (iommuops) {
-+ for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) {
-+ tbl = &container->tables[i];
-+
-+ tce_iommu_clear(container, tbl,
-+ tbl->it_offset, tbl->it_size);
-+
-+ iommuops->free_table(tbl);
-+ }
-+ }
-+
-+ while (tce_groups_attached(container)) {
-+ tcegrp = list_first_entry(&container->group_list,
-+ struct tce_iommu_group, next);
-+ iommu = iommu_group_get_iommudata(tcegrp->grp);
-+ iommuops = iommu->ops;
-+ tce_iommu_detach_group(iommu_data, tcegrp->grp);
- }
-
- tce_mem_unregister_all(container);
-@@ -607,16 +642,17 @@ static long tce_iommu_ioctl(void *iommu_data,
-
- case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
- struct vfio_iommu_spapr_tce_info info;
-- struct iommu_table *tbl;
-+ struct tce_iommu_group *tcegrp;
- struct powerpc_iommu *iommu;
-
-- if (WARN_ON(!container->grp))
-+ if (!tce_groups_attached(container))
- return -ENXIO;
-
-- iommu = iommu_group_get_iommudata(container->grp);
-+ tcegrp = list_first_entry(&container->group_list,
-+ struct tce_iommu_group, next);
-+ iommu = iommu_group_get_iommudata(tcegrp->grp);
-
-- tbl = &iommu->tables[0];
-- if (WARN_ON_ONCE(!tbl))
-+ if (!iommu)
- return -ENXIO;
-
- minsz = offsetofend(struct vfio_iommu_spapr_tce_info,
-@@ -628,9 +664,8 @@ static long tce_iommu_ioctl(void *iommu_data,
- if (info.argsz < minsz)
- return -EINVAL;
-
-- info.dma32_window_start = tbl->it_offset << tbl->it_page_shift;
-- info.dma32_window_size = tbl->it_size << tbl->it_page_shift;
-- info.flags = 0;
-+ info.dma32_window_start = iommu->tce32_start;
-+ info.dma32_window_size = iommu->tce32_size;
-
- if (copy_to_user((void __user *)arg, &info, minsz))
- return -EFAULT;
-@@ -779,12 +814,20 @@ static long tce_iommu_ioctl(void *iommu_data,
- tce_iommu_disable(container);
- mutex_unlock(&container->lock);
- return 0;
-- case VFIO_EEH_PE_OP:
-- if (!container->grp)
-- return -ENODEV;
-
-- return vfio_spapr_iommu_eeh_ioctl(container->grp,
-- cmd, arg);
-+ case VFIO_EEH_PE_OP: {
-+ struct tce_iommu_group *tcegrp;
-+
-+ ret = 0;
-+ list_for_each_entry(tcegrp, &container->group_list, next) {
-+ ret = vfio_spapr_iommu_eeh_ioctl(tcegrp->grp,
-+ cmd, arg);
-+ if (ret)
-+ return ret;
-+ }
-+ return ret;
-+ }
-+
- }
-
- return -ENOTTY;
-@@ -793,34 +836,34 @@ static long tce_iommu_ioctl(void *iommu_data,
- static int tce_iommu_attach_group(void *iommu_data,
- struct iommu_group *iommu_group)
- {
-- int ret = 0;
-+ int ret = 0, i;
- struct tce_container *container = iommu_data;
-- struct powerpc_iommu *iommu;
-- struct iommu_table tbltmp = { 0 }, *tbl = &tbltmp;
-+ struct powerpc_iommu *iommu = iommu_group_get_iommudata(iommu_group);
-+ struct tce_iommu_group *tcegrp;
-+ bool first_group = !tce_groups_attached(container);
-
- mutex_lock(&container->lock);
-
- /* pr_debug("tce_vfio: Attaching group #%u to iommu %p\n",
- iommu_group_id(iommu_group), iommu_group); */
-- if (container->grp) {
-- pr_warn("tce_vfio: Only one group per IOMMU container is allowed, existing id=%d, attaching id=%d\n",
-- iommu_group_id(container->grp),
-- iommu_group_id(iommu_group));
-- ret = -EBUSY;
-- goto unlock_exit;
-- }
-
-- if (container->enabled) {
-- pr_err("tce_vfio: attaching group #%u to enabled container\n",
-- iommu_group_id(iommu_group));
-- ret = -EBUSY;
-- goto unlock_exit;
-- }
-+ list_for_each_entry(tcegrp, &container->group_list, next) {
-+ struct powerpc_iommu *iommutmp;
-
-- iommu = iommu_group_get_iommudata(iommu_group);
-- if (WARN_ON_ONCE(!iommu)) {
-- ret = -ENXIO;
-- goto unlock_exit;
-+ if (tcegrp->grp == iommu_group) {
-+ pr_warn("tce_vfio: Group %d is already attached\n",
-+ iommu_group_id(iommu_group));
-+ ret = -EBUSY;
-+ goto unlock_exit;
-+ }
-+ iommutmp = iommu_group_get_iommudata(tcegrp->grp);
-+ if (iommutmp->ops != iommu->ops) {
-+ pr_warn("tce_vfio: Group %d is incompatible with group %d\n",
-+ iommu_group_id(iommu_group),
-+ iommu_group_id(tcegrp->grp));
-+ ret = -EBUSY;
-+ goto unlock_exit;
-+ }
- }
-
- /*
-@@ -835,14 +878,48 @@ static int tce_iommu_attach_group(void *iommu_data,
- goto unlock_exit;
- }
-
-- container->grp = iommu_group;
--
-- /* Create the default window as only now we know the parameters */
-- ret = iommu->ops->create_table(iommu, 0,
-- IOMMU_PAGE_SHIFT_4K,
-- ilog2(iommu->tce32_size), 1, tbl);
-- if (!ret)
-- ret = iommu->ops->set_window(iommu, 0, tbl);
-+ /* Put the group to the list so tce_def_window_create() can succeed */
-+ tcegrp = kzalloc(sizeof(*tcegrp), GFP_KERNEL);
-+ tcegrp->grp = iommu_group;
-+ list_add(&tcegrp->next, &container->group_list);
-+
-+ /*
-+ * If it the first group attached, check if there is any window
-+ * created and create one if none.
-+ */
-+ if (first_group) {
-+ bool found = false;
-+
-+ for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) {
-+ if (!container->tables[i].it_size)
-+ continue;
-+
-+ found = true;
-+ break;
-+ }
-+ if (!found) {
-+ struct iommu_table *tbl = &container->tables[0];
-+
-+ ret = iommu->ops->create_table(iommu, 0,
-+ IOMMU_PAGE_SHIFT_4K,
-+ ilog2(iommu->tce32_size), 1, tbl);
-+ if (ret)
-+ goto unlock_exit;
-+ }
-+ }
-+
-+ /* Set all windows to the new group */
-+ for (i = 0; i < POWERPC_IOMMU_MAX_TABLES; ++i) {
-+ struct iommu_table *tbl = &container->tables[i];
-+
-+ if (!tbl->it_size)
-+ continue;
-+
-+ /* Set the default window to a new group */
-+ ret = iommu->ops->set_window(iommu, i, tbl);
-+ if (ret)
-+ goto unlock_exit;
-+ }
-
- unlock_exit:
- mutex_unlock(&container->lock);
-@@ -855,24 +932,18 @@ static void tce_iommu_detach_group(void *iommu_data,
- {
- struct tce_container *container = iommu_data;
- struct powerpc_iommu *iommu;
-+ struct tce_iommu_group *tcegrp, *tcegrptmp;
- long i;
-
- mutex_lock(&container->lock);
-- if (iommu_group != container->grp) {
-- pr_warn("tce_vfio: detaching group #%u, expected group is #%u\n",
-- iommu_group_id(iommu_group),
-- iommu_group_id(container->grp));
-- } else {
-- if (container->enabled) {
-- pr_warn("tce_vfio: detaching group #%u from enabled container, forcing disable\n",
-- iommu_group_id(container->grp));
-- tce_iommu_disable(container);
-- }
-
-- /* pr_debug("tce_vfio: detaching group #%u from iommu %p\n",
-- iommu_group_id(iommu_group), iommu_group); */
-- container->grp = NULL;
-+ /* Detach windows from IOMMUs */
-+ list_for_each_entry_safe(tcegrp, tcegrptmp, &container->group_list,
-+ next) {
-+ if (tcegrp->grp != iommu_group)
-+ continue;
-
-+ list_del(&tcegrp->next);
- iommu = iommu_group_get_iommudata(iommu_group);
- BUG_ON(!iommu);
-
-@@ -882,6 +953,9 @@ static void tce_iommu_detach_group(void *iommu_data,
- /* Kernel owns the device now, we can restore bypass */
- if (iommu->ops && iommu->ops->set_ownership)
- iommu->ops->set_ownership(iommu, false);
-+
-+ kfree(tcegrp);
-+ break;
- }
- mutex_unlock(&container->lock);
+ static void pnv_pci_ioda2_table_free_pages(struct iommu_table *tbl)
+ {
++ const unsigned long size = tbl->it_indirect_levels ?
++ tbl->it_level_size : tbl->it_size;
++
+ if (!tbl->it_size)
+ return;
+
+- free_pages(tbl->it_base, get_order(tbl->it_size << 3));
++ pnv_pci_ioda2_table_do_free_pages((__be64 *)tbl->it_base, size,
++ tbl->it_indirect_levels);
+ }
+
+ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+@@ -2149,7 +2235,8 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+
+ /* Setup linux iommu table */
+ rc = pnv_pci_ioda2_table_alloc_pages(pe->phb->hose->node,
+- 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base, tbl);
++ 0, IOMMU_PAGE_SHIFT_4K, phb->ioda.m32_pci_base,
++ POWERNV_IOMMU_DEFAULT_LEVELS, tbl);
+ if (rc) {
+ pe_err(pe, "Failed to create 32-bit TCE table, err %ld", rc);
+ goto fail;
+diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
+index dce3bfd..d4e59f7 100644
+--- a/arch/powerpc/platforms/powernv/pci.c
++++ b/arch/powerpc/platforms/powernv/pci.c
+@@ -575,6 +575,19 @@ struct pci_ops pnv_pci_ops = {
+ static __be64 *pnv_tce(struct iommu_table *tbl, long idx)
+ {
+ __be64 *tmp = ((__be64 *)tbl->it_base);
++ int level = tbl->it_indirect_levels;
++ const long shift = ilog2(tbl->it_level_size);
++ unsigned long mask = (tbl->it_level_size - 1) << (level * shift);
++
++ while (level) {
++ int n = (idx & mask) >> (level * shift);
++ unsigned long tce = be64_to_cpu(tmp[n]);
++
++ tmp = __va(tce & ~(TCE_PCI_READ | TCE_PCI_WRITE));
++ idx &= ~mask;
++ mask >>= shift;
++ --level;
++ }
+
+ return tmp + idx;
}
--
-2.0.0
+2.4.0.rc3.8.gfb3e7d5