--- v10
+++ v7
@@ -1,100 +1,238 @@
-This relies on the fact that a PCI device always has an IOMMU table
-which may not be the case when we get dynamic DMA windows so
-let's use more reliable check for IOMMU group here.
-
-As we do not rely on the table presence here, remove the workaround
-from pnv_pci_ioda2_set_bypass(); also remove the @add_to_iommu_group
-parameter from pnv_ioda_setup_bus_dma().
+This moves page pinning (get_user_pages_fast()/put_page()) code out of
+the platform IOMMU code and puts it to VFIO IOMMU driver where it belongs
+to as the platform code does not deal with page pinning.
+
+This makes iommu_take_ownership()/iommu_release_ownership() deal with
+the IOMMU table bitmap only.
+
+This removes page unpinning from iommu_take_ownership() as the actual
+TCE table might contain garbage and doing put_page() on it is undefined
+behaviour.
+
+Besides the last part, the rest of the patch is mechanical.
Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
---
- arch/powerpc/kernel/eeh.c | 4 +---
- arch/powerpc/platforms/powernv/pci-ioda.c | 27 +++++----------------------
- 2 files changed, 6 insertions(+), 25 deletions(-)
-
-diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
-index 9ee61d1..defd874 100644
---- a/arch/powerpc/kernel/eeh.c
-+++ b/arch/powerpc/kernel/eeh.c
-@@ -1412,13 +1412,11 @@ static int dev_has_iommu_table(struct device *dev, void *data)
+Changes:
+v4:
+* s/iommu_tce_build(tbl, entry + 1/iommu_tce_build(tbl, entry + i/
+---
+ arch/powerpc/include/asm/iommu.h | 4 --
+ arch/powerpc/kernel/iommu.c | 55 --------------------------
+ drivers/vfio/vfio_iommu_spapr_tce.c | 78 ++++++++++++++++++++++++++++++-------
+ 3 files changed, 65 insertions(+), 72 deletions(-)
+
+diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
+index f1ea597..ed69b7d 100644
+--- a/arch/powerpc/include/asm/iommu.h
++++ b/arch/powerpc/include/asm/iommu.h
+@@ -197,10 +197,6 @@ extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+ unsigned long hwaddr, enum dma_data_direction direction);
+ extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
+ unsigned long entry);
+-extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+- unsigned long entry, unsigned long pages);
+-extern int iommu_put_tce_user_mode(struct iommu_table *tbl,
+- unsigned long entry, unsigned long tce);
+
+ extern void iommu_flush_tce(struct iommu_table *tbl);
+ extern int iommu_take_ownership(struct iommu_table *tbl);
+diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
+index b054f33..1b4a178 100644
+--- a/arch/powerpc/kernel/iommu.c
++++ b/arch/powerpc/kernel/iommu.c
+@@ -991,30 +991,6 @@ unsigned long iommu_clear_tce(struct iommu_table *tbl, unsigned long entry)
+ }
+ EXPORT_SYMBOL_GPL(iommu_clear_tce);
+
+-int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
+- unsigned long entry, unsigned long pages)
+-{
+- unsigned long oldtce;
+- struct page *page;
+-
+- for ( ; pages; --pages, ++entry) {
+- oldtce = iommu_clear_tce(tbl, entry);
+- if (!oldtce)
+- continue;
+-
+- page = pfn_to_page(oldtce >> PAGE_SHIFT);
+- WARN_ON(!page);
+- if (page) {
+- if (oldtce & TCE_PCI_WRITE)
+- SetPageDirty(page);
+- put_page(page);
+- }
+- }
+-
+- return 0;
+-}
+-EXPORT_SYMBOL_GPL(iommu_clear_tces_and_put_pages);
+-
+ /*
+ * hwaddr is a kernel virtual address here (0xc... bazillion),
+ * tce_build converts it to a physical address.
+@@ -1044,35 +1020,6 @@ int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
+ }
+ EXPORT_SYMBOL_GPL(iommu_tce_build);
+
+-int iommu_put_tce_user_mode(struct iommu_table *tbl, unsigned long entry,
+- unsigned long tce)
+-{
+- int ret;
+- struct page *page = NULL;
+- unsigned long hwaddr, offset = tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK;
+- enum dma_data_direction direction = iommu_tce_direction(tce);
+-
+- ret = get_user_pages_fast(tce & PAGE_MASK, 1,
+- direction != DMA_TO_DEVICE, &page);
+- if (unlikely(ret != 1)) {
+- /* pr_err("iommu_tce: get_user_pages_fast failed tce=%lx ioba=%lx ret=%d\n",
+- tce, entry << tbl->it_page_shift, ret); */
+- return -EFAULT;
+- }
+- hwaddr = (unsigned long) page_address(page) + offset;
+-
+- ret = iommu_tce_build(tbl, entry, hwaddr, direction);
+- if (ret)
+- put_page(page);
+-
+- if (ret < 0)
+- pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%d\n",
+- __func__, entry << tbl->it_page_shift, tce, ret);
+-
+- return ret;
+-}
+-EXPORT_SYMBOL_GPL(iommu_put_tce_user_mode);
+-
+ int iommu_take_ownership(struct iommu_table *tbl)
{
- struct pci_dev *pdev = to_pci_dev(dev);
- struct pci_dev **ppdev = data;
-- struct iommu_table *tbl;
-
- if (!dev)
- return 0;
-
-- tbl = get_iommu_table_base(dev);
-- if (tbl && tbl->it_group) {
-+ if (dev->iommu_group) {
- *ppdev = pdev;
- return 1;
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+@@ -1086,7 +1033,6 @@ int iommu_take_ownership(struct iommu_table *tbl)
}
-diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
-index f8bc950..2f092bb 100644
---- a/arch/powerpc/platforms/powernv/pci-ioda.c
-+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
-@@ -1654,21 +1654,15 @@ static u64 pnv_pci_ioda_dma_get_required_mask(struct pnv_phb *phb,
+
+ memset(tbl->it_map, 0xff, sz);
+- iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+
+ /*
+ * Disable iommu bypass, otherwise the user can DMA to all of
+@@ -1104,7 +1050,6 @@ void iommu_release_ownership(struct iommu_table *tbl)
+ {
+ unsigned long sz = (tbl->it_size + 7) >> 3;
+
+- iommu_clear_tces_and_put_pages(tbl, tbl->it_offset, tbl->it_size);
+ memset(tbl->it_map, 0, sz);
+
+ /* Restore bit#0 set by iommu_init_table() */
+diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
+index 730b4ef..cefaf05 100644
+--- a/drivers/vfio/vfio_iommu_spapr_tce.c
++++ b/drivers/vfio/vfio_iommu_spapr_tce.c
+@@ -147,6 +147,66 @@ static void tce_iommu_release(void *iommu_data)
+ kfree(container);
}
- static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe,
-- struct pci_bus *bus,
-- bool add_to_iommu_group)
-+ struct pci_bus *bus)
++static int tce_iommu_clear(struct tce_container *container,
++ struct iommu_table *tbl,
++ unsigned long entry, unsigned long pages)
++{
++ unsigned long oldtce;
++ struct page *page;
++
++ for ( ; pages; --pages, ++entry) {
++ oldtce = iommu_clear_tce(tbl, entry);
++ if (!oldtce)
++ continue;
++
++ page = pfn_to_page(oldtce >> PAGE_SHIFT);
++ WARN_ON(!page);
++ if (page) {
++ if (oldtce & TCE_PCI_WRITE)
++ SetPageDirty(page);
++ put_page(page);
++ }
++ }
++
++ return 0;
++}
++
++static long tce_iommu_build(struct tce_container *container,
++ struct iommu_table *tbl,
++ unsigned long entry, unsigned long tce, unsigned long pages)
++{
++ long i, ret = 0;
++ struct page *page = NULL;
++ unsigned long hva;
++ enum dma_data_direction direction = iommu_tce_direction(tce);
++
++ for (i = 0; i < pages; ++i) {
++ ret = get_user_pages_fast(tce & PAGE_MASK, 1,
++ direction != DMA_TO_DEVICE, &page);
++ if (unlikely(ret != 1)) {
++ ret = -EFAULT;
++ break;
++ }
++ hva = (unsigned long) page_address(page) +
++ (tce & IOMMU_PAGE_MASK(tbl) & ~PAGE_MASK);
++
++ ret = iommu_tce_build(tbl, entry + i, hva, direction);
++ if (ret) {
++ put_page(page);
++ pr_err("iommu_tce: %s failed ioba=%lx, tce=%lx, ret=%ld\n",
++ __func__, entry << tbl->it_page_shift,
++ tce, ret);
++ break;
++ }
++ tce += IOMMU_PAGE_SIZE_4K;
++ }
++
++ if (ret)
++ tce_iommu_clear(container, tbl, entry, i);
++
++ return ret;
++}
++
+ static long tce_iommu_ioctl(void *iommu_data,
+ unsigned int cmd, unsigned long arg)
{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
-- if (add_to_iommu_group)
-- set_iommu_table_base_and_group(&dev->dev,
-- pe->tce32_table);
-- else
-- set_iommu_table_base(&dev->dev, pe->tce32_table);
-+ set_iommu_table_base_and_group(&dev->dev, pe->tce32_table);
-
- if (dev->subordinate)
-- pnv_ioda_setup_bus_dma(pe, dev->subordinate,
-- add_to_iommu_group);
-+ pnv_ioda_setup_bus_dma(pe, dev->subordinate);
- }
- }
-
-@@ -1845,7 +1839,7 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
- } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
- iommu_register_group(tbl, phb->hose->global_number,
- pe->pe_number);
-- pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
-+ pnv_ioda_setup_bus_dma(pe, pe->pbus);
- } else if (pe->flags & PNV_IODA_PE_VF) {
- iommu_register_group(tbl, phb->hose->global_number,
- pe->pe_number);
-@@ -1882,17 +1876,6 @@ static void pnv_pci_ioda2_set_bypass(struct iommu_table *tbl, bool enable)
- window_id,
- pe->tce_bypass_base,
- 0);
--
-- /*
-- * EEH needs the mapping between IOMMU table and group
-- * of those VFIO/KVM pass-through devices. We can postpone
-- * resetting DMA ops until the DMA mask is configured in
-- * host side.
-- */
-- if (pe->pdev)
-- set_iommu_table_base(&pe->pdev->dev, tbl);
-- else
-- pnv_ioda_setup_bus_dma(pe, pe->pbus, false);
- }
- if (rc)
- pe_err(pe, "OPAL error %lld configuring bypass window\n", rc);
-@@ -1984,7 +1967,7 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
- } else if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) {
- iommu_register_group(tbl, phb->hose->global_number,
- pe->pe_number);
-- pnv_ioda_setup_bus_dma(pe, pe->pbus, true);
-+ pnv_ioda_setup_bus_dma(pe, pe->pbus);
- } else if (pe->flags & PNV_IODA_PE_VF) {
- iommu_register_group(tbl, phb->hose->global_number,
- pe->pe_number);
+@@ -195,7 +255,7 @@ static long tce_iommu_ioctl(void *iommu_data,
+ case VFIO_IOMMU_MAP_DMA: {
+ struct vfio_iommu_type1_dma_map param;
+ struct iommu_table *tbl = container->tbl;
+- unsigned long tce, i;
++ unsigned long tce;
+
+ if (!tbl)
+ return -ENXIO;
+@@ -229,17 +289,9 @@ static long tce_iommu_ioctl(void *iommu_data,
+ if (ret)
+ return ret;
+
+- for (i = 0; i < (param.size >> IOMMU_PAGE_SHIFT_4K); ++i) {
+- ret = iommu_put_tce_user_mode(tbl,
+- (param.iova >> IOMMU_PAGE_SHIFT_4K) + i,
+- tce);
+- if (ret)
+- break;
+- tce += IOMMU_PAGE_SIZE_4K;
+- }
+- if (ret)
+- iommu_clear_tces_and_put_pages(tbl,
+- param.iova >> IOMMU_PAGE_SHIFT_4K, i);
++ ret = tce_iommu_build(container, tbl,
++ param.iova >> IOMMU_PAGE_SHIFT_4K,
++ tce, param.size >> IOMMU_PAGE_SHIFT_4K);
+
+ iommu_flush_tce(tbl);
+
+@@ -273,7 +325,7 @@ static long tce_iommu_ioctl(void *iommu_data,
+ if (ret)
+ return ret;
+
+- ret = iommu_clear_tces_and_put_pages(tbl,
++ ret = tce_iommu_clear(container, tbl,
+ param.iova >> IOMMU_PAGE_SHIFT_4K,
+ param.size >> IOMMU_PAGE_SHIFT_4K);
+ iommu_flush_tce(tbl);
--
-2.4.0.rc3.8.gfb3e7d5
+2.0.0