--- v11
+++ v10
@@ -48,11 +48,12 @@
[aw: for the vfio related changes]
Acked-by: Alex Williamson <alex.williamson@redhat.com>
---
+
+Alex, should I remove your "acked-by" in the cases like this and
+get another one?
+
+---
Changes:
-v11:
-* mm_iommu_put() does not return a code so this does not check it
-* moved "v2" in tce_container to pack the struct
-
v10:
* moved it_userspace allocation to vfio_iommu_spapr_tce as it VFIO
specific thing
@@ -88,9 +89,9 @@
---
Documentation/vfio.txt | 31 ++-
arch/powerpc/include/asm/iommu.h | 6 +
- drivers/vfio/vfio_iommu_spapr_tce.c | 512 ++++++++++++++++++++++++++++++------
+ drivers/vfio/vfio_iommu_spapr_tce.c | 516 ++++++++++++++++++++++++++++++------
include/uapi/linux/vfio.h | 27 ++
- 4 files changed, 487 insertions(+), 89 deletions(-)
+ 4 files changed, 494 insertions(+), 86 deletions(-)
diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index 96978ec..7dcf2b5 100644
@@ -143,14 +144,15 @@
[1] VFIO was originally an acronym for "Virtual Function I/O" in its
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
-index 9d37492..f9957eb 100644
+index c8bad21..763c041 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
-@@ -112,9 +112,15 @@ struct iommu_table {
- unsigned long *it_map; /* A simple allocation bitmap for now */
+@@ -113,10 +113,16 @@ struct iommu_table {
unsigned long it_page_shift;/* table iommu page size */
+ #ifdef CONFIG_IOMMU_API
struct list_head it_group_list;/* List of iommu_table_group_link */
+ unsigned long *it_userspace; /* userspace view of the table */
+ #endif
struct iommu_table_ops *it_ops;
};
@@ -163,7 +165,7 @@
static inline __attribute_const__
int get_iommu_order(unsigned long size, struct iommu_table *tbl)
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c
-index 7a84110..cadd9f8 100644
+index 8943b29..e7e8db3 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -19,8 +19,10 @@
@@ -189,14 +191,14 @@
/*
* The container descriptor supports only a single group per container.
* Required by the API as the container is not supplied with the IOMMU group
-@@ -88,11 +95,84 @@ static void decrement_locked_vm(long npages)
+@@ -88,11 +95,98 @@ static void decrement_locked_vm(long npages)
*/
struct tce_container {
struct mutex lock;
- struct iommu_group *grp;
bool enabled;
+ unsigned long locked_pages;
+ bool v2;
- unsigned long locked_pages;
+ struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
+ struct list_head group_list;
};
@@ -204,32 +206,46 @@
+static long tce_iommu_unregister_pages(struct tce_container *container,
+ __u64 vaddr, __u64 size)
+{
++ long ret;
+ struct mm_iommu_table_group_mem_t *mem;
+
+ if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK))
+ return -EINVAL;
+
-+ mem = mm_iommu_lookup(vaddr, size >> PAGE_SHIFT);
++ mem = mm_iommu_get(vaddr, size >> PAGE_SHIFT);
+ if (!mem)
+ return -EINVAL;
+
-+ return mm_iommu_put(mem);
++ ret = mm_iommu_put(mem); /* undo kref_get() from mm_iommu_get() */
++ if (!ret)
++ ret = mm_iommu_put(mem);
++
++ return ret;
+}
+
+static long tce_iommu_register_pages(struct tce_container *container,
+ __u64 vaddr, __u64 size)
+{
+ long ret = 0;
-+ struct mm_iommu_table_group_mem_t *mem = NULL;
++ struct mm_iommu_table_group_mem_t *mem;
+ unsigned long entries = size >> PAGE_SHIFT;
+
+ if ((vaddr & ~PAGE_MASK) || (size & ~PAGE_MASK) ||
+ ((vaddr + size) < vaddr))
+ return -EINVAL;
+
-+ ret = mm_iommu_get(vaddr, entries, &mem);
-+ if (ret)
-+ return ret;
++ mem = mm_iommu_get(vaddr, entries);
++ if (!mem) {
++ ret = try_increment_locked_vm(entries);
++ if (ret)
++ return ret;
++
++ ret = mm_iommu_alloc(vaddr, entries, &mem);
++ if (ret) {
++ decrement_locked_vm(entries);
++ return ret;
++ }
++ }
+
+ container->enabled = true;
+
@@ -275,7 +291,7 @@
static bool tce_page_is_contained(struct page *page, unsigned page_shift)
{
/*
-@@ -103,18 +183,18 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
+@@ -103,18 +197,18 @@ static bool tce_page_is_contained(struct page *page, unsigned page_shift)
return (PAGE_SHIFT + compound_order(compound_head(page))) >= page_shift;
}
@@ -300,7 +316,7 @@
if (tbl) {
unsigned long entry = ioba >> tbl->it_page_shift;
-@@ -136,9 +216,7 @@ static int tce_iommu_enable(struct tce_container *container)
+@@ -136,9 +230,7 @@ static int tce_iommu_enable(struct tce_container *container)
int ret = 0;
unsigned long locked;
struct iommu_table_group *table_group;
@@ -311,7 +327,7 @@
if (!current->mm)
return -ESRCH; /* process exited */
-@@ -175,7 +253,12 @@ static int tce_iommu_enable(struct tce_container *container)
+@@ -175,7 +267,12 @@ static int tce_iommu_enable(struct tce_container *container)
* as there is no way to know how much we should increment
* the locked_vm counter.
*/
@@ -325,7 +341,7 @@
if (!table_group)
return -ENODEV;
-@@ -211,7 +294,7 @@ static void *tce_iommu_open(unsigned long arg)
+@@ -211,7 +308,7 @@ static void *tce_iommu_open(unsigned long arg)
{
struct tce_container *container;
@@ -334,7 +350,7 @@
pr_err("tce_vfio: Wrong IOMMU type\n");
return ERR_PTR(-EINVAL);
}
-@@ -221,18 +304,45 @@ static void *tce_iommu_open(unsigned long arg)
+@@ -221,18 +318,45 @@ static void *tce_iommu_open(unsigned long arg)
return ERR_PTR(-ENOMEM);
mutex_init(&container->lock);
@@ -383,7 +399,7 @@
tce_iommu_disable(container);
mutex_destroy(&container->lock);
-@@ -249,6 +359,47 @@ static void tce_iommu_unuse_page(struct tce_container *container,
+@@ -249,6 +373,47 @@ static void tce_iommu_unuse_page(struct tce_container *container,
put_page(page);
}
@@ -431,7 +447,7 @@
static int tce_iommu_clear(struct tce_container *container,
struct iommu_table *tbl,
unsigned long entry, unsigned long pages)
-@@ -267,6 +418,11 @@ static int tce_iommu_clear(struct tce_container *container,
+@@ -267,6 +432,11 @@ static int tce_iommu_clear(struct tce_container *container,
if (direction == DMA_NONE)
continue;
@@ -443,7 +459,7 @@
tce_iommu_unuse_page(container, oldhpa);
}
-@@ -333,6 +489,64 @@ static long tce_iommu_build(struct tce_container *container,
+@@ -333,6 +503,64 @@ static long tce_iommu_build(struct tce_container *container,
return ret;
}
@@ -508,7 +524,7 @@
static long tce_iommu_create_table(struct tce_container *container,
struct iommu_table_group *table_group,
int num,
-@@ -358,6 +572,12 @@ static long tce_iommu_create_table(struct tce_container *container,
+@@ -358,6 +586,12 @@ static long tce_iommu_create_table(struct tce_container *container,
WARN_ON(!ret && !(*ptbl)->it_ops->free);
WARN_ON(!ret && ((*ptbl)->it_allocated_size != table_size));
@@ -521,7 +537,7 @@
if (ret)
decrement_locked_vm(table_size >> PAGE_SHIFT);
-@@ -368,6 +588,7 @@ static void tce_iommu_free_table(struct iommu_table *tbl)
+@@ -368,6 +602,7 @@ static void tce_iommu_free_table(struct iommu_table *tbl)
{
unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT;
@@ -529,7 +545,7 @@
tbl->it_ops->free(tbl);
decrement_locked_vm(pages);
}
-@@ -383,6 +604,7 @@ static long tce_iommu_ioctl(void *iommu_data,
+@@ -383,6 +618,7 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_CHECK_EXTENSION:
switch (arg) {
case VFIO_SPAPR_TCE_IOMMU:
@@ -537,7 +553,7 @@
ret = 1;
break;
default:
-@@ -394,12 +616,15 @@ static long tce_iommu_ioctl(void *iommu_data,
+@@ -394,12 +630,15 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_IOMMU_SPAPR_TCE_GET_INFO: {
struct vfio_iommu_spapr_tce_info info;
@@ -555,7 +571,7 @@
if (!table_group)
return -ENXIO;
-@@ -468,11 +693,18 @@ static long tce_iommu_ioctl(void *iommu_data,
+@@ -467,11 +706,18 @@ static long tce_iommu_ioctl(void *iommu_data,
if (ret)
return ret;
@@ -579,7 +595,7 @@
iommu_flush_tce(tbl);
-@@ -518,7 +750,61 @@ static long tce_iommu_ioctl(void *iommu_data,
+@@ -517,7 +763,61 @@ static long tce_iommu_ioctl(void *iommu_data,
return ret;
}
@@ -629,10 +645,10 @@
+ return -EINVAL;
+
+ mutex_lock(&container->lock);
-+ ret = tce_iommu_unregister_pages(container, param.vaddr, param.size);
++ tce_iommu_unregister_pages(container, param.vaddr, param.size);
+ mutex_unlock(&container->lock);
+
-+ return ret;
++ return 0;
+ }
case VFIO_IOMMU_ENABLE:
+ if (container->v2)
@@ -641,7 +657,7 @@
mutex_lock(&container->lock);
ret = tce_iommu_enable(container);
mutex_unlock(&container->lock);
-@@ -526,16 +812,27 @@ static long tce_iommu_ioctl(void *iommu_data,
+@@ -525,16 +825,27 @@ static long tce_iommu_ioctl(void *iommu_data,
case VFIO_IOMMU_DISABLE:
@@ -674,7 +690,7 @@
}
return -ENOTTY;
-@@ -547,14 +844,17 @@ static void tce_iommu_release_ownership(struct tce_container *container,
+@@ -546,14 +857,17 @@ static void tce_iommu_release_ownership(struct tce_container *container,
int i;
for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
@@ -693,7 +709,7 @@
}
}
-@@ -569,7 +869,10 @@ static int tce_iommu_take_ownership(struct tce_container *container,
+@@ -568,7 +882,10 @@ static int tce_iommu_take_ownership(struct tce_container *container,
if (!tbl || !tbl->it_map)
continue;
@@ -705,7 +721,7 @@
if (rc) {
for (j = 0; j < i; ++j)
iommu_release_ownership(
-@@ -579,6 +882,9 @@ static int tce_iommu_take_ownership(struct tce_container *container,
+@@ -578,38 +895,57 @@ static int tce_iommu_take_ownership(struct tce_container *container,
}
}
@@ -715,99 +731,15 @@
return 0;
}
-@@ -592,18 +898,8 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container,
- return;
- }
-
-- for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-- /* Store table pointer as unset_window resets it */
-- struct iommu_table *tbl = table_group->tables[i];
--
-- if (!tbl)
-- continue;
--
-+ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
- table_group->ops->unset_window(table_group, i);
-- tce_iommu_clear(container, tbl,
-- tbl->it_offset, tbl->it_size);
-- tce_iommu_free_table(tbl);
-- }
-
- table_group->ops->release_ownership(table_group);
- }
-@@ -611,7 +907,7 @@ static void tce_iommu_release_ownership_ddw(struct tce_container *container,
- static long tce_iommu_take_ownership_ddw(struct tce_container *container,
- struct iommu_table_group *table_group)
+ static int tce_iommu_attach_group(void *iommu_data,
+ struct iommu_group *iommu_group)
{
-- long ret;
-+ long i, ret = 0;
- struct iommu_table *tbl = NULL;
-
- if (!table_group->ops->create_table || !table_group->ops->set_window ||
-@@ -622,23 +918,45 @@ static long tce_iommu_take_ownership_ddw(struct tce_container *container,
-
- table_group->ops->take_ownership(table_group);
-
-- ret = tce_iommu_create_table(container,
-- table_group,
-- 0, /* window number */
-- IOMMU_PAGE_SHIFT_4K,
-- table_group->tce32_size,
-- 1, /* default levels */
-- &tbl);
-- if (!ret) {
-- ret = table_group->ops->set_window(table_group, 0, tbl);
-+ /*
-+ * If it the first group attached, check if there is
-+ * a default DMA window and create one if none as
-+ * the userspace expects it to exist.
-+ */
-+ if (!tce_groups_attached(container) && !container->tables[0]) {
-+ ret = tce_iommu_create_table(container,
-+ table_group,
-+ 0, /* window number */
-+ IOMMU_PAGE_SHIFT_4K,
-+ table_group->tce32_size,
-+ 1, /* default levels */
-+ &tbl);
- if (ret)
-- tce_iommu_free_table(tbl);
-+ goto release_exit;
- else
-- table_group->tables[0] = tbl;
-+ container->tables[0] = tbl;
- }
-
-- if (ret)
-- table_group->ops->release_ownership(table_group);
-+ /* Set all windows to the new group */
-+ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
-+ tbl = container->tables[i];
-+
-+ if (!tbl)
-+ continue;
-+
-+ /* Set the default window to a new group */
-+ ret = table_group->ops->set_window(table_group, i, tbl);
-+ if (ret)
-+ goto release_exit;
-+ }
-+
-+ return 0;
-+
-+release_exit:
-+ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
-+ table_group->ops->unset_window(table_group, i);
-+
-+ table_group->ops->release_ownership(table_group);
-
- return ret;
- }
-@@ -649,29 +967,44 @@ static int tce_iommu_attach_group(void *iommu_data,
- int ret;
+- int ret;
++ int ret, i;
struct tce_container *container = iommu_data;
struct iommu_table_group *table_group;
+ struct tce_iommu_group *tcegrp = NULL;
++ bool first_group = !tce_groups_attached(container);
mutex_lock(&container->lock);
@@ -832,7 +764,7 @@
- if (!table_group) {
- ret = -ENXIO;
+
-+ if (tce_groups_attached(container) && (!table_group->ops ||
++ if (!first_group && (!table_group->ops ||
+ !table_group->ops->take_ownership ||
+ !table_group->ops->release_ownership)) {
+ ret = -EBUSY;
@@ -865,16 +797,61 @@
goto unlock_exit;
}
-@@ -681,10 +1014,15 @@ static int tce_iommu_attach_group(void *iommu_data,
- else
- ret = tce_iommu_take_ownership_ddw(container, table_group);
-
-- if (!ret)
-- container->grp = iommu_group;
-+ if (!ret) {
-+ tcegrp->grp = iommu_group;
-+ list_add(&tcegrp->next, &container->group_list);
-+ }
+@@ -628,28 +964,50 @@ static int tce_iommu_attach_group(void *iommu_data,
+ * the pages that has been explicitly mapped into the iommu
+ */
+ table_group->ops->take_ownership(table_group);
+- ret = tce_iommu_create_table(container,
+- table_group,
+- 0, /* window number */
+- IOMMU_PAGE_SHIFT_4K,
+- table_group->tce32_size,
+- 1, /* default levels */
+- &tbl);
+- if (!ret) {
+- ret = table_group->ops->set_window(table_group, 0, tbl);
++
++ /*
++ * If it the first group attached, check if there is
++ * a default DMA window and create one if none as
++ * the userspace expects it to exist.
++ */
++ if (first_group && !container->tables[0]) {
++ ret = tce_iommu_create_table(container,
++ table_group,
++ 0, /* window number */
++ IOMMU_PAGE_SHIFT_4K,
++ table_group->tce32_size,
++ 1, /* default levels */
++ &tbl);
+ if (ret)
+- tce_iommu_free_table(tbl);
++ goto unlock_exit;
+ else
+- table_group->tables[0] = tbl;
++ container->tables[0] = tbl;
++ }
++
++ /* Set all windows to the new group */
++ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
++ tbl = container->tables[i];
++
++ if (!tbl)
++ continue;
++
++ /* Set the default window to a new group */
++ ret = table_group->ops->set_window(table_group, i, tbl);
++ if (ret)
++ break;
+ }
+ }
+
+ if (ret)
+ goto unlock_exit;
+
+- container->grp = iommu_group;
++ tcegrp->grp = iommu_group;
++ list_add(&tcegrp->next, &container->group_list);
unlock_exit:
+ if (ret && tcegrp)
@@ -883,12 +860,13 @@
mutex_unlock(&container->lock);
return ret;
-@@ -695,24 +1033,26 @@ static void tce_iommu_detach_group(void *iommu_data,
+@@ -660,25 +1018,27 @@ static void tce_iommu_detach_group(void *iommu_data,
{
struct tce_container *container = iommu_data;
struct iommu_table_group *table_group;
++ struct tce_iommu_group *tcegrp;
+ long i;
+ bool found = false;
-+ struct tce_iommu_group *tcegrp;
mutex_lock(&container->lock);
- if (iommu_group != container->grp) {
@@ -923,6 +901,26 @@
table_group = iommu_group_get_iommudata(iommu_group);
BUG_ON(!table_group);
+@@ -689,18 +1049,8 @@ static void tce_iommu_detach_group(void *iommu_data,
+ else if (!table_group->ops->unset_window)
+ WARN_ON_ONCE(1);
+ else {
+- for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) {
+- /* Store table pointer as unset_window resets it */
+- struct iommu_table *tbl = table_group->tables[i];
+-
+- if (!tbl)
+- continue;
+-
++ for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i)
+ table_group->ops->unset_window(table_group, i);
+- tce_iommu_clear(container, tbl,
+- tbl->it_offset, tbl->it_size);
+- tce_iommu_free_table(tbl);
+- }
+
+ table_group->ops->release_ownership(table_group);
+ }
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index b57b750..8fdcfb9 100644
--- a/include/uapi/linux/vfio.h