--- v7
+++ v2
@@ -1,96 +1,131 @@
-The speculative page fault handler which is run without holding the
-mmap_sem is calling lru_cache_add_active_or_unevictable() but the vm_flags
-is not guaranteed to remain constant.
-Introducing __lru_cache_add_active_or_unevictable() which has the vma flags
-value parameter instead of the vma pointer.
+When dealing with speculative page fault handler, we may race with VMA
+being split or merged. In this case the vma->vm_start and vm->vm_end
+fields may not match the address the page fault is occurring.
+
+This can only happens when the VMA is split but in that case, the
+anon_vma pointer of the new VMA will be the same as the original one,
+because in __split_vma the new->anon_vma is set to src->anon_vma when
+*new = *vma.
+
+So even if the VMA boundaries are not correct, the anon_vma pointer is
+still valid.
+
+If the VMA has been merged, then the VMA in which it has been merged
+must have the same anon_vma pointer otherwise the merge can't be done.
+
+So in all the case we know that the anon_vma is valid, since we have
+checked before starting the speculative page fault that the anon_vma
+pointer is valid for this VMA and since there is an anon_vma this
+means that at one time a page has been backed and that before the VMA
+is cleaned, the page table lock would have to be grab to clean the
+PTE, and the anon_vma field is checked once the PTE is locked.
+
+This patch introduce a new __page_add_new_anon_rmap() service which
+doesn't check for the VMA boundaries, and create a new inline one
+which do the check.
+
+When called from a page fault handler, if this is not a speculative one,
+there is a guarantee that vm_start and vm_end match the faulting address,
+so this check is useless. In the context of the speculative page fault
+handler, this check may be wrong but anon_vma is still valid as explained
+above.
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
- include/linux/swap.h | 10 ++++++++--
+ include/linux/rmap.h | 12 ++++++++++--
mm/memory.c | 8 ++++----
- mm/swap.c | 6 +++---
- 3 files changed, 15 insertions(+), 9 deletions(-)
+ mm/rmap.c | 5 ++---
+ 3 files changed, 16 insertions(+), 9 deletions(-)
-diff --git a/include/linux/swap.h b/include/linux/swap.h
-index a1a3f4ed94ce..99377b66ea93 100644
---- a/include/linux/swap.h
-+++ b/include/linux/swap.h
-@@ -337,8 +337,14 @@ extern void deactivate_file_page(struct page *page);
- extern void mark_page_lazyfree(struct page *page);
- extern void swap_setup(void);
+diff --git a/include/linux/rmap.h b/include/linux/rmap.h
+index 43ef2c30cb0f..f5cd4dbc78b0 100644
+--- a/include/linux/rmap.h
++++ b/include/linux/rmap.h
+@@ -170,8 +170,16 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, bool);
+ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ unsigned long, int);
+-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+- unsigned long, bool);
++void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
++ unsigned long, bool);
++static inline void page_add_new_anon_rmap(struct page *page,
++ struct vm_area_struct *vma,
++ unsigned long address, bool compound)
++{
++ VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
++ __page_add_new_anon_rmap(page, vma, address, compound);
++}
++
+ void page_add_file_rmap(struct page *, bool);
+ void page_remove_rmap(struct page *, bool);
--extern void lru_cache_add_active_or_unevictable(struct page *page,
-- struct vm_area_struct *vma);
-+extern void __lru_cache_add_active_or_unevictable(struct page *page,
-+ unsigned long vma_flags);
-+
-+static inline void lru_cache_add_active_or_unevictable(struct page *page,
-+ struct vm_area_struct *vma)
-+{
-+ return __lru_cache_add_active_or_unevictable(page, vma->vm_flags);
-+}
-
- /* linux/mm/vmscan.c */
- extern unsigned long zone_reclaimable_pages(struct zone *zone);
diff --git a/mm/memory.c b/mm/memory.c
-index 82b943a369d6..2e4c5755cbc1 100644
+index 9f9e5bb7a556..51bc8315281e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
-@@ -2554,7 +2554,7 @@ static int wp_page_copy(struct vm_fault *vmf)
+@@ -2376,7 +2376,7 @@ static int wp_page_copy(struct vm_fault *vmf)
+ * thread doing COW.
+ */
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
- page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+- page_add_new_anon_rmap(new_page, vma, vmf->address, false);
++ __page_add_new_anon_rmap(new_page, vma, vmf->address, false);
mem_cgroup_commit_charge(new_page, memcg, false, false);
-- lru_cache_add_active_or_unevictable(new_page, vma);
-+ __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
+ __lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
/*
- * We call the notify macro here because, when using secondary
- * mmu page tables (such as kvm shadow page tables), we want the
-@@ -3095,7 +3095,7 @@ int do_swap_page(struct vm_fault *vmf)
- if (unlikely(page != swapcache && swapcache)) {
- page_add_new_anon_rmap(page, vma, vmf->address, false);
+@@ -2847,7 +2847,7 @@ int do_swap_page(struct vm_fault *vmf)
+ mem_cgroup_commit_charge(page, memcg, true, false);
+ activate_page(page);
+ } else { /* ksm created a completely new copy */
+- page_add_new_anon_rmap(page, vma, vmf->address, false);
++ __page_add_new_anon_rmap(page, vma, vmf->address, false);
mem_cgroup_commit_charge(page, memcg, false, false);
-- lru_cache_add_active_or_unevictable(page, vma);
-+ __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
+ __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
+ }
+@@ -2985,7 +2985,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
+ }
+
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+- page_add_new_anon_rmap(page, vma, vmf->address, false);
++ __page_add_new_anon_rmap(page, vma, vmf->address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
+ setpte:
+@@ -3237,7 +3237,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
+ /* copy-on-write page */
+ if (write && !(vmf->vma_flags & VM_SHARED)) {
+ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+- page_add_new_anon_rmap(page, vma, vmf->address, false);
++ __page_add_new_anon_rmap(page, vma, vmf->address, false);
+ mem_cgroup_commit_charge(page, memcg, false, false);
+ __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
} else {
- do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true, false);
-@@ -3246,7 +3246,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
-- lru_cache_add_active_or_unevictable(page, vma);
-+ __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
- setpte:
- set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
+diff --git a/mm/rmap.c b/mm/rmap.c
+index c1286d47aa1f..0c9f8ded669a 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -1122,7 +1122,7 @@ void do_page_add_anon_rmap(struct page *page,
+ }
-@@ -3498,7 +3498,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
- page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
-- lru_cache_add_active_or_unevictable(page, vma);
-+ __lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
- } else {
- inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
-diff --git a/mm/swap.c b/mm/swap.c
-index 2d337710218f..85fc6e78ca99 100644
---- a/mm/swap.c
-+++ b/mm/swap.c
-@@ -455,12 +455,12 @@ void lru_cache_add(struct page *page)
- * directly back onto it's zone's unevictable list, it does NOT use a
- * per cpu pagevec.
+ /**
+- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
++ * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * @page: the page to add the mapping to
+ * @vma: the vm area in which the mapping is added
+ * @address: the user virtual address mapped
+@@ -1132,12 +1132,11 @@ void do_page_add_anon_rmap(struct page *page,
+ * This means the inc-and-test can be bypassed.
+ * Page does not have to be locked.
*/
--void lru_cache_add_active_or_unevictable(struct page *page,
-- struct vm_area_struct *vma)
-+void __lru_cache_add_active_or_unevictable(struct page *page,
-+ unsigned long vma_flags)
+-void page_add_new_anon_rmap(struct page *page,
++void __page_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, bool compound)
{
- VM_BUG_ON_PAGE(PageLRU(page), page);
+ int nr = compound ? hpage_nr_pages(page) : 1;
-- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
-+ if (likely((vma_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
- SetPageActive(page);
- else if (!TestSetPageMlocked(page)) {
- /*
+- VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ __SetPageSwapBacked(page);
+ if (compound) {
+ VM_BUG_ON_PAGE(!PageTransHuge(page), page);
--
2.7.4