Re: [RFC][PATCH] memcg: avoid THP split in task migration

From: Hillf Danton <hidden>
Date: 2012-02-29 14:40:13
Also in: lkml

On Wed, Feb 29, 2012 at 8:28 AM, KAMEZAWA Hiroyuki
[off-list ref] wrote:

On Tue, 28 Feb 2012 16:12:32 -0500
Naoya Horiguchi [off-list ref] wrote:

quoted

Currently we can't do task migration among memory cgroups without THP split,
which means processes heavily using THP experience large overhead in task
migration. This patch introduce the code for moving charge of THP and makes
THP more valuable.

Signed-off-by: Naoya Horiguchi <redacted>
Cc: Hillf Danton <redacted>


Thank you!

   ++hd;

A comment below.

quoted

---
 mm/memcontrol.c |   76 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 70 insertions(+), 6 deletions(-)

diff --git linux-next-20120228.orig/mm/memcontrol.c linux-next-20120228/mm/memcontrol.c
index c83aeb5..e97c041 100644
--- linux-next-20120228.orig/mm/memcontrol.c
+++ linux-next-20120228/mm/memcontrol.c

@@ -5211,6 +5211,42 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma,

      return ret;
 }

+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+/*
+ * We don't consider swapping or file mapped pages because THP does not
+ * support them for now.
+ */
+static int is_target_huge_pmd_for_mc(struct vm_area_struct *vma,

static int is_target_thp_for_mc(struct vm_area_struct *vma,
or
static int is_target_pmd_for_mc(struct vm_area_struct *vma,
sounds better?

quoted

+             unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+     struct page *page = NULL;
+     struct page_cgroup *pc;
+     int ret = 0;
+
+     if (pmd_present(pmd))
+             page = pmd_page(pmd);
+     if (!page)
+             return 0;
+     VM_BUG_ON(!PageHead(page));

With a huge and stable pmd, the above operations on page could be
compacted into one line?

	page = pmd_page(pmd);

quoted

+     get_page(page);
+     pc = lookup_page_cgroup(page);
+     if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
+             ret = MC_TARGET_PAGE;
+             if (target)

After checking target, looks only get_page() needed?

quoted

+                     target->page = page;
+     }
+     if (!ret || !target)
+             put_page(page);
+     return ret;
+}
+#else
+static inline int is_target_huge_pmd_for_mc(struct vm_area_struct *vma,
+             unsigned long addr, pmd_t pmd, union mc_target *target)
+{
+     return 0;
+}
+#endif
+
 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                                      unsigned long addr, unsigned long end,
                                      struct mm_walk *walk)

@@ -5219,7 +5255,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,

      pte_t *pte;
      spinlock_t *ptl;

-     split_huge_page_pmd(walk->mm, pmd);
+     if (pmd_trans_huge_lock(pmd, vma) == 1) {
+             if (is_target_huge_pmd_for_mc(vma, addr, *pmd, NULL))

		if (is_target_huge_pmd_for_mc(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
looks clearer

quoted

+                     mc.precharge += HPAGE_PMD_NR;

As HPAGE_PMD_NR is directly used, compiler beeps if THP disabled, I guess.

If yes, please cleanup huge_mm.h with s/BUG()/BUILD_BUG()/ and with
both HPAGE_PMD_ORDER and HPAGE_PMD_NR also defined,
to easy others a bit.

quoted

+             spin_unlock(&walk->mm->page_table_lock);

		spin_unlock(&vma->mm->page_table_lock);
looks clearer

quoted

+             cond_resched();
+             return 0;
+     }

      pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
      for (; addr != end; pte++, addr += PAGE_SIZE)

@@ -5378,16 +5420,38 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,

      struct vm_area_struct *vma = walk->private;
      pte_t *pte;
      spinlock_t *ptl;
+     int type;
+     union mc_target target;
+     struct page *page;
+     struct page_cgroup *pc;
+
+     if (pmd_trans_huge_lock(pmd, vma) == 1) {
+             if (!mc.precharge)
+                     return 0;

Bang, return without page table lock released.

quoted

+             type = is_target_huge_pmd_for_mc(vma, addr, *pmd, &target);
+             if (type == MC_TARGET_PAGE) {
+                     page = target.page;
+                     if (!isolate_lru_page(page)) {
+                             pc = lookup_page_cgroup(page);

Here is a diffuclut point. Please see mem_cgroup_split_huge_fixup(). It splits

Hard and hard point IMO.

updates memcg's status of splitted pages under lru_lock and compound_lock
but not under mm->page_table_lock.

Looking into split_huge_page()

       split_huge_page()  # take anon_vma lock
               __split_huge_page()
                       __split_huge_page_refcount() # take lru_lock, compound_lock.
                               mem_cgroup_split_huge_fixup()
                       __split_huge_page_map() # take page table lock.

[copied from Naoya-san's reply]

I'm afraid this callchain is not correct.

s/correct/complete/

Page table lock seems to be taken before we enter the main split work.

   split_huge_page
       take anon_vma lock
       __split_huge_page
           __split_huge_page_splitting
               lock page_table_lock     <--- *1
               page_check_address_pmd
               unlock page_table_lock

Yeah, splitters are blocked.
Plus from the *ugly* documented lock function(another
cleanup needed), the embedded mmap_sem also blocks splitters.

That said, could we simply wait and see results of test cases?

-hd

/* mmap_sem must be held on entry */
static inline int pmd_trans_huge_lock(pmd_t *pmd,
				      struct vm_area_struct *vma)
{
	VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem));
	if (pmd_trans_huge(*pmd))
		return __pmd_trans_huge_lock(pmd, vma);
	else
		return 0;
}

           __split_huge_page_refcount
               lock lru_lock
               compound_lock
               mem_cgroup_split_huge_fixup
               compound_unlock
               unlock lru_lock
           __split_huge_page_map
               lock page_table_lock
               ... some work
               unlock page_table_lock
       unlock anon_vma lock

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help