Thread (30 messages) 30 messages, 5 authors, 2012-01-19

Re: [patch 1/2] mm: memcg: per-memcg reclaim statistics

From: Ying Han <hidden>
Date: 2012-01-10 23:54:10
Also in: linux-mm, lkml

Thank you for the patch and the stats looks reasonable to me, few
questions as below:

On Tue, Jan 10, 2012 at 7:02 AM, Johannes Weiner [off-list ref] wrote:
With the single per-zone LRU gone and global reclaim scanning
individual memcgs, it's straight-forward to collect meaningful and
accurate per-memcg reclaim statistics.

This adds the following items to memory.stat:
Some of the previous discussions including patches have similar stats
in memory.vmscan_stat API, which collects all the per-memcg vmscan
stats. I would like to understand more why we add into memory.stat
instead, and do we have plan to keep extending memory.stat for those
vmstat like stats?
pgreclaim
Not sure if we want to keep this more consistent to /proc/vmstat, then
it will be "pgsteal"?
pgscan

 Number of pages reclaimed/scanned from that memcg due to its own
 hard limit (or physical limit in case of the root memcg) by the
 allocating task.

kswapd_pgreclaim
kswapd_pgscan
we have "pgscan_kswapd_*" in vmstat, so maybe ?
"pgsteal_kswapd"
"pgscan_kswapd"
 Reclaim activity from kswapd due to the memcg's own limit.  Only
 applicable to the root memcg for now since kswapd is only triggered
 by physical limits, but kswapd-style reclaim based on memcg hard
 limits is being developped.

hierarchy_pgreclaim
hierarchy_pgscan
hierarchy_kswapd_pgreclaim
hierarchy_kswapd_pgscan
"pgsteal_hierarchy"
"pgsteal_kswapd_hierarchy"
..

No strong option on the naming, but try to make it more consistent to
existing API.

quoted hunk ↗ jump to hunk
 Reclaim activity due to limitations in one of the memcg's parents.

Signed-off-by: Johannes Weiner <redacted>
---
 Documentation/cgroups/memory.txt |    4 ++
 include/linux/memcontrol.h       |   10 +++++
 mm/memcontrol.c                  |   84 +++++++++++++++++++++++++++++++++++++-
 mm/vmscan.c                      |    7 +++
 4 files changed, 103 insertions(+), 2 deletions(-)
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index cc0ebc5..eb9e982 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -389,6 +389,10 @@ mapped_file        - # of bytes of mapped file (includes tmpfs/shmem)
 pgpgin         - # of pages paged in (equivalent to # of charging events).
 pgpgout                - # of pages paged out (equivalent to # of uncharging events).
 swap           - # of bytes of swap usage
+pgreclaim      - # of pages reclaimed due to this memcg's limit
+pgscan         - # of pages scanned due to this memcg's limit
+kswapd_*       - # reclaim activity by background daemon due to this memcg's limit
+hierarchy_*    - # reclaim activity due to pressure from parental memcg
 inactive_anon  - # of bytes of anonymous memory and swap cache memory on
               LRU list.
 active_anon    - # of bytes of anonymous and swap cache memory on active
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index bd3b102..6c1d69e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -121,6 +121,8 @@ struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                     struct zone *zone);
 struct zone_reclaim_stat*
 mem_cgroup_get_reclaim_stat_from_page(struct page *page);
+void mem_cgroup_account_reclaim(struct mem_cgroup *, struct mem_cgroup *,
+                               unsigned long, unsigned long, bool);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                       struct task_struct *p);
 extern void mem_cgroup_replace_page_cache(struct page *oldpage,
@@ -347,6 +349,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
       return NULL;
 }

+static inline void mem_cgroup_account_reclaim(struct mem_cgroup *root,
+                                             struct mem_cgroup *memcg,
+                                             unsigned long nr_reclaimed,
+                                             unsigned long nr_scanned,
+                                             bool kswapd)
+{
+}
+
 static inline void
 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e2a80d..170dff4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -91,12 +91,23 @@ enum mem_cgroup_stat_index {
       MEM_CGROUP_STAT_NSTATS,
 };

+#define MEM_CGROUP_EVENTS_KSWAPD 2
+#define MEM_CGROUP_EVENTS_HIERARCHY 4
+
 enum mem_cgroup_events_index {
       MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
       MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
       MEM_CGROUP_EVENTS_COUNT,        /* # of pages paged in/out */
       MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
       MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
+       MEM_CGROUP_EVENTS_PGRECLAIM,
+       MEM_CGROUP_EVENTS_PGSCAN,
+       MEM_CGROUP_EVENTS_KSWAPD_PGRECLAIM,
+       MEM_CGROUP_EVENTS_KSWAPD_PGSCAN,
+       MEM_CGROUP_EVENTS_HIERARCHY_PGRECLAIM,
+       MEM_CGROUP_EVENTS_HIERARCHY_PGSCAN,
+       MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGRECLAIM,
+       MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGSCAN,
missing comment here?
quoted hunk ↗ jump to hunk
       MEM_CGROUP_EVENTS_NSTATS,
 };
 /*
@@ -889,6 +900,38 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
       return (memcg == root_mem_cgroup);
 }

+/**
+ * mem_cgroup_account_reclaim - update per-memcg reclaim statistics
+ * @root: memcg that triggered reclaim
+ * @memcg: memcg that is actually being scanned
+ * @nr_reclaimed: number of pages reclaimed from @memcg
+ * @nr_scanned: number of pages scanned from @memcg
+ * @kswapd: whether reclaiming task is kswapd or allocator itself
+ */
+void mem_cgroup_account_reclaim(struct mem_cgroup *root,
+                               struct mem_cgroup *memcg,
+                               unsigned long nr_reclaimed,
+                               unsigned long nr_scanned,
+                               bool kswapd)
+{
+       unsigned int offset = 0;
+
+       if (!root)
+               root = root_mem_cgroup;
+
+       if (kswapd)
+               offset += MEM_CGROUP_EVENTS_KSWAPD;
+       if (root != memcg)
+               offset += MEM_CGROUP_EVENTS_HIERARCHY;
Just to be clear, here root cgroup has hierarchy_* stats always 0 ?
Also, we might want to consider renaming the root here, something like
target? The root is confusing with root_mem_cgroup.

--Ying
quoted hunk ↗ jump to hunk
+
+       preempt_disable();
+       __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGRECLAIM + offset],
+                      nr_reclaimed);
+       __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGSCAN + offset],
+                      nr_scanned);
+       preempt_enable();
+}
+
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
       struct mem_cgroup *memcg;
@@ -1662,6 +1705,8 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
       excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT;

       while (1) {
+               unsigned long nr_reclaimed;
+
               victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
               if (!victim) {
                       loop++;
@@ -1687,8 +1732,11 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
               }
               if (!mem_cgroup_reclaimable(victim, false))
                       continue;
-               total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
-                                                    zone, &nr_scanned);
+               nr_reclaimed = mem_cgroup_shrink_node_zone(victim, gfp_mask, false,
+                                                          zone, &nr_scanned);
+               mem_cgroup_account_reclaim(root_mem_cgroup, victim, nr_reclaimed,
+                                          nr_scanned, current_is_kswapd());
+               total += nr_reclaimed;
               *total_scanned += nr_scanned;
               if (!res_counter_soft_limit_excess(&root_memcg->res))
                       break;
@@ -4023,6 +4071,14 @@ enum {
       MCS_SWAP,
       MCS_PGFAULT,
       MCS_PGMAJFAULT,
+       MCS_PGRECLAIM,
+       MCS_PGSCAN,
+       MCS_KSWAPD_PGRECLAIM,
+       MCS_KSWAPD_PGSCAN,
+       MCS_HIERARCHY_PGRECLAIM,
+       MCS_HIERARCHY_PGSCAN,
+       MCS_HIERARCHY_KSWAPD_PGRECLAIM,
+       MCS_HIERARCHY_KSWAPD_PGSCAN,
       MCS_INACTIVE_ANON,
       MCS_ACTIVE_ANON,
       MCS_INACTIVE_FILE,
@@ -4047,6 +4103,14 @@ struct {
       {"swap", "total_swap"},
       {"pgfault", "total_pgfault"},
       {"pgmajfault", "total_pgmajfault"},
+       {"pgreclaim", "total_pgreclaim"},
+       {"pgscan", "total_pgscan"},
+       {"kswapd_pgreclaim", "total_kswapd_pgreclaim"},
+       {"kswapd_pgscan", "total_kswapd_pgscan"},
+       {"hierarchy_pgreclaim", "total_hierarchy_pgreclaim"},
+       {"hierarchy_pgscan", "total_hierarchy_pgscan"},
+       {"hierarchy_kswapd_pgreclaim", "total_hierarchy_kswapd_pgreclaim"},
+       {"hierarchy_kswapd_pgscan", "total_hierarchy_kswapd_pgscan"},
       {"inactive_anon", "total_inactive_anon"},
       {"active_anon", "total_active_anon"},
       {"inactive_file", "total_inactive_file"},
@@ -4079,6 +4143,22 @@ mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
       s->stat[MCS_PGFAULT] += val;
       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
       s->stat[MCS_PGMAJFAULT] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGRECLAIM);
+       s->stat[MCS_PGRECLAIM] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGSCAN);
+       s->stat[MCS_PGSCAN] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_KSWAPD_PGRECLAIM);
+       s->stat[MCS_KSWAPD_PGRECLAIM] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_KSWAPD_PGSCAN);
+       s->stat[MCS_KSWAPD_PGSCAN] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_PGRECLAIM);
+       s->stat[MCS_HIERARCHY_PGRECLAIM] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_PGSCAN);
+       s->stat[MCS_HIERARCHY_PGSCAN] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGRECLAIM);
+       s->stat[MCS_HIERARCHY_KSWAPD_PGRECLAIM] += val;
+       val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_HIERARCHY_KSWAPD_PGSCAN);
+       s->stat[MCS_HIERARCHY_KSWAPD_PGSCAN] += val;

       /* per zone stat */
       val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index c631234..e3fd8a7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2115,12 +2115,19 @@ static void shrink_zone(int priority, struct zone *zone,
       memcg = mem_cgroup_iter(root, NULL, &reclaim);
       do {
+               unsigned long nr_reclaimed = sc->nr_reclaimed;
+               unsigned long nr_scanned = sc->nr_scanned;
               struct mem_cgroup_zone mz = {
                       .mem_cgroup = memcg,
                       .zone = zone,
               };

               shrink_mem_cgroup_zone(priority, &mz, sc);
+
+               mem_cgroup_account_reclaim(root, memcg,
+                                          sc->nr_reclaimed - nr_reclaimed,
+                                          sc->nr_scanned - nr_scanned,
+                                          current_is_kswapd());
               /*
                * Limit reclaim has historically picked one memcg and
                * scanned it with decreasing priority levels until
--
1.7.7.5
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help