[RFC PATCH v4 07/12] mm: memcontrol: make all the callers of page_memcg() safe
From: Muchun Song <hidden>
Date: 2021-05-27 09:34:51
Also in:
lkml
Subsystem:
control group - memory resource controller (memcg), filesystems (vfs and infrastructure), memory management, memory management - memory policy and migration, memory management - swap, the rest · Maintainers:
Johannes Weiner, Michal Hocko, Roman Gushchin, Shakeel Butt, Alexander Viro, Christian Brauner, Andrew Morton, David Hildenbrand, Chris Li, Kairui Song, Linus Torvalds
When we use objcg APIs to charge the LRU pages, the page will not hold a reference to the memcg associated with the page. So the caller of the page_memcg() should hold an rcu read lock or obtain a reference to the memcg associated with the page to protect memcg from being released. So introduce get_mem_cgroup_from_page() to obtain a reference to the memory cgroup associated with the page. In this patch, make all the callers hold an rcu read lock or obtain a reference to the memcg to protect memcg from being released when the LRU pages reparented. We do not need to adjust the callers of page_memcg() during the whole process of mem_cgroup_move_task(). Because the cgroup migration and memory cgroup offlining are serialized by @cgroup_mutex. In this routine, the LRU pages cannot be reparented to its parent memory cgroup. So page_memcg(page) is stable and cannot be released. This is a preparation for reparenting the LRU pages. Signed-off-by: Muchun Song <redacted> --- fs/buffer.c | 3 ++- fs/fs-writeback.c | 23 +++++++++++---------- include/linux/memcontrol.h | 39 ++++++++++++++++++++++++++++++++--- mm/memcontrol.c | 51 ++++++++++++++++++++++++++++++++++++---------- mm/migrate.c | 4 ++++ mm/page_io.c | 5 +++-- 6 files changed, 97 insertions(+), 28 deletions(-)
diff --git a/fs/buffer.c b/fs/buffer.c
index 673cfbef9eec..a542a47f6e27 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c@@ -848,7 +848,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, gfp |= __GFP_NOFAIL; /* The page lock pins the memcg */ - memcg = page_memcg(page); + memcg = get_mem_cgroup_from_page(page); old_memcg = set_active_memcg(memcg); head = NULL;
@@ -868,6 +868,7 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, set_bh_page(bh, page, offset); } out: + mem_cgroup_put(memcg); set_active_memcg(old_memcg); return head; /*
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7c46d1588a19..52b7d4563528 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c@@ -255,15 +255,13 @@ void __inode_attach_wb(struct inode *inode, struct page *page) if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; - if (page) { - memcg_css = mem_cgroup_css_from_page(page); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - } else { - /* must pin memcg_css, see wb_get_create() */ + /* must pin memcg_css, see wb_get_create() */ + if (page) + memcg_css = get_mem_cgroup_css_from_page(page); + else memcg_css = task_get_css(current, memory_cgrp_id); - wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); - css_put(memcg_css); - } + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); } if (!wb)
@@ -736,16 +734,16 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, if (!wbc->wb || wbc->no_cgroup_owner) return; - css = mem_cgroup_css_from_page(page); + css = get_mem_cgroup_css_from_page(page); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) - return; + goto out; id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; - return; + goto out; } if (id == wbc->wb_lcand_id)
@@ -758,6 +756,9 @@ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); + +out: + css_put(css); } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 961ca3126d7f..99f745e23607 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h@@ -380,7 +380,7 @@ static inline bool PageMemcgKmem(struct page *page); * a valid memcg, but can be atomically swapped to the parent memcg. * * The caller must ensure that the returned memcg won't be released: - * e.g. acquire the rcu_read_lock or css_set_lock. + * e.g. acquire the rcu_read_lock or css_set_lock or cgroup_mutex. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) {
@@ -458,6 +458,31 @@ static inline struct mem_cgroup *page_memcg(struct page *page) } /* + * get_mem_cgroup_from_page - Obtain a reference on the memory cgroup associated + * with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup (and obtain a reference on it) + * associated with the page, or NULL. This function assumes that the page + * is known to have a proper memory cgroup pointer. It's not safe to call + * this function against some type of pages, e.g. slab pages or ex-slab + * pages. + */ +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + struct mem_cgroup *memcg; + + rcu_read_lock(); +retry: + memcg = page_memcg(page); + if (unlikely(memcg && !css_tryget(&memcg->css))) + goto retry; + rcu_read_unlock(); + + return memcg; +} + +/* * page_memcg_rcu - locklessly get the memory cgroup associated with a page * @page: a pointer to the page struct *
@@ -870,7 +895,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm, return match; } -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg)
@@ -1030,10 +1055,13 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg; + rcu_read_lock(); + memcg = page_memcg(page); if (memcg) count_memcg_events(memcg, idx, 1); + rcu_read_unlock(); } static inline void count_memcg_event_mm(struct mm_struct *mm,
@@ -1107,6 +1135,11 @@ static inline struct mem_cgroup *page_memcg(struct page *page) return NULL; } +static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) +{ + return NULL; +} + static inline struct mem_cgroup *page_memcg_rcu(struct page *page) { WARN_ON_ONCE(!rcu_read_lock_held());
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 06c6bcaa265a..d294504aea0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c@@ -410,7 +410,7 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); #endif /** - * mem_cgroup_css_from_page - css of the memcg associated with a page + * get_mem_cgroup_css_from_page - get css of the memcg associated with a page * @page: page of interest * * If memcg is bound to the default hierarchy, css of the memcg associated
@@ -420,13 +420,15 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key); * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */ -struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) +struct cgroup_subsys_state *get_mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - memcg = page_memcg(page); + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return &root_mem_cgroup->css; - if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + memcg = get_mem_cgroup_from_page(page); + if (!memcg) memcg = root_mem_cgroup; return &memcg->css;
@@ -2015,7 +2017,9 @@ void lock_page_memcg(struct page *page) * The RCU lock is held throughout the transaction. The fast * path can get away without acquiring the memcg->move_lock * because page moving starts with an RCU grace period. - */ + * + * The RCU lock also protects the memcg from being freed. + */ rcu_read_lock(); if (mem_cgroup_disabled())
@@ -4591,7 +4595,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page_memcg(page); + struct mem_cgroup *memcg; struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now;
@@ -4600,6 +4604,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, trace_track_foreign_dirty(page, wb); + memcg = get_mem_cgroup_from_page(page); /* * Pick the slot to use. If there is already a slot for @wb, keep * using it. If not replace the oldest one which isn't being
@@ -4638,6 +4643,7 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, frn->memcg_id = wb->memcg_css->id; frn->at = now; } + css_put(&memcg->css); } /* issue foreign writeback flushes for recorded foreign dirtying events */
@@ -6168,6 +6174,14 @@ static void mem_cgroup_move_charge(void) atomic_dec(&mc.from->moving_account); } +/* + * The cgroup migration and memory cgroup offlining are serialized by + * @cgroup_mutex. If we reach here, it means that the LRU pages cannot + * be reparented to its parent memory cgroup. So during the whole process + * of mem_cgroup_move_task(), page_memcg(page) is stable. So we do not + * need to worry about the memcg (returned from page_memcg()) being + * released even if we do not hold an rcu read lock. + */ static void mem_cgroup_move_task(void) { if (mc.to) {
@@ -6995,7 +7009,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) if (page_memcg(newpage)) return; - memcg = page_memcg(oldpage); + memcg = get_mem_cgroup_from_page(oldpage); VM_WARN_ON_ONCE_PAGE(!memcg, oldpage); if (!memcg) return;
@@ -7016,6 +7030,8 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) mem_cgroup_charge_statistics(memcg, newpage, nr_pages); memcg_check_events(memcg, newpage); local_irq_restore(flags); + + css_put(&memcg->css); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
@@ -7204,6 +7220,10 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; + /* + * Interrupts should be disabled by the caller (see the comments below), + * which can serve as RCU read-side critical sections. + */ memcg = page_memcg(page); VM_WARN_ON_ONCE_PAGE(!memcg, page);
@@ -7271,15 +7291,16 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; + rcu_read_lock(); memcg = page_memcg(page); VM_WARN_ON_ONCE_PAGE(!memcg, page); if (!memcg) - return 0; + goto out; if (!entry.val) { memcg_memory_event(memcg, MEMCG_SWAP_FAIL); - return 0; + goto out; } memcg = mem_cgroup_id_get_online(memcg);
@@ -7289,6 +7310,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); + rcu_read_unlock(); return -ENOMEM; }
@@ -7298,6 +7320,8 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); +out: + rcu_read_unlock(); return 0; }
@@ -7352,17 +7376,22 @@ bool mem_cgroup_swap_full(struct page *page) if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; + rcu_read_lock(); memcg = page_memcg(page); if (!memcg) - return false; + goto out; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { unsigned long usage = page_counter_read(&memcg->swap); if (usage * 2 >= READ_ONCE(memcg->swap.high) || - usage * 2 >= READ_ONCE(memcg->swap.max)) + usage * 2 >= READ_ONCE(memcg->swap.max)) { + rcu_read_unlock(); return true; + } } +out: + rcu_read_unlock(); return false; }
diff --git a/mm/migrate.c b/mm/migrate.c
index 76cb9b422dc2..c4594b20dcec 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c@@ -465,6 +465,10 @@ int migrate_page_move_mapping(struct address_space *mapping, struct lruvec *old_lruvec, *new_lruvec; struct mem_cgroup *memcg; + /* + * Irq is disabled, which can serve as RCU read-side critical + * sections. + */ memcg = page_memcg(page); old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat); new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
diff --git a/mm/page_io.c b/mm/page_io.c
index c493ce9ebcf5..81744777ab76 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c@@ -269,13 +269,14 @@ static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) struct cgroup_subsys_state *css; struct mem_cgroup *memcg; + rcu_read_lock(); memcg = page_memcg(page); if (!memcg) - return; + goto out; - rcu_read_lock(); css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); +out: rcu_read_unlock(); } #else
--
2.11.0