mem_cgroup_page_lruvec()
mm/memcontrol.c
/**
* mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
* @page: the page
* @zone: zone of the page
*
* This function is only safe when following the LRU page isolation
* and putback protocol: the LRU lock must be held, and the page must
* either be PageLRU() or the caller must have isolated/allocated it.
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{
struct mem_cgroup_per_zone *mz;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
if (mem_cgroup_disabled()) {
lruvec = &zone->lruvec;
goto out;
}
memcg = page->mem_cgroup;
/*
* Swapcache readahead pages are added to the LRU - and
* possibly migrated - before they are charged.
*/
if (!memcg)
memcg = root_mem_cgroup;
mz = mem_cgroup_page_zoneinfo(memcg, page);
lruvec = &mz->lruvec;
out:
/*
* Since a node can be onlined after the mem_cgroup was created,
* we have to be prepared to initialize lruvec->zone here;
* and if offlined then reonlined, we need to reinitialize it.
*/
if (unlikely(lruvec->zone != zone))
lruvec->zone = zone;
return lruvec;
}
메모리 cgroup이 disable되어 있는 경우 zone이 가리키는 lruvec 정보를 가져오고 enable 되어 있는 경우 메모리 cgroup에 연결되어 있는 mem_cgroup_per_zone에 존재하는 lruvec 정보를 리턴한다. 리턴되는 lruvec 내의 zone 정보는 인수로 지정한 zone으로 갱신된다.
- if (mem_cgroup_disabled()) { lruvec = &zone->lruvec; goto out; }
- memory control group 서브 시스템이 disable되어 있는 경우 zone->lruvec을 기억하고 out:으로 이동한다.
- if (!memcg) memcg = root_mem_cgroup;
- 페이지에 지정된 mem_cgroup이 없는 경우 root_mem_cgroup을 대입한다.
- mz = mem_cgroup_page_zoneinfo(memcg, page);
- 페이지 정보로 mem_cgroup_per_zone 구조체 정보를 알아온다.
- lruvec = &mz->lruvec;
- 찾은 mem_cgroup_per_zone 구조체의 멤버 lruvec
- if (unlikely(lruvec->zone != zone)) lruvec->zone = zone;
- lruvec->zone을 갱신한다.
mem_cgroup_disabled()
include/linux/memcontrol.h
static inline bool mem_cgroup_disabled(void)
{
if (memory_cgrp_subsys.disabled)
return true;
return false;
}
메모리 Control Group 서브시스템의 disable 여부를 리턴한다.
mem_cgroup_page_zoneinfo()
mm/memcontrol.c
static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
int nid = page_to_nid(page);
int zid = page_zonenum(page);
return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
페이지 정보로 mem_cgroup_per_zone 정보를 알아온다.
- int nid = page_to_nid(page);
- Sparse 메모리 시스템이 아닌 경우 page->flags 정보에 있는 노드 id
- Sparse 메모리 시스템인 경우 페이지에 해당하는 섹션에 담긴 노드 id
- int zid = page_zonenum(page);
- 페이지에 담긴 zone 타입
- return &memcg->nodeinfo[nid]->zoneinfo[zid];
- memcg 뒤에 따라 붙은 주소에서 지정된 노드와 zone의 mem_cgroup_per_zone 구조체 주소를 알아온다.
아래 그림은 2 개의 노드와 3개의 zone으로 구성된 memory control group 서브 시스템을 보여준다.
구조체
mem_cgroup 구조체
mm/memcontrol.c
/*
* The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide
* statistics based on the statistics developed by Rik Van Riel for clock-pro,
* to help the administrator determine what knobs to tune.
*
* TODO: Add a water mark for the memory controller. Reclaim will begin when
* we hit the water mark. May be even add a low water mark, such that
* no reclaim occurs from a cgroup at it's low water mark, this is
* a feature that will be implemented much later in the future.
*/
struct mem_cgroup {
struct cgroup_subsys_state css;
/* Accounted resources */
struct page_counter memory;
struct page_counter memsw;
struct page_counter kmem;
/* Normal memory consumption range */
unsigned long low;
unsigned long high;
unsigned long soft_limit;
/* vmpressure notifications */
struct vmpressure vmpressure;
/* css_online() has been completed */
int initialized;
/*
* Should the accounting and control be hierarchical, per subtree?
*/
bool use_hierarchy;
bool oom_lock;
atomic_t under_oom;
atomic_t oom_wakeups;
int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
/* protect arrays of thresholds */
struct mutex thresholds_lock;
/* thresholds for memory usage. RCU-protected */
struct mem_cgroup_thresholds thresholds;
/* thresholds for mem+swap usage. RCU-protected */
struct mem_cgroup_thresholds memsw_thresholds;
/* For oom notifier event fd */
struct list_head oom_notify;
/*
* Should we move charges of a task when a task is moved into this
* mem_cgroup ? And what type of charges should we move ?
*/
unsigned long move_charge_at_immigrate;
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
atomic_t moving_account;
/* taken only while moving_account > 0 */
spinlock_t move_lock;
struct task_struct *move_lock_task;
unsigned long move_lock_flags;
/*
* percpu counter.
*/
struct mem_cgroup_stat_cpu __percpu *stat;
/*
* used when a cpu is offlined or other synchronizations
* See mem_cgroup_read_stat().
*/
struct mem_cgroup_stat_cpu nocpu_base;
spinlock_t pcp_counter_lock;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
/* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
bool kmem_acct_activated;
bool kmem_acct_active;
#endif
int last_scanned_node;
#if MAX_NUMNODES > 1
nodemask_t scan_nodes;
atomic_t numainfo_events;
atomic_t numainfo_updating;
#endif
/* List of events which userspace want to receive */
struct list_head event_list;
spinlock_t event_list_lock;
struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
};
mem_cgroup_per_zone 구조체
mm/memcontrol.c
/*
* per-zone information in memory controller.
*/
struct mem_cgroup_per_zone {
struct lruvec lruvec;
unsigned long lru_size[NR_LRU_LISTS];
struct reclaim_iter iter[DEF_PRIORITY + 1];
struct rb_node tree_node; /* RB tree node */
unsigned long usage_in_excess;/* Set to the value by which */
/* the soft limit is exceeded*/
bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
};
/*
* The "priority" of VM scanning is how much of the queues we will scan in one
* go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
* queues ("queue_length >> 12") during an aging round.
*/
#define DEF_PRIORITY 12
reclaim_iter 구조체
mm/memcontrol.c
struct reclaim_iter {
struct mem_cgroup *position;
/* scan generation, increased every round-trip */
unsigned int generation;
};
