mem_cgroup_page_lruvec()
mm/memcontrol.c
/** * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page * @page: the page * @zone: zone of the page * * This function is only safe when following the LRU page isolation * and putback protocol: the LRU lock must be held, and the page must * either be PageLRU() or the caller must have isolated/allocated it. */ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) { struct mem_cgroup_per_zone *mz; struct mem_cgroup *memcg; struct lruvec *lruvec; if (mem_cgroup_disabled()) { lruvec = &zone->lruvec; goto out; } memcg = page->mem_cgroup; /* * Swapcache readahead pages are added to the LRU - and * possibly migrated - before they are charged. */ if (!memcg) memcg = root_mem_cgroup; mz = mem_cgroup_page_zoneinfo(memcg, page); lruvec = &mz->lruvec; out: /* * Since a node can be onlined after the mem_cgroup was created, * we have to be prepared to initialize lruvec->zone here; * and if offlined then reonlined, we need to reinitialize it. */ if (unlikely(lruvec->zone != zone)) lruvec->zone = zone; return lruvec; }
메모리 cgroup이 disable되어 있는 경우 zone이 가리키는 lruvec 정보를 가져오고 enable 되어 있는 경우 메모리 cgroup에 연결되어 있는 mem_cgroup_per_zone에 존재하는 lruvec 정보를 리턴한다. 리턴되는 lruvec 내의 zone 정보는 인수로 지정한 zone으로 갱신된다.
- if (mem_cgroup_disabled()) { lruvec = &zone->lruvec; goto out; }
- memory control group 서브 시스템이 disable되어 있는 경우 zone->lruvec을 기억하고 out:으로 이동한다.
- if (!memcg) memcg = root_mem_cgroup;
- 페이지에 지정된 mem_cgroup이 없는 경우 root_mem_cgroup을 대입한다.
- mz = mem_cgroup_page_zoneinfo(memcg, page);
- 페이지 정보로 mem_cgroup_per_zone 구조체 정보를 알아온다.
- lruvec = &mz->lruvec;
- 찾은 mem_cgroup_per_zone 구조체의 멤버 lruvec
- if (unlikely(lruvec->zone != zone)) lruvec->zone = zone;
- lruvec->zone을 갱신한다.
mem_cgroup_disabled()
include/linux/memcontrol.h
static inline bool mem_cgroup_disabled(void) { if (memory_cgrp_subsys.disabled) return true; return false; }
메모리 Control Group 서브시스템의 disable 여부를 리턴한다.
mem_cgroup_page_zoneinfo()
mm/memcontrol.c
static struct mem_cgroup_per_zone * mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); return &memcg->nodeinfo[nid]->zoneinfo[zid]; }
페이지 정보로 mem_cgroup_per_zone 정보를 알아온다.
- int nid = page_to_nid(page);
- Sparse 메모리 시스템이 아닌 경우 page->flags 정보에 있는 노드 id
- Sparse 메모리 시스템인 경우 페이지에 해당하는 섹션에 담긴 노드 id
- int zid = page_zonenum(page);
- 페이지에 담긴 zone 타입
- return &memcg->nodeinfo[nid]->zoneinfo[zid];
- memcg 뒤에 따라 붙은 주소에서 지정된 노드와 zone의 mem_cgroup_per_zone 구조체 주소를 알아온다.
아래 그림은 2 개의 노드와 3개의 zone으로 구성된 memory control group 서브 시스템을 보여준다.
구조체
mem_cgroup 구조체
mm/memcontrol.c
/* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. * * TODO: Add a water mark for the memory controller. Reclaim will begin when * we hit the water mark. May be even add a low water mark, such that * no reclaim occurs from a cgroup at it's low water mark, this is * a feature that will be implemented much later in the future. */ struct mem_cgroup { struct cgroup_subsys_state css; /* Accounted resources */ struct page_counter memory; struct page_counter memsw; struct page_counter kmem; /* Normal memory consumption range */ unsigned long low; unsigned long high; unsigned long soft_limit; /* vmpressure notifications */ struct vmpressure vmpressure; /* css_online() has been completed */ int initialized; /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; bool oom_lock; atomic_t under_oom; atomic_t oom_wakeups; int swappiness; /* OOM-Killer disable */ int oom_kill_disable; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify;
/* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; /* taken only while moving_account > 0 */ spinlock_t move_lock; struct task_struct *move_lock_task; unsigned long move_lock_flags; /* * percpu counter. */ struct mem_cgroup_stat_cpu __percpu *stat; /* * used when a cpu is offlined or other synchronizations * See mem_cgroup_read_stat(). */ struct mem_cgroup_stat_cpu nocpu_base; spinlock_t pcp_counter_lock; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; bool kmem_acct_activated; bool kmem_acct_active; #endif int last_scanned_node; #if MAX_NUMNODES > 1 nodemask_t scan_nodes; atomic_t numainfo_events; atomic_t numainfo_updating; #endif /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ };
mem_cgroup_per_zone 구조체
mm/memcontrol.c
/* * per-zone information in memory controller. */ struct mem_cgroup_per_zone { struct lruvec lruvec; unsigned long lru_size[NR_LRU_LISTS]; struct reclaim_iter iter[DEF_PRIORITY + 1]; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ };
/* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12
reclaim_iter 구조체
mm/memcontrol.c
struct reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ unsigned int generation; };