Control Group for Memory




 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 * @page: the page
 * @zone: zone of the page
 * This function is only safe when following the LRU page isolation
 * and putback protocol: the LRU lock must be held, and the page must
 * either be PageLRU() or the caller must have isolated/allocated it.
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
        struct mem_cgroup_per_zone *mz;
        struct mem_cgroup *memcg;
        struct lruvec *lruvec; 

        if (mem_cgroup_disabled()) {
                lruvec = &zone->lruvec;
                goto out;

        memcg = page->mem_cgroup;
         * Swapcache readahead pages are added to the LRU - and
         * possibly migrated - before they are charged.
        if (!memcg)
                memcg = root_mem_cgroup;

        mz = mem_cgroup_page_zoneinfo(memcg, page);
        lruvec = &mz->lruvec;
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->zone here;
         * and if offlined then reonlined, we need to reinitialize it.
        if (unlikely(lruvec->zone != zone))
                lruvec->zone = zone;
        return lruvec;             

메모리 cgroup이 disable되어 있는 경우 zone이 가리키는 lruvec 정보를 가져오고 enable 되어 있는 경우 메모리 cgroup에 연결되어 있는 mem_cgroup_per_zone에 존재하는 lruvec 정보를 리턴한다. 리턴되는 lruvec 내의 zone 정보는 인수로 지정한 zone으로 갱신된다.

  • if (mem_cgroup_disabled()) { lruvec = &zone->lruvec; goto out; }
    • memory control group 서브 시스템이 disable되어 있는 경우 zone->lruvec을 기억하고 out:으로 이동한다.
  • if (!memcg) memcg = root_mem_cgroup;
    • 페이지에 지정된 mem_cgroup이 없는 경우 root_mem_cgroup을 대입한다.
  • mz = mem_cgroup_page_zoneinfo(memcg, page);
    • 페이지 정보로 mem_cgroup_per_zone 구조체 정보를 알아온다.
  • lruvec = &mz->lruvec;
    • 찾은 mem_cgroup_per_zone 구조체의 멤버 lruvec
  • if (unlikely(lruvec->zone != zone)) lruvec->zone = zone;
    • lruvec->zone을 갱신한다.




static inline bool mem_cgroup_disabled(void) 
        if (memory_cgrp_subsys.disabled)
                return true;
        return false;

메모리 Control Group 서브시스템의 disable 여부를 리턴한다.




static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
        int nid = page_to_nid(page);
        int zid = page_zonenum(page);

        return &memcg->nodeinfo[nid]->zoneinfo[zid];

페이지 정보로 mem_cgroup_per_zone 정보를 알아온다.

  •  int nid = page_to_nid(page);
    • Sparse 메모리 시스템이 아닌 경우 page->flags 정보에 있는 노드 id
    • Sparse 메모리 시스템인 경우 페이지에 해당하는 섹션에 담긴 노드 id
  • int zid = page_zonenum(page);
    • 페이지에 담긴 zone 타입
  • return &memcg->nodeinfo[nid]->zoneinfo[zid];
    • memcg 뒤에 따라 붙은 주소에서 지정된 노드와 zone의 mem_cgroup_per_zone 구조체 주소를 알아온다.

아래 그림은 2 개의 노드와 3개의 zone으로 구성된 memory control group 서브 시스템을 보여준다.




mem_cgroup 구조체


 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Accounted resources */
        struct page_counter memory;
        struct page_counter memsw;
        struct page_counter kmem;

        /* Normal memory consumption range */
        unsigned long low;
        unsigned long high;

        unsigned long soft_limit;

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /* css_online() has been completed */
        int initialized;

         * Should the accounting and control be hierarchical, per subtree?
        bool use_hierarchy;

        bool            oom_lock;
        atomic_t        under_oom;
        atomic_t        oom_wakeups;

        int     swappiness;
        /* OOM-Killer disable */
        int             oom_kill_disable;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;


         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
        unsigned long move_charge_at_immigrate;
         * set > 0 if pages under this cgroup are moving to other cgroup.
        atomic_t                moving_account;
        /* taken only while moving_account > 0 */
        spinlock_t              move_lock;
        struct task_struct      *move_lock_task;
        unsigned long           move_lock_flags;
         * percpu counter.
        struct mem_cgroup_stat_cpu __percpu *stat;
         * used when a cpu is offlined or other synchronizations
         * See mem_cgroup_read_stat().
        struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct cg_proto tcp_mem;
#if defined(CONFIG_MEMCG_KMEM)
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
        int kmemcg_id;
        bool kmem_acct_activated;
        bool kmem_acct_active;

        int last_scanned_node;
        nodemask_t      scan_nodes;
        atomic_t        numainfo_events;
        atomic_t        numainfo_updating;

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;

        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */


mem_cgroup_per_zone 구조체


 * per-zone information in memory controller.
struct mem_cgroup_per_zone {
        struct lruvec           lruvec;
        unsigned long           lru_size[NR_LRU_LISTS];

        struct reclaim_iter     iter[DEF_PRIORITY + 1];

        struct rb_node          tree_node;      /* RB tree node */
        unsigned long           usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                    on_tree;
        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                /* use container_of        */


 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
#define DEF_PRIORITY 12


reclaim_iter 구조체


struct reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        unsigned int generation;



댓글 남기기