Control Group for Memory

mem_cgroup_page_lruvec()

mm/memcontrol.c

/**
 * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 * @page: the page
 * @zone: zone of the page
 *                                                  
 * This function is only safe when following the LRU page isolation
 * and putback protocol: the LRU lock must be held, and the page must
 * either be PageLRU() or the caller must have isolated/allocated it.
 */
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
{                                                   
        struct mem_cgroup_per_zone *mz;
        struct mem_cgroup *memcg;
        struct lruvec *lruvec; 

        if (mem_cgroup_disabled()) {
                lruvec = &zone->lruvec;
                goto out;
        }

        memcg = page->mem_cgroup;
        /*
         * Swapcache readahead pages are added to the LRU - and
         * possibly migrated - before they are charged.
         */
        if (!memcg)
                memcg = root_mem_cgroup;

        mz = mem_cgroup_page_zoneinfo(memcg, page);
        lruvec = &mz->lruvec;
out:
        /*
         * Since a node can be onlined after the mem_cgroup was created,
         * we have to be prepared to initialize lruvec->zone here;
         * and if offlined then reonlined, we need to reinitialize it.
         */
        if (unlikely(lruvec->zone != zone))
                lruvec->zone = zone;
        return lruvec;             
}

메모리 cgroup이 disable되어 있는 경우 zone이 가리키는 lruvec 정보를 가져오고 enable 되어 있는 경우 메모리 cgroup에 연결되어 있는 mem_cgroup_per_zone에 존재하는 lruvec 정보를 리턴한다. 리턴되는 lruvec 내의 zone 정보는 인수로 지정한 zone으로 갱신된다.

if (mem_cgroup_disabled()) { lruvec = &zone->lruvec; goto out; }
- memory control group 서브 시스템이 disable되어 있는 경우 zone->lruvec을 기억하고 out:으로 이동한다.
if (!memcg) memcg = root_mem_cgroup;
- 페이지에 지정된 mem_cgroup이 없는 경우 root_mem_cgroup을 대입한다.
mz = mem_cgroup_page_zoneinfo(memcg, page);
- 페이지 정보로 mem_cgroup_per_zone 구조체 정보를 알아온다.
lruvec = &mz->lruvec;
- 찾은 mem_cgroup_per_zone 구조체의 멤버 lruvec
if (unlikely(lruvec->zone != zone)) lruvec->zone = zone;
- lruvec->zone을 갱신한다.

mem_cgroup_disabled()

include/linux/memcontrol.h

static inline bool mem_cgroup_disabled(void) 
{
        if (memory_cgrp_subsys.disabled)
                return true;
        return false;
}

메모리 Control Group 서브시스템의 disable 여부를 리턴한다.

mem_cgroup_page_zoneinfo()

mm/memcontrol.c

static struct mem_cgroup_per_zone *
mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
{
        int nid = page_to_nid(page);
        int zid = page_zonenum(page);

        return &memcg->nodeinfo[nid]->zoneinfo[zid];
}

페이지 정보로 mem_cgroup_per_zone 정보를 알아온다.

int nid = page_to_nid(page);
- Sparse 메모리 시스템이 아닌 경우 page->flags 정보에 있는 노드 id
- Sparse 메모리 시스템인 경우 페이지에 해당하는 섹션에 담긴 노드 id
int zid = page_zonenum(page);
- 페이지에 담긴 zone 타입
return &memcg->nodeinfo[nid]->zoneinfo[zid];
- memcg 뒤에 따라 붙은 주소에서 지정된 노드와 zone의 mem_cgroup_per_zone 구조체 주소를 알아온다.

아래 그림은 2 개의 노드와 3개의 zone으로 구성된 memory control group 서브 시스템을 보여준다.

구조체

mem_cgroup 구조체

mm/memcontrol.c

/*
 * The memory controller data structure. The memory controller controls both
 * page cache and RSS per cgroup. We would eventually like to provide
 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
 * to help the administrator determine what knobs to tune.
 *
 * TODO: Add a water mark for the memory controller. Reclaim will begin when
 * we hit the water mark. May be even add a low water mark, such that
 * no reclaim occurs from a cgroup at it's low water mark, this is
 * a feature that will be implemented much later in the future.
 */
struct mem_cgroup {
        struct cgroup_subsys_state css;

        /* Accounted resources */
        struct page_counter memory;
        struct page_counter memsw;
        struct page_counter kmem;

        /* Normal memory consumption range */
        unsigned long low;
        unsigned long high;

        unsigned long soft_limit;

        /* vmpressure notifications */
        struct vmpressure vmpressure;

        /* css_online() has been completed */
        int initialized;

        /*
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;

        bool            oom_lock;
        atomic_t        under_oom;
        atomic_t        oom_wakeups;

        int     swappiness;
        /* OOM-Killer disable */
        int             oom_kill_disable;

        /* protect arrays of thresholds */
        struct mutex thresholds_lock;

        /* thresholds for memory usage. RCU-protected */
        struct mem_cgroup_thresholds thresholds;

        /* thresholds for mem+swap usage. RCU-protected */
        struct mem_cgroup_thresholds memsw_thresholds;

        /* For oom notifier event fd */
        struct list_head oom_notify;

        /*
         * Should we move charges of a task when a task is moved into this
         * mem_cgroup ? And what type of charges should we move ?
         */
        unsigned long move_charge_at_immigrate;
        /*
         * set > 0 if pages under this cgroup are moving to other cgroup.
         */
        atomic_t                moving_account;
        /* taken only while moving_account > 0 */
        spinlock_t              move_lock;
        struct task_struct      *move_lock_task;
        unsigned long           move_lock_flags;
        /*
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
        /*
         * used when a cpu is offlined or other synchronizations
         * See mem_cgroup_read_stat().
         */
        struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;

#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
        /* Index in the kmem_cache->memcg_params.memcg_caches array */
        int kmemcg_id;
        bool kmem_acct_activated;
        bool kmem_acct_active;
#endif

        int last_scanned_node;
#if MAX_NUMNODES > 1
        nodemask_t      scan_nodes;
        atomic_t        numainfo_events;
        atomic_t        numainfo_updating;
#endif

        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;

        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
};

mem_cgroup_per_zone 구조체

mm/memcontrol.c

/*
 * per-zone information in memory controller.
 */
struct mem_cgroup_per_zone {
        struct lruvec           lruvec;
        unsigned long           lru_size[NR_LRU_LISTS];

        struct reclaim_iter     iter[DEF_PRIORITY + 1];

        struct rb_node          tree_node;      /* RB tree node */
        unsigned long           usage_in_excess;/* Set to the value by which */
                                                /* the soft limit is exceeded*/
        bool                    on_tree;
        struct mem_cgroup       *memcg;         /* Back pointer, we cannot */
                                                /* use container_of        */
};

/*
 * The "priority" of VM scanning is how much of the queues we will scan in one
 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
 * queues ("queue_length >> 12") during an aging round.
 */
#define DEF_PRIORITY 12

reclaim_iter 구조체

mm/memcontrol.c

struct reclaim_iter {
        struct mem_cgroup *position;
        /* scan generation, increased every round-trip */
        unsigned int generation;
};

mem_cgroup_page_lruvec()

mem_cgroup_disabled()

mem_cgroup_page_zoneinfo()

구조체

mem_cgroup 구조체

mem_cgroup_per_zone 구조체

reclaim_iter 구조체

댓글 남기기 댓글 취소