문c 블로그

커널 코드 영역 확인(func_ptr_is_kernel_text())

2016-04-202016-04-20 문영일 Leave a comment

func_ptr_is_kernel_text()

kernel/extable.c

/*
 * On some architectures (PPC64, IA64) function pointers
 * are actually only tokens to some data that then holds the
 * real function address. As a result, to find if a function
 * pointer is part of the kernel text, we need to do some
 * special dereferencing first.
 */
int func_ptr_is_kernel_text(void *ptr)
{
        unsigned long addr;
        addr = (unsigned long) dereference_function_descriptor(ptr);
        if (core_kernel_text(addr))
                return 1;
        return is_module_text_address(addr);
}

ptr 주소가 커널 코드 또는 모듈 코드 영역인지 여부를 알아온다.

__kernel_text_address()

kernel/extable.c

int __kernel_text_address(unsigned long addr)
{
        if (core_kernel_text(addr))
                return 1;
        if (is_module_text_address(addr))
                return 1;
        if (is_ftrace_trampoline(addr))
                return 1;
        /*
         * There might be init symbols in saved stacktraces.
         * Give those symbols a chance to be printed in
         * backtraces (such as lockdep traces).
         *
         * Since we are after the module-symbols check, there's
         * no danger of address overlap:
         */
        if (init_kernel_text(addr))
                return 1;
        return 0;
}

addr 주소가 core 커널 코드, 모듈 코드, ftrace 코드, init 커널 코드 영역인 경우 1을 리턴하고 그렇지 않으면 0을 리턴한다.

core_kernel_text()

kernel/extable.c

int core_kernel_text(unsigned long addr)
{
        if (addr >= (unsigned long)_stext &&
            addr < (unsigned long)_etext)
                return 1;

        if (system_state == SYSTEM_BOOTING &&
            init_kernel_text(addr))
                return 1;
        return 0;
}

addr 주소가 _stext ~ _etext 범위에 있거나 부팅 중 init kernel 코드 영역인 경우 1을 리턴하고 그렇지 않으면 0을 리턴한다.

core_kernel_data()

kernel/extable.c

/**
 * core_kernel_data - tell if addr points to kernel data
 * @addr: address to test
 *
 * Returns true if @addr passed in is from the core kernel data
 * section.
 *
 * Note: On some archs it may return true for core RODATA, and false
 *  for others. But will always be true for core RW data.
 */
int core_kernel_data(unsigned long addr)
{
        if (addr >= (unsigned long)_sdata &&
            addr < (unsigned long)_edata)
                return 1;
        return 0;
}

addr 주소가 _sdata ~ _edata 영역에 포함되어 있는지 여부를 리턴한다.

init_kernel_text()

kernel/extable.c

static inline int init_kernel_text(unsigned long addr)
{
        if (addr >= (unsigned long)_sinittext &&
            addr < (unsigned long)_einittext)
                return 1;
        return 0;
}

addr 주소가 _sinittext ~ _einittext 영역에 포함되어 있는지 여부를 리턴한다.

is_module_text_address()

kernel/module.c

/*
 * is_module_text_address - is this address inside module code?
 * @addr: the address to check.
 *
 * See is_module_address() if you simply want to see if the address is
 * anywhere in a module.  See kernel_text_address() for testing if an
 * address corresponds to kernel or module code.
 */
bool is_module_text_address(unsigned long addr)
{
        bool ret;
                                      
        preempt_disable();
        ret = __module_text_address(addr) != NULL;
        preempt_enable();

        return ret;
}

addr 주소가 모듈 코드 영역에 포함되어 있는지 여부를 알아온다.

__module_text_address()

kernel/module.c

/*
 * __module_text_address - get the module whose code contains an address.
 * @addr: the address.
 *
 * Must be called with preempt disabled or module mutex held so that
 * module doesn't get freed during this.
 */
struct module *__module_text_address(unsigned long addr)
{
        struct module *mod = __module_address(addr);
        if (mod) {
                /* Make sure it's within the text section. */
                if (!within(addr, mod->module_init, mod->init_text_size)
                    && !within(addr, mod->module_core, mod->core_text_size))
                        mod = NULL;
        }
        return mod;
}
EXPORT_SYMBOL_GPL(__module_text_address);

addr 주소가 모듈 초기화 코드 및 모듈 코어 코드에 포함되어 있는지 여부를 알아온다.

__module_address()

kernel/module.c

/*
 * __module_address - get the module which contains an address.
 * @addr: the address.
 *
 * Must be called with preempt disabled or module mutex held so that
 * module doesn't get freed during this.
 */
struct module *__module_address(unsigned long addr)
{
        struct module *mod;

        if (addr < module_addr_min || addr > module_addr_max)
                return NULL;

        list_for_each_entry_rcu(mod, &modules, list) {
                if (mod->state == MODULE_STATE_UNFORMED)
                        continue;
                if (within_module(addr, mod))
                        return mod;
        }
        return NULL;
}
EXPORT_SYMBOL_GPL(__module_address);

addr 주소가 module_addr_min ~ module_addr_max에 포함되어 있고 로드된 모듈의 코드영역에 있는 경우 module 포인터를 리턴하고 그렇지 않으면 null을 리턴한다.

within_module()

include/linux/module.h

static inline bool within_module(unsigned long addr, const struct module *mod)
{
        return within_module_init(addr, mod) || within_module_core(addr, mod);
}

addr 주소가 지정된 모듈의 초기화 코드 또는 코어 코드에 포함되어 있는지 여부를 리턴한다.

within_module_init()

include/linux/module.h

static inline bool within_module_init(unsigned long addr,
                                      const struct module *mod)
{                       
        return (unsigned long)mod->module_init <= addr &&
               addr < (unsigned long)mod->module_init + mod->init_size;
}

addr 주소가 지정된 모듈의 초기화 코드 영역에 포함되어 있는지 여부를 리턴한다.

is_ftrace_trampoline()

kernel/trace/ftrace.c

/*
 * This is used by __kernel_text_address() to return true if the
 * address is on a dynamically allocated trampoline that would
 * not return true for either core_kernel_text() or
 * is_module_text_address().
 */
bool is_ftrace_trampoline(unsigned long addr)
{
        struct ftrace_ops *op;
        bool ret = false;

        /*
         * Some of the ops may be dynamically allocated,
         * they are freed after a synchronize_sched().
         */
        preempt_disable_notrace();

        do_for_each_ftrace_op(op, ftrace_ops_list) {
                /*
                 * This is to check for dynamically allocated trampolines.
                 * Trampolines that are in kernel text will have
                 * core_kernel_text() return true.
                 */
                if (op->trampoline && op->trampoline_size)
                        if (addr >= op->trampoline &&
                            addr < op->trampoline + op->trampoline_size) {
                                ret = true;
                                goto out; 
                        }       
        } while_for_each_ftrace_op(op);
        
 out:   
        preempt_enable_notrace(); 

        return ret;
}

addr 주소가 각 ftrace_ops_list를 뒤져 각 op 구조체의 영역에 포함되어 있는지 여부를 알아내어 리턴한다.

do_for_each_ftrace_op()

kernel/trace/ftrace.c

/*
 * Traverse the ftrace_global_list, invoking all entries.  The reason that we
 * can use rcu_dereference_raw_notrace() is that elements removed from this list
 * are simply leaked, so there is no need to interact with a grace-period
 * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle
 * concurrent insertions into the ftrace_global_list.
 *
 * Silly Alpha and silly pointer-speculation compiler optimizations!
 */
#define do_for_each_ftrace_op(op, list)                 \
        op = rcu_dereference_raw_notrace(list);                 \
        do

while_for_each_ftrace_op()

kernel/trace/ftrace.c

/*
 * Optimized for just a single item in the list (as that is the normal case).
 */
#define while_for_each_ftrace_op(op)                            \
        while (likely(op = rcu_dereference_raw_notrace((op)->next)) &&  \
               unlikely((op) != &ftrace_list_end))

hotcpu_notifier()

2016-04-202016-04-20 문영일 Leave a comment

hotcpu_notifier()를 사용하여 등록하는 callback 함수들

page_alloc_cpu_notify,
percpu_counter_hotcpu_callback
radix_tree_callback
blk_mq_queue_reinit_notify
pfault_cpu_notify
smp_cpu_notify
vgetcpu_cpu_notifier
apbt_cpuhp_notify
hpet_cpuhp_notify
uv_scir_cpu_notify
vfp_hotplug
loongson3_cpu_callback
octeon_cpu_callback
buffer_cpu_notify
cpu_callback
memcg_cpu_hotplug_callback
cpuset_cpu_inactive
cpuset_cpu_active
sched_domains_numa_masks_update
hotplug_hrtick
workqueue_cpu_down_callback
console_cpu_notify
profile_cpu_callback
topology_cpu_callback
cacheinfo_cpu_callback
dev_cpu_callback

hotcpu notifier 등록

hotcpu_notifier()

include/linux/cpu.h

#define hotcpu_notifier(fn, pri)        cpu_notifier(fn, pri)

cpu_notifier()

include/linux/cpu.h

/* Need to know about CPUs going up/down? */
#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
#define cpu_notifier(fn, pri) {                                 \
        static struct notifier_block fn##_nb =                  \
                { .notifier_call = fn, .priority = pri };       \
        register_cpu_notifier(&fn##_nb);                        \
}
#else /* #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */
#define cpu_notifier(fn, pri)   do { (void)(fn); } while (0)
#endif /* #else #if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) */

신규 notifier_block 구조체 객체를 만들고 만들어진 객체를 cpu_chain에 추가하되 priority가 가장 높은 값이 선두에 위치한다.

예) hotcpu_notifer(page_alloc_cpu_notify, 0)
- static struct notifier_block page_alloc_cpu_notify_nb = { .notifier_call = page_alloc_cpu_notify, .priority = 0 };
- register_cpu_notifier(page_alloc_cpu_notify_nb);

register_cpu_notifer()

kernel/cpu.c

/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
{
        int ret;
        cpu_maps_update_begin();
        ret = raw_notifier_chain_register(&cpu_chain, nb);
        cpu_maps_update_done();
        return ret;
}

mutex lock으로 보호한 후 cpu chain에 신규 nb를 추가한다.

raw_notifier_chain_register()

kernel/notifier.c

/*
 *      Raw notifier chain routines.  There is no protection;
 *      the caller must provide it.  Use at your own risk!
 */

/**
 *      raw_notifier_chain_register - Add notifier to a raw notifier chain
 *      @nh: Pointer to head of the raw notifier chain
 *      @n: New entry in notifier chain
 *
 *      Adds a notifier to a raw notifier chain.
 *      All locking must be provided by the caller.
 *              
 *      Currently always returns zero.
 */             
int raw_notifier_chain_register(struct raw_notifier_head *nh,
                struct notifier_block *n)
{
        return notifier_chain_register(&nh->head, n);
}
EXPORT_SYMBOL_GPL(raw_notifier_chain_register);

아래 그림과 같이 raw_notifier_head nh에 신규 notifier_block n을 추가하되 priority가 가장 높은 값이 선두에 위치한다. 동일 priority의 경우 나중에 추가한 블럭은 뒤로 추가된다.

notifier_chain_register()

kernel/notifier.c

/*
 *      Notifier chain core routines.  The exported routines below
 *      are layered on top of these, with appropriate locking added.
 */

static int notifier_chain_register(struct notifier_block **nl,
                struct notifier_block *n)
{
        while ((*nl) != NULL) {
                if (n->priority > (*nl)->priority)
                        break;
                nl = &((*nl)->next);
        }
        n->next = *nl;
        rcu_assign_pointer(*nl, n);
        return 0;
}

아래 그림과 같이 신규 n의 우선순위가 비교 블럭 nl의 우선순위보다 높은 경우 신규 n을 비교 블럭 nl 앞에 끼워넣는다.

호출(Notify)

호출 action

#define CPU_ONLINE              0x0002 /* CPU (unsigned)v is up */
#define CPU_UP_PREPARE          0x0003 /* CPU (unsigned)v coming up */
#define CPU_UP_CANCELED         0x0004 /* CPU (unsigned)v NOT coming up */
#define CPU_DOWN_PREPARE        0x0005 /* CPU (unsigned)v going down */
#define CPU_DOWN_FAILED         0x0006 /* CPU (unsigned)v NOT going down */
#define CPU_DEAD                0x0007 /* CPU (unsigned)v dead */
#define CPU_DYING               0x0008 /* CPU (unsigned)v not running any task,
                                        * not handling interrupts, soon dead.
                                        * Called on the dying cpu, interrupts
                                        * are already disabled. Must not
                                        * sleep, must not fail */
#define CPU_POST_DEAD           0x0009 /* CPU (unsigned)v dead, cpu_hotplug
                                        * lock is dropped */
#define CPU_STARTING            0x000A /* CPU (unsigned)v soon running.
                                        * Called on the new cpu, just before
                                        * enabling interrupts. Must not sleep,
                                        * must not fail */

cpu_notify()

kernel/cpu.c

static int cpu_notify(unsigned long val, void *v)
{
        return __cpu_notify(val, v, -1, NULL);
}

cpu action val과 data v로 cpu_chin에 등록되어 있는 모든 콜백함수를 호출한다.

__cpu_notify()

kernel/cpu.c

static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
                        int *nr_calls)
{
        int ret;

        ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
                                        nr_calls);

        return notifier_to_errno(ret);
}

cpu_chain에 등록된 콜백함수를 nr_to_call 수만큼 순서대로 호출하되 인수로 cpu action val과 데이터 v를 사용한다. 출력 인수 nr_calls에 호출된 수를 저장하고 에러 여부를 리턴한다.

__raw_notifier_call_chain()

kernel/notifier.c

/**
 *      __raw_notifier_call_chain - Call functions in a raw notifier chain
 *      @nh: Pointer to head of the raw notifier chain
 *      @val: Value passed unmodified to notifier function
 *      @v: Pointer passed unmodified to notifier function
 *      @nr_to_call: See comment for notifier_call_chain.
 *      @nr_calls: See comment for notifier_call_chain
 *              
 *      Calls each function in a notifier chain in turn.  The functions
 *      run in an undefined context.
 *      All locking must be provided by the caller.
 *
 *      If the return value of the notifier can be and'ed
 *      with %NOTIFY_STOP_MASK then raw_notifier_call_chain()
 *      will return immediately, with the return value of
 *      the notifier function which halted execution.
 *      Otherwise the return value is the return value
 *      of the last notifier function called.
 */
int __raw_notifier_call_chain(struct raw_notifier_head *nh,
                              unsigned long val, void *v,
                              int nr_to_call, int *nr_calls)
{               
        return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls);
}
EXPORT_SYMBOL_GPL(__raw_notifier_call_chain);

nh가 가리키는 첫번째 콜백함수 부터 nr_to_call 수만큼 순서대로 호출하되 인수로 cpu action val과 데이터 v를 사용한다. 출력 인수 nr_calls에 호출된 수를 저장하고 에러 여부를 리턴한다.

notifier_call_chain()

kernel/notifier.c

/**
 * notifier_call_chain - Informs the registered notifiers about an event.
 *      @nl:            Pointer to head of the blocking notifier chain
 *      @val:           Value passed unmodified to notifier function
 *      @v:             Pointer passed unmodified to notifier function
 *      @nr_to_call:    Number of notifier functions to be called. Don't care
 *                      value of this parameter is -1.
 *      @nr_calls:      Records the number of notifications sent. Don't care
 *                      value of this field is NULL.
 *      @returns:       notifier_call_chain returns the value returned by the
 *                      last notifier function called.
 */
static int notifier_call_chain(struct notifier_block **nl,
                               unsigned long val, void *v,
                               int nr_to_call, int *nr_calls)
{
        int ret = NOTIFY_DONE;
        struct notifier_block *nb, *next_nb;

        nb = rcu_dereference_raw(*nl);

        while (nb && nr_to_call) {
                next_nb = rcu_dereference_raw(nb->next);

#ifdef CONFIG_DEBUG_NOTIFIERS
                if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
                        WARN(1, "Invalid notifier called!");
                        nb = next_nb;
                        continue;
                }
#endif
                ret = nb->notifier_call(nb, val, v);

                if (nr_calls)
                        (*nr_calls)++;

                if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
                        break;
                nb = next_nb;
                nr_to_call--;
        }
        return ret;
}
NOKPROBE_SYMBOL(notifier_call_chain);

첫번째 콜백함수 nl 부터 nr_to_call 수만큼 순서대로 호출하되 인수로 cpu action val과 데이터 v를 사용한다. 출력 인수 nr_calls에 호출된 수를 저장하고 에러 발생 시 NOTIFY_STOP_MASK 비트를 포함하는 경우 루프를 탈출하고 에러 값을 리턴한다.

참고: 커널 코드 영역 확인(func_ptr_is_kernel_text()) | 문c

구조체

include/linux/notifier.h

/*      
 * Notifier chains are of four types:
 *
 *      Atomic notifier chains: Chain callbacks run in interrupt/atomic
 *              context. Callouts are not allowed to block.
 *      Blocking notifier chains: Chain callbacks run in process context.
 *              Callouts are allowed to block.
 *      Raw notifier chains: There are no restrictions on callbacks,
 *              registration, or unregistration.  All locking and protection
 *              must be provided by the caller.
 *      SRCU notifier chains: A variant of blocking notifier chains, with
 *              the same restrictions.
 *      
 * atomic_notifier_chain_register() may be called from an atomic context,
 * but blocking_notifier_chain_register() and srcu_notifier_chain_register()
 * must be called from a process context.  Ditto for the corresponding
 * _unregister() routines.
 *
 * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(),
 * and srcu_notifier_chain_unregister() _must not_ be called from within
 * the call chain.
 *
 * SRCU notifier chains are an alternative form of blocking notifier chains.
 * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for
 * protection of the chain links.  This means there is _very_ low overhead
 * in srcu_notifier_call_chain(): no cache bounces and no memory barriers.
 * As compensation, srcu_notifier_chain_unregister() is rather expensive.
 * SRCU notifier chains should be used when the chain will be called very
 * often but notifier_blocks will seldom be removed.  Also, SRCU notifier
 * chains are slightly more difficult to use because they require special
 * runtime initialization.
 */

notifier_fn_t 타입

typedef int (*notifier_fn_t)(struct notifier_block *nb,
                        unsigned long action, void *data);

notifier_block 구조체

struct notifier_block {
        notifier_fn_t notifier_call;
        struct notifier_block __rcu *next;
        int priority;
};

atomic_notifier_head 구조체

struct atomic_notifier_head {
        spinlock_t lock;
        struct notifier_block __rcu *head;
};

blocking_notifier_head 구조체

struct blocking_notifier_head {
        struct rw_semaphore rwsem;
        struct notifier_block __rcu *head;
};

raw_notifier_head 구조체

struct raw_notifier_head {
        struct notifier_block __rcu *head;
};

srcu_notifier_head 구조체

struct srcu_notifier_head {
        struct mutex mutex;
        struct srcu_struct srcu;
        struct notifier_block __rcu *head;
};

전역 cpu_chain 구조체

static RAW_NOTIFIER_HEAD(cpu_chain);

static raw_notifier_head cpu_chain = { .head = null }

/* srcu_notifier_heads cannot be initialized statically */

#define ATOMIC_NOTIFIER_HEAD(name)                              \
        struct atomic_notifier_head name =                      \
                ATOMIC_NOTIFIER_INIT(name)
#define BLOCKING_NOTIFIER_HEAD(name)                            \
        struct blocking_notifier_head name =                    \
                BLOCKING_NOTIFIER_INIT(name)
#define RAW_NOTIFIER_HEAD(name)                                 \
        struct raw_notifier_head name =                         \
                RAW_NOTIFIER_INIT(name)

/* srcu_notifier_heads must be initialized and cleaned up dynamically */
extern void srcu_init_notifier_head(struct srcu_notifier_head *nh);
#define srcu_cleanup_notifier_head(name)        \
                cleanup_srcu_struct(&(name)->srcu);

#define ATOMIC_NOTIFIER_INIT(name) {                            \
                .lock = __SPIN_LOCK_UNLOCKED(name.lock),        \
                .head = NULL }
#define BLOCKING_NOTIFIER_INIT(name) {                          \
                .rwsem = __RWSEM_INITIALIZER((name).rwsem),     \
                .head = NULL }
#define RAW_NOTIFIER_INIT(name) {                               \
                .head = NULL }

build_all_zonelists()

2016-04-192019-08-24 문영일 Leave a comment

이 함수는 부팅 시 호출이 되는 경우 전체 노드에 대해 zonelist를 구성하고 현재 cpu에 대해 모든 노드의 메모리를 access 할 수 있도록 설정한다. 또한 운영 중에 핸들러에 의해 호출(hotplug memory)되는 경우 전체 cpu를 멈추고 zone에 대한 boot pageset 테이블 구성 및 전체 노드에 대해 zonelist를 다시 구성한다. 구성 후 free 페이지가 적은 경우 전역 변수 page_group_by_mobility_disabled를 1로 하여 mobility 기능을 잠시 disable하고 나중에 enable한다.

zone 구성

zone은 하나 이상 구성할 수 있다. 32bit 시스템에서는 최대 4개까지 조합이 가능하다.
- ZONE_DMA
- ZONE_DMA32
- ZONE_NORMAL
- ZONE_HIGHMEM
- ZONE_MOVABLE
- ZONE_DEVICE

zonelists

fallback
- 각 노드에 존들이 구성되어 있고 특정 노드에서 메모리를 할당해야 할 때 할당할 영역(노드 + 존)을 선택하는데, 그 영역에서 메모리 부족(out of memory)으로 인해 할당이 실패하는 경우 대체(fallback) 영역에서 할당을 재시도해야 한다. 그때마다 영역을 찾지 않고 일정한 규칙에 의해 할당 우선순위 리스트를 만들어두어 사용하는데, 이것을 zonelist라고 한다.
zonelist order
- 우선순위를 두어 단계적인 fallback용 zonelist를 구성한다. 시스템 설계에 따라 노드 우선(node order) 또는 존 우선(zone order)을 결정하여 순서를 정할 수 있다. 디폴트로 32비트 시스템은 존 우선으로 되어 있고, 64비트 시스템은 노드 우선으로 구성한다. 단 커널 버전 4.14-rc1부터 존 우선(zone order)으로 zonelist를 생성하는 루틴을 삭제하고 노드 우선(node order)만사용한다.
  - 참고: mm, page_alloc: rip out ZONELIST_ORDER_ZONE
- 리눅스 시스템은 zonelist order와 관련하여 다음과 같이 두 가지 중 하나를 선택하여 동작한다.
  - 노드 우선(node order)
    - 베스트 노드부터 역순으로 정렬한 존 타입으로 zonelist를 구성한다. 노드 우선으로 동작하더라도 ZONE_DMA 또는 ZONE_DMA32에 대해서는 아키텍처에 따라 할당을 제한하기도 한다.
  - 존 우선(zone order)
    - 역순으로 정렬한 존 타입부터 베스트 노드 순서대로 zonelist를 구성한다. ZONE_NORMAL 영역에 대해 충분히 free 영역을 확보할 필요가 있기 때문에, 보통 메모리가 적은 32비트 시스템에서는 존 우선 방식을 디폴트로 사용하여 ZONE_HIGHMEM(ZONE_MOVABLE)부터 우선 사용할 수 있게 한다. 32비트 시스템에서는 보통 메모리가 충분하지 않으므로 ZONE_DMA 및 ZONE_NORMAL 메모리를 보호하기 위해 존 우선을 사용한다

node와 zone의 검색 방향
- 노드(순방향)
  - 노드 간 접근 속도가 다르므로 현재 노드부터 가장 빠른 노드(best)를 우선순위로 둔다.
- 존(역방향)
  - 존에 대해 높은 번호부터 아래 번호 순서로 할당 시도를 한다. DMA 영역은 가장 낮은 순위로 메모리 할당을 하여 보호받는다.
    - ZONE_DEVICE(최상위 우선 순위)
    - ZONE_MOVABLE
    - ZONE_HIGHMEM(64비트 시스템에서는 사용되지 않는다.)
    - ZONE_NORMAL
    - ZONE_DMA32
    - ZONE_DMA
NUMA 시스템에서는 2개의 zonelists를 사용한다.
- zonelists[0]은 전체 노드를 대상으로 만들어진다.
- zonelists[1]은 현재 노드만을 대상으로 만들어진다(NUMA only). _ _GFP_THISNODE 사용 시 활용한다.

MOVABLE 존

이 영역은 특별히 버디 시스템으로 구현된 페이지 할당자가 메모리 단편화를 막기 위해 전용으로 사용하는 영역이며, 동시에 메모리 핫플러그를 지원하기 위해 구성돼야 하는 영역이다. 버디 시스템에서 메모리 단편화를 막기 위해 사용하는 방법으로 가능하면 마이그레이션 타입별로 관리를 하는데, 좀 더 확실한 영역을 지정하여 더 효율적으로 사용할 수 있다.

두 가지 구성 방법을 알아보자.

크기로 설정
- 커널 파라미터로 ‘kernelcore=’ 또는 ‘movablecore’를 사용해 크기를 지정하여 사용한다. kernelcore=를 사용하는 경우 전체 메모리에서 지정된 크기를 제외한 나머지 크기가 ZONE_MOVABLE로 설정된다. movablecore=를 사용하여 지정된 크기가 ZONE_MOVABLE로 설정된다.
특정 노드 메모리를 지정
- CONFIG_MOVABLE_NODE 커널 옵션을 사용하면서 커널 파라미터로 movable_node를 사용하고 memory memblock regions의 flags에 MEMBLOCK
  _HOTPLUG 플래그 비트를 설정한 메모리 블록을 통째로 ZONE_MOVABLE로 지정할 수 있다. 메모리 핫플러그가 가능한 메모리 영역을 지정하여 사용한다.

리눅스에서 메모리 핫플러그 구현은 완료되었고, 이 기능을 응용하여 유연하게 메모리의 동적 구성이 가능하다. 실제 하드웨어로 메모리 핫플러그가 시험되었는지 여부는 미지수다. 구현 당시에는 하드웨어가 준비되지 않은 상태였다.

movable(_ _GFP_MOVABLE) 속성을 가진 페이지들만 ZONE_MOVABLE에 들어갈 수 있다. 버디 시스템으로 구현된 페이지 할당자에서 연속된 페이지 요청을 수행하기 힘든 경우 메모리 컴팩션이 일어나는데, 이때 movable 속성을 가진 페이지를 다른 주소로 마이그레이션(migration) 시킨다.

메모리 컴팩션은 버디 시스템에서 연속된 free 메모리가 부족하여 요청한 오더 페이지에 대한 할당이 불가능할 때 해당 오더의 앞쪽에 있는 사용된 페이지를 뒤쪽으로 옮겨서 앞부분에 필요한 공간을 만드는 메커니즘이라고 할 수 있다.

ZONE_MOVABLE은 highest 존의 메모리를 분할하여 사용한다. 또한 아키텍처와 메모리 크기에 따라 해당 시스템의 highest 존이 다르다.

ARM32 예) CONFIG_HIGHMEM을 선택하고, 만일 없으면 CONFIG_NORMAL을 선택함
ARM64 예) CONFIG_NORMAL을 선택하고, 만일 없으면 ZONE_DMA32를 선택함
x86_32 예) ZONE_HIGHMEM을 선택하고, 만일 없으면 ZONE_NORMAL을 선택함
x86_64 예) ZONE_NORMAL을 선택하고, 만일 없으면 ZONE_DMA32를 선택함

zonelist 초기화

이 함수는 부팅 시에 호출되는 경우 전체 노드에 대해 zonelist를 구성하고 현재 cpu에 대해 모든 노드의 메모리를 액세스할 수 있도록 설정한다. 또한 운영 중에 핸들러에 의해 호출(핫플러그 메모리)되는 경우 전체 cpu를 멈추고 존에 대한 boot pageset 테이블 구성 및 전체 노드에 대해 zonelist를 다시 구성한다. 구성 후 free 페이지가 적은 경우 전역 변수 page_group_by_mobility_disabled를 1로 하여 mobility 기능을 잠시 disable하고 나중에 enable한다.

다음 그림은 zonelist를 구성할 때의 함수 호출 관계이다.

build_all_zonelists()

mm/page_alloc.c

/*
 * unless system_state == SYSTEM_BOOTING.
 *
 * __ref due to call of __init annotated helper build_all_zonelists_init
 * [protected by SYSTEM_BOOTING].
 */

void __ref build_all_zonelists(pg_data_t *pgdat)
{
        if (system_state == SYSTEM_BOOTING) {
                build_all_zonelists_init();
        } else {
                __build_all_zonelists(pgdat);
                /* cpuset refresh routine should be here */
        }
        vm_total_pages = nr_free_pagecache_pages();
        /*
         * Disable grouping by mobility if the number of pages in the
         * system is too low to allow the mechanism to work. It would be
         * more accurate, but expensive to check per-zone. This check is
         * made on memory-hotadd so a system can start with mobility
         * disabled and enable it later
         */
        if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
                page_group_by_mobility_disabled = 1;
        else
                page_group_by_mobility_disabled = 0;

        pr_info("Built %i zonelists, mobility grouping %s.  Total pages: %ld\n",
                nr_online_nodes,
                page_group_by_mobility_disabled ? "off" : "on",
                vm_total_pages);
#ifdef CONFIG_NUMA
        pr_info("Policy zone: %s\n", zone_names[policy_zone]);
#endif
}

각 노드의 활성화된 존에 fallback list인 zonelist를 생성한다. 첫 번째 인자로 주어지는 pgdat의 타입 pg_data_t는 노드 정보를 담고 있는 pglist_data 구조체를 타입 정의(define)하여 사용했다.

코드 라인 3~8에서 최초 부팅 시에 호출된 경우 곧바로 최초 zonelist를 구성한다. 최초 부팅 시가 아니면 zonelist를 재구성한다. 만일 핫플러그 메모리가 지원되는 경우 존에 대한 per-cpu 페이지 세트 프레임 캐시를 할당받고 구성한다.
코드 라인 9에서 zonelist에서 high 워터마크까지의 페이지 개수를 제외한 free 페이지를 재계산한다.
코드 라인 17~20에서 vm_total_pages가 migrate 타입 수(최대 6개) * 페이지 블록 크기보다 작으면 free 메모리가 매우 부족한 상태이므로 전역 page_group_by_mobility_disabled 변수를 true로 설정하여 페이지 블록에 migratetype 저장 시 항상 unmovable 타입을 사용하여 mobility 기능을 제한한다.
코드 라인 22~25에서 zonelist 순서(order) 정보와 모빌리티 그루핑 동작 여부와 전체 페이지 수 정보등을 출력한다.
- 예) “Built 1 zonelists in Zone order, mobility grouping on. Total pages: 999432”
코드 라인 27에서 NUMA 시스템인 경우 policy 존 정보를 출력한다.
- 예) “Policy zone: Normal”

policy_zone

vma_migrate() 함수를 통해 사용되는 movable을 제외한 최상위 존을 담은 변수로 policy 존 미만을 사용하는 존으로 migrate를 제한할 목적으로 사용된다.
zonelist를 만드는 과정에서 build_zonerefs_node() -> check_highest_zone() 함수를 통해 policy_zone이 설정된다.

system_states

include/linux/kernel.h

/*
 * Values used for system_state. Ordering of the states must not be changed
 * as code checks for <, <=, >, >= STATE.
 */

extern enum system_states {
        SYSTEM_BOOTING,
        SYSTEM_SCHEDULING,
        SYSTEM_RUNNING,
        SYSTEM_HALT,
        SYSTEM_POWER_OFF,
        SYSTEM_RESTART,
        SYSTEM_SUSPEND,
} system_state;

build_all_zonelists_init()

mm/page_alloc.c

static noinline void __init
build_all_zonelists_init(void)
{
        int cpu;

        __build_all_zonelists(NULL);

        /*
         * Initialize the boot_pagesets that are going to be used
         * for bootstrapping processors. The real pagesets for
         * each zone will be allocated later when the per cpu
         * allocator is available.
         *
         * boot_pagesets are used also for bootstrapping offline
         * cpus if the system is already booted because the pagesets
         * are needed to initialize allocators on a specific cpu too.
         * F.e. the percpu allocator needs the page allocator which
         * needs the percpu allocator in order to allocate its pagesets
         * (a chicken-egg dilemma).
         */
        for_each_possible_cpu(cpu)
                setup_pageset(&per_cpu(boot_pageset, cpu), 0);

        mminit_verify_zonelist();
        cpuset_init_current_mems_allowed();
}

노드별로 zonelist를 생성하고 현재 태스크의 mems_allowed 노드마스크를 모두 설정하여 전체 메모리 노드에서 할당받을 수 있음을 나타낸다.

__build_all_zonelists()

mm/page_alloc.c

static void __build_all_zonelists(void *data)
{
        int nid;
        int __maybe_unused cpu;
        pg_data_t *self = data;
        static DEFINE_SPINLOCK(lock);

        spin_lock(&lock);

#ifdef CONFIG_NUMA
        memset(node_load, 0, sizeof(node_load));
#endif

        /*
         * This node is hotadded and no memory is yet present.   So just
         * building zonelists is fine - no need to touch other nodes.
         */
        if (self && !node_online(self->node_id)) {
                build_zonelists(self);
        } else {
                for_each_online_node(nid) {
                        pg_data_t *pgdat = NODE_DATA(nid);

                        build_zonelists(pgdat);
                }

#ifdef CONFIG_HAVE_MEMORYLESS_NODES
                /*
                 * We now know the "local memory node" for each node--
                 * i.e., the node of the first zone in the generic zonelist.
                 * Set up numa_mem percpu variable for on-line cpus.  During
                 * boot, only the boot cpu should be on-line;  we'll init the
                 * secondary cpus' numa_mem as they come on-line.  During
                 * node/memory hotplug, we'll fixup all on-line cpus.
                 */
                for_each_online_cpu(cpu)
                        set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
#endif
        }

        spin_unlock(&lock);
}

인자를 통해 지정한 노드 또는 모든 노드에 대해 zonelist와 per-cpu 페이지 프레임 캐시를 구성한다.

코드 라인 5에서 최초 build_zonelists_all() 함수 호출 시 NULL이 data 인자로 주어졌다.
코드 라인 18~19에서 인자로 요청한 노드가 온라인 노드가 아니면 zonelists를 구성한다.
코드 라인 21~25에서 모든 온라인 노드를 순회하며 각 노드의 zonelists를 구성한다.
코드 라인 36~37에서 커널이 메모리리스 노드를 지원하는 경우 온라인 cpu별로 메모리가 있는 인접 노드를 지정한다.

build_zonelists()

mm/page_alloc.c

/*
 * Build zonelists ordered by zone and nodes within zones.
 * This results in conserving DMA zone[s] until all Normal memory is
 * exhausted, but results in overflowing to remote node while memory
 * may still exist in local DMA zone.
 */

static void build_zonelists(pg_data_t *pgdat)
{
        static int node_order[MAX_NUMNODES];
        int node, load, nr_nodes = 0;
        nodemask_t used_mask;
        int local_node, prev_node;

        /* NUMA-aware ordering of nodes */
        local_node = pgdat->node_id;
        load = nr_online_nodes;
        prev_node = local_node;
        nodes_clear(used_mask);

        memset(node_order, 0, sizeof(node_order));
        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
                /*
                 * We don't want to pressure a particular node.
                 * So adding penalty to the first node in same
                 * distance group to make it round-robin.
                 */
                if (node_distance(local_node, node) !=
                    node_distance(local_node, prev_node))
                        node_load[node] = load;

                node_order[nr_nodes++] = node;
                prev_node = node;
                load--;
        }

        build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
        build_thisnode_zonelists(pgdat);
}

유저 설정 또는 디폴트 설정에 따라 노드 우선 또는 존 우선을 선택하여 zonelist를 만들어낸다.

코드 라인 15~코드 used_mask 비트맵에 포함되지 않은 노드 중 가장 인접한 노드 id를 구한다.
코드 라인 21~23에서 현재 노드와 알아온 인접 노드의 거리가 현재 노드와 기존 노드의 거리와 다르다면, 즉 이전 노드와 거리가 다르다면 node_load[node]에 load 값을 설정한다.
코드 라인 26~27에서 prev_node에 방금 처리한 현재 노드 id를 설정하고 load 값을 감소시킨다.
코드 라인 30에서 노드 오더용 zonelist를 구성하여 zonelists[0]에 추가한다.
코드 라인 31에서 현재 노드에 대해서만 활성화된 존을 zonelists[1]에 추가한다.
- GFP_THISNODE 플래그 옵션이 주어지는 경우 노드 fallback 기능을 사용하지 않도록 이 zonelists[1]을 사용한다.

“numa_zonelist_order” 커널 파라메터 & “/proc/sys/vm/numa_zonelist_order”

setup_numa_zonelist_order()

mm/page_alloc.c

static __init int setup_numa_zonelist_order(char *s)
{
        if (!s)
                return 0;

        return __parse_numa_zonelist_order(s);
}
early_param("numa_zonelist_order", setup_numa_zonelist_order);

누마 시스템에서 zonelist order를 다음 중 하나로 선택하였었으나 커널 v4.14-rc1 이후로는 이러한 커널 파라미터를 사용하는 경우 경고 메시지를 출력한다.

“numa_zonelist_order=d”
- automatic configuration
“numa_zonelist_order=n”
- node order (노드 거리에 따른 가장 가까운 best 노드 순)
“numa_zonelist_order=z”
- zone order (zone type 역순)

__parse_numa_zonelist_order()

mm/page_alloc.c

static int __parse_numa_zonelist_order(char *s)
{
        /*
         * We used to support different zonlists modes but they turned
         * out to be just not useful. Let's keep the warning in place
         * if somebody still use the cmd line parameter so that we do
         * not fail it silently
         */
        if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
                pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
                return -EINVAL;
        }
        return 0;
}

문자열이 Default 및 Node 약자인 d, D, n, N 문자를 사용하면 경고 메시지를 출력한다.

그 외의 문자는 skip 한다. (“Z”, “z” 등)

node_load[] 및 node_order[]

다음 그림은 NUMA 시스템에서 node_load[] 및 node_order[]를 알아본 사례이다.

node_order[]
- 현재 부팅한 cpu가 있는 로컬 노드를 대상으로 가장 빠른(가까운) 순서의 노드 번호를 저장한다.
node_load[]
- 다음 best 노드를 찾을 때 사용할 노드별 가중치 값

build_zonelists_in_node_order()

mm/page_alloc.c

/*
 * Build zonelists ordered by node and zones within node.
 * This results in maximum locality--normal zone overflows into local
 * DMA zone, if any--but risks exhausting DMA zone.
 */

static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
                unsigned nr_nodes)
{
        struct zoneref *zonerefs;
        int i;

        zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;

        for (i = 0; i < nr_nodes; i++) {
                int nr_zones;

                pg_data_t *node = NODE_DATA(node_order[i]);

                nr_zones = build_zonerefs_node(node, zonerefs);
                zonerefs += nr_zones;
        }
        zonerefs->zone = NULL;
        zonerefs->zone_idx = 0;
}

zonelist를 노드 우선으로 만든다.

아래 그림은 4개 노드에 대해 node order로 구축한 zonelists를 보여준다.

경우에 따라서는 DMA(DMA32)만 특별히 가장 밑으로 옮긴 경우도 있다.

아래 그림은 4개 노드에 대해 zone order로 구축한 zonelists를 보여준다. (커널 4.14-rc1 부터 사용하지 않음)

build_zonerefs_node()

mm/page_alloc.c

/*
 * Builds allocation fallback zone lists.
 *
 * Add all populated zones of a node to the zonelist.
 */

static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
{
        struct zone *zone;
        enum zone_type zone_type = MAX_NR_ZONES;
        int nr_zones = 0;

        do {
                zone_type--;
                zone = pgdat->node_zones + zone_type;
                if (managed_zone(zone)) {
                        zoneref_set_zone(zone, &zonerefs[nr_zones++]);
                        check_highest_zone(zone_type);
                }
        } while (zone_type);

        return nr_zones;
}

노드에 관련된 zone 정보를 구성하고, 구성한 존 수를 반환한다.

코드 라인 4~9에서 MAX_NR_ZONES(3)가 가리키는 최상위 존부터 ~ 최하위 0번을 제외한 1번 존까지 순회한다.
코드 라인 10~13에서 실제 사용할 페이지가 있는 zone인 경우 출력 인자 @zonerefs 배열에 zone 정보를 추가하고 최상위 존인 경우 policy zone을 갱신한다.

build_thisnode_zonelists()

mm/page_alloc.c

/*
 * Build gfp_thisnode zonelists
 */

static void build_thisnode_zonelists(pg_data_t *pgdat)
{
        struct zoneref *zonerefs;
        int nr_zones;

        zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
        nr_zones = build_zonerefs_node(pgdat, zonerefs);
        zonerefs += nr_zones;
        zonerefs->zone = NULL;
        zonerefs->zone_idx = 0;
}

요청한 단일 노드용 zonelist를 구성한다.

Zonelist 캐시

zonelist 캐시는 커널 v4.4-rc1에서 제거되었다.

참고: mm, page_alloc: delete the zonelist_cache

아래 그림은 zonelist_cache를 구성한 모습이다. fullzones 비트맵은 0으로 clear한다.

mminit_verify_zonelist()

mm/mm_init.c

/* The zonelists are simply reported, validation is manual. */
void __init mminit_verify_zonelist(void)
{
        int nid;

        if (mminit_loglevel < MMINIT_VERIFY)
                return;

        for_each_online_node(nid) {
                pg_data_t *pgdat = NODE_DATA(nid);
                struct zone *zone;
                struct zoneref *z;
                struct zonelist *zonelist;
                int i, listid, zoneid;

                BUG_ON(MAX_ZONELISTS > 2);
                for (i = 0; i < MAX_ZONELISTS * MAX_NR_ZONES; i++) {
 
                        /* Identify the zone and nodelist */
                        zoneid = i % MAX_NR_ZONES;
                        listid = i / MAX_NR_ZONES;
                        zonelist = &pgdat->node_zonelists[listid];
                        zone = &pgdat->node_zones[zoneid];
                        if (!populated_zone(zone))
                                continue;

                        /* Print information about the zonelist */
                        printk(KERN_DEBUG "mminit::zonelist %s %d:%s = ",
                                listid > 0 ? "thisnode" : "general", nid,
                                zone->name);

                        /* Iterate the zonelist */
                        for_each_zone_zonelist(zone, z, zonelist, zoneid)
                                pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
                        printk("\n");
                }               
        }
}

모든 노드의 존에 대해서 zonelists를 출력한다.

cpuset_init_current_mems_allowed()

kernel/cpuset.c

void __init cpuset_init_current_mems_allowed(void)
{
        nodes_setall(current->mems_allowed);
}

현재 태스크가 모든 노드의 메모리를 사용할 수 있도록 설정한다.

현재 태스크의 mems_allowed 노드마스크 비트맵에 대해 모든 비트를 1로 설정한다.

nodes_setall()

include/linux/nodemask.h

#define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES)
static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits)
{
        bitmap_fill(dstp->bits, nbits);
}

지정된 dst 노드 비트맵 마스크에 대해 모든 노드의 메모리를 사용할 수 있도록 설정한다.

dstp 노드마스크 비트맵에 대해 MAX_NUMNODES 수 만큼의 비트를 1로 설정한다

managed_zone()

include/linux/mmzone.h

/*
 * Returns true if a zone has pages managed by the buddy allocator.
 * All the reclaim decisions have to use this function rather than
 * populated_zone(). If the whole zone is reserved then we can easily
 * end up with populated_zone() && !managed_zone().
 */

static inline bool managed_zone(struct zone *zone)
{
        return zone_managed_pages(zone);
}

해당 존이 버디 할당자가 관리하는 페이지인지 여부를 반환한다.

zone_managed_pages()

include/linux/mmzone.h

static inline unsigned long zone_managed_pages(struct zone *zone)
{
        return (unsigned long)atomic_long_read(&zone->managed_pages);
}

해당 존에서 버디 할당자가 관리하는 페이지 수를 반환한다.

zoneref_set_zone()

mm/page_alloc.c

static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
        zoneref->zone = zone;
        zoneref->zone_idx = zone_idx(zone);
}

zoneref 구조체에 zone 정보를 기록한다.

check_highest_zone()

include/linux/mempolicy.h

static inline void check_highest_zone(enum zone_type k)
{
        if (k > policy_zone && k != ZONE_MOVABLE)
                policy_zone = k;
}

인자로 요청한 존이 최상인 경우 policy 존으로 갱신한다. movable 존을 제외하고, device 존 -> highmem 존 -> normal 존 순서이다.

빈 페이지 캐시 페이지 산출

nr_free_pagecache_pages()

mm/page_alloc.c

/**
 * nr_free_pagecache_pages - count number of pages beyond high watermark
 *
 * nr_free_pagecache_pages() counts the number of pages which are beyond the
 * high watermark within all zones.
 */

unsigned long nr_free_pagecache_pages(void)
{
        return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
}

유저 페이지 캐시용(movable 가능한 페이지)으로 사용할 수 있는 빈 페이지 수를 반환한다. 단 high 워터마크 미만 페이지들은 제외한다.

highmem 존 또는 movable 존 이하 대상

nr_free_zone_pages()

mm/page_alloc.c

/**
 * nr_free_zone_pages - count number of pages beyond high watermark
 * @offset: The zone index of the highest zone
 *
 * nr_free_zone_pages() counts the number of counts pages which are beyond the
 * high watermark within all zones at or below a given zone index.  For each
 * zone, the number of pages is calculated as:
 *
 *     nr_free_zone_pages = managed_pages - high_pages
 */

static unsigned long nr_free_zone_pages(int offset)
{
        struct zoneref *z;
        struct zone *zone;

        /* Just pick one node, since fallback list is circular */
        unsigned long sum = 0;

        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);

        for_each_zone_zonelist(zone, z, zonelist, offset) {
                unsigned long size = zone_managed_pages(zone);
                unsigned long high = high_wmark_pages(zone);
                if (size > high)
                        sum += size - high;
        }

        return sum;
}

zonelist의 존들 중 offset 존 이하의 zone에 대해 free 페이지 수를 알아오는데 high 워터마크 페이지 수를 제외한다.

코드 라인 9에서 현재 cpu의 노드용 zonelist이다.
코드 라인 11~16에서 zonelist에 포함된 모든 zone들 중 offset 이하의 존을 순회하며 버디 시스템이 관리하는 free 페이지인 managed 페이지를 누적하되 high 워터마크 이하의 페이지는 제외시킨다.

numa_node_id()

include/linux/topology.h

#ifndef numa_node_id
/* Returns the number of the current Node. */
static inline int numa_node_id(void)
{
        return raw_cpu_read(numa_node);
}
#endif

per-cpu 데이터인 numa_node 값을 알아온다. 즉 해당 cpu에 대한 numa_node id 값을 리턴한다.

이 값은 set_numa_node() 또는 set_cpu_numa_node() 함수에 의해 설정된다.

node_zonelist()

include/linux/gfp.h

/*
 * We get the zone list from the current node and the gfp_mask.
 * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
 * There are two zonelists per node, one for all zones with memory and
 * one containing just zones from the node the zonelist belongs to.
 *
 * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
 * optimized to &contig_page_data at compile-time.
 */

static inline struct zonelist *node_zonelist(int nid, gfp_t flags)
{
        return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags);
}

노드에 대응하는 2 개의 zonelists 중 하나를 다음 조건으로 리턴한다.

NUMA 시스템에서 __GFP_THISNODE 플래그 비트가 있는 경우 오직 자신의 노드에 대한 zone 만 포함된 zonelists[1]
otherwise, 모든 노드에 대한 zone이 포함된 zonelists[0]

gfp_zonelist()

include/linux/gfp.h

/*
 * There is only one page-allocator function, and two main namespaces to
 * it. The alloc_page*() variants return 'struct page *' and as such
 * can allocate highmem pages, the *get*page*() variants return
 * virtual kernel addresses to the allocated page(s).
 */

static inline int gfp_zonelist(gfp_t flags)
{
#ifdef CONFIG_NUMA
        if (unlikely(flags & __GFP_THISNODE))
                return ZONELIST_NOFALLBACK;
#endif
        return ZONELIST_FALLBACK;
}

요청 플래그에 따른 zonelist 인덱스를 반환한다. ZONELIST_FALLBACK(0) 또는 ZONELIST_NOFALLBACK(1)

__GFP_THISNODE 플래그가 유무에 따라
- 플래그가 있으면 zonelist의 fallback을 허용하지 않고 현재 노드에서만 할당하도록 1번 인덱스를 반환한다.
- 플래그가 없으면 fallback 가능한 여러 노드에서 할당할 수 있게 0번 인덱스를 반환한다.

구조체

zonelist 구조체

include/linux/mmzone.h

/*
 * One allocation request operates on a zonelist. A zonelist
 * is a list of zones, the first one is the 'goal' of the
 * allocation, the other zones are fallback zones, in decreasing
 * priority.
 *
 * To speed the reading of the zonelist, the zonerefs contain the zone index
 * of the entry being read. Helper functions to access information given
 * a struct zoneref are
 *
 * zonelist_zone()      - Return the struct zone * for an entry in _zonerefs
 * zonelist_zone_idx()  - Return the index of the zone for an entry
 * zonelist_node_idx()  - Return the index of the node for an entry
 */

struct zonelist {
        struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1];
};

_zonerefs[]
- 메모리 할당을 위해 fallback을 위한 zone들로 구성된다.

zoneref 구조체

include/linux/mmzone.h

/*
 * This struct contains information about a zone in a zonelist. It is stored
 * here to avoid dereferences into large structures and lookups of tables
 */

struct zoneref {
        struct zone *zone;      /* Pointer to actual zone */
        int zone_idx;           /* zone_idx(zoneref->zone) */
};

*zone
- 존 포인터
zone_idx
- 존에 대한 인덱스(0~3)

참고

Bit Operations | 문c
Bitmap Operations | 문c
NODE 비트맵 (API) | 문c
CPU 비트맵 (API) | 문c
ZONE 비트맵 (API) | 문c
setup_per_cpu_pageset() | 문c
Per-CPU Page Frame Cache (zone->pageset) | 문c

ZONE 비트맵 (API)

2016-04-192016-11-05 문영일 Leave a comment

for_each_zone_zonelist()

include/linux/mmzone.h

/**             
 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
 * @zone - The current zone in the iterator
 * @z - The current pointer within zonelist->zones being iterated
 * @zlist - The zonelist being iterated
 * @highidx - The zone index of the highest zone to return
 *
 * This iterator iterates though all zones at or below a given zone index.
 */
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
        for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)

zonelist에서 highest_zoneidx 이하의 zone 엔트리들에 대해 루프를 돌며 zone과 z(zoneref)를 반환한다.

for_each_zone_zonelist_nodemask()

include/linux/mmzone.h

/**
 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
 * @zone - The current zone in the iterator
 * @z - The current pointer within zonelist->zones being iterated
 * @zlist - The zonelist being iterated
 * @highidx - The zone index of the highest zone to return
 * @nodemask - Nodemask allowed by the allocator
 *
 * This iterator iterates though all zones at or below a given zone index and
 * within a given nodemask
 */
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
        for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
                zone;                                                   \
                z = next_zones_zonelist(++z, highidx, nodemask),        \
                        zone = zonelist_zone(z))                        \

zonelist에서 highest_zoneidx 이하의 zone이면서 nodes 비트맵에 설정된 노드들인 zone 엔트리들에 대해 루프를 돌며 zone과 z(zoneref)를 반환한다.

/**
 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
 * @zonelist - The zonelist to search for a suitable zone
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 * @zone - The first suitable zone found is returned via this parameter
 *
 * This function returns the first zone at or below a given zone index that is
 * within the allowed nodemask. The zoneref returned is a cursor that can be
 * used to iterate the zonelist with next_zones_zonelist by advancing it by
 * one before calling.
 */
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes,
                                        struct zone **zone)
{
        struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
                                                        highest_zoneidx, nodes);
        *zone = zonelist_zone(z);
        return z;
}

zonelist에서 highest_zoneidx 이하의 zone이면서 nodes 비트맵에 설정된 노드들에서 가장 처음 발견된 zone을 반환한다.

아래 그림은 zonelist의 처음 부터 검색을 하여 ZONE_NORMAL 타입을 초과하지 않는 zone에 대해 적합한 zone을 알아온다.

next_zones_zonelist()

mm/mmzone.c

/**
 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
 * @z - The cursor used as a starting point for the search
 * @highest_zoneidx - The zone index of the highest zone to return
 * @nodes - An optional nodemask to filter the zonelist with
 *
 * This function returns the next zone at or below a given zone index that is
 * within the allowed nodemask using a cursor as the starting point for the
 * search. The zoneref returned is a cursor that represents the current zone
 * being examined. It should be advanced by one before calling
 * next_zones_zonelist again.
 */
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
                                        enum zone_type highest_zoneidx,
                                        nodemask_t *nodes)
{
        /*
         * Find the next suitable zone to use for the allocation.
         * Only filter based on nodemask if it's set
         */
        if (likely(nodes == NULL))
                while (zonelist_zone_idx(z) > highest_zoneidx)
                        z++;
        else
                while (zonelist_zone_idx(z) > highest_zoneidx ||
                                (z->zone && !zref_in_nodemask(z, nodes)))
                        z++;

        return z;
}

zonelists에서 처음 부터 검색하여 zone이 highest_zoneidx 이하의 zone인 경우 그 zone을 리턴하되 인수 nodes에 대해 아래와 같이 처리한다.

인수 nodes가 지정된 경우 zonelists의 각 zone은 nodes에 포함된 zone으로 한정한다.
인수 nodes가 지정되지 않은 경우 node에 대해 제한 없다.

if (likely(nodes == NULL))
- 높은 확률로 nodes가 null인 경우
while (zonelist_zone_idx(z) > highest_zoneidx) z++;
- zonelist에서 zone 타입이 highest_zoneidx를 초과하는 경우 다음 존을 계속 진행한다.
while (zonelist_zone_idx(z) > highest_zoneidx || (z->zone && !zref_in_nodemask(z, nodes))) z++;
- zonelist에 zone 타입이 highest_zoneidx를 초과하거나 인수 nodes에 포함되어 있지 않은 경우 다음 존을 계속 진행한다.

아래 그림은 z 부터 검색을 하여 ZONE_NORMAL 타입을 초과하지 않는 zone에 대해 적합한 zone을 알아온다.

zonelist_zone()

include/linux/mmzone.h

static inline struct zone *zonelist_zone(struct zoneref *zoneref)
{
        return zoneref->zone;
}

zonelist에서 zoneref의 zone 을 리턴한다.

zonelist_zone()

include/linux/mmzone.h

static inline int zonelist_zone_idx(struct zoneref *zoneref)
{                                       
        return zoneref->zone_idx;
}

zonelist에서 zoneref의 zone 타입을 리턴한다. (based 0)

zref_in_nodemask()

mm/mmzone.c

static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
{
#ifdef CONFIG_NUMA
        return node_isset(zonelist_node_idx(zref), *nodes);
#else
        return 1;
#endif /* CONFIG_NUMA */
}

nodes 노드 비트맵에 zref 의 노드가 포함 여부를 리턴한다. NUMA 시스템이 아닌 경우는 항상 1을 리턴한다.

ZONE과 관련한 GFP

gfp_zone()

include/linux/gfp.h

static inline enum zone_type gfp_zone(gfp_t flags)
{               
        enum zone_type z;
        int bit = (__force int) (flags & GFP_ZONEMASK);
                        
        z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) &
                                         ((1 << ZONES_SHIFT) - 1);
        VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1);
        return z;
}

ZONE 비트 정보가 포함된 flags를 사용하여 zone_type (based 0)을 리턴한다.

int bit = (__force int) (flags & GFP_ZONEMASK);
- ZONE에 대한 __GFP_**** 플래그를 추출한다.
z = (GFP_ZONE_TABLE >> (bit * ZONES_SHIFT)) & ((1 << ZONES_SHIFT) – 1);
- zone type(based 0)을 알아와서 리턴한다.

/*
 * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the
 * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long
 * and there are 16 of them to cover all possible combinations of
 * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM.
 *
 * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA.
 * But GFP_MOVABLE is not only a zone specifier but also an allocation
 * policy. Therefore __GFP_MOVABLE plus another zone selector is valid.
 * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1".
 *
 *       bit       result
 *       =================
 *       0x0    => NORMAL
 *       0x1    => DMA or NORMAL
 *       0x2    => HIGHMEM or NORMAL
 *       0x3    => BAD (DMA+HIGHMEM)
 *       0x4    => DMA32 or DMA or NORMAL
 *       0x5    => BAD (DMA+DMA32)
 *       0x6    => BAD (HIGHMEM+DMA32)
 *       0x7    => BAD (HIGHMEM+DMA32+DMA)
 *       0x8    => NORMAL (MOVABLE+0)
 *       0x9    => DMA or NORMAL (MOVABLE+DMA)
 *       0xa    => MOVABLE (Movable is valid only if HIGHMEM is set too)
 *       0xb    => BAD (MOVABLE+HIGHMEM+DMA)
 *       0xc    => DMA32 (MOVABLE+DMA32)
 *       0xd    => BAD (MOVABLE+DMA32+DMA)
 *       0xe    => BAD (MOVABLE+DMA32+HIGHMEM)
 *       0xf    => BAD (MOVABLE+DMA32+HIGHMEM+DMA)
 *
 * ZONES_SHIFT must be <= 2 on 32 bit platforms.
 */

#if 16 * ZONES_SHIFT > BITS_PER_LONG
#error ZONES_SHIFT too large to create GFP_ZONE_TABLE integer
#endif

#define GFP_ZONE_TABLE ( \
        (ZONE_NORMAL << 0 * ZONES_SHIFT)                                      \
        | (OPT_ZONE_DMA << ___GFP_DMA * ZONES_SHIFT)                          \
        | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * ZONES_SHIFT)                  \
        | (OPT_ZONE_DMA32 << ___GFP_DMA32 * ZONES_SHIFT)                      \
        | (ZONE_NORMAL << ___GFP_MOVABLE * ZONES_SHIFT)                       \
        | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * ZONES_SHIFT)       \
        | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * ZONES_SHIFT)   \
        | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * ZONES_SHIFT)   \
)

/*
 * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32
 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per
 * entry starting with bit 0. Bit is set if the combination is not
 * allowed.
 */
#define GFP_ZONE_BAD ( \
        1 << (___GFP_DMA | ___GFP_HIGHMEM)                                    \
        | 1 << (___GFP_DMA | ___GFP_DMA32)                                    \
        | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM)                                \
        | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM)                   \
        | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA)                 \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA)                   \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM)               \
        | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM)  \
)

ZONES_SHIFT

include/linux/page-flags-layout.h

/*
 * When a memory allocation must conform to specific limitations (such
 * as being suitable for DMA) the caller will pass in hints to the
 * allocator in the gfp_mask, in the zone modifier bits.  These bits
 * are used to select a priority ordered list of memory zones which
 * match the requested limits. See gfp_zone() in include/linux/gfp.h
 */
#if MAX_NR_ZONES < 2
#define ZONES_SHIFT 0
#elif MAX_NR_ZONES <= 2
#define ZONES_SHIFT 1
#elif MAX_NR_ZONES <= 4
#define ZONES_SHIFT 2
#else
#error ZONES_SHIFT -- too many zones configured adjust calculation
#endif

zone 을 표현하기 위해 좌측 쉬프트를 해야 하는 비트 수

zone이 1개인 경우 0 비트 (zone 비트가 필요 없음)
zone이 2개인 경우 1 비트
zone이 3~4개인 경우 2비트
zone이 5개 이상인 경우 에러

gfpflags_to_migratetype()

include/linux/gfp.h

/* Convert GFP flags to their corresponding migrate type */
static inline int gfpflags_to_migratetype(const gfp_t gfp_flags)
{               
        WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK);
        
        if (unlikely(page_group_by_mobility_disabled))
                return MIGRATE_UNMOVABLE;

        /* Group based on mobility */
        return (((gfp_flags & __GFP_MOVABLE) != 0) << 1) |
                ((gfp_flags & __GFP_RECLAIMABLE) != 0);
}

gfp_flags에서 migrate type을 알아내어 반환한다.

참고

CPU 비트맵 (API)

2016-04-182019-10-07 문영일 Leave a comment

for_each_cpu()

include/linux/cpumask.h

/**
 * for_each_cpu - iterate over every cpu in a mask
 * @cpu: the (optionally unsigned) integer iterator
 * @mask: the cpumask pointer
 *
 * After the loop, cpu is >= nr_cpu_ids.
 */
#define for_each_cpu(cpu, mask)                         \
        for ((cpu) = -1;                                \
                (cpu) = cpumask_next((cpu), (mask)),    \
                (cpu) < nr_cpu_ids;)

mask가 가리키는 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.
- nr_cpu_ids는 초기에 NR_CPUS 값이 설정되었다가 나중에 possible cpu id + 1 값으로 설정된다.

for_each_possible_cpu()

include/linux/cpumask.h

#define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)

cpu_possible_mask가 가리키는 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_online_cpu()

include/linux/cpumask.h

#define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)

cpu_online_mask가 가리키는 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_present_cpu()

include/linux/cpumask.h

#define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)

cpu_present_mask가 가리키는 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

on_each_cpu_mask()

/**
 * on_each_cpu_mask(): Run a function on processors specified by
 * cpumask, which may include the local processor.
 * @mask: The set of cpus to run on (only runs on online subset).
 * @func: The function to run. This must be fast and non-blocking.
 * @info: An arbitrary pointer to pass to the function.
 * @wait: If true, wait (atomically) until function has completed
 *        on other CPUs.
 *
 * If @wait is true, then returns once @func has returned.
 *
 * You must not call this function with disabled interrupts or from a
 * hardware interrupt handler or from a bottom half handler.  The
 * exception is that it may be used during early boot while
 * early_boot_irqs_disabled is set.
 */                     
void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
                        void *info, bool wait)
{
        int cpu = get_cpu();    

        smp_call_function_many(mask, func, info, wait);
        if (cpumask_test_cpu(cpu, mask)) {
                unsigned long flags;
                local_irq_save(flags);
                func(info);
                local_irq_restore(flags);
        }               
        put_cpu();
}        
EXPORT_SYMBOL(on_each_cpu_mask);

func 함수를 다른 cpu에서 모두 수행시키고 현재 cpu에서도 수행시킨다. 인수 wait이 true인 경우 모든 cpu의 수행이 완료될 때까지 기다린다.

smp_call_function_many()
- cpu mask 중 현재 cpu를 제외한 cpu에서 func을 수행한다.
- 참고: IPI cross call – 소프트 인터럽트 | 문c

cpumask_first()

include/linux/cpumask.h

/**
 * cpumask_first - get the first cpu in a cpumask
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no cpus set.
 */
static inline unsigned int cpumask_first(const struct cpumask *srcp)
{
        return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}

srcp가 가리키는 cpu 상태 관련 비트맵의 nr_cpumask_bits 까지에서 1로 설정된 첫 cpu id(based 0)를 알아온다.

cpumask_next()

include/linux/cpumask.h

/**
 * cpumask_next - get the next cpu in a cpumask
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus set.
 */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
}

srcp가 가리키는 cpu 상태 관련 비트맵의 nr_cpumask_bits 까지에서 n+1 번째 비트부터 1로 설정된 cpu id(based 0)를 찾아 알아온다.

cpumask_next_zero()

include/linux/cpumask.h

/**
 * cpumask_next_zero - get the next unset cpu in a cpumask
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @srcp: the cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus unset.
 */
static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp)
{
        /* -1 is a legal arg here. */
        if (n != -1)
                cpumask_check(n);
        return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
}

srcp가 가리키는 cpu 상태 관련 비트맵의 nr_cpumask_bits 까지에서 n+1 번째 비트부터 0으로 설정된 cpu id(based 0)를 찾아 알아온다.

cpumask_next_and()

lib/cpumask.c

/**
 * cpumask_next_and - get the next cpu in *src1p & *src2p
 * @n: the cpu prior to the place to search (ie. return will be > @n)
 * @src1p: the first cpumask pointer
 * @src2p: the second cpumask pointer
 *
 * Returns >= nr_cpu_ids if no further cpus set in both.
 */ 
int cpumask_next_and(int n, const struct cpumask *src1p,
                     const struct cpumask *src2p)
{
        while ((n = cpumask_next(n, src1p)) < nr_cpu_ids)
                if (cpumask_test_cpu(n, src2p))
                        break;
        return n;
}
EXPORT_SYMBOL(cpumask_next_and);

src1p와 src2p가 가리키는 두 개의 cpu 상태 관련 비트맵의 n+1 번째 비트부터 두 비트맵이 동시에 1로 설정된 cpu id(based 0)를 찾아 알아온다.

cpumask_first_and()

include/linux/cpumask.h

/**
 * cpumask_first_and - return the first cpu from *srcp1 & *srcp2
 * @src1p: the first input
 * @src2p: the second input
 *       
 * Returns >= nr_cpu_ids if no cpus set in both.  See also cpumask_next_and().
 */
#define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p))

src1p와 src2p가 가리키는 두 개의 cpu 상태 관련 비트맵의 1 번째 비트부터 두 비트맵이 동시에 1로 설정된 cpu id(based 0)를 찾아 알아온다.

CPU 상태

cpumask_t 타입

include/linux/cpumask.h

typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t;

cpu 비트맵 배열

다음은 unsigned long 배열 형태의 비트맵이다.

cpu_all_bits[]
cpu_possible_bits[]
cpu_online_bits[]
cpu_present_bits[]
cpu_active_bits[]

다음은 위 4개의 비트맵을 가리키는 const 포인터이다.

*cpu_possible_mask
*cpu_online_mask
*cpu_present_mask
*cpu_active_mask

kernel/cpu.c

const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL;
EXPORT_SYMBOL(cpu_all_bits);

#ifdef CONFIG_INIT_ALL_POSSIBLE
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly
        = CPU_BITS_ALL;
#else
static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly;
#endif
const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits);
EXPORT_SYMBOL(cpu_possible_mask);

static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits);
EXPORT_SYMBOL(cpu_online_mask);

static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits);
EXPORT_SYMBOL(cpu_present_mask);

static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly;
const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits);
EXPORT_SYMBOL(cpu_active_mask);

전역 변수

nr_cpu_ids

kernel/smp.c

/* Setup number of possible processor ids */
int nr_cpu_ids __read_mostly = NR_CPUS;
EXPORT_SYMBOL(nr_cpu_ids);

초기 값은 NR_CPUS로 설정됨

setup_max_cpus

kernel/smp.c

/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);

초기 값은 NR_CPUS로 설정됨

커널 파라메터

“nosmp”

kernel/smp.c

static int __init nosmp(char *str)
{
        setup_max_cpus = 0;
        arch_disable_smp_support();

        return 0;
}

early_param("nosmp", nosmp);

커널 파라메터 “nosmp”가 입력되면 setup_max_cpus에 0을 대입하고 arch_disable_smp_support() 함수를 통해 smp 기능이 동작하지 않도록 한다.

현재 이 커널 파라메터는 x86 시스템에만 구현되어 있다.

arch_disable_smp_support()

kernel/smp.c

/*
 * Setup routine for controlling SMP activation
 *
 * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
 * activation entirely (the MPS table probe still happens, though).
 *
 * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
 * greater than 0, limits the maximum number of CPUs activated in
 * SMP mode to <NUM>.
 */

void __weak arch_disable_smp_support(void) { }

각 아키텍처에서 정의된 함수를 사용하며 없는 경우 weak 함수로 정의된 빈 함수가 동작한다.

“nr_cpus”

kernel/smp.c

/* this is hard limit */
static int __init nrcpus(char *str)
{
        int nr_cpus;

        get_option(&str, &nr_cpus);
        if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
                nr_cpu_ids = nr_cpus;

        return 0;
}

early_param("nr_cpus", nrcpus);

커널 파라메터 “nr_cpus”로 입력 받은 cpu 수를 nr_cpu_ids에 대입한다. 대입 가능 범위는 1~nr_cpus 만큼이다.

예) “nr_cpus=2”

get_option()

lib/cmdline.c

/**
 *      get_option - Parse integer from an option string 
 *      @str: option string
 *      @pint: (output) integer value parsed from @str
 *
 *      Read an int from an option string; if available accept a subsequent
 *      comma as well.
 *
 *      Return values:
 *      0 - no int in string
 *      1 - int found, no subsequent comma
 *      2 - int found including a subsequent comma
 *      3 - hyphen found to denote a range
 */

int get_option(char **str, int *pint)
{
        char *cur = *str;

        if (!cur || !(*cur))
                return 0;
        *pint = simple_strtol(cur, str, 0);
        if (cur == *str)
                return 0;
        if (**str == ',') {
                (*str)++; 
                return 2;
        }
        if (**str == '-')
                return 3;
        
        return 1;
}
EXPORT_SYMBOL(get_option);

str 문자열을 정수로 변환하여 pint에 저장하고 리턴 되는 값은 다음과 같다.

정수 변환이 되지 않는 경우 0
정수 변환 후, 옵션 문자열이 없는 경우 1
정수 변환 후, 옵션 문자열이 ‘,’인 경우 2
정수 변환 후, 옵션 문자열이 ‘-‘인 경우 3

if (!cur || !(*cur)) return 0;
- cur가 null이거나 cur 문자열이 “”인 경우 0을 리턴한다.
*pint = simple_strtol(cur, str, 0);
- cur를 파싱하고 숫자로 변환하여 pint에 저장한다. 변환된 문자열의 다음 문자를 가리키는 주소를 출력 인수 str에 저장한다.
if (cur == *str) return 0;
- 문자열을 숫자로 변환되지 않는 경우 0을 리턴한다.
if (**str == ‘,’) { (*str)++; return 2; }
- 변환 후 다음 문자가 ‘,’인 경우 str을 증가시키고 2를 리턴한다.
if (**str == ‘-‘) return 3;
- 변환 후 다음 문자가 ‘-‘인 경우 str을 증가시키고 3을 리턴한다.
변환 후 문자가 ‘,’ 또는 ‘-‘가 아닌 경우 1을 리턴한다.

“maxcpus”

kernel/smp.c

static int __init maxcpus(char *str)
{
        get_option(&str, &setup_max_cpus);
        if (setup_max_cpus == 0)
                arch_disable_smp_support();

        return 0;
}

early_param("maxcpus", maxcpus);

커널 파라메터 “maxcpus”로 입력 받은 값을 setup_max_cpus에 대입한다.

만일 0이 입력되면 smp 기능이 동작하지 않게 한다.
- 현재 0 값의 허용은 x86 시스템에만 구현되어 있다.
예) “maxcpus=2”

싱글 비트 설정된 cpumask

get_cpu_mask()

include/linux/cpumask.h

static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
{
        const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG];
        p -= cpu / BITS_PER_LONG;
        return to_cpumask(p);
}

@cpu에 해당하는 싱글비트 설정된 cpumask 비트맵을 반환한다.

NR_CPUS가 어떤 값이든 요청 cpu에 대해 싱글 비트가 설정된 cpumask를 만들어내기위해 컴파일 타임에 특수한 형태의 static 배열 cpu_bit_bimap[][]을 만든다.
예) NR_CPUS=192일 때,각 cpu에 대해 반환되는 cpumask는
- get_cpu_mask(0) = 0x0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000
- get_cpu_mask(1) = 0x0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0001
- get_cpu_mask(4) = 0x0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0008
- get_cpu_mask(65) = 0x0000_0000_0000_0000_0000_0000_0000_0001_0000_0000_0000_0000
- get_cpu_mask(99) = 0x0000_0000_0000_0000_0000_0001_0000_0000_0000_0000_0000_0000
- get_cpu_mask(129) = 0x0000_0000_0000_0001_0000_0000_0000_0000_0000_0000_0000_0000
- get_cpu_mask(192) = 0x8000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000_0000

cpu_bit_bitmap 배열

kernel/cpu.c

/*
 * Special-case data structure for "single bit set only" constant CPU masks.
 *
 * We pre-generate all the 64 (or 32) possible bit positions, with enough
 * padding to the left and the right, and return the constant pointer
 * appropriately offset.
 */

const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {

        MASK_DECLARE_8(0),      MASK_DECLARE_8(8),
        MASK_DECLARE_8(16),     MASK_DECLARE_8(24),
#if BITS_PER_LONG > 32
        MASK_DECLARE_8(32),     MASK_DECLARE_8(40),
        MASK_DECLARE_8(48),     MASK_DECLARE_8(56),
#endif
};
EXPORT_SYMBOL_GPL(cpu_bit_bitmap);

get_cpu_mask() 함수에서 사용하기 위한 특수한 형태의 싱글 비트 설정된 cpumask를 컴파일 타임에 준비한다.

예) NR_CPUS=192인 경우 long 값이 64비트를 처리할 수 있으므로, 192개의 cpu를 처리하려면 3개의 배열을 사용해야 한다.

cpu_bit_bitmap[64+1][3] ={
- {0x0, 0x0, 0x0},
- {0x1, 0x0, 0x0},
- {0x4, 0x0, 0x0},
- {0x8, 0x0, 0x0},
- {0x10, 0x0, 0x0},
- {0x20, 0x0, 0x0},
- {0x40, 0x0, 0x0},
- {0x80, 0x0, 0x0},
- {0x100, 0x0, 0x0},
- {0x200, 0x0, 0x0},
- {0x400, 0x0, 0x0},
- {0x800, 0x0, 0x0},
- {0x1000, 0x0, 0x0},
- {0x2000, 0x0, 0x0},
- {0x4000, 0x0, 0x0},
- {0x8000, 0x0, 0x0},
- …
- {0x8000_0000_0000_0000, 0x0, 0x0}

unsigned logn cpu_bit_bitmap[NR_CPUS+1];과 같은 형태로 선언하고 배열내의 각 cpumask 값을 런타임에 준비할 수도 있지만, NR_CPUS 값이 클 때 메모리가 많이 낭비되며, 런타임에 별도의 초기화를 수행해야 하는 단점이 있다. 따라서 이 루틴을 설계한 개발자는 특수한 형태의 배열을 컴파일 타임에 준비해두고 이를 교묘하게 활용하여 get_cpu_mask() 함수를 통해 요청한 cpu에 대한 싱글 비트가 설정된 값을 담은 cpumask를 반환할 수 있게 설계하였다.

NR_CPUS=192일 때 little 엔디안 cpu을 사용하는 시스템에서 cpu_bit_bitmap을 덤프하면 다음과 같다.

cpu=5일 때 오렌지색에 해당하는 cpumask를 반환한다.
- =0x10
cpu=73일 때 파란색에 해당하는 cpumask를 반환한다.(unsigned long 단위를 cpu/64만큼 뒤로 이동한 포인터를 반환한다.)
- =0x1_00_0000_0000_0000_0000
cpu=192일 때 초록색에 해당하는 cpumask를 반환한다.
- =0x80_0000_0000_0000_0000_0000_0000_0000_0000

0x00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[0]
0x01 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[1]
0x02 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[2]
0x04 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[3]
0x08 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[4]
0x10 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[5]
0x20 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[6]
0x40 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[7]  
0x80 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[8]  
0x00 01 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[9]  
0x00 02 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[10]  
0x00 04 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[11]  
0x00 08 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[12]  
0x00 10 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[13]  
0x00 20 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[14]  
0x00 40 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[15]  
0x00 80 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[16]
(...생략...)
0x00 00 00 00 00 00 00 40 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[63]
0x00 00 00 00 00 00 00 80 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[64]

NR_CPUS=192일 때 big 엔디안 cpu를 사용하는 시스템에서 cpu_bit_bitmap을 덤프하면 다음과 같다.

cpu=5일 때 오렌지색에 해당하는 cpumask를 반환한다.
- =0x10
cpu=73일 때 파란색에 해당하는 cpumask를 반환한다.
- =0x1_00_0000_0000_0000_0000
cpu=192일 때 초록색에 해당하는 cpumask를 반환한다.
- =0x80_0000_0000_0000_0000_0000_0000_0000_0000

0x00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[0]
0x00 00 00 00 00 00 00 01 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[1] 
0x00 00 00 00 00 00 00 02 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[2] 
0x00 00 00 00 00 00 00 04 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[3] 
0x00 00 00 00 00 00 00 08 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[4] 
0x00 00 00 00 00 00 00 10 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[5] 
0x00 00 00 00 00 00 00 20 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[6] 
0x00 00 00 00 00 00 00 40 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[7] 
0x00 00 00 00 00 00 00 80 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[8] 
0x00 00 00 00 00 00 01 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[9] 
0x00 00 00 00 00 00 02 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[10] 
0x00 00 00 00 00 00 04 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[11] 
0x00 00 00 00 00 00 08 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[12] 
0x00 00 00 00 00 00 10 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[13] 
0x00 00 00 00 00 00 20 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[14] 
0x00 00 00 00 00 00 40 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[15] 
0x00 00 00 00 00 00 80 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[16] 
(...생략...)
0x40 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[63]
0x80 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 - 00 00 00 00 00 00 00 00 : cpu_bit_bitmap[64]

MASK_DECLARE_x 매크로

kernel/cpu.c

/*
 * cpu_bit_bitmap[] is a special, "compressed" data structure that
 * represents all NR_CPUS bits binary values of 1<<nr.
 *
 * It is used by cpumask_of() to get a constant address to a CPU
 * mask value that has a single bit set only.
 */

/* cpu_bit_bitmap[0] is empty - so we can back into it */

#define MASK_DECLARE_1(x)       [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x)       MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x)       MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x)       MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)

MASK_DECLARE_8(x)은 다음과 같이 변화한다.

[x+1][0] = 1UL << x,
[x+1+1][0] = 1UL << x+1,
[x+2+1][0] = 1UL << x+2,
[x+3+1][0] = 1UL << x+3,
[x+4+1][0] = 1UL << x+4,
[x+5+1][0] = 1UL << x+5,
[x+6+1][0] = 1UL << x+6,
[x+7+1][0] = 1UL << x+7,