문c 블로그

NODE 비트맵 (API)

2016-04-182019-04-19 문영일 Leave a comment

NODE 비트맵 (API)

노드 관리용 자료 구조

node_data

pglist_data 구조체를 통해 노드 관리를 수행한다. 디폴트로 4개의 멀티 노드를 사용한다.

arch/arm64/mm/numa.c

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);

노드 비트맵

arch/arm64/mm/numa.c

nodemask_t numa_nodes_parsed __initdata;

NUMA 시스템에서 사용할 노드 비트맵이다.

include/linux/nodemask.h

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map       node_states[N_POSSIBLE]

온라인 노드 비트맵과 possible 노드 비트맵이다.

NUMA 노드가 발견되면 node_set_online(nid) 함수를 통해 node_states[N_ONLINE] 비트맵에 해당 노드를 설정한다.

노드 상태

enum node_states

include/linux/nodemask.h

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,             /* The node could become online at some point */
        N_ONLINE,               /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,          /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,               /* The node has memory(regular, high, movable) */
        N_CPU,          /* The node has one or more cpus */
        NR_NODE_STATES
};

노드의 상태 인덱스 값으로 사용한다.

nodemask_t 타입

include/linux/nodemask.h

typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;

node_states[] 배열

mm/page_alloc.c

/*
 * Array of node states.
 */
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
        [N_POSSIBLE] = NODE_MASK_ALL,
        [N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
        [N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
        [N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
#ifdef CONFIG_MOVABLE_NODE
        [N_MEMORY] = { { [0] = 1UL } },
#endif
        [N_CPU] = { { [0] = 1UL } },
#endif  /* NUMA */
};
EXPORT_SYMBOL(node_states);

cpu_to_node_map[]

다음은 cpu -> node 관계를 매핑하는 배열이다.

arch/arm64/mm/numa.c

static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };

numa_distance[]

다음은 동적으로 노드 간 거리(distance)를 저장한 numa_distance[] 배열을 만들어 관리한다.

배열의 크기는 numa_distance_cnt * numa_distance_cnt를 사용한다.

arch/arm64/mm/numa.c

static int numa_distance_cnt;
static u8 *numa_distance;

노드 관련 API들

노드 관련 순회(iterator)

for_each_node()

include/linux/nodemask.h

#define for_each_node(node)        for_each_node_state(node, N_POSSIBLE)

node_states[N_POSSIBLE] 노드 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_online_node()

include/linux/nodemask.h

#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

node_states[N_ONLINE] 노드 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_node_state()

include/linux/nodemask.h

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

node_states[__state] 노드 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_node_mask()

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                  \
        for ((node) = first_node(mask);                 \
                (node) < MAX_NUMNODES;                  \
                (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                  \
        if (!nodes_empty(mask))                         \
                for ((node) = 0; (node) < 1; (node)++)
#endif /* MAX_NUMNODES */

NUMA 시스템에서는 전체 노드 중 mask 노드 비트맵에서 1로 기록된 노드만큼 루프를 돈다.
UMA 시스템에서는 node 번호 0에 대해 1번만 for 루프를 수행한다.

상태 관련 함수

static inline int node_state(int node, enum node_states state)
- node_states[state] 비트맵에서 node 번째 비트 유무를 리턴한다.
static inline void node_set_state(int node, enum node_states state)
- node_states[state] 비트맵에 node 번째 비트를 1로 설정한다.
static inline void node_clear_state(int node, enum node_states state)
- node_states[state] 비트맵에 node 번째 비트를 0으로 클리어한다.
static inline int num_node_state(enum node_states state)
- node_states[state] 비트맵에서 1로 설정된 비트의 수를 알아온다.

first_node()

include/linux/nodemask.h

#define first_node(src) __first_node(&(src))
static inline int __first_node(const nodemask_t *srcp)
{
        return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

노드를 나타내는 src 비트맵에서 첫 번째 노드 번호(based 0)를 알아온다. 알아온 노드 번호는 MAX_NUMNODES를 초과하지 않도록 제한되다.

next_node()

include/linux/nodemask.h

#define next_node(n, src) __next_node((n), &(src))
static inline int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

노드를 나타내는 src 비트맵에서 n+1 번째 부터 비트를 검색하여 1이 발견되는 위치의 노드 번호를 알아온다. 알아온 노드 번호는 MAX_NUMNODES를 초과하지 않도록 제한된다.

nodes_empty()

include/linux/nodemask.h

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

src 노드 비트맵이 비어 있는지 여부를 알아온다.

find_next_best_node()

mm/page_alloc.c

/**
 * find_next_best_node - find the next node that should appear in a given node's fallback list
 * @node: node whose fallback list we're appending
 * @used_node_mask: nodemask_t of already used nodes
 *
 * We use a number of factors to determine which is the next node that should
 * appear on a given node's fallback list.  The node should not have appeared
 * already in @node's fallback list, and it should be the next closest node
 * according to the distance array (which contains arbitrary distance values
 * from each node to each node in the system), and should also prefer nodes
 * with no CPUs, since presumably they'll have very little allocation pressure
 * on them otherwise.
 * It returns -1 if no node is found.
 */
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
        int n, val;
        int min_val = INT_MAX;
        int best_node = NUMA_NO_NODE;
        const struct cpumask *tmp = cpumask_of_node(0);

        /* Use the local node if we haven't already */
        if (!node_isset(node, *used_node_mask)) {
                node_set(node, *used_node_mask);
                return node;
        }

        for_each_node_state(n, N_MEMORY) {

                /* Don't want a node to appear more than once */
                if (node_isset(n, *used_node_mask))
                        continue;

                /* Use the distance array to find the distance */
                val = node_distance(node, n);

                /* Penalize nodes under us ("prefer the next node") */
                val += (n < node);

                /* Give preference to headless and unused nodes */
                tmp = cpumask_of_node(n);
                if (!cpumask_empty(tmp))
                        val += PENALTY_FOR_NODE_WITH_CPUS;

                /* Slight preference for less loaded node */
                val *= (MAX_NODE_LOAD*MAX_NUMNODES);
                val += node_load[n];

                if (val < min_val) {
                        min_val = val;
                        best_node = n;
                }
        }

        if (best_node >= 0)
                node_set(best_node, *used_node_mask);

        return best_node;
}

사용된 노드를 제외하고 현재 노드의 가장 인접한 다음 노드 id를 찾아 리턴한다.

const struct cpumask *tmp = cpumask_of_node(0);
- 0번 노드에 대응하는 cpumask
if (!node_isset(node, *used_node_mask)) { node_set(node, *used_node_mask); return node; }
- 요청 노드가 used_node_mask 비트맵에 포함되어 있지 않으면 used_node_mask 비트맵에 노드에 대응하는 비트를 설정하고 노드 id를 리턴한다.
for_each_node_state(n, N_MEMORY) {
- N_MEMORY 노드 상태 비트맵에 대해 루프를 돈다.
if (node_isset(n, *used_node_mask)) continue;
- 이미 사용 중으로 표기된 노드인 경우 다음 노드를 진행한다.
val = node_distance(node, n);
- 노드간 거리값을 알아온다.
  - NUMA가 아닌 경우 같은 노드인 경우 10, 아닌 경우 20이다.
  - NUMA인 경우 노드 설계마다 다르며 노드 간에 속도가 빠를 수록 수치가 작다. (인접 노드)
val += (n < node);
- node 번호보다 작은 노드에 대해 모두 거리를 합산한다.
tmp = cpumask_of_node(n);
- n번 노드에 대한 cpu 비트맵 주소를 알아온다.
if (!cpumask_empty(tmp)) val += PENALTY_FOR_NODE_WITH_CPUS;
- tmp가 가리키는 비트맵이 비어있는 경우 1을 더한다.
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
- MAX_NODE_LOAD
  - nr_online_nodes
val += node_load[n];
- node_load[n]을 추가한다.
if (val < min_val) { min_val = val; best_node = n; }
- 가장 작은 값인 경우 min_val과 best_node를 갱신한다.
if (best_node >= 0) node_set(best_node, *used_node_mask);
- 노드가 선택되었으면 used_node_mask가 가리키는 비트맵에 해당 노드 비트를 설정한다.

cpumask_of_node()

arch/arm64/include/asm/numa.h

/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
static inline const struct cpumask *cpumask_of_node(int node)
{
        return node_to_cpumask_map[node];
}

node_to_cpumask_map[] 배열에서 노드 id에 대응하는 cpu 비트맵을 가리키는 cpumask를 알아온다.

참고

smp_prepare_boot_cpu()

2016-04-182019-05-07 문영일 Leave a comment

smp_prepare_boot_cpu() – ARM32

arch/arm/kernel/smp.c

/*
 * Mark the boot cpu "online" so that it can call console drivers in
 * printk() and can access its per-cpu storage.
 */
void __init smp_prepare_boot_cpu(void)
{
        set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
}

smp 시스템에서 부트 cpu에 대한 사전 준비를 수행한다.

TPIDRPRW 레지스터를 사용하여 현재 cpu에 대한 per-cpu offset 값을 저장하여 per-cpu 변수에 대한 빠른 access를 가능하게 한다.
- 참고: Per-cpu | 문c

smp_prepare_boot_cpu() – ARM64

arch/arm64/kernel/smp.c

void __init smp_prepare_boot_cpu(void)
{
        set_my_cpu_offset(per_cpu_offset(smp_processor_id()));
        /*
         * Initialise the static keys early as they may be enabled by the
         * cpufeature code.
         */
        jump_label_init();
        cpuinfo_store_boot_cpu();
}

smp 시스템에서 부트 cpu에 대한 사전 준비를 수행한다.

코드 라인 3에서 per-cpu를 사용하기 전에 부트 cpu에 대한 offset 값을 tpidr에 기록한다.
- 참고: smp_setup_processor_id() | 문c
코드 라인 8에서 static 키를 사용하기 위해 jump 라벨 엔트리들에 대한 초기화를 수행한다.
코드 라인 9에서 부트 cpu에 대한 arm64 cpu 정보를 읽어온다.

ARM64 CPU 정보

cpuinfo_store_boot_cpu() – ARM64

arch/arm64/kernel/cpuinfo.c

void __init cpuinfo_store_boot_cpu(void)
{
        struct cpuinfo_arm64 *info = &per_cpu(cpu_data, 0);
        __cpuinfo_store_cpu(info);

        boot_cpu_data = *info;
        init_cpu_features(&boot_cpu_data);
}

부트업 cpu에 대해 각종 시스템 레지스터 값들을 읽어 cpuinfo_arm64에 저장하고, cpu features들을 초기화한다.

참고: CPU Capabilities – ARM64 | 문c

__cpuinfo_store_cpu()

arch/arm64/kernel/cpuinfo.c

static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
{
        info->reg_cntfrq = arch_timer_get_cntfrq();
        /*
         * Use the effective value of the CTR_EL0 than the raw value
         * exposed by the CPU. CTR_E0.IDC field value must be interpreted
         * with the CLIDR_EL1 fields to avoid triggering false warnings
         * when there is a mismatch across the CPUs. Keep track of the
         * effective value of the CTR_EL0 in our internal records for
         * acurate sanity check and feature enablement.
         */
        info->reg_ctr = read_cpuid_effective_cachetype();
        info->reg_dczid = read_cpuid(DCZID_EL0);
        info->reg_midr = read_cpuid_id();
        info->reg_revidr = read_cpuid(REVIDR_EL1);

        info->reg_id_aa64dfr0 = read_cpuid(ID_AA64DFR0_EL1);
        info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1);
        info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
        info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
        info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
        info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
        info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1);
        info->reg_id_aa64pfr0 = read_cpuid(ID_AA64PFR0_EL1);
        info->reg_id_aa64pfr1 = read_cpuid(ID_AA64PFR1_EL1);
        info->reg_id_aa64zfr0 = read_cpuid(ID_AA64ZFR0_EL1);

        /* Update the 32bit ID registers only if AArch32 is implemented */
        if (id_aa64pfr0_32bit_el0(info->reg_id_aa64pfr0)) {
                info->reg_id_dfr0 = read_cpuid(ID_DFR0_EL1);
                info->reg_id_isar0 = read_cpuid(ID_ISAR0_EL1);
                info->reg_id_isar1 = read_cpuid(ID_ISAR1_EL1);
                info->reg_id_isar2 = read_cpuid(ID_ISAR2_EL1);
                info->reg_id_isar3 = read_cpuid(ID_ISAR3_EL1);
                info->reg_id_isar4 = read_cpuid(ID_ISAR4_EL1);
                info->reg_id_isar5 = read_cpuid(ID_ISAR5_EL1);
                info->reg_id_mmfr0 = read_cpuid(ID_MMFR0_EL1);
                info->reg_id_mmfr1 = read_cpuid(ID_MMFR1_EL1);
                info->reg_id_mmfr2 = read_cpuid(ID_MMFR2_EL1);
                info->reg_id_mmfr3 = read_cpuid(ID_MMFR3_EL1);
                info->reg_id_pfr0 = read_cpuid(ID_PFR0_EL1);
                info->reg_id_pfr1 = read_cpuid(ID_PFR1_EL1);

                info->reg_mvfr0 = read_cpuid(MVFR0_EL1);
                info->reg_mvfr1 = read_cpuid(MVFR1_EL1);
                info->reg_mvfr2 = read_cpuid(MVFR2_EL1);
        }

        if (IS_ENABLED(CONFIG_ARM64_SVE) &&
            id_aa64pfr0_sve(info->reg_id_aa64pfr0))
                info->reg_zcr = read_zcr_features();

        cpuinfo_detect_icache_policy(info);
}

현재 cpu에 대한 각종 시스템 레지스터 값들을 읽어 cpuinfo_arm64에 저장한다.

read_cpuid_effective_cachetype()

arch/arm64/include/asm/cache.h

/*
 * Read the effective value of CTR_EL0.
 *
 * According to ARM ARM for ARMv8-A (ARM DDI 0487C.a),
 * section D10.2.33 "CTR_EL0, Cache Type Register" :
 *
 * CTR_EL0.IDC reports the data cache clean requirements for
 * instruction to data coherence.
 *
 *  0 - dcache clean to PoU is required unless :
 *     (CLIDR_EL1.LoC == 0) || (CLIDR_EL1.LoUIS == 0 && CLIDR_EL1.LoUU == 0)
 *  1 - dcache clean to PoU is not required for i-to-d coherence.
 *
 * This routine provides the CTR_EL0 with the IDC field updated to the
 * effective state.
 */

static inline u32 __attribute_const__ read_cpuid_effective_cachetype(void)
{
        u32 ctr = read_cpuid_cachetype();

        if (!(ctr & BIT(CTR_IDC_SHIFT))) {
                u64 clidr = read_sysreg(clidr_el1);

                if (CLIDR_LOC(clidr) == 0 ||
                    (CLIDR_LOUIS(clidr) == 0 && CLIDR_LOUU(clidr) == 0))
                        ctr |= BIT(CTR_IDC_SHIFT);
        }

        return ctr;
}

Cache 타입 레지스터 값을 읽어온다. (실제 PoU 동작이 필요 없는 경우 읽어온 idc 필드를 수정한다.)

CTR 레지스터의 IDC 필드가 0이면 PoU를 위해 데이터 캐시의 clean이 필요한 경우이다. 만일 CLIDR 레지스터를 읽어 LoC=0 또는 LoU=0이면 idc 비트를 반대로 1로 설정한다. 이렇게 하여 데이터 캐시의 clean이 필요 없다라고 설정한다.
참고: ARM64 시스템 주요 레지스터 | 문c

read_cpuid_cachetype()

arch/arm64/include/asm/cputype.h

static inline u32 __attribute_const__ read_cpuid_cachetype(void)
{
        return read_cpuid(CTR_EL0);
}

Cache 타입 레지스터 값을 읽어온다.

cpuinfo_detect_icache_policy()

arch/arm64/kernel/cpuinfo.c

static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info)
{
        unsigned int cpu = smp_processor_id();
        u32 l1ip = CTR_L1IP(info->reg_ctr);

        switch (l1ip) {
        case ICACHE_POLICY_PIPT:
                break;
        case ICACHE_POLICY_VPIPT:
                set_bit(ICACHEF_VPIPT, &__icache_flags);
                break;
        default:
                /* Fallthrough */
        case ICACHE_POLICY_VIPT:
                /* Assume aliasing */
                set_bit(ICACHEF_ALIASING, &__icache_flags);
        }

        pr_info("Detected %s I-cache on CPU%d\n", icache_policy_str[l1ip], cpu);
}

인스트럭션 캐시 타입을 알아와서 반환한다. 캐시 타입은 PIPT, VPIPT 또는 VIPT 중 하나이다.

다음은 Cortext A-72 코어를 가진 ARM64 시스템에서 명령 캐시 타입을 인식하여 보여주는 로그이다.

[    0.000000] Detected VIPT I-cache on CPU0

참고

ARM64 시스템 주요 레지스터 | 문c
CPU Capabilities – ARM64 | 문c

setup_nr_cpu_ids()

2016-04-162019-05-02 문영일 Leave a comment

전역 변수 nr_cpu_ids에 최종 cpu 번호 + 1을 설정한다.

일반적인 머신에서는 NR_CPUS값과 nr_cpu_ids 값이 동일하지만 NR_CPUS가 수천으로 설정이 되는 시스템에서는 실제 운영가능한 최고 cpu 번호를 알아내어 저장할 필요가 있다.

setup_nr_cpu_ids()

kernel/smp.c

/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
void __init setup_nr_cpu_ids(void)
{
        nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
}

전역 nr_cpu_ids 변수에 possible cpu id + 1 값이 담긴다.

nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
- 전역 nr_cpu_ids에 possible cpu id 중 가장 높은 cpu id + 1이 저장된다.
- nr_cpu_ids는 아키텍처에 따라 먼저 설정되어 있을 수도 있다.

참고

Bit Operations | 문c
Bitmap Operations | 문c

setup_command_line()

2016-04-152019-05-02 문영일 Leave a comment

setup_command_line()

init/main.c

/*
 * We need to store the untouched command line for future reference.
 * We also need to store the touched command line since the parameter
 * parsing is performed in place, and we should allow a component to
 * store reference of name/value for future reference.
 */

static void __init setup_command_line(char *command_line)
{
        saved_command_line =
                memblock_alloc(strlen(boot_command_line) + 1, SMP_CACHE_BYTES);
        initcall_command_line =
                memblock_alloc(strlen(boot_command_line) + 1, SMP_CACHE_BYTES);
        static_command_line = memblock_alloc(strlen(command_line) + 1,
                                             SMP_CACHE_BYTES);
        strcpy(saved_command_line, boot_command_line);
        strcpy(static_command_line, command_line);
}

다음 3 개의 cmdline용 메모리를 할당하여 그 주소를 전역 변수에 대입한다.

saved_command_line
- boot_command_line을 복사하여 놓는다.
- 이 문자열은 변경되지 않는다.
initcall_command_line
- boot_command_line 크기만큼 일단 공간만 확보해 놓는다.
- 이 문자열은 per-initcall 파라메터 파싱을 위해 사용된다.
static_command_line
- 인수로 받은 command_line을 복사하여 놓는다.
- 이 문자열은 파라메터 파싱용으로 사용되며 변경될 수 있다.

mm_init_cpumask()

2016-04-152019-05-02 문영일 Leave a comment

mm_init_cpumask()

include/linux/mm_types.h

/* Pointer magic because the dynamic array size confuses some compilers. */
static inline void mm_init_cpumask(struct mm_struct *mm)
{
        unsigned long cpu_bitmap = (unsigned long)mm;

        cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
        cpumask_clear((struct cpumask *)cpu_bitmap);
}

cpu mask 비트맵을 cpu 수만큼 비트 clear한다.

cpumask_clear()

include/linux/cpumask.h

/**
 * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask
 * @dstp: the cpumask pointer 
 */

static inline void cpumask_clear(struct cpumask *dstp)
{       
        bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits);
}

nr_cpumask_bits 수 만큼 비트를 dstp->bits를 clear 한다.

CONFIG_CPUMASK_OFFSTACK

config CPUMASK_OFFSTACK
        bool "Force CPU masks off stack" if DEBUG_PER_CPU_MAPS
        help
          Use dynamic allocation for cpumask_var_t, instead of putting
          them on the stack.  This is a bit more expensive, but avoids
          stack overflow.

CONFIG_CPUMASK_OFFSTACK 커널 옵션을 사용하면 cpu 수가 많은 경우 cpu 수 만큼 비트맵으로 처리되는 cpumask의 크기가 커진다. 이의 처리를 스택을 통해 처리하지 않도록 별도의 메모리를 할당받아 사용하는 방법으로 stack overflow를 피할 수 있다.

참고

Bitmap Operations | 문c