NODE 비트맵 (API)

NODE 비트맵 (API)

노드 관리용 자료 구조

node_data

pglist_data 구조체를 통해 노드 관리를 수행한다. 디폴트로 4개의 멀티 노드를 사용한다.

arch/arm64/mm/numa.c

struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
EXPORT_SYMBOL(node_data);

노드 비트맵

arch/arm64/mm/numa.c

nodemask_t numa_nodes_parsed __initdata;

NUMA 시스템에서 사용할 노드 비트맵이다.

include/linux/nodemask.h

#define node_online_map         node_states[N_ONLINE]
#define node_possible_map       node_states[N_POSSIBLE]

온라인 노드 비트맵과 possible 노드 비트맵이다.

NUMA 노드가 발견되면 node_set_online(nid) 함수를 통해 node_states[N_ONLINE] 비트맵에 해당 노드를 설정한다.

노드 상태

enum node_states

include/linux/nodemask.h

/*
 * Bitmasks that are kept for all the nodes.
 */
enum node_states {
        N_POSSIBLE,             /* The node could become online at some point */
        N_ONLINE,               /* The node is online */
        N_NORMAL_MEMORY,        /* The node has regular memory */
#ifdef CONFIG_HIGHMEM
        N_HIGH_MEMORY,          /* The node has regular or high memory */
#else
        N_HIGH_MEMORY = N_NORMAL_MEMORY,
#endif
        N_MEMORY,               /* The node has memory(regular, high, movable) */
        N_CPU,          /* The node has one or more cpus */
        NR_NODE_STATES
};

노드의 상태 인덱스 값으로 사용한다.

nodemask_t 타입

include/linux/nodemask.h

typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t;

node_states[] 배열

mm/page_alloc.c

/*
 * Array of node states.
 */
nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
        [N_POSSIBLE] = NODE_MASK_ALL,
        [N_ONLINE] = { { [0] = 1UL } },
#ifndef CONFIG_NUMA
        [N_NORMAL_MEMORY] = { { [0] = 1UL } },
#ifdef CONFIG_HIGHMEM
        [N_HIGH_MEMORY] = { { [0] = 1UL } },
#endif
#ifdef CONFIG_MOVABLE_NODE
        [N_MEMORY] = { { [0] = 1UL } },
#endif
        [N_CPU] = { { [0] = 1UL } },
#endif  /* NUMA */
};
EXPORT_SYMBOL(node_states);

cpu_to_node_map[]

다음은 cpu -> node 관계를 매핑하는 배열이다.

arch/arm64/mm/numa.c

static int cpu_to_node_map[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };

numa_distance[]

다음은 동적으로 노드 간 거리(distance)를 저장한 numa_distance[] 배열을 만들어 관리한다.

배열의 크기는 numa_distance_cnt * numa_distance_cnt를 사용한다.

arch/arm64/mm/numa.c

static int numa_distance_cnt;
static u8 *numa_distance;

노드 관련 API들

노드 관련 순회(iterator)

for_each_node()

include/linux/nodemask.h

#define for_each_node(node)        for_each_node_state(node, N_POSSIBLE)

node_states[N_POSSIBLE] 노드 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_online_node()

include/linux/nodemask.h

#define for_each_online_node(node) for_each_node_state(node, N_ONLINE)

node_states[N_ONLINE] 노드 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_node_state()

include/linux/nodemask.h

#define for_each_node_state(__node, __state) \
        for_each_node_mask((__node), node_states[__state])

node_states[__state] 노드 비트맵에서 각 비트가 1로 기록된 노드만큼 루프를 돈다.

for_each_node_mask()

#if MAX_NUMNODES > 1
#define for_each_node_mask(node, mask)                  \
        for ((node) = first_node(mask);                 \
                (node) < MAX_NUMNODES;                  \
                (node) = next_node((node), (mask)))
#else /* MAX_NUMNODES == 1 */
#define for_each_node_mask(node, mask)                  \
        if (!nodes_empty(mask))                         \
                for ((node) = 0; (node) < 1; (node)++)
#endif /* MAX_NUMNODES */

NUMA 시스템에서는 전체 노드 중 mask 노드 비트맵에서 1로 기록된 노드만큼 루프를 돈다.
UMA 시스템에서는 node 번호 0에 대해 1번만 for 루프를 수행한다.

상태 관련 함수

static inline int node_state(int node, enum node_states state)
- node_states[state] 비트맵에서 node 번째 비트 유무를 리턴한다.
static inline void node_set_state(int node, enum node_states state)
- node_states[state] 비트맵에 node 번째 비트를 1로 설정한다.
static inline void node_clear_state(int node, enum node_states state)
- node_states[state] 비트맵에 node 번째 비트를 0으로 클리어한다.
static inline int num_node_state(enum node_states state)
- node_states[state] 비트맵에서 1로 설정된 비트의 수를 알아온다.

first_node()

include/linux/nodemask.h

#define first_node(src) __first_node(&(src))
static inline int __first_node(const nodemask_t *srcp)
{
        return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES));
}

노드를 나타내는 src 비트맵에서 첫 번째 노드 번호(based 0)를 알아온다. 알아온 노드 번호는 MAX_NUMNODES를 초과하지 않도록 제한되다.

next_node()

include/linux/nodemask.h

#define next_node(n, src) __next_node((n), &(src))
static inline int __next_node(int n, const nodemask_t *srcp)
{
        return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
}

노드를 나타내는 src 비트맵에서 n+1 번째 부터 비트를 검색하여 1이 발견되는 위치의 노드 번호를 알아온다. 알아온 노드 번호는 MAX_NUMNODES를 초과하지 않도록 제한된다.

nodes_empty()

include/linux/nodemask.h

#define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES)
static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits)
{
        return bitmap_empty(srcp->bits, nbits);
}

src 노드 비트맵이 비어 있는지 여부를 알아온다.

find_next_best_node()

mm/page_alloc.c

/**
 * find_next_best_node - find the next node that should appear in a given node's fallback list
 * @node: node whose fallback list we're appending
 * @used_node_mask: nodemask_t of already used nodes
 *
 * We use a number of factors to determine which is the next node that should
 * appear on a given node's fallback list.  The node should not have appeared
 * already in @node's fallback list, and it should be the next closest node
 * according to the distance array (which contains arbitrary distance values
 * from each node to each node in the system), and should also prefer nodes
 * with no CPUs, since presumably they'll have very little allocation pressure
 * on them otherwise.
 * It returns -1 if no node is found.
 */
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
        int n, val;
        int min_val = INT_MAX;
        int best_node = NUMA_NO_NODE;
        const struct cpumask *tmp = cpumask_of_node(0);

        /* Use the local node if we haven't already */
        if (!node_isset(node, *used_node_mask)) {
                node_set(node, *used_node_mask);
                return node;
        }

        for_each_node_state(n, N_MEMORY) {

                /* Don't want a node to appear more than once */
                if (node_isset(n, *used_node_mask))
                        continue;

                /* Use the distance array to find the distance */
                val = node_distance(node, n);

                /* Penalize nodes under us ("prefer the next node") */
                val += (n < node);

                /* Give preference to headless and unused nodes */
                tmp = cpumask_of_node(n);
                if (!cpumask_empty(tmp))
                        val += PENALTY_FOR_NODE_WITH_CPUS;

                /* Slight preference for less loaded node */
                val *= (MAX_NODE_LOAD*MAX_NUMNODES);
                val += node_load[n];

                if (val < min_val) {
                        min_val = val;
                        best_node = n;
                }
        }

        if (best_node >= 0)
                node_set(best_node, *used_node_mask);

        return best_node;
}

사용된 노드를 제외하고 현재 노드의 가장 인접한 다음 노드 id를 찾아 리턴한다.

const struct cpumask *tmp = cpumask_of_node(0);
- 0번 노드에 대응하는 cpumask
if (!node_isset(node, *used_node_mask)) { node_set(node, *used_node_mask); return node; }
- 요청 노드가 used_node_mask 비트맵에 포함되어 있지 않으면 used_node_mask 비트맵에 노드에 대응하는 비트를 설정하고 노드 id를 리턴한다.
for_each_node_state(n, N_MEMORY) {
- N_MEMORY 노드 상태 비트맵에 대해 루프를 돈다.
if (node_isset(n, *used_node_mask)) continue;
- 이미 사용 중으로 표기된 노드인 경우 다음 노드를 진행한다.
val = node_distance(node, n);
- 노드간 거리값을 알아온다.
  - NUMA가 아닌 경우 같은 노드인 경우 10, 아닌 경우 20이다.
  - NUMA인 경우 노드 설계마다 다르며 노드 간에 속도가 빠를 수록 수치가 작다. (인접 노드)
val += (n < node);
- node 번호보다 작은 노드에 대해 모두 거리를 합산한다.
tmp = cpumask_of_node(n);
- n번 노드에 대한 cpu 비트맵 주소를 알아온다.
if (!cpumask_empty(tmp)) val += PENALTY_FOR_NODE_WITH_CPUS;
- tmp가 가리키는 비트맵이 비어있는 경우 1을 더한다.
val *= (MAX_NODE_LOAD*MAX_NUMNODES);
- MAX_NODE_LOAD
  - nr_online_nodes
val += node_load[n];
- node_load[n]을 추가한다.
if (val < min_val) { min_val = val; best_node = n; }
- 가장 작은 값인 경우 min_val과 best_node를 갱신한다.
if (best_node >= 0) node_set(best_node, *used_node_mask);
- 노드가 선택되었으면 used_node_mask가 가리키는 비트맵에 해당 노드 비트를 설정한다.

cpumask_of_node()

arch/arm64/include/asm/numa.h

/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
static inline const struct cpumask *cpumask_of_node(int node)
{
        return node_to_cpumask_map[node];
}

node_to_cpumask_map[] 배열에서 노드 id에 대응하는 cpu 비트맵을 가리키는 cpumask를 알아온다.

NODE 비트맵 (API)