numa_policy_init()
mm/mempolicy.c
void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .v = { .preferred_node = nid, }, }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); }
- 코드 라인 07~13에서 mempolicy 구조체 타입으로 전역 policy_cache kmem 캐시와 sp_node 구조체 타입으로 전역 sn_cache kmem 캐시를 구성한다.
- 코드 라인 15~22에서 전체 노드에 대해 preferred_node_policy[] 배열을 초기화한다.
- 코드 라인 29~41에서 interleave 노드를 초기화한다. 16M 이상의 모든 메모리 노드를 interleave 노드에 참여 시킨다.
- 코드 라인 44~45에서 만일 참여한 노드가 하나도 없는 경우 가장 큰 노드를 interleave 노드에 참여시킨다.
- 코드 라인 47~48에서 interleave 메모리 정책을 설정한다.
- 코드 라인 50에서 NUMA 밸런싱을 설정한다.
mm/mempolicy.c
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
do_set_mempolicy()
mm/mempolicy.c
/* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); mpol_put(new); goto out; } old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE && nodes_weight(new->v.nodes)) current->il_next = first_node(new->v.nodes); task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; }
요청한 메모리 정책 모드를 새로 만들고 설정한 후 현재 태스크의 메모리 정책에 대입시킨다.
mpol_new()
mm/mempolicy.c
/* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; return policy; }
새로운 메모리 정책을 생성한다.
mpol_set_nodemask()
mm/mempolicy.c
/* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. But, we need to * handle an empty nodemask with MPOL_PREFERRED here. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; } if (nodes) ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); else ret = mpol_ops[pol->mode].create(pol, NULL); return ret; }
mpol_relative_nodemask()
mm/mempolicy.c
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); }
mpol_op[] 구조체 배열
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_interleave, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_bind, .rebind = mpol_rebind_nodemask, }, };
check_numabalancing_enable()
mm/mempolicy.c
#ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. " "Configure with numa_balancing= or the " "kernel.numa_balancing sysctl", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } #endif
NUMA 밸런싱을 설정한다.
- 코드 라인 12~13에서 “numa_balancing=enable” 커널 파라메터가 설정된 경우 전역 numabalancing_enabled를 true로 설정한다.
- 코드 라인 15~21에서 CONFIG_NUMA_BALANCING_DEFAULT_ENABLED 커널 옵션이 사용된 커널에서 온라인 노드가 2개 이상인 경우도 전역 numabalancing_enabled를 true로 설정한다.
구조체
mempolicy 구조체
/* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interlave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_sem. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ union { short preferred_node; /* preferred */ nodemask_t nodes; /* interleave/bind */ /* undefined for default */ } v; union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; };
mempolicy_operations 구조체
mm/mempolicy.c
static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); /* * If read-side task has no lock to protect task->mempolicy, write-side * task will rebind the task->mempolicy by two step. The first step is * setting all the newly nodes, and the second step is cleaning all the * disallowed nodes. In this way, we can avoid finding no node to alloc * page. * If we have a lock to protect task->mempolicy in read-side, we do * rebind directly. * * step: * MPOL_REBIND_ONCE - do rebind work at once * MPOL_REBIND_STEP1 - set all the newly nodes * MPOL_REBIND_STEP2 - clean all the disallowed nodes */ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, enum mpol_rebind_step step); } mpol_ops[MPOL_MAX];
Policy 관련 enum
/* * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are * passed by the user to either set_mempolicy() or mbind() in an 'int' actual. * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags. */ /* Policies */ enum { MPOL_DEFAULT, MPOL_PREFERRED, MPOL_BIND, MPOL_INTERLEAVE, MPOL_LOCAL, MPOL_MAX, /* always last member of enum */ };
참고
- NUMA with Linux | Lunatine