numa_policy_init()
mm/mempolicy.c
void __init numa_policy_init(void)
{
nodemask_t interleave_nodes;
unsigned long largest = 0;
int nid, prefer = 0;
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
0, SLAB_PANIC, NULL);
for_each_node(nid) {
preferred_node_policy[nid] = (struct mempolicy) {
.refcnt = ATOMIC_INIT(1),
.mode = MPOL_PREFERRED,
.flags = MPOL_F_MOF | MPOL_F_MORON,
.v = { .preferred_node = nid, },
};
}
/*
* Set interleaving policy for system init. Interleaving is only
* enabled across suitably sized nodes (default is >= 16MB), or
* fall back to the largest node if they're all smaller.
*/
nodes_clear(interleave_nodes);
for_each_node_state(nid, N_MEMORY) {
unsigned long total_pages = node_present_pages(nid);
/* Preserve the largest node */
if (largest < total_pages) {
largest = total_pages;
prefer = nid;
}
/* Interleave this node? */
if ((total_pages << PAGE_SHIFT) >= (16 << 20))
node_set(nid, interleave_nodes);
}
/* All too small, use the largest */
if (unlikely(nodes_empty(interleave_nodes)))
node_set(prefer, interleave_nodes);
if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
pr_err("%s: interleaving failed\n", __func__);
check_numabalancing_enable();
}
- 코드 라인 07~13에서 mempolicy 구조체 타입으로 전역 policy_cache kmem 캐시와 sp_node 구조체 타입으로 전역 sn_cache kmem 캐시를 구성한다.
- 코드 라인 15~22에서 전체 노드에 대해 preferred_node_policy[] 배열을 초기화한다.
- 코드 라인 29~41에서 interleave 노드를 초기화한다. 16M 이상의 모든 메모리 노드를 interleave 노드에 참여 시킨다.
- 코드 라인 44~45에서 만일 참여한 노드가 하나도 없는 경우 가장 큰 노드를 interleave 노드에 참여시킨다.
- 코드 라인 47~48에서 interleave 메모리 정책을 설정한다.
- 코드 라인 50에서 NUMA 밸런싱을 설정한다.
mm/mempolicy.c
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
do_set_mempolicy()
mm/mempolicy.c
/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *new, *old;
NODEMASK_SCRATCH(scratch);
int ret;
if (!scratch)
return -ENOMEM;
new = mpol_new(mode, flags, nodes);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto out;
}
task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
task_unlock(current);
mpol_put(new);
goto out;
}
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
task_unlock(current);
mpol_put(old);
ret = 0;
out:
NODEMASK_SCRATCH_FREE(scratch);
return ret;
}
요청한 메모리 정책 모드를 새로 만들고 설정한 후 현재 태스크의 메모리 정책에 대입시킨다.
mpol_new()
mm/mempolicy.c
/*
* This function just creates a new policy, does some check and simple
* initialization. You must invoke mpol_set_nodemask() to set nodes.
*/
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);
if (mode == MPOL_DEFAULT) {
if (nodes && !nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
return NULL;
}
VM_BUG_ON(!nodes);
/*
* MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
* MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
* All other modes require a valid pointer to a non-empty nodemask.
*/
if (mode == MPOL_PREFERRED) {
if (nodes_empty(*nodes)) {
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
}
} else if (mode == MPOL_LOCAL) {
if (!nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
mode = MPOL_PREFERRED;
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!policy)
return ERR_PTR(-ENOMEM);
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
return policy;
}
새로운 메모리 정책을 생성한다.
mpol_set_nodemask()
mm/mempolicy.c
/*
* mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
* any, for the new policy. mpol_new() has already validated the nodes
* parameter with respect to the policy mode and flags. But, we need to
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
* and mempolicy. May also be called holding the mmap_semaphore for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
int ret;
/* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
if (pol == NULL)
return 0;
/* Check N_MEMORY */
nodes_and(nsc->mask1,
cpuset_current_mems_allowed, node_states[N_MEMORY]);
VM_BUG_ON(!nodes);
if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
nodes = NULL; /* explicit local allocation */
else {
if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
else
nodes_and(nsc->mask2, *nodes, nsc->mask1);
if (mpol_store_user_nodemask(pol))
pol->w.user_nodemask = *nodes;
else
pol->w.cpuset_mems_allowed =
cpuset_current_mems_allowed;
}
if (nodes)
ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
else
ret = mpol_ops[pol->mode].create(pol, NULL);
return ret;
}
mpol_relative_nodemask()
mm/mempolicy.c
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
const nodemask_t *rel)
{
nodemask_t tmp;
nodes_fold(tmp, *orig, nodes_weight(*rel));
nodes_onto(*ret, tmp, *rel);
}
mpol_op[] 구조체 배열
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
[MPOL_DEFAULT] = {
.rebind = mpol_rebind_default,
},
[MPOL_INTERLEAVE] = {
.create = mpol_new_interleave,
.rebind = mpol_rebind_nodemask,
},
[MPOL_PREFERRED] = {
.create = mpol_new_preferred,
.rebind = mpol_rebind_preferred,
},
[MPOL_BIND] = {
.create = mpol_new_bind,
.rebind = mpol_rebind_nodemask,
},
};
check_numabalancing_enable()
mm/mempolicy.c
#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;
static void __init check_numabalancing_enable(void)
{
bool numabalancing_default = false;
if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
numabalancing_default = true;
/* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
if (numabalancing_override)
set_numabalancing_state(numabalancing_override == 1);
if (num_online_nodes() > 1 && !numabalancing_override) {
pr_info("%s automatic NUMA balancing. "
"Configure with numa_balancing= or the "
"kernel.numa_balancing sysctl",
numabalancing_default ? "Enabling" : "Disabling");
set_numabalancing_state(numabalancing_default);
}
}
#endif
NUMA 밸런싱을 설정한다.
- 코드 라인 12~13에서 “numa_balancing=enable” 커널 파라메터가 설정된 경우 전역 numabalancing_enabled를 true로 설정한다.
- 코드 라인 15~21에서 CONFIG_NUMA_BALANCING_DEFAULT_ENABLED 커널 옵션이 사용된 커널에서 온라인 노드가 2개 이상인 경우도 전역 numabalancing_enabled를 true로 설정한다.
구조체
mempolicy 구조체
/*
* Describe a memory policy.
*
* A mempolicy can be either associated with a process or with a VMA.
* For VMA related allocations the VMA policy is preferred, otherwise
* the process policy is used. Interrupts ignore the memory policy
* of the current process.
*
* Locking policy for interlave:
* In process context there is no locking because only the process accesses
* its own state. All vma manipulation is somewhat protected by a down_read on
* mmap_sem.
*
* Freeing policy:
* Mempolicy objects are reference counted. A mempolicy will be freed when
* mpol_put() decrements the reference count to zero.
*
* Duplicating policy objects:
* mpol_dup() allocates a new mempolicy and copies the specified mempolicy
* to the new storage. The reference count of the new object is initialized
* to 1, representing the caller of mpol_dup().
*/
struct mempolicy {
atomic_t refcnt;
unsigned short mode; /* See MPOL_* above */
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
union {
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave/bind */
/* undefined for default */
} v;
union {
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
nodemask_t user_nodemask; /* nodemask passed by user */
} w;
};
mempolicy_operations 구조체
mm/mempolicy.c
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
/*
* If read-side task has no lock to protect task->mempolicy, write-side
* task will rebind the task->mempolicy by two step. The first step is
* setting all the newly nodes, and the second step is cleaning all the
* disallowed nodes. In this way, we can avoid finding no node to alloc
* page.
* If we have a lock to protect task->mempolicy in read-side, we do
* rebind directly.
*
* step:
* MPOL_REBIND_ONCE - do rebind work at once
* MPOL_REBIND_STEP1 - set all the newly nodes
* MPOL_REBIND_STEP2 - clean all the disallowed nodes
*/
void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
enum mpol_rebind_step step);
} mpol_ops[MPOL_MAX];
Policy 관련 enum
/*
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
*/
/* Policies */
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_MAX, /* always last member of enum */
};
참고
- NUMA with Linux | Lunatine