numa_policy_init()

mm/mempolicy.c

void __init numa_policy_init(void)
{
        nodemask_t interleave_nodes; 
        unsigned long largest = 0;
        int nid, prefer = 0;

        policy_cache = kmem_cache_create("numa_policy",
                                         sizeof(struct mempolicy),
                                         0, SLAB_PANIC, NULL);

        sn_cache = kmem_cache_create("shared_policy_node",
                                     sizeof(struct sp_node),
                                     0, SLAB_PANIC, NULL);

        for_each_node(nid) {
                preferred_node_policy[nid] = (struct mempolicy) {
                        .refcnt = ATOMIC_INIT(1),
                        .mode = MPOL_PREFERRED,
                        .flags = MPOL_F_MOF | MPOL_F_MORON,
                        .v = { .preferred_node = nid, },
                };
        }
                
        /*
         * Set interleaving policy for system init. Interleaving is only
         * enabled across suitably sized nodes (default is >= 16MB), or
         * fall back to the largest node if they're all smaller.
         */
        nodes_clear(interleave_nodes);
        for_each_node_state(nid, N_MEMORY) {
                unsigned long total_pages = node_present_pages(nid);

                /* Preserve the largest node */
                if (largest < total_pages) {
                        largest = total_pages;
                        prefer = nid;
                }

                /* Interleave this node? */
                if ((total_pages << PAGE_SHIFT) >= (16 << 20))
                        node_set(nid, interleave_nodes);
        }

        /* All too small, use the largest */
        if (unlikely(nodes_empty(interleave_nodes)))
                node_set(prefer, interleave_nodes);

        if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
                pr_err("%s: interleaving failed\n", __func__);

        check_numabalancing_enable();
}

코드 라인 07~13에서 mempolicy 구조체 타입으로 전역 policy_cache kmem 캐시와 sp_node 구조체 타입으로 전역 sn_cache kmem 캐시를 구성한다.
코드 라인 15~22에서 전체 노드에 대해 preferred_node_policy[] 배열을 초기화한다.
코드 라인 29~41에서 interleave 노드를 초기화한다. 16M 이상의 모든 메모리 노드를 interleave 노드에 참여 시킨다.
코드 라인 44~45에서 만일 참여한 노드가 하나도 없는 경우 가장 큰 노드를 interleave 노드에 참여시킨다.
코드 라인 47~48에서 interleave 메모리 정책을 설정한다.
코드 라인 50에서 NUMA 밸런싱을 설정한다.

mm/mempolicy.c

static struct mempolicy preferred_node_policy[MAX_NUMNODES];

do_set_mempolicy()

mm/mempolicy.c

/* Set the process memory policy */
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
{
        struct mempolicy *new, *old;
        NODEMASK_SCRATCH(scratch);
        int ret;

        if (!scratch)
                return -ENOMEM;

        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new)) {
                ret = PTR_ERR(new);
                goto out;
        }

        task_lock(current);
        ret = mpol_set_nodemask(new, nodes, scratch);
        if (ret) {
                task_unlock(current);
                mpol_put(new);
                goto out;
        }
        old = current->mempolicy;
        current->mempolicy = new;
        if (new && new->mode == MPOL_INTERLEAVE &&
            nodes_weight(new->v.nodes))
                current->il_next = first_node(new->v.nodes);
        task_unlock(current);
        mpol_put(old);
        ret = 0;
out:
        NODEMASK_SCRATCH_FREE(scratch);
        return ret;
}

요청한 메모리 정책 모드를 새로 만들고 설정한 후 현재 태스크의 메모리 정책에 대입시킨다.

mpol_new()

mm/mempolicy.c

/*
 * This function just creates a new policy, does some check and simple
 * initialization. You must invoke mpol_set_nodemask() to set nodes.
 */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
{
        struct mempolicy *policy;

        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
                 mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE);

        if (mode == MPOL_DEFAULT) {
                if (nodes && !nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                return NULL;
        }
        VM_BUG_ON(!nodes);

        /*
         * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
         * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
         * All other modes require a valid pointer to a non-empty nodemask.
         */
        if (mode == MPOL_PREFERRED) {
                if (nodes_empty(*nodes)) {
                        if (((flags & MPOL_F_STATIC_NODES) ||
                             (flags & MPOL_F_RELATIVE_NODES)))
                                return ERR_PTR(-EINVAL);
                }
        } else if (mode == MPOL_LOCAL) {
                if (!nodes_empty(*nodes))
                        return ERR_PTR(-EINVAL);
                mode = MPOL_PREFERRED;
        } else if (nodes_empty(*nodes))
                return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
        policy->mode = mode;
        policy->flags = flags;

        return policy;
}

새로운 메모리 정책을 생성한다.

mpol_set_nodemask()

mm/mempolicy.c

/*
 * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 * any, for the new policy.  mpol_new() has already validated the nodes
 * parameter with respect to the policy mode and flags.  But, we need to
 * handle an empty nodemask with MPOL_PREFERRED here.
 *
 * Must be called holding task's alloc_lock to protect task's mems_allowed
 * and mempolicy.  May also be called holding the mmap_semaphore for write.
 */
static int mpol_set_nodemask(struct mempolicy *pol,
                     const nodemask_t *nodes, struct nodemask_scratch *nsc)
{
        int ret;

        /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
        if (pol == NULL)
                return 0;
        /* Check N_MEMORY */
        nodes_and(nsc->mask1,
                  cpuset_current_mems_allowed, node_states[N_MEMORY]);

        VM_BUG_ON(!nodes);
        if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
                nodes = NULL;   /* explicit local allocation */
        else {
                if (pol->flags & MPOL_F_RELATIVE_NODES)
                        mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1);
                else
                        nodes_and(nsc->mask2, *nodes, nsc->mask1);

                if (mpol_store_user_nodemask(pol))
                        pol->w.user_nodemask = *nodes;
                else
                        pol->w.cpuset_mems_allowed =
                                                cpuset_current_mems_allowed;
        }

        if (nodes)
                ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
        else
                ret = mpol_ops[pol->mode].create(pol, NULL);
        return ret;
}

mpol_relative_nodemask()

mm/mempolicy.c

static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
                                   const nodemask_t *rel)
{
        nodemask_t tmp;
        nodes_fold(tmp, *orig, nodes_weight(*rel));
        nodes_onto(*ret, tmp, *rel);
}

mpol_op[] 구조체 배열

static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
        [MPOL_DEFAULT] = {
                .rebind = mpol_rebind_default,
        },
        [MPOL_INTERLEAVE] = {
                .create = mpol_new_interleave,
                .rebind = mpol_rebind_nodemask,
        },
        [MPOL_PREFERRED] = {
                .create = mpol_new_preferred,
                .rebind = mpol_rebind_preferred,
        },
        [MPOL_BIND] = {
                .create = mpol_new_bind,
                .rebind = mpol_rebind_nodemask,
        },
};

check_numabalancing_enable()

mm/mempolicy.c

#ifdef CONFIG_NUMA_BALANCING
static int __initdata numabalancing_override;

static void __init check_numabalancing_enable(void)
{
        bool numabalancing_default = false;

        if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED))
                numabalancing_default = true;

        /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */
        if (numabalancing_override)
                set_numabalancing_state(numabalancing_override == 1);

        if (num_online_nodes() > 1 && !numabalancing_override) {
                pr_info("%s automatic NUMA balancing. "
                        "Configure with numa_balancing= or the "
                        "kernel.numa_balancing sysctl",
                        numabalancing_default ? "Enabling" : "Disabling");
                set_numabalancing_state(numabalancing_default);
        }
} 
#endif

NUMA 밸런싱을 설정한다.

코드 라인 12~13에서 “numa_balancing=enable” 커널 파라메터가 설정된 경우 전역 numabalancing_enabled를 true로 설정한다.
코드 라인 15~21에서 CONFIG_NUMA_BALANCING_DEFAULT_ENABLED 커널 옵션이 사용된 커널에서 온라인 노드가 2개 이상인 경우도 전역 numabalancing_enabled를 true로 설정한다.

구조체

mempolicy 구조체

/*
 * Describe a memory policy.
 *
 * A mempolicy can be either associated with a process or with a VMA.
 * For VMA related allocations the VMA policy is preferred, otherwise
 * the process policy is used. Interrupts ignore the memory policy
 * of the current process.
 *
 * Locking policy for interlave:
 * In process context there is no locking because only the process accesses
 * its own state. All vma manipulation is somewhat protected by a down_read on
 * mmap_sem.
 *
 * Freeing policy:
 * Mempolicy objects are reference counted.  A mempolicy will be freed when
 * mpol_put() decrements the reference count to zero.
 *
 * Duplicating policy objects:
 * mpol_dup() allocates a new mempolicy and copies the specified mempolicy
 * to the new storage.  The reference count of the new object is initialized
 * to 1, representing the caller of mpol_dup().
 */
struct mempolicy {
        atomic_t refcnt;
        unsigned short mode;    /* See MPOL_* above */
        unsigned short flags;   /* See set_mempolicy() MPOL_F_* above */
        union {
                short            preferred_node; /* preferred */
                nodemask_t       nodes;         /* interleave/bind */
                /* undefined for default */
        } v;
        union {
                nodemask_t cpuset_mems_allowed; /* relative to these nodes */
                nodemask_t user_nodemask;       /* nodemask passed by user */
        } w; 
};

mempolicy_operations 구조체

mm/mempolicy.c

static const struct mempolicy_operations {
        int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
        /*
         * If read-side task has no lock to protect task->mempolicy, write-side
         * task will rebind the task->mempolicy by two step. The first step is
         * setting all the newly nodes, and the second step is cleaning all the
         * disallowed nodes. In this way, we can avoid finding no node to alloc
         * page.
         * If we have a lock to protect task->mempolicy in read-side, we do
         * rebind directly.
         *
         * step:
         *      MPOL_REBIND_ONCE - do rebind work at once
         *      MPOL_REBIND_STEP1 - set all the newly nodes
         *      MPOL_REBIND_STEP2 - clean all the disallowed nodes
         */
        void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
                        enum mpol_rebind_step step);
} mpol_ops[MPOL_MAX];

Policy 관련 enum

/*
 * Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
 * passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
 * The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
 */

/* Policies */
enum {
        MPOL_DEFAULT,
        MPOL_PREFERRED,
        MPOL_BIND, 
        MPOL_INTERLEAVE,
        MPOL_LOCAL,
        MPOL_MAX,       /* always last member of enum */
};

참고

NUMA with Linux | Lunatine

numa_policy_init()

do_set_mempolicy()

mpol_op[] 구조체 배열

check_numabalancing_enable()

구조체

mempolicy 구조체

mempolicy_operations 구조체

Policy 관련 enum

참고

댓글 남기기 댓글 취소