문c 블로그

Radix Tree

2016-09-262016-09-26 문영일 6 Comments

Radix Tree

Dynamic하게 정수 index key에 해당하는 slot에 포인터 값을 저장할 수 있다.
처음부터 큰 index key를 사용하면 트리 단계가 확장되어 느려지므로 작은 정수 index key를 사용하는 것이 좋다.
커널 버전 2.6.17에서 lockless한 구현을 하였다.
- 참고: radix-tree: updates and lockless

다음 그림은 2단계 Radix Tree의 구조를 표현하였다.

radix_tree-2

다음 그림은 Radix Tree의 최소 0단계로 index key 0번만을 등록시킬 수 있다. 이 상태에서 다른 번호의 index key를 추가하게 되면 radix 트리 단계(height)가 필요한 단계만큼 확장된다.

다음 그림은 index 값 크기에 따라 Radix Tree의 단계가 결정되는 것을 보여준다. (단계별 6bit 사용)

Radix 트리 선언

Radix 트리를 선언하고 초기화하는 방법은 다음과 같이 두 가지가 준비되어 있다.

RADIX_TREE(name, mask);
struct radix_tree_root my_tree; INIT_RADIX_TREE(my_tree, gfp_mask);

include/linux/radix-tree.h

#define RADIX_TREE(name, mask) \
        struct radix_tree_root name = RADIX_TREE_INIT(mask)

요청한 name으로 radix_tree_root 구조체를 선언하고 gfp_mask를 대입하여 초기화한다.

include/linux/radix-tree.h

#define RADIX_TREE_INIT(mask)   {                                       \
        .height = 0,                                                    \
        .gfp_mask = (mask),                                             \
        .rnode = NULL,                                                  \
}

include/linux/radix-tree.h

#define INIT_RADIX_TREE(root, mask)                                     \
do {                                                                    \
        (root)->height = 0;                                             \
        (root)->gfp_mask = (mask);                                      \
        (root)->rnode = NULL;                                           \
} while (0)

Radix 트리 추가 및 삭제

Radix 트리에 항목을 추가하고 삭제하는 명령이 준비되어 있다.

radix_tree_insert(root, index, item)
radix_tree_delete(root, index)

radix_tree_insert()

lib/radix-tree.c

/**
 *      radix_tree_insert    -    insert into a radix tree
 *      @root:          radix tree root
 *      @index:         index key
 *      @item:          item to insert
 *
 *      Insert an item into the radix tree at position @index. 
 */
int radix_tree_insert(struct radix_tree_root *root,
                        unsigned long index, void *item)
{
        struct radix_tree_node *node;
        void **slot;
        int error;

        BUG_ON(radix_tree_is_indirect_ptr(item));

        error = __radix_tree_create(root, index, &node, &slot); 
        if (error)
                return error;
        if (*slot != NULL)
                return -EEXIST;
        rcu_assign_pointer(*slot, item);

        if (node) {
                node->count++;
                BUG_ON(tag_get(node, 0, index & RADIX_TREE_MAP_MASK));
                BUG_ON(tag_get(node, 1, index & RADIX_TREE_MAP_MASK));
        } else {
                BUG_ON(root_tag_get(root, 0));
                BUG_ON(root_tag_get(root, 1));
        }

        return 0;
}
EXPORT_SYMBOL(radix_tree_insert);

radix 트리의 index에 해당하는 슬롯에 item 포인터를 대입한다.

error = __radix_tree_create(root, index, &node, &slot);
- index 키 번호로 radix 트리 slot을 준비한다.
if (error) return error;
- 에러 시 에러를 반환한다.
if (*slot != NULL) return -EEXIST;
- slot이 null이 아니면 이미 존재한다고 에러를 반환한다.
rcu_assign_pointer(*slot, item);
- slot에 item을 대입한다.

__radix_tree_create()

lib/radix-tree.c

/**
 *      __radix_tree_create     -       create a slot in a radix tree
 *      @root:          radix tree root
 *      @index:         index key
 *      @nodep:         returns node
 *      @slotp:         returns slot
 *
 *      Create, if necessary, and return the node and slot for an item
 *      at position @index in the radix tree @root.
 *
 *      Until there is more than one item in the tree, no nodes are
 *      allocated and @root->rnode is used as a direct slot instead of
 *      pointing to a node, in which case *@nodep will be NULL.
 *
 *      Returns -ENOMEM, or 0 for success.
 */
int __radix_tree_create(struct radix_tree_root *root, unsigned long index,
                        struct radix_tree_node **nodep, void ***slotp)
{
        struct radix_tree_node *node = NULL, *slot;
        unsigned int height, shift, offset;
        int error;

        /* Make sure the tree is high enough.  */
        if (index > radix_tree_maxindex(root->height)) {
                error = radix_tree_extend(root, index);
                if (error)
                        return error;
        }

        slot = indirect_to_ptr(root->rnode);

        height = root->height;
        shift = (height-1) * RADIX_TREE_MAP_SHIFT;

        offset = 0;                     /* uninitialised var warning */
        while (height > 0) {
                if (slot == NULL) {
                        /* Have to add a child node.  */
                        if (!(slot = radix_tree_node_alloc(root)))
                                return -ENOMEM;
                        slot->path = height;
                        slot->parent = node;
                        if (node) {
                                rcu_assign_pointer(node->slots[offset], slot);
                                node->count++;
                                slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
                        } else
                                rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
                }

                /* Go a level down */
                offset = (index >> shift) & RADIX_TREE_MAP_MASK;
                node = slot;
                slot = node->slots[offset];
                shift -= RADIX_TREE_MAP_SHIFT;
                height--;
        }

        if (nodep)
                *nodep = node;
        if (slotp)
                *slotp = node ? node->slots + offset : (void **)&root->rnode;
        return 0;
}

index 키에 해당하는 노드와 슬롯을 알아온다. 만일 확장이 필요한 경우 확장도 수행한다.

if (index > radix_tree_maxindex(root->height)) { error = radix_tree_extend(root, index); if (error) return error; }
- index 키가 최대 값을 넘어가는 경우 radix 트리를 확장시킨다. 만일 에러인 경우 에러를 반환한다.
slot = indirect_to_ptr(root->rnode);
- ptr에서 RADIX_TREE_INDIRECT_PTR(1)이 위치한 비트(bit0)를 제거한다.
height = root->height; shift = (height-1) * RADIX_TREE_MAP_SHIFT;
- 현재 radix tree의 height(레벨)로 shift 값을 정한다.
  - 예) height=3, RADIX_TREE_MAP_SHIFT=6
    - shift=12
while (height > 0) {
- height가 0보다 큰 경우 루프를 돈다.
if (slot == NULL) {
- slot이 비어 있는 경우
if (!(slot = radix_tree_node_alloc(root))) return -ENOMEM;
- radix 트리 노드를 할당받는다. 에러가 발생하면 메모리 부족 에러를 반환한다.
slot->path = height; slot->parent = node;
- slot의 path에 height를 대입하고, parent에 노드를 대입한다.
if (node) { rcu_assign_pointer(node->slots[offset], slot); node->count++; slot->path |= offset << RADIX_TREE_HEIGHT_SHIFT;
- 루트 노드가 아닌 경우 노드의 slots[offset]에 slot을 대입하고, count를 증가시키며, path에 offset 값을 RADIX_TREE_HEIGHT_SHIFT만큼 좌로 쉬프트한 값을 대입한다.
} else rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
- 루트 노드인 경우 root->rnode에 slot 포인터에 RADIX_TREE_INDEIRECT_PTR(1)을 더한 값을 대입한다.
offset = (index >> shift) & RADIX_TREE_MAP_MASK;
- index 값에서 다음 레벨에 처리할 index bit 만큼을 offset에 대입한다.
node = slot; slot = node->slots[offset];
- 다음 레벨의 노드를 알아온다.
shift -= RADIX_TREE_MAP_SHIFT; height–;
- shift를 RADIX_TREE_MAP_SHIFT 만큼 감소시키고 height도 1 감소시킨다.
if (nodep) *nodep = node;
- 인수 nodep가 주어진 경우 node를 대입한다.
if (slotp) *slotp = node ? node->slots + offset : (void **)&root->rnode;
- 인수 slotp가 주어진 경우 슬롯을 대입한다.
  - node가 null인 경우는 radix_tree_root 노드가 radix_tree_node 없이 직접 leaf를 관리하는 경우이다.

다음 그림은 필요한 index key를 추가하기 위해 필요로하는 중간 노드들을 만들고 구성하는 과정을 보여준다.

radix_tree_extend()

lib/radix-tree.c

/*
 *      Extend a radix tree so it can store key @index.
 */
static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
{
        struct radix_tree_node *node;
        struct radix_tree_node *slot;
        unsigned int height;
        int tag;

        /* Figure out what the height should be.  */
        height = root->height + 1;
        while (index > radix_tree_maxindex(height))
                height++;

        if (root->rnode == NULL) {
                root->height = height;
                goto out;
        }

        do {
                unsigned int newheight;
                if (!(node = radix_tree_node_alloc(root)))
                        return -ENOMEM;

                /* Propagate the aggregated tag info into the new root */
                for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                        if (root_tag_get(root, tag))
                                tag_set(node, tag, 0);
                }

                /* Increase the height.  */
                newheight = root->height+1;
                BUG_ON(newheight & ~RADIX_TREE_HEIGHT_MASK);
                node->path = newheight;
                node->count = 1;
                node->parent = NULL;
                slot = root->rnode;
                if (newheight > 1) {
                        slot = indirect_to_ptr(slot);
                        slot->parent = node;
                }
                node->slots[0] = slot;
                node = ptr_to_indirect(node);
                rcu_assign_pointer(root->rnode, node);
                root->height = newheight;
        } while (height > root->height);
out:
        return 0;
}

Radix 트리 노드를 확장하기 위해 새로운 루트 노드를 추가하고 기존 노드를 새로 만든 노드의 첫 번째 슬롯에 연결한다. 이러한 과정을 확장이 필요한 단계만큼 수행한다.

height = root->height + 1; while (index > radix_tree_maxindex(height)) height++;
- 현재 radix tree가 처리할 수 있는 레벨만큼 height 값을 정한다.
if (root->rnode == NULL) { root->height = height; goto out; }
- 만일 슬롯이 비어 있는 경우 root->height값을 설정하고 함수를 빠져나간다.
do { unsigned int newheight; if (!(node = radix_tree_node_alloc(root))) return -ENOMEM;
- 새 루트 노드를 생성한다.
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (root_tag_get(root, tag)) tag_set(node, tag, 0); }
- 최대 태그 비트 수 만큼 루프를 돌며 루트 노드의 각 태그 비트가 1인 경우 0으로 초기화한다.
- 태그 비트는 gfp_mask의 비트들 중 __GFP_BITS_SHIFT(25) 비트부터 차례대로 사용된다.
newheight = root->height+1; node->path = newheight;
- 새로운 루트 노드의 path에 height 값을 1 증가시켜 대입한다.
node->count = 1; node->parent = NULL;
- 루트 노드의 count에 1을 대입하고 부모 노드가 없다고 지정한다.
slot = root->rnode; if (newheight > 1) { slot = indirect_to_ptr(slot); slot->parent = node; } node->slots[0] = slot;
- 루트 노드의 첫 번째 슬롯에 기존 루트 노드를 연결한다.
node = ptr_to_indirect(node); rcu_assign_pointer(root->rnode, node);
- root->rnode에
root->height = newheight;
- 새 루트 노드의 hieght 값을 설정한다.
} while (height > root->height);
- height가 현재 루트 노드의 height보다 큰 경우 새로운 루트 노드를 생성하기 위해 루프를 반복한다.

다음 그림은 1단계의 radix 트리가 2단계로 확장되는 모습을 보여준다.

radix_tree_node_alloc()

lib/radix-tree.c

/*
 * This assumes that the caller has performed appropriate preallocation, and
 * that the caller has pinned this thread of control to the current CPU.
 */
static struct radix_tree_node *
radix_tree_node_alloc(struct radix_tree_root *root)
{
        struct radix_tree_node *ret = NULL;
        gfp_t gfp_mask = root_gfp_mask(root);

        /*
         * Preload code isn't irq safe and it doesn't make sence to use
         * preloading in the interrupt anyway as all the allocations have to
         * be atomic. So just do normal allocation when in interrupt.
         */
        if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
                struct radix_tree_preload *rtp;

                /*
                 * Provided the caller has preloaded here, we will always
                 * succeed in getting a node here (and never reach
                 * kmem_cache_alloc)
                 */
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr) {
                        ret = rtp->nodes[rtp->nr - 1];
                        rtp->nodes[rtp->nr - 1] = NULL;
                        rtp->nr--;
                }
                /*
                 * Update the allocation stack trace as this is more useful
                 * for debugging.
                 */
                kmemleak_update_trace(ret);
        }
        if (ret == NULL)
                ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);

        BUG_ON(radix_tree_is_indirect_ptr(ret));
        return ret;
}

radix 트리 노드를 할당받아온다.

radix_tree_preloads에 준비된 노드를 반환한다.
만일 radix_tree_preloads에 준비된 노드가 없으면 slub 캐시에서 할당 받아온다.

gfp_t gfp_mask = root_gfp_mask(root);
- 루트 노드의 gfp_mask에서 태그를 제외한 값을 알아온다.
if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
- 인터럽트 처리중이 아니면서 __GFP_WAIT 요청도 없는 경우
rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr – 1]; rtp->nodes[rtp->nr – 1] = NULL; rtp->nr–; }
- 전역 radix_tree_preloads 구조체가 관리하는 노드가 존재하는 경우 노드 하나를 빼온다.
if (ret == NULL) ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
- 노드가 preload에 의해 준비되지 않은 경우 slub 캐시로부터 할당받아온다.

radix_tree_delete()

lib/radix-tree.c

/**
 *      radix_tree_delete    -    delete an item from a radix tree
 *      @root:          radix tree root
 *      @index:         index key
 *      
 *      Remove the item at @index from the radix tree rooted at @root.
 *
 *      Returns the address of the deleted item, or NULL if it was not present.
 */
void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
{
        return radix_tree_delete_item(root, index, NULL); 
}
EXPORT_SYMBOL(radix_tree_delete);

Radix 트리에서 요청 index 키 항목을 제거한다.

radix_tree_delete_item()

lib/radix-tree.c

/**
 *      radix_tree_delete_item    -    delete an item from a radix tree
 *      @root:          radix tree root
 *      @index:         index key
 *      @item:          expected item
 *
 *      Remove @item at @index from the radix tree rooted at @root.
 *
 *      Returns the address of the deleted item, or NULL if it was not present
 *      or the entry at the given @index was not @item.
 */
void *radix_tree_delete_item(struct radix_tree_root *root,
                             unsigned long index, void *item)
{
        struct radix_tree_node *node;
        unsigned int offset;
        void **slot;
        void *entry;
        int tag;

        entry = __radix_tree_lookup(root, index, &node, &slot);
        if (!entry)
                return NULL;

        if (item && entry != item)
                return NULL;

        if (!node) {
                root_tag_clear_all(root);
                root->rnode = NULL;
                return entry;
        }

        offset = index & RADIX_TREE_MAP_MASK;

        /*
         * Clear all tags associated with the item to be deleted.
         * This way of doing it would be inefficient, but seldom is any set.
         */
        for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
                if (tag_get(node, tag, offset))
                        radix_tree_tag_clear(root, index, tag);
        }

        node->slots[offset] = NULL;
        node->count--;

        __radix_tree_delete_node(root, node);

        return entry;
}
EXPORT_SYMBOL(radix_tree_delete_item);

Radix 트리에서 요청 index 키 항목을 제거하고 제거한 항목을 반환한다.

entry = __radix_tree_lookup(root, index, &node, &slot);
- Radix 트리에서 요청 index 키 항목을 검색한다.
if (item && entry != item) return NULL;
- 검색하여 찾은 entry 주소와 item 주소가 다른(mismatch) 경우 null을 반환한다.
if (!node) { root_tag_clear_all(root); root->rnode = NULL; return entry; }
- 노드가 아닌 경우, 즉 키 인덱스가 0인 경우 루트에서 태그와 ptr 값을 지우고 해당 데이터 ptr 값을 반환한다.
offset = index & RADIX_TREE_MAP_MASK;
- 현재 노드에서 요청한 인덱스 키에 해당하는 offset
- 0~RADIX_TREE_MAP_SIZE(63)
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { if (tag_get(node, tag, offset)) radix_tree_tag_clear(root, index, tag); }
- offset에 위치한 태그가 설정되어 있는 경우 요청한 인덱스 키에 해당하는 3개 태그를 clear한다. 같은 노드가 관리하는 주변 64개의 태그들도 모두 없는 경우 상위노드로 진행하며 태그를 clear해 나간다.
node->slots[offset] = NULL; node->count–;
- 슬롯을 비우고 사용 카운터를 1 감소시킨다.
__radix_tree_delete_node(root, node);
- Radix 트리 노드가 필요 없는 경우 삭제한다.
return entry;
- 삭제한 엔트리를 반환한다.

__radix_tree_lookup()

lib/radix-tree.c

/**
 *      __radix_tree_lookup     -       lookup an item in a radix tree
 *      @root:          radix tree root
 *      @index:         index key
 *      @nodep:         returns node
 *      @slotp:         returns slot
 *
 *      Lookup and return the item at position @index in the radix
 *      tree @root.
 *
 *      Until there is more than one item in the tree, no nodes are
 *      allocated and @root->rnode is used as a direct slot instead of
 *      pointing to a node, in which case *@nodep will be NULL.
 */
void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index,
                          struct radix_tree_node **nodep, void ***slotp)
{
        struct radix_tree_node *node, *parent;
        unsigned int height, shift;
        void **slot;

        node = rcu_dereference_raw(root->rnode);
        if (node == NULL)
                return NULL;

        if (!radix_tree_is_indirect_ptr(node)) {
                if (index > 0)
                        return NULL;

                if (nodep)
                        *nodep = NULL;
                if (slotp)
                        *slotp = (void **)&root->rnode;
                return node;
        }
        node = indirect_to_ptr(node);

        height = node->path & RADIX_TREE_HEIGHT_MASK;
        if (index > radix_tree_maxindex(height))
                return NULL;

        shift = (height-1) * RADIX_TREE_MAP_SHIFT;

        do {
                parent = node;
                slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK);
                node = rcu_dereference_raw(*slot);
                if (node == NULL)
                        return NULL;

                shift -= RADIX_TREE_MAP_SHIFT;
                height--;
        } while (height > 0);

        if (nodep)
                *nodep = parent;
        if (slotp)
                *slotp = slot;
        return node;
}

Radix 트리에서 요청 index 키 항목을 검색한다. 발견되지 않으면 null을 반환한다.

node = rcu_dereference_raw(root->rnode); if (node == NULL) return NULL;
- 루트에 연결된 노드 주소를 알아온다.
if (!radix_tree_is_indirect_ptr(node)) {
- 노드가 아니라 직접 데이터 값이 있는 경우
if (index > 0) return NULL;
- 요청 index가 0보다 크면 못찾은 경우이므로 null을 반환한다.
- 루트가 직접 데이터를 갖는 경우는 key 인덱스가 0인 경우 밖에 없다.
if (nodep) *nodep = NULL;
- 루트에서 발견된 경우이므로 Radix 트리 노드 없어서 null을 출력인수 nodep에 대입한다.
if (slotp) *slotp = (void **)&root->rnode; return node;
- 출력인수 slotp에 슬롯 주소(rnode가 단일 슬롯으로 동작) 값을 대입하고 데이터 ptr 값을 반환한다.
node = indirect_to_ptr(node);
- 불필요한 플래그 비트를 제거하고 실제 노드 주소만 남긴다.
- 현재 node 값은 가장 상위 Radix 트리 노드 주소값이다.
height = node->path & RADIX_TREE_HEIGHT_MASK; if (index > radix_tree_maxindex(height)) return NULL;
- 요청 인덱스 키값이 최상위 노드의 height 단계가 관리하는 값을 초과하는 경우 null을 반환한다.
shift = (height-1) * RADIX_TREE_MAP_SHIFT;
- 가장 최상위 노드를 처리하기 위해 쉬프트할 비트 수를 결정한다.
do { parent = node; slot = node->slots + ((index >> shift) & RADIX_TREE_MAP_MASK); node = rcu_dereference_raw(*slot); if (node == NULL) return NULL; shift -= RADIX_TREE_MAP_SHIFT; height–; } while (height > 0);
- 가장 상위 노드부터 가장 바닥 노드를 거쳐 leaf까지 루프를 돌며 연결된 노드를 찾아간다.
if (nodep) *nodep = parent;
- 출력인수 nodep에 leaf를 관리하는 가장 마지막 노드를 대입한다.
if (slotp) *slotp = slot; return node;
- 출력인수 slotp에 슬롯 주소를 대입하고 데이터 ptr 값을 반환한다.

radix_tree_tag_clear()

lib/radix-tree.c

/**
 *      radix_tree_tag_clear - clear a tag on a radix tree node
 *      @root:          radix tree root
 *      @index:         index key
 *      @tag:           tag index
 *
 *      Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
 *      corresponding to @index in the radix tree.  If
 *      this causes the leaf node to have no tags set then clear the tag in the
 *      next-to-leaf node, etc.
 *
 *      Returns the address of the tagged item on success, else NULL.  ie:
 *      has the same return value and semantics as radix_tree_lookup().
 */
void *radix_tree_tag_clear(struct radix_tree_root *root,
                        unsigned long index, unsigned int tag)
{
        struct radix_tree_node *node = NULL;
        struct radix_tree_node *slot = NULL;
        unsigned int height, shift;
        int uninitialized_var(offset);

        height = root->height;
        if (index > radix_tree_maxindex(height))
                goto out;

        shift = height * RADIX_TREE_MAP_SHIFT;
        slot = indirect_to_ptr(root->rnode);

        while (shift) {
                if (slot == NULL)
                        goto out;

                shift -= RADIX_TREE_MAP_SHIFT;
                offset = (index >> shift) & RADIX_TREE_MAP_MASK;
                node = slot;
                slot = slot->slots[offset];
        }

        if (slot == NULL)
                goto out;

        while (node) {
                if (!tag_get(node, tag, offset))
                        goto out;
                tag_clear(node, tag, offset);
                if (any_tag_set(node, tag))
                        goto out;

                index >>= RADIX_TREE_MAP_SHIFT;
                offset = index & RADIX_TREE_MAP_MASK;
                node = node->parent;
        }

        /* clear the root's tag bit */
        if (root_tag_get(root, tag))
                root_tag_clear(root, tag);

out:
        return slot;
}
EXPORT_SYMBOL(radix_tree_tag_clear);

요청한 인덱스 키에 해당하는 태그를 clear한다. 같은 노드가 관리하는 64개의 태그들도 모두 없는 경우 상위노드로 진행하며 태그를 clear해 나간다.

height = root->height; if (index > radix_tree_maxindex(height)) goto out;
- 요청 index 키 값이 Radix 트리가 관리하는 단계를 초과하는 경우 null을 반환한다.
shift = height * RADIX_TREE_MAP_SHIFT;
- 잠시 후에 루프를 돌며 index 키에서 각 단계별로 필요한 비트만큼을 쉬프트하여 사용할 것이므로 미리 가장 상위보다 한 단계 더 높은 단계로 쉬프트 값을 정해 놓는다.
slot = indirect_to_ptr(root->rnode);
- 슬롯은 최상위 노드를 가리킨다.
while (shift) { if (slot == NULL) goto out; shift -= RADIX_TREE_MAP_SHIFT; offset = (index >> shift) & RADIX_TREE_MAP_MASK; node = slot; slot = slot->slots[offset]; }
- 가장 바닥 단계까지 루프를 돌아 node와 slot에 가장 최하단 노드 주소와 슬롯 값을 대입하게 한다.
if (slot == NULL) goto out;
- slot이 이미 비어 있는 경우 null을 반환한다.
while (node) { if (!tag_get(node, tag, offset)) goto out; tag_clear(node, tag, offset); if (any_tag_set(node, tag)) goto out; index >>= RADIX_TREE_MAP_SHIFT; offset = index & RADIX_TREE_MAP_MASK; node = node->parent; }
- 가장 하위 노드부터 최상위 노드까지 루프를 돌며 index 키와 관련된 태크를 clear한다.
- 진행도중 현재 노드의 index 키와 관련된 태그가 이미 비어 있는 경우 함수를 빠져나간다.
- 진행도중 현재 노드와 관련된 64개의 다른 태그 비트가 여전히 존재하는 경우 상위로 진행하지 않고 함수를 빠져나간다.
if (root_tag_get(root, tag)) root_tag_clear(root, tag);
- 여기까지 진행이 되었다는 의미는 최상위 노드마저도 모든 태그가 지워졌다는 의미이므로 루트에 있는 태그도 삭제한다.

다음 그림은 index 129번에 대한 0번 태그를 삭제할 때 하위 노드의 태그를 먼저 지운후 같은 노드의 태그가 모두 없는 경우 그 상위 노드의 태그마저 삭제하는 모습을 보여준다.

__radix_tree_delete_node()

lib/radix-tree.c

/**
 *      __radix_tree_delete_node    -    try to free node after clearing a slot
 *      @root:          radix tree root
 *      @node:          node containing @index
 *
 *      After clearing the slot at @index in @node from radix tree
 *      rooted at @root, call this function to attempt freeing the
 *      node and shrinking the tree.
 *
 *      Returns %true if @node was freed, %false otherwise.
 */
bool __radix_tree_delete_node(struct radix_tree_root *root,
                              struct radix_tree_node *node)
{
        bool deleted = false;

        do {
                struct radix_tree_node *parent;

                if (node->count) {
                        if (node == indirect_to_ptr(root->rnode)) {
                                radix_tree_shrink(root);
                                if (root->height == 0)
                                        deleted = true;
                        }
                        return deleted;
                }

                parent = node->parent;
                if (parent) {
                        unsigned int offset;

                        offset = node->path >> RADIX_TREE_HEIGHT_SHIFT;
                        parent->slots[offset] = NULL;
                        parent->count--;
                } else {
                        root_tag_clear_all(root);
                        root->height = 0;
                        root->rnode = NULL;
                }

                radix_tree_node_free(node);
                deleted = true;

                node = parent;
        } while (node);

        return deleted;
}

요청 노드에 대해 shrink를 해본 후 사용 슬롯이 없는 경우 삭제하고 루프를 돌며 상위 노드로 이동하여 반복한다. 하나 이상 삭제된 경우 true를 반환한다.

do { struct radix_tree_node *parent; if (node->count) { if (node == indirect_to_ptr(root->rnode)) { radix_tree_shrink(root); if (root->height == 0) deleted = true; } return deleted; }
- 요청한 노드의 count가 0보다 큰 경우 shrink를 시도한 후 결과를 반환한다.
parent = node->parent; if (parent) { unsigned int offset; offset = node->path >> RADIX_TREE_HEIGHT_SHIFT; parent->slots[offset] = NULL; parent->count–;
- 부모 노드가 있는 경우 부모 노드에서 현재 노드로의 연결을 끊고 count를 줄인다.
} else { root_tag_clear_all(root); root->height = 0; root->rnode = NULL; }
- 최상위 노드를 제거한다.
- 부모 노드가 없는 경우 루트 태그를 모두 clear하고 height를 0으로 만들고 item 연결을 끊는다.
radix_tree_node_free(node); deleted = true; node = parent; } while (node);
- 현재 노드를 제거하고 그 상위 노드를 선택한 후 계속 진행한다.
  - 상위 노드도 카운트가 0이 된 경우 제거한다.

radix_tree_shrink()

lib/radix-tree.c

/**
 *      radix_tree_shrink    -    shrink height of a radix tree to minimal
 *      @root           radix tree root
 */
static inline void radix_tree_shrink(struct radix_tree_root *root)
{
        /* try to shrink tree height */
        while (root->height > 0) {
                struct radix_tree_node *to_free = root->rnode;
                struct radix_tree_node *slot;

                BUG_ON(!radix_tree_is_indirect_ptr(to_free));
                to_free = indirect_to_ptr(to_free);

                /*
                 * The candidate node has more than one child, or its child
                 * is not at the leftmost slot, we cannot shrink.
                 */
                if (to_free->count != 1)
                        break;
                if (!to_free->slots[0])
                        break;

                /*
                 * We don't need rcu_assign_pointer(), since we are simply
                 * moving the node from one part of the tree to another: if it
                 * was safe to dereference the old pointer to it
                 * (to_free->slots[0]), it will be safe to dereference the new
                 * one (root->rnode) as far as dependent read barriers go.
                 */
                slot = to_free->slots[0];
                if (root->height > 1) {
                        slot->parent = NULL;
                        slot = ptr_to_indirect(slot);
                }
                root->rnode = slot;
                root->height--;

                /*
                 * We have a dilemma here. The node's slot[0] must not be
                 * NULLed in case there are concurrent lookups expecting to
                 * find the item. However if this was a bottom-level node,
                 * then it may be subject to the slot pointer being visible
                 * to callers dereferencing it. If item corresponding to
                 * slot[0] is subsequently deleted, these callers would expect
                 * their slot to become empty sooner or later.
                 *
                 * For example, lockless pagecache will look up a slot, deref
                 * the page pointer, and if the page is 0 refcount it means it
                 * was concurrently deleted from pagecache so try the deref
                 * again. Fortunately there is already a requirement for logic
                 * to retry the entire slot lookup -- the indirect pointer
                 * problem (replacing direct root node with an indirect pointer
                 * also results in a stale slot). So tag the slot as indirect
                 * to force callers to retry.
                 */
                if (root->height == 0)
                        *((unsigned long *)&to_free->slots[0]) |=
                                                RADIX_TREE_INDIRECT_PTR;

                radix_tree_node_free(to_free);
        }
}

Radix 트리 단계를 줄일 수 있는 경우 불필요한 Radix 트리 노드를 삭제하고 단계를 줄인다.

루프를 돌며 최상위 노드를 제거할 수 있는 경우 제거하여 Radix 트리 단계를 줄인다.
최상위 노드의 0번 슬롯만 있는 경우 현재 노드를 삭제하고 그 다음 노드를 최상위 노드로 변경한다.

while (root->height > 0) { struct radix_tree_node *to_free = root->rnode;
- Radix 트리 단계가 1단계 이상인 경우 루트에 연결된 최상위 노드를 가져온다.
to_free = indirect_to_ptr(to_free);
- 최상위 노드 포인터에서 RADIX_TREE_INDIRECT_PTR 비트를 제거한다.
if (to_free->count != 1) break;
- 노드가 관리하는 슬롯이 하나가 아니면 그만 shrink를 중지하고 빠져나간다.
if (!to_free->slots[0]) break;
- 남은 슬롯이 0번이 아닌 경우 그만 shrink를 중지하고 빠져나간다.
slot = to_free->slots[0]; if (root->height > 1) { slot->parent = NULL; slot = ptr_to_indirect(slot); }root->rnode = slot; root->height–;
- 첫 슬롯에 연결된 다음 노드를 최상위 노드로 만든다.
- parent에 null을 넣고, 루트가 다음 노드를 가리키게 하고 height 값을 감소시킨다.
if (root->height == 0) *((unsigned long *)&to_free->slots[0]) |= RADIX_TREE_INDIRECT_PTR;
- 루트 height값이 0이면 삭제할 노드의 첫 슬롯에 연결된 값은 item이더라도 RADIX_TREE_INDIRECT_PTR 비트를 더한다.
radix_tree_node_free(to_free);
- rcu 방식을 사용하여 노드를 제거한다.

radix_tree_node_free()

lib/radix-tree.c

static inline void
radix_tree_node_free(struct radix_tree_node *node)
{
        call_rcu(&node->rcu_head, radix_tree_node_rcu_free);
}

Radix 트리 노드를 RCU 방식으로 제거한 후 slub 캐시로 반환한다(free).

radix_tree_node_rcu_free()

lib/radix-tree.c

static void radix_tree_node_rcu_free(struct rcu_head *head)
{
        struct radix_tree_node *node =
                        container_of(head, struct radix_tree_node, rcu_head);
        int i;

        /*
         * must only free zeroed nodes into the slab. radix_tree_shrink
         * can leave us with a non-NULL entry in the first slot, so clear
         * that here to make sure.
         */
        for (i = 0; i < RADIX_TREE_MAX_TAGS; i++)
                tag_clear(node, i, 0);

        node->slots[0] = NULL;
        node->count = 0;

        kmem_cache_free(radix_tree_node_cachep, node);
}

Radix 트리 노드의 태그를 제거하고 slots[0]에 null을 대입하고, count를 0으로 만든 후 Radix 트리 노드 slub 캐시에 반환한다.(free)

Preload Radix 트리 노드

radix_tree_preload()

lib/radix-tree.c

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_WAIT being passed to INIT_RADIX_TREE().
 */
int radix_tree_preload(gfp_t gfp_mask)
{
        /* Warn on non-sensical use... */
        WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
        return __radix_tree_preload(gfp_mask);
}
EXPORT_SYMBOL(radix_tree_preload);

전역 per-cpu 타입의 radix tree preload 구조체에 빈 radix 트리 노드를 미리 할당받아 가득 채워 준비한다.

__radix_tree_preload()

lib/radix-tree.c

/*
 * Load up this CPU's radix_tree_node buffer with sufficient objects to
 * ensure that the addition of a single element in the tree cannot fail.  On
 * success, return zero, with preemption disabled.  On error, return -ENOMEM
 * with preemption not disabled.
 *
 * To make use of this facility, the radix tree must be initialised without
 * __GFP_WAIT being passed to INIT_RADIX_TREE().
 */
static int __radix_tree_preload(gfp_t gfp_mask)
{
        struct radix_tree_preload *rtp;
        struct radix_tree_node *node;
        int ret = -ENOMEM;

        preempt_disable();
        rtp = this_cpu_ptr(&radix_tree_preloads);
        while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
                preempt_enable();
                node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask);
                if (node == NULL)
                        goto out;
                preempt_disable();
                rtp = this_cpu_ptr(&radix_tree_preloads);
                if (rtp->nr < ARRAY_SIZE(rtp->nodes))
                        rtp->nodes[rtp->nr++] = node;
                else
                        kmem_cache_free(radix_tree_node_cachep, node);
        }
        ret = 0;
out:
        return ret;
}

전역 per-cpu 타입의 radix tree preload 구조체에 빈 radix 트리 노드를 미리 할당 받아 가득 채워 준비한다.

중간에 slub 캐시로 부터 할당이 실패한 경우 preemption이 enable된 채로 빠져나온다. 성공한 경우 preemption이 disable된 채로 빠져나온다.
시스템 크기에 따라 최대 radix 트리 노드 수가 정해진다.
- 32bit=11
- 64bit=21

lib/radix-tree.c

/*
 * The radix tree is variable-height, so an insert operation not only has
 * to build the branch to its corresponding item, it also has to build the
 * branch to existing items if the size has to be increased (by
 * radix_tree_extend).
 *
 * The worst case is a zero height tree with just a single item at index 0,
 * and then inserting an item at index ULONG_MAX. This requires 2 new branches
 * of RADIX_TREE_MAX_PATH size to be created, with only the root node shared.
 * Hence:
 */
#define RADIX_TREE_PRELOAD_SIZE (RADIX_TREE_MAX_PATH * 2 - 1)

RADIX_TREE_PRELOAD_SIZE
- Radix 트리는 가변 단계(레벨, 높이)로 구성되는데 insert 한 번 수행시 최악의 경우 여러 개의 radix 트리 노드의 할당이 필요하다 따라서 최대 할당이 가능한 수 만큼 미리 radix 트리 프리로드 버퍼에 빈 radix 트리 노드들을 할당 받아 둔다.
  - 예) 최악의 32bit 시스템 case
    - height가 0인 상태에서 index 키 0xffffffff를 사용하는 경우 최대 11개의 rcu 트리 노드가 필요하다.
      - Radix 트리 노드를 6단계까지 6번 확장 시키면서 6개가 필요하다.
      - index 키에 맞는 radix 트리 노드를 만들기 위해 6단계를 제외한 1~5단계 각각에 하나씩 하여 5개가 필요하다.
- 따라서 위와 같은 최대로 필요한 수에 맞추어 시스템에 따라 다음과 같이 크기가 결정된다.
  - 32bit=11
  - 64bit=21

다음 그림은 radix_tree_root가 0단계로 동작중에 long 최고 값을 index 키로 요청한 경우 32bit 시스템의 최대 단계인 6단계로 확장되면서 총 11개의 radix_tree_node가 필요한 경우를 보여준다.

기타 함수

ptr_to_indirect()

lib/radix-tree.c

static inline void *ptr_to_indirect(void *ptr)
{
        return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
}

leaf를 가리키지 않고 radix 트리 노드를 가리키는 경우 ptr에 RADIX_TREE_INDIRECT_PTR을 더해 저장한다.

indirect_to_ptr()

lib/radix-tree.c

static inline void *indirect_to_ptr(void *ptr)
{
        return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
}

ptr에서 RADIX_TREE_INDIRECT_PTR을 제외한다.

root_gfp_mask()

lib/radix-tree.c

static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
{
        return root->gfp_mask & __GFP_BITS_MASK;
}

루트에 저장된 gfp_mask에서 태그 비트를 제외한 순수 gfp_mask를 반환한다.

tag_set()

lib/radix-tree.c

static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __set_bit(offset, node->tags[tag]);
}

Radix 트리 노드의 tags[tag]의 offset 비트를 set 한다.

최대 태그 배열은 3개

tag_clear()

lib/radix-tree.c

static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        __clear_bit(offset, node->tags[tag]);
}

Radix 트리 노드의 tags[tag]의 offset 비트를 clear 한다.

최대 태그 배열은 3개

tag_get()

lib/radix-tree.c

static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
                int offset)
{
        return test_bit(offset, node->tags[tag]);
}

Radix 트리 노드의 tags[tag]의 offset 비트 상태를 가져온다.

최대 태그 배열은 3개

root_tag_set()

lib/radix-tree.c

static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
{
        root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
}

Radix 트리의 루트에 요청한 tag 비트를 설정한다.

root_tag_clear()

lib/radix-tree.c

static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
{
        root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
}

Radix 트리의 루트에 요청한 tag 비트를 clear 한다.

root_tag_clear_all()

lib/radix-tree.c

static inline void root_tag_clear_all(struct radix_tree_root *root)
{
        root->gfp_mask &= __GFP_BITS_MASK;
}

Radix 트리의 루트에 전체 tag 비트(총 3개)를 clear 한다.

root_tag_get()

lib/radix-tree.c

static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
{
        return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
}

Radix 트리의 루트의 요청한 tag 비트 상태를 가져온다.

구조체

radix_tree_root 구조체

include/linux/radix-tree.h

/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
struct radix_tree_root {
        unsigned int            height;
        gfp_t                   gfp_mask;
        struct radix_tree_node  __rcu *rnode;
};

height
- radix 트리가 관리하는 단계 수 (0~N)
- 0 단계에서는 Radix 트리 노드없이 오직 index 키 0 번 1개에 대한 슬롯을 직접 제공한다.
- 시스템 크기에 따라 최대 단계 수가 다르다.
  - 32bit: 6
  - 64bit: 11
- height가 0인 경우 index key가 하나도 등록이 되지 않았거나 하나의 0번 index key만을 등록한 경우이다.
gfp_mask
- radix_tree_node 할당을 받을 때 마다 slub 캐시에 메모리 할당을 요청하는데 이 때 사용할 gfp_mask를 담고 있다.
- 추가로 3개의 tag 비트를 사용한다.
rnode
- 노드를 가리키거나 한 개의 0번 index key에 해당하는 슬롯으로 동작하여 leaf의 포인터를 저장한다.
  - 가장 상위 노드인 radix_tree_node를 가리키게 할 경우 RADIX_TREE_INDIRECT_PTR 비트를 추가하여 사용한다.
  - 하나의 0번 index key만 사용된 경우 radix_tree_node를 만들지 않고 직접 rnode가 단일 슬롯으로 동작하여 item을 직접 저장한다.

radix_tree_node 구조체

include/linux/radix-tree.h

struct radix_tree_node {
        unsigned int    path;   /* Offset in parent & height from the bottom */
        unsigned int    count;
        union {
                struct {
                        /* Used when ascending tree */
                        struct radix_tree_node *parent;
                        /* For tree user */
                        void *private_data;
                };
                /* Used when freeing node */
                struct rcu_head rcu_head;
        };
        /* For tree user */
        struct list_head private_list;
        void __rcu      *slots[RADIX_TREE_MAP_SIZE];
        unsigned long   tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};

path
- 현재 노드의 레벨
- 가장 바닥 레벨은 1부터 시작한다.
count
- 사용되고 있는 슬롯 수
- 0~최대 RADIX_TREE_MAP_SIZE(64)개까지
*parent
- 상위 노드를 가리킨다.
*private_data
rcu_head
- rcu를 이용하여 노드를 삭제할 때 사용한다.
private_list
*slots[]
- RADIX_TREE_MAP_SIZE(64)개까지 다음 노드를 가리키거나 leaf에 해당하는 item을 저장한다.
  - 노드를 가리키게 할 경우 RADIX_TREE_INDIRECT_PTR을 추가하여 사용한다.
tags[]
- 총 3개의 태그로 구성된 비트맵
- 각 비트맵은 RADIX_TREE_MAP_SIZE(64)개의 비트를 처리할 수 있는 공간을 가졌다.
  - tags[][]의 이중 배열중 마지막은 실제 선언 시에만 사용되고 실제 처리 루틴에서는 tags[] 일차원 배열로만 이용한다.
- slot에 item이 저장되었는지 여부를 비트맵을 사용하여 표현한다.
  - 태그 비트가 1이면 해당 비트 위치의 슬롯이 사용되었음을 의미한다.

radix_tree_preload 구조체

lib/radix-tree.c

/*
 * Per-cpu pool of preloaded nodes
 */
struct radix_tree_preload {
        int nr;
        struct radix_tree_node *nodes[RADIX_TREE_PRELOAD_SIZE];
};

nr
- 현재 cpu에 할당받은 빈 radix_tree_node 구조체의 수
*nodes
- 할당 받은 radix_tree_node 포인터를 순서대로 배열에 가지고 있다.

참고

Trees I: Radix trees | LWN.net
radix_tree_init() | 문c

context_tracking_init()

2016-09-222016-09-22 문영일 Leave a comment

context_tracking_init()

kernel/context_tracking.c

#ifdef CONFIG_CONTEXT_TRACKING_FORCE
void __init context_tracking_init(void)
{
        int cpu;

        for_each_possible_cpu(cpu)
                context_tracking_cpu_set(cpu);
}
#endif

CONFIG_CONTEXT_TRACKING_FORCE 커널 옵션을 사용한 경우 동작되며 각 cpu에 대해 context 트래킹을 enable한다.

성능상의 이유로 production 커널을 빌드시에는 이 옵션을 사용하면 안된다.
CONFIG_CONTEXT_TRACKING_FORCE 옵션은 CONFIG_RCU_USER_QS 옵션 또는 CONFIG_VIRT_CPU_ACCOUNTING_GEN을 사용한 경우 지원된다.
- CONFIG_RCU_USER_QS
  - userspace에서 확장된 quiescent 상태를 가진 RCU를 사용하게 한다.
    - kernelspace에서 userspace로 넘어갈 때 이 cpu에 대해 GP 내부에 있는지 확인할 필요가 없으므로 이 상태를 quiescent 상태로 바꾼다.
- CONFIG_VIRT_CPU_ACCOUNTING_GEN
  - full dynticks 시스템에서 task와 cpu 타임을 재기위해 동작시키낟.

context_tracking_cpu_set()

void context_tracking_cpu_set(int cpu)
{
        if (!per_cpu(context_tracking.active, cpu)) {
                per_cpu(context_tracking.active, cpu) = true;
                static_key_slow_inc(&context_tracking_enabled);
        }
}

요청 cpu에 대한 context_tracking.active가 false인 경우 true로 변경하고 context_tracking_enabled static key 변수를 증가시킨다.

trace_init()

2016-09-222016-09-22 문영일 Leave a comment

trace_init()

kernel/trace/trace.c

void __init trace_init(void)
{
        if (tracepoint_printk) {
                tracepoint_print_iter =
                        kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
                if (WARN_ON(!tracepoint_print_iter))
                        tracepoint_printk = 0;
        }
        tracer_alloc_buffers();
        trace_event_init();
}

CONFIG_TRACING 커널 옵션이 설정된 경우에 빌드되어 동작되며 트레이스용 버퍼 및 이벤트를 초기화한다.

성능상의 이유로 production 커널을 빌드시에는 이 옵션을 사용하면 안된다.

if (tracepoint_printk) {
- “tp_printk=on” 등의 커널 파라메터가 동작된 경우
tracepoint_print_iter = kmalloc(sizeof(*tracepoint_print_iter), GFP_KERNEL);
- trace_iterator 구조체 공간을 할당받는다.
tracer_alloc_buffers();
- trace용 per-cpu 버퍼를 할당받고 초기화한다.
  - (생략)
trace_event_init();
- CONFIG_EVENT_TRACING 커널 옵션을 사용하는 경우 이벤트 트레이스를 위해 초기화한다.
  - (생략)

“tp_printk=” 커널 파라메터

static int __init set_tracepoint_printk(char *str)
{
        if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
                tracepoint_printk = 1;
        return 1;
}
__setup("tp_printk", set_tracepoint_printk);

“tp_printk=0” 및 “tp_printk=off”가 아닌 경우 전역 tracepoint_printk를 1로 설정하여 trace 출력이 가능하게한다.

rcu_init()

2016-09-222021-04-01 문영일 Leave a comment

RCU 초기화

rcu의 초기화는 rcu_init() 함수 및 rcu_nohz_init() 함수에서 이루어진다.

rcu_init()

kernel/rcu/tree.c

void __init rcu_init(void)
{
        int cpu;

        rcu_early_boot_tests();

        rcu_bootup_announce();
        rcu_init_geometry();
        rcu_init_one();
        if (dump_tree)
                rcu_dump_rcu_node_tree();
        if (use_softirq)
                open_softirq(RCU_SOFTIRQ, rcu_core_si);

        /*
         * We don't need protection against CPU-hotplug here because
         * this is called early in boot, before either interrupts
         * or the scheduler are operational.
         */
        pm_notifier(rcu_pm_notify, 0);
        for_each_online_cpu(cpu) {
                rcutree_prepare_cpu(cpu);
                rcu_cpu_starting(cpu);
                rcutree_online_cpu(cpu);
        }

        /* Create workqueue for expedited GPs and for Tree SRCU. */
        rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0);
        WARN_ON(!rcu_gp_wq);
        rcu_par_gp_wq = alloc_workqueue("rcu_par_gp", WQ_MEM_RECLAIM, 0);
        WARN_ON(!rcu_par_gp_wq);
        srcu_init();
}

rcu를 사용하기 위해 rcu 관련 구조체들을 초기화하고, cpu pm 변화에 따라 호출되는 콜백함수를 등록한다.

코드 라인 5에서 “Running RCU self tests” 메시지를 출력하고 모듈 파라미터 “rcutree.rcu_self_test”가 설정된 경우 테스트 콜백 하나를 call_rcu()와 call_srcu() 두 API를 사용하여 등록시켜 “RCU test callback executed %d” 메시지가 출력되게 한다.
코드 라인 7에서 RCU 설정들이 동작하는지 관련 메시지 로그를 출력한다. “Preemptible hierarchical RCU implementation.” 메시지를 시작으로 여러 정보들이 출력된다.
코드 라인 8에서 rcu_state 구조체 내부의 rcu_node 들에 대한 트리 기하를 구성하기 위한 설정 값들을 산출한다.
코드 라인 9에서 rcu_state이하 모든 rcu 노드 및 rcu_data들을 초기화하고 구성한다.
코드 라인 10~11에서 “dump_tree” 커널 파라미터가 설정된 경우 “rcu_node tree layout dump” 메시지 출력 이후에 노드 구성을 출력한다.
코드 라인 12~13에서 rcu softirq 벡터에 rcu_core_si() 함수가 호출되도록 대입한다.
코드 라인 20에서 pm(power management)의 suspend/resume 동작시 호출되도록 rcu_pm_notify() 함수를 등록한다.
코드 라인 21~25에서 각 online cpu 만큼 순회하며 rcu용 cpu 정보들을 초기화하고 시작시킨다.
코드 라인 28~31에서 급행 gp와 srcu tree용으로 워크큐를 할당한다.
코드 라인 32에서 srcu를 초기화한다.

early 부트에서의 테스트

rcu_early_boot_tests()

kernel/rcu/update.c

void rcu_early_boot_tests(void)
{
        pr_info("Running RCU self tests\n");

        if (rcu_self_test)
                early_boot_test_call_rcu();
        rcu_test_sync_prims();
}

“Running RCU self tests” 메시지를 출력하고 모듈 파라미터 “rcutree.rcu_self_test”가 설정된 경우 테스트 콜백 하나를 call_rcu()와 call_srcu() 두 API를 사용하여 등록시켜 “RCU test callback executed %d” 메시지가 출력되게 한다.

early_boot_test_call_rcu()

kernel/rcu/update.c

static void early_boot_test_call_rcu(void)
{
        static struct rcu_head head;
        static struct rcu_head shead;

        call_rcu(&head, test_callback);
        if (IS_ENABLED(CONFIG_SRCU))
                call_srcu(&early_srcu, &shead, test_callback);
}

test_callback()

kernel/rcu/update.c

static void test_callback(struct rcu_head *r)
{
        rcu_self_test_counter++;
        pr_info("RCU test callback executed %d\n", rcu_self_test_counter);
}

“RCU test callback executed %d” 메시지가 출력되는 테스트용 콜백이다.

rcu_test_sync_prims()

kernel/rcu/update.c

/*
 * Test each non-SRCU synchronous grace-period wait API.  This is
 * useful just after a change in mode for these primitives, and
 * during early boot.
 */

void rcu_test_sync_prims(void)
{
        if (!IS_ENABLED(CONFIG_PROVE_RCU))
                return;
        synchronize_rcu();
        synchronize_rcu_expedited();
}

싱크용 rcu 명령이 eayly boot에서 어떻게 동작하는지 호출해본다.

early 부트에서 gp 대기 없이 아무런 처리를 하지 않아야 한다.

부트업 어나운스

rcu_bootup_announce()

kernel/rcu/tree_plugin.h

/*
 * Tell them what RCU they are running.
 */

static void __init rcu_bootup_announce(void)
{
        pr_info("Preemptible hierarchical RCU implementation.\n");
        rcu_bootup_announce_oddness();
}

RCU 설정들이 동작하는지 관련 메시지 로그를 출력한다. “Preemptible hierarchical RCU implementation.” 메시지를 시작으로 여러 정보들이 출력된다.

rcu_bootup_announce_oddness()

kernel/rcu/tree_plugin.h

/*
 * Check the RCU kernel configuration parameters and print informative
 * messages about anything out of the ordinary.
 */

static void __init rcu_bootup_announce_oddness(void)
{
        if (IS_ENABLED(CONFIG_RCU_TRACE))
                pr_info("\tRCU event tracing is enabled.\n");
        if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
            (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
                pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n",
                        RCU_FANOUT);
        if (rcu_fanout_exact)
                pr_info("\tHierarchical RCU autobalancing is disabled.\n");
        if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
                pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
        if (IS_ENABLED(CONFIG_PROVE_RCU))
                pr_info("\tRCU lockdep checking is enabled.\n");
        if (RCU_NUM_LVLS >= 4)
                pr_info("\tFour(or more)-level hierarchy is enabled.\n");
        if (RCU_FANOUT_LEAF != 16)
                pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
                        RCU_FANOUT_LEAF);
        if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
                pr_info("\tBoot-time adjustment of leaf fanout to %d.\n",
                        rcu_fanout_leaf);
        if (nr_cpu_ids != NR_CPUS)
                pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
#ifdef CONFIG_RCU_BOOST
        pr_info("\tRCU priority boosting: priority %d delay %d ms.\n",
                kthread_prio, CONFIG_RCU_BOOST_DELAY);
#endif
        if (blimit != DEFAULT_RCU_BLIMIT)
                pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
        if (qhimark != DEFAULT_RCU_QHIMARK)
                pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
        if (qlowmark != DEFAULT_RCU_QLOMARK)
                pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
        if (jiffies_till_first_fqs != ULONG_MAX)
                pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
        if (jiffies_till_next_fqs != ULONG_MAX)
                pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
        if (jiffies_till_sched_qs != ULONG_MAX)
                pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs))
;
        if (rcu_kick_kthreads)
                pr_info("\tKick kthreads if too-long grace period.\n");
        if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
                pr_info("\tRCU callback double-/use-after-free debug enabled.\n");
        if (gp_preinit_delay)
                pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
        if (gp_init_delay)
                pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
        if (gp_cleanup_delay)
                pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay);
        if (!use_softirq)
                pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
        if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
                pr_info("\tRCU debug extended QS entry/exit.\n");
        rcupdate_announce_bootup_oddness();
}

커널 config RCU 설정들에 대해 다음과 같은 로그를 출력한다.

CONFIG_RCU_TRACE를 설정한 경우 “RCU event tracing is enabled.”
CONFIG_RCU_FANOUT 값이 default(32bit=32, 64bit=64) 값과 다른 경우 “CONFIG_RCU_FANOUT set to non-default value of %d”
CONFIG_RCU_FANOUT_EXACT가 설정된 경우 “Hierarchical RCU autobalancing is disabled.”
CONFIG_RCU_FAST_NO_HZ가 설정된 경우 “RCU dyntick-idle grace-period acceleration is enabled.”
CONFIG_PROVE_RCU가 설정된 경우 “RCU lockdep checking is enabled.”
rcu 노드 레벨이 4 이상(0~4까지 가능) 설정된 경우 “Four-level hierarchy is enabled.”
CONFIG_RCU_FANOUT_LEAF 값이 default(16) 값과 다른 경우 “Boot-time adjustment of leaf fanout to %d.”
online cpu와 커널 컴파일 시 설정된 NR_CPUS와 다른 경우 “RCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.”
CONFIG_RCU_BOOST가 설정된 경우 “RCU priority boosting: priority %d delay %d ms.”
blimit가 DEFAULT_RCU_BLIMIT(10)와 다른 경우 “Boot-time adjustment of callback invocation limit to %ld.”
qhimark가 DEFAULT_RCU_QHIMARK(10000)와 다른 경우 “Boot-time adjustment of callback high-water mark to %ld.”
qlowmark가 DEFAULT_RCU_QLOMARK(10)와 다른 경우 “Boot-time adjustment of callback low-water mark to %ld.”
jiffies_till_first_fqs가 설정된 경우 “Boot-time adjustment of first FQS scan delay to %ld jiffies.”
jiffies_till_next_fqs가 설정된 경우 “Boot-time adjustment of subsequent FQS scan delay to %ld jiffies.”
jiffies_till_sched_qs가 설정된 경우 “Boot-time adjustment of scheduler-enlistment delay to %ld jiffies.”
rcu_kick_kthreads가 설정된 경우 “Kick kthreads if too-long grace period.”
CONFIG_DEBUG_OBJECTS_RCU_HEAD가 설정된 경우 “tRCU callback double-/use-after-free debug enabled.”
gp_preinit_delay가 설정된 경우 “RCU debug GP pre-init slowdown %d jiffies.”
gp_init_delay가 설정된 경우 “RCU debug GP init slowdown %d jiffies.”
gp_cleanup_delay가 설정된 경우 “RCU debug GP init slowdown %d jiffies.”
use_softirq(디폴트=1)가 off된 경우 “RCU_SOFTIRQ processing moved to rcuc kthreads.”
CONFIG_RCU_EQS_DEBUG가 설정된 경우 “RCU debug extended QS entry/exit.”

rcupdate_announce_bootup_oddness()

kernel/rcu/update.c

/*
 * Print any significant non-default boot-time settings.
 */

void __init rcupdate_announce_bootup_oddness(void)
{
        if (rcu_normal)
                pr_info("\tNo expedited grace period (rcu_normal).\n");
        else if (rcu_normal_after_boot)
                pr_info("\tNo expedited grace period (rcu_normal_after_boot).\n");
        else if (rcu_expedited)
                pr_info("\tAll grace periods are expedited (rcu_expedited).\n");
        if (rcu_cpu_stall_suppress)
                pr_info("\tRCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).\n");
        if (rcu_cpu_stall_timeout != CONFIG_RCU_CPU_STALL_TIMEOUT)
                pr_info("\tRCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).\n", rcu_cpu_stall_timeout);
        rcu_tasks_bootup_oddness();
}

boot 타임 RCU 설정들에 대해 다음과 같은 로그를 출력한다.

rcu_normal이 설정된 경우 “No expedited grace period (rcu_normal).”
rcu_normal_after_boot가 설정된 경우 “No expedited grace period (rcu_normal_after_boot).”
rcu_expedited가 설정된 경우 “All grace periods are expedited (rcu_expedited).”
rcu_cpu_stall_suppress가 설정된 경우 “RCU CPU stall warnings suppressed (rcu_cpu_stall_suppress).”
rcu_cpu_stall_timeout가 CONFIG_RCU_CPU_STALL_TIMEOUT(21초)와 다르게 설정된 경우 “RCU CPU stall warnings timeout set to %d (rcu_cpu_stall_timeout).”

rcu_tasks_bootup_oddness()

kernel/rcu/update.c

/*
 * Print any non-default Tasks RCU settings.
 */

static void __init rcu_tasks_bootup_oddness(void)
{
#ifdef CONFIG_TASKS_RCU
        if (rcu_task_stall_timeout != RCU_TASK_STALL_TIMEOUT)
                pr_info("\tTasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).\n", rcu_task_stall_tt
imeout);
        else
                pr_info("\tTasks RCU enabled.\n");
#endif /* #ifdef CONFIG_TASKS_RCU */
}

Tasks RCU 설정들에 대해 다음과 같은 로그를 출력한다.

rcu_task_stall_timeout 가 RCU_TASK_STALL_TIMEOUT(600초)와 다르게 설정된 경우 “Tasks-RCU CPU stall warnings timeout set to %d (rcu_task_stall_timeout).”

cpu hot-plug 동작

rcu_pm_notify()

kernel/rcu/tree.c

/*
 * On non-huge systems, use expedited RCU grace periods to make suspend
 * and hibernation run faster.
 */

static int rcu_pm_notify(struct notifier_block *self,
                         unsigned long action, void *hcpu)
{
        switch (action) {
        case PM_HIBERNATION_PREPARE:
        case PM_SUSPEND_PREPARE:
                rcu_expedite_gp();
                break;
        case PM_POST_HIBERNATION:
        case PM_POST_SUSPEND:
                rcu_unexpedite_gp();
                break;
        default:
                break;
        }
        return NOTIFY_OK;
}

cpu 상태 변화에 따른 통지를 받았을 때 action에 따라 처리할 함수를 호출한다.

트리 기하 초기화

rcu_init_geometry()

kernel/rcu/tree.c

/*
 * Compute the rcu_node tree geometry from kernel parameters.  This cannot
 * replace the definitions in tree.h because those are needed to size
 * the ->node array in the rcu_state structure.
 */

static void __init rcu_init_geometry(void)
{
        ulong d;
        int i;
        int rcu_capacity[RCU_NUM_LVLS];

        /*
         * Initialize any unspecified boot parameters.
         * The default values of jiffies_till_first_fqs and
         * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS
         * value, which is a function of HZ, then adding one for each
         * RCU_JIFFIES_FQS_DIV CPUs that might be on the system.
         */
        d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
        if (jiffies_till_first_fqs == ULONG_MAX)
                jiffies_till_first_fqs = d;
        if (jiffies_till_next_fqs == ULONG_MAX)
                jiffies_till_next_fqs = d;
        adjust_jiffies_till_sched_qs();

        /* If the compile-time values are accurate, just leave. */
        if (rcu_fanout_leaf == RCU_FANOUT_LEAF &&
            nr_cpu_ids == NR_CPUS)
                return;
        pr_info("Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%u\n",
                rcu_fanout_leaf, nr_cpu_ids);

        /*
         * The boot-time rcu_fanout_leaf parameter must be at least two
         * and cannot exceed the number of bits in the rcu_node masks.
         * Complain and fall back to the compile-time values if this
         * limit is exceeded.
         */
        if (rcu_fanout_leaf < 2 ||
            rcu_fanout_leaf > sizeof(unsigned long) * 8) {
                rcu_fanout_leaf = RCU_FANOUT_LEAF;
                WARN_ON(1);
                return;
        }

        /*
         * Compute number of nodes that can be handled an rcu_node tree
         * with the given number of levels.
         */
        rcu_capacity[0] = rcu_fanout_leaf;
        for (i = 1; i < RCU_NUM_LVLS; i++)
                rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;

        /*
         * The tree must be able to accommodate the configured number of CPUs.
         * If this limit is exceeded, fall back to the compile-time values.
         */
        if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) {
                rcu_fanout_leaf = RCU_FANOUT_LEAF;
                WARN_ON(1);
                return;
        }

        /* Calculate the number of levels in the tree. */
        for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
        }
        rcu_num_lvls = i + 1;

        /* Calculate the number of rcu_nodes at each level of the tree. */
        for (i = 0; i < rcu_num_lvls; i++) {
                int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
                num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
        }

        /* Calculate the total number of rcu_node structures. */
        rcu_num_nodes = 0;
        for (i = 0; i < rcu_num_lvls; i++)
                rcu_num_nodes += num_rcu_lvl[i];
}

노드 구성을 위한 트리 기하를 산출한다.

전역 jiffies_till_first_fqs 및 jiffies_till_next_fqs 산출
전역 rcu_num_lvls 산출
전역 num_rcu_lvl[] 배열에 각 rcu 노드 레벨별로 rcu_node 갯수 산출
전역 rcu_num_nodes 산출
- num_rcu_lvl[]을 모두 더한다

코드 라인 14에서 d 값으로 RCU_JIFFIES_TILL_FORCE_QS를 배정하지만 online cpu 수가 256개 단위를 초과할 때 마다 delay 값을 추가 한다.
- rpi2: RCU_JIFFES_TILL_FORCE_QS(1) + nr_cpu_ids(4) / RCU_JIFFIES_FQS_DIV(256) = 1
- RCU_JIFFIES_TILL_FORCE_QS 딜레이 값을 디폴트로 대입하되 시스템의 HZ가 250을 넘어가는 케이스 및 online cpu 수가 256개를 초과하는 케이스에 대해 추가로 delay값을 증가하여 설정하다.
코드 라인 15~16에서 모듈 파라미터 jiffies_till_first_fqs가 설정되어 있지 않은 경우 d 값으로 설정된다.
코드 라인 17~18에서 모듈 파라미터 jiffies_till_next_fqs 가 설정되어 있지 않은 경우 d 값으로 설정된다.
코드 라인 19에서 위에서 산출된 값으로 jiffies_to_sched_qs 값을 결정한다.
코드 라인 22~24에서 모듈 파라미터 rcu_fanout_leaf 값과 nr_cpu_ids가 커널 설정 시와 다르지 않은 경우 변동이 없어 함수를 빠져나간다.
코드 라인 25~26에서 변동이 생긴 경우이다. “RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d” 메시지를 출력한다.
코드 라인 34~39에서 rcu_fanout_leaf 값이 2보다 작거나 시스템 한계(32bits=32, 64bits=64)를 초과하는 경우 CONFIG_RCU_FANOUT_LEAF(디폴트=16)으로 변경하고 함수를 빠져나간다.
코드 라인 45~47에서 rcu_capacity[] 배열에 각 레벨별 최대 노드 수가 산출된다.
코드 라인 53~57에서 산출된 하위 레벨의 rcu_capacity[]가 cpu 보다 작은 경우 rcu_fanout_leaf 값을 CONFIG_RCU_FANOUT_LEAF(디폴트=16)으로 변경하고 함수를 빠져나간다.
코드 라인 60~62에서 트리 레벨을 결정한다.
코드 라인 65~68에서 online cpu 수에 맞게 각 레벨에서 필요로 하는 rcu_node의 수를 num_rcu_lvl[]에 대입한다.
코드 라인 71~73에서 num_rcu_lvl[]을 모두 더해 rcu_num_nodes를 산출한다.

kernel/rcu/tree.h

#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
                                        /* For jiffies_till_first_fqs and */
                                        /*  and jiffies_till_next_fqs. */

#define RCU_JIFFIES_FQS_DIV     256     /* Very large systems need more */
                                        /*  delay between bouts of */
                                        /*  quiescent-state forcing. */

RCU_JIFFIES_TILL_FORCE_QS
- fqs 대기 시간(틱)
RCU_JIFFIES_FQS_DIV
- cpu가 많은 시스템에서 이 값 만큼의 cpu 마다 1틱씩 추가 delay를 준다.
- 예) cpus=512 -> fqs 대기 시간에 2틱 추가 delay

다음 그림은 jiffies_till_first_fqs 및 jiffies_till_next_fqs를 산출하는 과정을 보여준다.

다음 그림은 rcu_capacity[] 배열이 산출된 후의 모습을 보여준다. (32bits)

jiffies_till_sched_qs 조정

adjust_jiffies_till_sched_qs()

kernel/rcu/tree.c

/*
 * Make sure that we give the grace-period kthread time to detect any
 * idle CPUs before taking active measures to force quiescent states.
 * However, don't go below 100 milliseconds, adjusted upwards for really
 * large systems.
 */

static void adjust_jiffies_till_sched_qs(void)
{
        unsigned long j;

        /* If jiffies_till_sched_qs was specified, respect the request. */
        if (jiffies_till_sched_qs != ULONG_MAX) {
                WRITE_ONCE(jiffies_to_sched_qs, jiffies_till_sched_qs);
                return;
        }
        /* Otherwise, set to third fqs scan, but bound below on large system. */
        j = READ_ONCE(jiffies_till_first_fqs) +
                      2 * READ_ONCE(jiffies_till_next_fqs);
        if (j < HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV)
                j = HZ / 10 + nr_cpu_ids / RCU_JIFFIES_FQS_DIV;
        pr_info("RCU calculated value of scheduler-enlistment delay is %ld jiffies.\n", j);
        WRITE_ONCE(jiffies_to_sched_qs, j);
}

jiffies_to_sched_qs 값을 산출한다.

코드 라인 6~9에서 모듈 파라미터 jiffies_till_sched_qs가 지정된 경우 이 값으로 jiffies_to_sched_qs를 갱신하고 함수를 빠져나간다.
코드 라인 11~16에서 jiffies_till_first_fqs + jiffies_till_next_fqs * 2 값을 jiffies_to_sched_qs에 기록하되 최소 값을 다음과 같이 제한한다.
- 0.1초에 해당하는 틱 수 + cpu 수 / 256
- rpi4 = 25

다음은 rpi4에서 알아본 초기 rcu에 대한 모듈 파라메터 값이다.

$ cat /sys/module/rcutree/parameters/jiffies_till_first_fqs
1
$ cat /sys/module/rcutree/parameters/jiffies_till_next_fqs
1
$ cat /sys/module/rcutree/parameters/jiffies_till_sched_qs
18446744073709551615
$ cat /sys/module/rcutree/parameters/jiffies_to_sched_qs
25
$ cat /sys/module/rcutree/parameters/rcu_fanout_leaf
16
$ cat /sys/module/rcutree/parameters/qlowmark
100
$ cat /sys/module/rcutree/parameters/qhimark
10000
$ cat /sys/module/rcutree/parameters/blimit
10
$ cat /sys/module/rcutree/parameters/kthread_prio
0
$ cat /sys/module/rcutree/parameters/gp_cleanup_delay
0
$ cat /sys/module/rcutree/parameters/gp_init_delay
0
$ cat /sys/module/rcutree/parameters/gp_preinit_delay
0
$ cat /sys/module/rcutree/parameters/rcu_divisior
7
$ cat /sys/module/rcutree/parameters/rcu_fanout_exact
N
$ cat /sys/module/rcutree/parameters/rcu_resched_ns
3000000
$ cat /sys/module/rcutree/parameters/sysrq_rcu
N
$ cat /sys/module/rcutree/parameters/use_softirq
Y

빌드 타임 RCU 트리 구성

빌드 타임에 NR_CPUS에 해당하는 cpu 수로 rcu 노드 크기를 결정하여 사용한다.

1개로 구성된 rcu_state 구조체 내부에 rcu_node 구조체 배열이 static하게 구성된다. (기존에는 3개의 rcu_state가 사용되었었다)
rcu_node 구조체 배열은 NR_CPUS 크기에 따라 1레벨부터 최대 4레벨 까지의 tree 구조를 지원한다.
RCU 노드 트리는 최소 1레벨부터 최대 4레벨까지 구성된다.
- 32bit 시스템(CONFIG_RCU_FANOUT_LEAF=16(디폴트)기준)
  - 1레벨에서 최대 16개 cpu 지원 가능
  - 2레벨에서 최대 16×32개 cpu 지원 가능
  - 3레벨에서 최대 16x32x32개 cpu 지원 가능
  - 4레벨에서 최대 16x32x32x32개 cpu 지원 가능
- 64bit 시스템(CONFIG_RCU_FANOUT_LEAF=16(디폴트)기준)
  - 1레벨에서 최대 16개 cpu 지원 가능
  - 2레벨에서 최대 16×64개 cpu 지원 가능
  - 3레벨에서 최대 16x64x64개 cpu 지원 가능
  - 4레벨에서 최대 16x64x64x64개 cpu 지원 가능
hotplug cpu를 지원하여 상태가 변화함에 따라 노드 구성이 바뀌게 설계되어 있다.
rcu_node의 sub rcu 노드들은 최대 CONFIG_RCU_FANOUT까지 구성된다.
- 32bit 시스템에서 2~32개까지, 64bit 시스템에서 2~64개까지 설정 가능하다.
- default: 32bit에서 32, 64bit에서 64
최하단 leaf 노드의 경우 rcu_data(cpu)와의 구성에서 rcu_fanout_leaf까지 연결될 수 있다.
- CONFIG_RCU_FANOUT_LEAF
  - 디폴트로 16개의 cpu(rcu_data)를 관리할 수 있고, 2~RCU_FANOUT 범위까지 설정 가능하다.
  - 각 cpu에 대한 노드 락을 contention을 회피하기 위해 16개를 디폴트로 사용하고 있다.

다음 그림은 최소 1 레벨과 최대 4 레벨의 구성 차이를 보여준다.

다음 그림은 최대 4 레벨에서 관리 가능한 cpu 수를 보여준다.

다음 그림은 컴파일 타임에 NR_CPUS 크기에 따라 사용할 레벨이 결정되고 각 레벨별로 rcu 노드 수가 결정되는 것을 보여준다.

다음 그림은 20개의 CPU를 지원하는 설정으로 컴파일 시 구성되는 rcu 노드들의 수를 산출하는 것을 보여준다.

CONFIG_RCU_FANOUT=64, CONFIG_RCU_FANOUT_LEAF=16 사용

다음 그림은 rcu_state 구조체 내부에서 4레벨로 구성된 rcu_node가 구성된 순서를 보여준다.

rcu_data들은 최 하위 leaf 노드들과 직접 연결된다.

RCU 구조체 초기화

rcu_init_one()

kernel/rcu/tree.c

/*
 * Helper function for rcu_init() that initializes the rcu_state structure.
 */

static void __init rcu_init_one(void)
{
        static const char * const buf[] = RCU_NODE_NAME_INIT;
        static const char * const fqs[] = RCU_FQS_NAME_INIT;
        static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
        static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];

        int levelspread[RCU_NUM_LVLS];          /* kids/node in each level. */
        int cpustride = 1;
        int i;
        int j;
        struct rcu_node *rnp;

        BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf));  /* Fix buf[] init! */

        /* Silence gcc 4.8 false positive about array index out of range. */
        if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
                panic("rcu_init_one: rcu_num_lvls out of range");

        /* Initialize the level-tracking arrays. */

        for (i = 1; i < rcu_num_lvls; i++)
                rcu_state.level[i] =
                        rcu_state.level[i - 1] + num_rcu_lvl[i - 1];
        rcu_init_levelspread(levelspread, num_rcu_lvl);

        /* Initialize the elements themselves, starting from the leaves. */

        for (i = rcu_num_lvls - 1; i >= 0; i--) {
                cpustride *= levelspread[i];
                rnp = rcu_state.level[i];
                for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) {
                        raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
                        lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
                                                   &rcu_node_class[i], buf[i]);
                        raw_spin_lock_init(&rnp->fqslock);
                        lockdep_set_class_and_name(&rnp->fqslock,
                                                   &rcu_fqs_class[i], fqs[i]);
                        rnp->gp_seq = rcu_state.gp_seq;
                        rnp->gp_seq_needed = rcu_state.gp_seq;
                        rnp->completedqs = rcu_state.gp_seq;
                        rnp->qsmask = 0;
                        rnp->qsmaskinit = 0;
                        rnp->grplo = j * cpustride;
                        rnp->grphi = (j + 1) * cpustride - 1;
                        if (rnp->grphi >= nr_cpu_ids)
                                rnp->grphi = nr_cpu_ids - 1;
                        if (i == 0) {
                                rnp->grpnum = 0;
                                rnp->grpmask = 0;
                                rnp->parent = NULL;
                        } else {
                                rnp->grpnum = j % levelspread[i - 1];
                                rnp->grpmask = BIT(rnp->grpnum);
                                rnp->parent = rcu_state.level[i - 1] +
                                              j / levelspread[i - 1];
                        }
                        rnp->level = i;
                        INIT_LIST_HEAD(&rnp->blkd_tasks);
                        rcu_init_one_nocb(rnp);
                        init_waitqueue_head(&rnp->exp_wq[0]);
                        init_waitqueue_head(&rnp->exp_wq[1]);
                        init_waitqueue_head(&rnp->exp_wq[2]);
                        init_waitqueue_head(&rnp->exp_wq[3]);
                        spin_lock_init(&rnp->exp_lock);
                }
        }

        init_swait_queue_head(&rcu_state.gp_wq);
        init_swait_queue_head(&rcu_state.expedited_wq);
        rnp = rcu_first_leaf_node();
        for_each_possible_cpu(i) {
                while (i > rnp->grphi)
                        rnp++;
                per_cpu_ptr(&rcu_data, i)->mynode = rnp;
                rcu_boot_init_percpu_data(i);
        }
}

1개의 rcu_state에 포함된 rcu_node와 rcu_data를 초기화한다. (예전 커널에서 rcu_state가 3가지가 존재하여 이 함수가 3번 호출되었었다)

코드 라인 17~18에서 rcu 노드들의 하이 라키는 1~4 레벨로 제한되어 있다.
코드 라인 22~24에서 각 레벨별로 첫 rcu_node를 가리키게 한다.
코드 라인 25에서 각 rcu 레벨이 관리하는 sub 노드의 수를 산출한다. 모듈 파라미터 rcu_fanout_exact(디폴트=0) 값이 0일 때 nr_cpu_ids 수에 맞춰 spread 한다.
코드 라인 29~67에서 leaf 노드부터 최상위 노드까지 초기화한다.
- grplo와 grphi에는 각 노드가 관리하는 cpu 번호 범위가 지정된다.
코드 라인 69~70에서 두 개의 swait_queue를 초기화한다.
코드 라이 71~77에서 각 cpu에 해당하는 rcu 데이터를 초기화한다. 또한 ->mynode가 담당 leaf 노드를 가리키게 한다.

다음 그림은 rcu_node의 grplo 및 grphi를 산출하는 과정을 보여준다.

다음 그림은 rcu_node의 grpnum, grpmask 및 level을 산출하는 과정을 보여준다.

다음 그림은 3단계로 구성된 rcu_node 구성을 트리 구조로 보여준 사례이다.

rcu_init_levelspread()

kernel/rcu/rcu.h

/*
 * Compute the per-level fanout, either using the exact fanout specified
 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
 */

static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt)
{
        int i;

        if (rcu_fanout_exact) {
                levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
                for (i = rcu_num_lvls - 2; i >= 0; i--)
                        levelspread[i] = RCU_FANOUT;
        } else {
                int ccur;
                int cprv;

                cprv = nr_cpu_ids;
                for (i = rcu_num_lvls - 1; i >= 0; i--) {
                        ccur = levelcnt[i];
                        levelspread[i] = (cprv + ccur - 1) / ccur;
                        cprv = ccur;
                }
        }
}

각 rcu 레벨이 관리하는 sub 노드의 수를 산출한다. rcu_fanout_exact=0(디폴트)을 사용하는 경우 노드 락을 최소화하기 위해 online된 cpu수에 맞춰 노드 배치를 spread하여 구성한다.

코드 라인 5~8에서 모듈 파라미터 rcu_fanout_exact가 설정된 경우 leaf 노드에서는 rcu_fanout_leaf(디폴트=16)로 설정하고, 나머지 노드는 RCU_FANOUT(디폴트=32/64 bits) 값으로 설정한다.
코드 라인 9~19에서 그 외의 경우 online cpu 수에 맞게 각 레벨의 노드가 관리하는 sub 노드 수를 spread 배치하여 구성한다.

다음 그림은 모듈 파라미터 rcu_fanout_exact 설정된 경우 노드 배치가 spread 되는 모습을 보여준다.

rcu_init_one_nocb()

kernel/rcu/tree_plugin.h

static void rcu_init_one_nocb(struct rcu_node *rnp)
{
        init_waitqueue_head(&rnp->nocb_gp_wq[0]);
        init_waitqueue_head(&rnp->nocb_gp_wq[1]);
}

CONFIG_RCU_NOCB_CPU 커널 옵션이 사용되는 경우 nocb_gp_wq[]에 있는 대기큐 두 개를 초기화한다.

rcu_boot_init_percpu_data()

kernel/rcu/tree.c

/*
 * Do boot-time initialization of a CPU's per-CPU RCU data.
 */
static void __init
rcu_boot_init_percpu_data(int cpu)
{
        struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);

        /* Set up local state, ensuring consistent view of global state. */
        rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
        WARN_ON_ONCE(rdp->dynticks_nesting != 1);
        WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
        rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
        rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED;
        rdp->rcu_onl_gp_seq = rcu_state.gp_seq;
        rdp->rcu_onl_gp_flags = RCU_GP_CLEANED;
        rdp->cpu = cpu;
        rcu_boot_init_nocb_percpu_data(rdp);
}

cpu별로 구성되는 rcu_data 구조체의 멤버를 부트타임에 모두 초기화한다.

nohz 및 no-cb용 콜백 처리 커널 스레드 구성

rcu_init_nohz()

kernel/rcu/tree_plugin.h

void __init rcu_init_nohz(void)
{
        int cpu;
        bool need_rcu_nocb_mask = false;
        struct rcu_data *rdp;

#if defined(CONFIG_NO_HZ_FULL)
        if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
                need_rcu_nocb_mask = true;
#endif /* #if defined(CONFIG_NO_HZ_FULL) */

        if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
                if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
                        pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
                        return;
                }
        }
        if (!cpumask_available(rcu_nocb_mask))
                return;

#if defined(CONFIG_NO_HZ_FULL)
        if (tick_nohz_full_running)
                cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
#endif /* #if defined(CONFIG_NO_HZ_FULL) */

        if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
                pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\nn");
                cpumask_and(rcu_nocb_mask, cpu_possible_mask,
                            rcu_nocb_mask);
        }
        if (cpumask_empty(rcu_nocb_mask))
                pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
        else
                pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
                        cpumask_pr_args(rcu_nocb_mask));
        if (rcu_nocb_poll)
                pr_info("\tPoll for callbacks from no-CBs CPUs.\n");

        for_each_cpu(cpu, rcu_nocb_mask) {
                rdp = per_cpu_ptr(&rcu_data, cpu);
                if (rcu_segcblist_empty(&rdp->cblist))
                        rcu_segcblist_init(&rdp->cblist);
                rcu_segcblist_offload(&rdp->cblist);
        }
        rcu_organize_nocb_kthreads();
}

rcu nohz 처리를 위한 초기화를 수행한다.

코드 라인 7~10에서 “nohz_full=” 커널 파라미터로 지정된 cpu들이 있는 경우 임시 변수 need_rcu_nocb_mask에 true를 대입해둔다.
코드 라인 12~19에서 “rcu_nocbs=” 커널 파라미터로 지정되는 rcu_nocb_mask 비트마스크가 할당되지 않은 경우 할당한다.
코드 라인 21~24에서 nohz full이 지원되는 시스템인 경우 rcu_nocb_mask에 nohz full cpu들을 추가한다.
코드 라인 26~30에서 nocb용 cpu들이 possible cpu에 포함되지 않은 경우 경고 메시지를 출력하고, rcu_nocb_mask 비트마스크에서 possible cpu들을 모두 뺀다.
코드 라인 31~35에서 offload된(no-cb) cpu들을 출력한다.
코드 라인 36~37에서 “rcu_nocb_poll=” 커널 파라미터가 설정된 경우 no-cb 스레드가 polling을 지원한다고 해당 정보를 출력한다.
코드 라인 39~44에서 offload cpu들에 대해 콜백리스트의 offloaded=1을 설정한다.
코드 라인 45에서 no-cb용 cpu들 각각에 대해 no-cb용 gp 커널 스레드가 동작하는 cpu를 지정한다.
- no-cb로 동작할 때 각 cpu들은 그룹으로 나뉘어 관리되며, 각 그룹당 대표 cpu는 no-cb용 gp 커널 스레드도 생성한다.

참고

RCU(Read Copy Update) -1- (Basic) | 문c
RCU(Read Copy Update) -2- (Callback process) | 문c
RCU(Read Copy Update) -3- (RCU threads) | 문c
RCU(Read Copy Update) -4- (NOCB process) | 문c
RCU(Read Copy Update) -5- (Callback list) | 문c
RCU(Read Copy Update) -6- (Expedited GP) | 문c
RCU(Read Copy Update) -7- (Preemptible RCU) | 문c
rcu_init() | 문c – 현재글
wait_for_completion() | 문c

wait_for_completion()

2016-09-082017-12-04 문영일 Leave a comment

작업 완료 시그널을 받는 wait_for_complition() 함수와 작업 완료 시그널을 보내는 complete() 함수의 처리 흐름도이다.

선언 및 초기화

DECLARE_COMPLETION()

include/linux/completion.h

/**
 * DECLARE_COMPLETION - declare and initialize a completion structure
 * @work:  identifier for the completion structure
 *
 * This macro declares and initializes a completion structure. Generally used
 * for static declarations. You should use the _ONSTACK variant for automatic
 * variables.
 */
#define DECLARE_COMPLETION(work) \
        struct completion work = COMPLETION_INITIALIZER(work)

주어진 이름의 completion 구조체에 대해 초기화를 한다.

예) DECLARE_COMPLETION(abc)
- abc 라는 이름의 completion 구조체 초기화

COMPLETION_INITIALIZER()

include/linux/completion.h

#define COMPLETION_INITIALIZER(work) \
        { 0, __WAIT_QUEUE_HEAD_INITIALIZER((work).wait) }

주어진 이름의 completion 구조체의 FIFO 대기 큐를 초기화하고 done 이라는 멤버의 초기값을 0으로 클리어한다.

APIs

wait_for_completion()

kernel/sched/completion.c

/**     
 * wait_for_completion: - waits for completion of a task
 * @x:  holds the state of this particular completion
 *
 * This waits to be signaled for completion of a specific task. It is NOT
 * interruptible and there is no timeout.
 *
 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
 * and interrupt capability. Also see complete().
 */
void __sched wait_for_completion(struct completion *x)
{
        wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(wait_for_completion);

작업의 완료를 기다린다. 정상 완료 시 0이 반환되고 인터럽트된 경우 -ERESTARTSYS를 반환한다.

wait_for_common()

kernel/sched/completion.c

static long __sched
wait_for_common(struct completion *x, long timeout, int state)
{
        return __wait_for_common(x, schedule_timeout, timeout, state);
}

작업의 완료를 주어진 시간 만큼 기다린다. 정상 완료 시 0, 인터럽트된 경우 -ERESTARTSYS 그리고 타임아웃 시 양의 정수로 남은 timeout jiffies 값을 반환한다.

__wait_for_common()

kernel/sched/completion.c

static inline long __sched
__wait_for_common(struct completion *x,
                  long (*action)(long), long timeout, int state)
{
        might_sleep();

        spin_lock_irq(&x->wait.lock);
        timeout = do_wait_for_common(x, action, timeout, state);
        spin_unlock_irq(&x->wait.lock);
        return timeout;
}

do_wait_for_common()

kernel/sched/completion.c

static inline long __sched
do_wait_for_common(struct completion *x,
                   long (*action)(long), long timeout, int state)
{
        if (!x->done) {
                DECLARE_WAITQUEUE(wait, current);

                __add_wait_queue_tail_exclusive(&x->wait, &wait);
                do {
                        if (signal_pending_state(state, current)) {
                                timeout = -ERESTARTSYS;
                                break;
                        }
                        __set_current_state(state);
                        spin_unlock_irq(&x->wait.lock);
                        timeout = action(timeout);
                        spin_lock_irq(&x->wait.lock);
                } while (!x->done && timeout);
                __remove_wait_queue(&x->wait, &wait);
                if (!x->done)
                        return timeout;
        }
        x->done--;
        return timeout ?: 1;
}

현재 태스크를 completion 구조체에 추가한 후 완료 시그널을 기다린다.

if (!x->done) {
- 이미 대기 중이 아닌 경우
DECLARE_WAITQUEUE(wait, current);
- 현재 태스크로 wait 노드를 생성한다.
__add_wait_queue_tail_exclusive(&x->wait, &wait);
- x 인수로 받은 compltion 구조체의 wait 큐에 조금 전에 생성한 wait 노드를 마지막에 exclusive 플래그로 추가한다.
do { if (signal_pending_state(state, current)) { timeout = -ERESTARTSYS; break; }
- 루프를 돌며 지연된 시그널이 있는 경우 루프를 빠져나간다.
__set_current_state(state); spin_unlock_irq(&x->wait.lock); timeout = action(timeout); spin_lock_irq(&x->wait.lock);
- 현재 상태를 저장하고 action 인수로 받은 함수를 호출한다.
  - 기본 함수로 schedule_timeout()을 사용한다.
} while (!x->done && timeout);
- 완료 시그널을 받았거나 타임 아웃된 경우가 아니면 루프를 계속 돈다.
__remove_wait_queue(&x->wait, &wait);
- 대기 큐에서 추가하였던 wait 노드를 제거한다.
if (!x->done) return timeout;
- 완료 시그널을 받지 않은 경우 timeout 값을 반환한다.
x->done–; return timeout ?: 1;
- 완료 카운터 done을 1 감소시키고 timeout 값을 반환하거나 0인 경우 1을 반환한다.

complete()

kernel/sched/completion.c

/**
 * complete: - signals a single thread waiting on this completion
 * @x:  holds the state of this particular completion
 *
 * This will wake up a single thread waiting on this completion. Threads will be
 * awakened in the same order in which they were queued.
 *
 * See also complete_all(), wait_for_completion() and related routines.
 *
 * It may be assumed that this function implies a write memory barrier before
 * changing the task state if and only if any tasks are woken up.
 */
void complete(struct completion *x)
{
        unsigned long flags;

        spin_lock_irqsave(&x->wait.lock, flags);
        x->done++;
        __wake_up_locked(&x->wait, TASK_NORMAL, 1);
        spin_unlock_irqrestore(&x->wait.lock, flags);
}
EXPORT_SYMBOL(complete);

작업 완료를 기다리는 하나의 태스크를 깨우고 완료 신호를 보내 대기중인 함수(wait_for_compltion())에서 탈출하게 한다.

대기큐에 하나 이상의 스레드들이 등록되어 있어 모두 깨어나게 할 필요가 있는 경우 complete_all() 함수를 사용한다.
TASK_NORMAL:
- TASK_INTERRUPTIBLE | TASK_UNITERRUPTIBLE

__wake_up_locked()

kernel/sched/wait.c

/*
 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
 */
void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
{
        __wake_up_common(q, mode, nr, 0, NULL); 
}
EXPORT_SYMBOL_GPL(__wake_up_locked);

대기큐에 등록된 하나의 태스크가 슬립된 경우 깨어나게 한다.

__wake_up_common()

kernel/sched/wait.c

/*
 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
 * number) then we wake all the non-exclusive tasks and one exclusive task.
 *
 * There are circumstances in which we can try to wake a task which has already
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, int wake_flags, void *key)
{
        wait_queue_t *curr, *next;

        list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
                unsigned flags = curr->flags;

                if (curr->func(curr, mode, wake_flags, key) &&
                                (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
                        break;
        }
}

작업 완료(complete)를 기다리는 스레드들을 순회하며 깨우는 함수(func)를 호출하되 exclusive 설정된 태스크들만 nr_exclusive 수 만큼 깨운다.

func()
- schedule_timeout()
- io_schedule_timeout()

구조체

completion 구조체

include/linux/completion.h

/*
 * struct completion - structure used to maintain state for a "completion"
 *
 * This is the opaque structure used to maintain the state for a "completion".
 * Completions currently use a FIFO to queue threads that have to wait for
 * the "completion" event.
 *
 * See also:  complete(), wait_for_completion() (and friends _timeout,
 * _interruptible, _interruptible_timeout, and _killable), init_completion(),
 * reinit_completion(), and macros DECLARE_COMPLETION(),
 * DECLARE_COMPLETION_ONSTACK().
 */
struct completion {
        unsigned int done;
        wait_queue_head_t wait;
};

done
- 초기 값은 0, 완료 시 1
wait
- FIFO 대기 큐

Radix Tree

Radix 트리 선언

Radix 트리 추가 및 삭제

radix_tree_insert()

__radix_tree_create()

radix_tree_extend()

radix_tree_node_alloc()

radix_tree_delete()

radix_tree_delete_item()

__radix_tree_lookup()

radix_tree_tag_clear()

__radix_tree_delete_node()

radix_tree_shrink()

radix_tree_node_free()

radix_tree_node_rcu_free()

Preload Radix 트리 노드

radix_tree_preload()

__radix_tree_preload()

기타 함수

ptr_to_indirect()

indirect_to_ptr()

root_gfp_mask()

tag_set()

tag_clear()

tag_get()

root_tag_set()

root_tag_clear()

root_tag_clear_all()

root_tag_get()

관련 상수

구조체

radix_tree_root 구조체

radix_tree_node 구조체

radix_tree_preload 구조체

참고

context_tracking_init()

context_tracking_cpu_set()

trace_init()

“tp_printk=” 커널 파라메터

RCU 초기화

rcu_init()

early 부트에서의 테스트

rcu_early_boot_tests()

early_boot_test_call_rcu()

test_callback()

rcu_test_sync_prims()

부트업 어나운스

rcu_bootup_announce()

rcu_bootup_announce_oddness()

rcupdate_announce_bootup_oddness()

rcu_tasks_bootup_oddness()

cpu hot-plug 동작

rcu_pm_notify()

트리 기하 초기화

rcu_init_geometry()

jiffies_till_sched_qs 조정

adjust_jiffies_till_sched_qs()

빌드 타임 RCU 트리 구성

RCU 구조체 초기화

rcu_init_one()

rcu_init_levelspread()

rcu_init_one_nocb()

rcu_boot_init_percpu_data()

nohz 및 no-cb용 콜백 처리 커널 스레드 구성

rcu_init_nohz()

참고

선언 및 초기화

DECLARE_COMPLETION()

APIs

wait_for_completion()

wait_for_common()

__wait_for_common()

do_wait_for_common()

complete()

__wake_up_locked()

__wake_up_common()

구조체

completion 구조체