문c 블로그

함수선언부 관련 매크로 (attribute)

2015-11-302016-03-21 문영일 Leave a comment

__init

__section(.init.text) __cold notrace
- __section(S) __attribute__ ((__section__(#S)))
init.text 섹션에 해당 코드를 배치한다.

__cold

__attribute__((__cold__))
호출될 가능성이 희박한 함수를 뜻함.
속도보다 사이즈에 더 최적화를 수행한다.
unlikely()의 사용을 줄일 수 있게 된다. unlikely() 함수들은 old compiler 호환성을 위해 그냥같이 사용한다.
text 섹션의 한 쪽에 __cold 펑션들을 모아두는 지역성(locality)도 있다. 당연히 이로 인한 cache 효율성도 좋아진다.

notrace

__attribute__((no_instrument_function))
컴파일러에서 -finstrument-functions 컴파일 옵션을 사용할 때에도 해당 함수에 대한 profiling을 비활성한다.
참고:
- Trace and profile function calls with GCC
- 함수 호출 시각화 하기 | JaPa2
- Profiling | 문c

__weak

__attribute__((weak))
해당 심볼을 weak symbol로 만든다.
링커가 링크를 수행 시 다른곳에 같은 이름으로 만든 strong symbol이 존재하면 weak symbol 대신 strong symbol을 사용한다.
참고: GCC Weak Function Attributes

__attribute_const__

__attribute__((__const__))
전달 받은 인수외에 global 변수에 접근할 수 없다.
side effect가 생기지 않는다.
참고: Implications of pure and constant functions | LWN.net

__pure

__attribute__((pure)
전달 받은 인수외에 global 변수로의 access는 읽기만 가능한다.
side effect가 생기지 않는다.

__read_mostly

__attribute__((__section__(“.data.read_mostly”)))
읽기 위주의 데이터들만을 위한 섹션으로 캐시 라인 바운싱을 회피하기 위한 솔루션
- SMP 머신에서 cache eviction이 최소화될 수 있는 데이터들끼리 모여있도록 함으로 성능향상을 목표로 하였다.
- 캐시 라인 바운싱 참고: Exclusive loads and store | 문c
참고: Short subjects: kerneloops, read-mostly, and port 80 | LWN.net
.data.read_mostly 섹션은 RO_DATA_SECTION 다음에 위치한 RW_DATA_SECTION 에 정의되어 있다.

include/asm-generic/vmlinux.lds.h

/*
 * Helper macros to support writing architecture specific
 * linker scripts.
 *
 * A minimal linker scripts has following content:
 * [This is a sample, architectures may have special requiriements]
 *
 * OUTPUT_FORMAT(...)
 * OUTPUT_ARCH(...)
 * ENTRY(...)
 * SECTIONS
 * {
 *      . = START;
 *      __init_begin = .;
 *      HEAD_TEXT_SECTION
 *      INIT_TEXT_SECTION(PAGE_SIZE)
 *      INIT_DATA_SECTION(...)
 *      PERCPU_SECTION(CACHELINE_SIZE)
 *      __init_end = .;
 *
 *      _stext = .;
 *      TEXT_SECTION = 0
 *      _etext = .;
 *
 *      _sdata = .;
 *      RO_DATA_SECTION(PAGE_SIZE)
 *      RW_DATA_SECTION(...)
 *      _edata = .;
 *
 *      EXCEPTION_TABLE(...)
 *      NOTES
 *
 *      BSS_SECTION(0, 0, 0)
 *      _end = .;
 *
 *      STABS_DEBUG
 *      DWARF_DEBUG
 *
 *      DISCARDS                // must be the last
 * }
 *
 * [__init_begin, __init_end] is the init section that may be freed after init
 *      // __init_begin and __init_end should be page aligned, so that we can
 *      // free the whole .init memory
 * [_stext, _etext] is the text section
 * [_sdata, _edata] is the data section
 *
 * Some of the included output section have their own set of constants.
 * Examples are: [__initramfs_start, __initramfs_end] for initramfs and
 *               [__nosave_begin, __nosave_end] for the nosave data
 */

/*
 * Writeable data.
 * All sections are combined in a single .data section.
 * The sections following CONSTRUCTORS are arranged so their
 * typical alignment matches.
 * A cacheline is typical/always less than a PAGE_SIZE so
 * the sections that has this restriction (or similar)
 * is located before the ones requiring PAGE_SIZE alignment.
 * NOSAVE_DATA starts and ends with a PAGE_SIZE alignment which
 * matches the requirement of PAGE_ALIGNED_DATA.
 *
 * use 0 as page_align if page_aligned data is not used */
#define RW_DATA_SECTION(cacheline, pagealigned, inittask)               \
        . = ALIGN(PAGE_SIZE);                                           \
        .data : AT(ADDR(.data) - LOAD_OFFSET) {                         \
                INIT_TASK_DATA(inittask)                                \
                NOSAVE_DATA                                             \
                PAGE_ALIGNED_DATA(pagealigned)                          \
                CACHELINE_ALIGNED_DATA(cacheline)                       \
                READ_MOSTLY_DATA(cacheline)                             \
                DATA_DATA                                               \
                CONSTRUCTORS                                            \
        }

#define READ_MOSTLY_DATA(align)                                         \
        . = ALIGN(align);                                               \
        *(.data..read_mostly)                                           \
        . = ALIGN(align);

__used

__attribute__((used))
해당 객체 또는 함수가 참조되지 않아도 사용하는 것처럼 컴파일러로 하여금 삭제되지 않도록 한다.

__visible

__attribute__((externally_visible))
LTO(Link Time Optimization) 기능을 사용하는 경우 caller(호출측)와 callee(피호출측)의 관계에서 링커가 callee가 한 번만 사용된다고 판단되는 경우 caller에 callee를 inline화 하여 집어 넣는다.
externally_visible 속성을 사용하는 경우 LTO 옵션을 사용하여 링크를 하는 경우에도 하나의 완전한 함수나 객체로 외부에 보여질 수 있도록 심볼화하여 해당 함수나 객체가 inline화 되지 않도록 막는다.
-flto 또는 -whole-program을 사용하여 LTO 기능을 동작시킨다.
참고: Enable link-time optimization (after switching to avr-gcc 4.5 or greater)

asmlinkage

어셈블리 코드에서 C 함수를 호출할 때 함수 인자의 전달을 레지스터가 아닌 스택을 이용하도록 해주는 속성지정 매크로이다.
extern “C”로 정의되어 있다.
참고: [Linux] asmlinkage – F/OSS

참고

Declaring Attributes of Functions | gcc.gnu.org
Options That Control Optimization | gcc.gnu.org

lockdep_init()

2015-11-302016-02-11 문영일 Leave a comment

lockdep_init()

lockdep
- lock dependency의 약자로 커널이 lock을 모니터링하고 디버깅하기 위한 것으로 dead-lock 검출도 한다.

void lockdep_init(void)
{
        int i;

        /*   
         * Some architectures have their own start_kernel()
         * code which calls lockdep_init(), while we also
         * call lockdep_init() from the start_kernel() itself,
         * and we want to initialize the hashes only once:
         */

        if (lockdep_initialized)
                return;

        for (i = 0; i < CLASSHASH_SIZE; i++) 
                INIT_LIST_HEAD(classhash_table + i);

        for (i = 0; i < CHAINHASH_SIZE; i++) 
                INIT_LIST_HEAD(chainhash_table + i);

        lockdep_initialized = 1;
}

if (lockdep_initialized)
- lockdep_initialized는 lockdep_init() 함수가 이미 초기화되었음을 의미한다. 따라서 lockdep_initialized가 true(1)일 경우에는 초기화 코드를 수행하지 않는다.
INIT_LIST_HEAD(classhash_table + i);
- lockdep에 사용될 class마다 hash table을 만든다.
- lockdep에 사용되는 class는 4096개(CLASSHASH_SIZE)이다.
- classhash_table을 CLASSHASH_SIZE(4096)개 만큼 초기화한다.
INIT_LIST_HEAD(chainhash_table + i);
- chainhash_table을 CHAINHASH_SIZE(32768)개 만큼 초기화한다.

classhash_table & chainhash_table

/*
 * The lockdep classes are in a hash-table as well, for fast lookup:
 */
static struct list_head classhash_table[CLASSHASH_SIZE];

/*
 * We put the lock dependency chains into a hash-table as well, to cache
 * their existence:
 */
static struct list_head chainhash_table[CHAINHASH_SIZE];

list_head의 구조

struct list_head {
    struct list_head *next, *prev;
};

INIT_LIST_HEAD()

static inline void INIT_LIST_HEAD(struct list_head *list)
{
        list->next = list;
        list->prev = list;
}

참고

Lockdep | 문c

소스 코드 출력(Syntax highlight) 방법

2015-11-272016-01-08 문영일 Leave a comment

글을 쓸 때 화면 하단부에 Syntax-highlighter++ 창을 이용하여 코드를 추가할 수 있다.

<pre class=”brush:c; highlight:3″>static inline void sync_boot_mode(void)
{
sync_cache_r(&amp;__boot_cpu_mode);
}</pre>

static inline void sync_boot_mode(void) 
{ 
   sync_cache_r(&amp;__boot_cpu_mode); 
}

shortcode 플러그인 지원

2015-11-242016-01-04 문영일 1 Comment

shortcode

shortcode를 통해 많은 기능을 사용할 수 있으며, Highlighted 문자열도 출력할 수 있습니다.

kernel/head.S – v7_flush_dcache_louis:

2015-11-242016-02-11 문영일 Leave a comment

compressed/head.S 에서 사용하였던 __armv7_mmu_cache_flush: 루틴과 거의 동일하다.
데이터 캐시를 지우라고 요청하면 SoC 정보를 확인하여 지원하는 최종 캐시레벨을 확인한 후 L1 부터 해당 캐시 레벨까지 flush한다.
해당 캐시 레벨에서는 index와 way를 사용하여 하나씩 삭제한다.

hierarchical cache

ARMv6 아키텍처까지는 ARM 아키텍처에서 L1 캐시만 지원하였었다.
ARMv7 아키텍처부터 다단계의 캐시를 지원하게되었다.
다단계 캐시, ARM에서는 hierarchical cache 구조 라고 한다.

v7_flush_dcache_louis()

arch/arm/mm/cache-v7.S

 /*
 *     v7_flush_dcache_louis()
 *
 *     Flush the D-cache up to the Level of Unification Inner Shareable
 *
 *     Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
 */

ENTRY(v7_flush_dcache_louis)
        dmb                                     @ ensure ordering with previous memory accesses
        mrc     p15, 1, r0, c0, c0, 1           @ read clidr, r0 = clidr
        ALT_SMP(ands    r3, r0, #(7 << 21))     @ extract LoUIS from clidr
        ALT_UP(ands     r3, r0, #(7 << 27))     @ extract LoUU from clidr
#ifdef CONFIG_ARM_ERRATA_643719
        ALT_SMP(mrceq   p15, 0, r2, c0, c0, 0)  @ read main ID register
        ALT_UP(reteq    lr)                     @ LoUU is zero, so nothing to do
        ldreq   r1, =0x410fc090                 @ ID of ARM Cortex A9 r0p?
        biceq   r2, r2, #0x0000000f             @ clear minor revision number
        teqeq   r2, r1                          @ test for errata affected core and if so...
        orreqs  r3, #(1 << 21)                  @   fix LoUIS value (and set flags state to 'ne')
#endif
        ALT_SMP(mov     r3, r3, lsr #20)        @ r3 = LoUIS * 2
        ALT_UP(mov      r3, r3, lsr #26)        @ r3 = LoUU * 2
        reteq   lr                              @ return if level == 0
        mov     r10, #0                         @ r10 (starting level) = 0
        b       flush_levels                    @ start flushing cache levels
ENDPROC(v7_flush_dcache_louis)

mrc p15, 1, r0, c0, c0, 1
- LoUU/LoUIS를 추출하기 위해 CLIDR을 읽어온다.
ALT_SMP(ands r3, r0, #(7 << 21))
- SMP 시스템에서 CLIDR의 LoUIS 필드를 추출해온다.
ERRATA_643719
- 특정 프로세서의 CLIDR.LOUIS가 잘못 기록이 되어 있어서 이를 보정해주는 코드
- Cortex-A9 r1p0 이전 버전에서 LoUIS 값이 1이 아닌 0으로 기록된 것을 잡아준다.
ALT_SMP(mov r3, r3, lsr #20)
- r3: 읽어온 값을 우측으로 쉬프트하여 LoUIS x 2와 같은 값으로 만든다.
  - d-cache를 어느 캐시 레벨까지 flush할지 결정하기 위함.
reteq lr
- 읽어온 LoUIS가 0이면 d-cache의 flush를 포기하고 루틴을 빠져나간다.
mov r10, #0
- 시작 캐시 레벨을 0(L1)부터 준비한다.
b flush_levels
- v7_flush_dcache_all() 루틴 중간에 있는 flush_levels 레이블을 같이 사용한다.

v7_flush_dcache_all()

/*
 *      v7_flush_dcache_all()
 *
 *      Flush the whole D-cache.
 *
 *      Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
 *
 *      - mm    - mm_struct describing address space
 */
ENTRY(v7_flush_dcache_all)
        dmb                                     @ ensure ordering with previous memory accesses
        mrc     p15, 1, r0, c0, c0, 1           @ read clidr
        ands    r3, r0, #0x7000000              @ extract loc from clidr
        mov     r3, r3, lsr #23                 @ left align loc bit field
        beq     finished                        @ if loc is 0, then no need to clean
        mov     r10, #0                         @ start clean at cache level 0

flush_levels:
        add     r2, r10, r10, lsr #1            @ work out 3x current cache level
        mov     r1, r0, lsr r2                  @ extract cache type bits from clidr
        and     r1, r1, #7                      @ mask of the bits for current cache only
        cmp     r1, #2                          @ see what cache we have at this level
        blt     skip                            @ skip if no cache, or just i-cache
#ifdef CONFIG_PREEMPT
        save_and_disable_irqs_notrace r9        @ make cssr&csidr read atomic
#endif
        mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
        isb                                     @ isb to sych the new cssr&csidr
        mrc     p15, 1, r1, c0, c0, 0           @ read the new csidr
#ifdef CONFIG_PREEMPT
        restore_irqs_notrace r9
#endif
        and     r2, r1, #7                      @ extract the length of the cache lines
        add     r2, r2, #4                      @ add 4 (line length offset)
        ldr     r4, =0x3ff
        ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
        clz     r5, r4                          @ find bit position of way size increment
        ldr     r7, =0x7fff
        ands    r7, r7, r1, lsr #13             @ extract max number of the index size
loop1:
        mov     r9, r7                          @ create working copy of max index
loop2:
 ARM(   orr     r11, r10, r4, lsl r5    )       @ factor way and cache number into r11
 THUMB( lsl     r6, r4, r5              )
 THUMB( orr     r11, r10, r6            )       @ factor way and cache number into r11
 ARM(   orr     r11, r11, r9, lsl r2    )       @ factor index number into r11
 THUMB( lsl     r6, r9, r2              )
 THUMB( orr     r11, r11, r6            )       @ factor index number into r11
        mcr     p15, 0, r11, c7, c14, 2         @ clean & invalidate by set/way
        subs    r9, r9, #1                      @ decrement the index
        bge     loop2
        subs    r4, r4, #1                      @ decrement the way
        bge     loop1
skip:
        add     r10, r10, #2                    @ increment cache number
        cmp     r3, r10
        bgt     flush_levels
finished:
        mov     r10, #0                         @ swith back to cache level 0
        mcr     p15, 2, r10, c0, c0, 0          @ select current cache level in cssr
        dsb     st
        isb
        ret     lr
ENDPROC(v7_flush_dcache_all)

decompressed/head.S에서 d-cache를 flush한 로직과 거의 흡사하다.
- way와 index 루프 순서만 기존과 바뀌었다.