문c 블로그

TRACE_EVENT

2015-12-162020-01-07 문영일 Leave a comment

TRACE_EVENT 특징

– Static Kernel Tracing: 커널 코드안에서 static probe point가 있다.
– TRACE_EVENT 매크로는 low overhead 성능을 위해 만들어졌다.
– ftrace는 물론 perf, LTTng와 SystemTap에서도 사용된다.

TRACE_EVENT 매크로 사용

커널 코드내부에 tracepoint를 만들어야 한다.
tracepoint를 hook하는 callback 펑션을 만들어야 한다.
callback 펑션은 tracer ring buffer를 통해 가능한 빨리 데이터 기록.
기록된 데이터는 사람이 읽을 수 있는 데이터 형태를 갖어야 한다.

TRACE_EVENT(name, proto, args, struct, assign, print)

TRACE_EVENT 매크로 인수

.name

tracepoint 이름
trace_ 접두어로 함수가 만들어진다.
```
TRACE_EVENT(sched_switch,
```

.prototype

tracepoint callbacks을 위한 prototype

TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next)

trace_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next);

.args

prototype에 매치되는 실제 데이터 인수

TP_ARGS(rq, prev, next),

  #define TRACE_POINT(name, proto, args) \
       void trace_##name(proto)            \
       {                                   \
               if (trace_##name##_active)  \
                       callback(args);     \
       }

.struct (optional)

tracepoint 로 전달될 데이터가 저장될 구조체

TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__field(	long,	prev_state			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	next_pid			)
		__field(	int,	next_prio			)
    ),

  struct {
	      char   prev_comm[TASK_COMM_LEN];
	      pid_t  prev_pid;
	      int    prev_prio;
	      long   prev_state;
	      char   next_comm[TASK_COMM_LEN];
	      pid_t  next_pid;
	      int    next_prio;
    };

.assign

ring buffer에 연결된 구조체에 데이터를 옮기기 위해 사용

 TP_fast_assign(
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		__entry->prev_state	= prev->state;
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
    ),

.print

사람이 읽을 수 있는 ASCII 형태 출력

TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> " \
 		  "next_comm=%s next_pid=%d next_prio=%d",
		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
		__entry->prev_state ?
		  __print_flags(__entry->prev_state, "|",
				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
				{ 128, "W" }) : "R",
		__entry->next_comm, __entry->next_pid, __entry->next_prio)

format 화일

#cat /sys/kernel/debug/tracing/events/sched/sched_switch/format
name: sched_switch
ID: 264
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:char prev_comm[16];       offset:8;       size:16;        signed:1;
        field:pid_t prev_pid;   offset:24;      size:4; signed:1;
        field:int prev_prio;    offset:28;      size:4; signed:1;
        field:long prev_state;  offset:32;      size:8; signed:1;
        field:char next_comm[16];       offset:40;      size:16;        signed:1;
        field:pid_t next_pid;   offset:56;      size:4; signed:1;
        field:int next_prio;    offset:60;      size:4; signed:1;

print fmt: "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", REC->prev_comm, REC->prev_pid, REC->prev_prio, REC->prev_state & (1024-1) ? __print_flags(REC->prev_state & (1024-1), "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" }, { 64, "x" }, { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R", REC->prev_state & 1024 ? "+" : "", REC->next_comm, REC->next_pid, REC->next_prio

trace 함수 호출 예)

  static inline void
   context_switch(struct rq *rq, struct task_struct *prev,
	          struct task_struct *next)
   {
	   struct mm_struct *mm, *oldmm;

	   prepare_task_switch(rq, prev, next);
	   trace_sched_switch(rq, prev, next);
	   mm = next->mm;
	   oldmm = prev->active_mm;

[04:37:30.629091416] (+0.000050732) sched_switch: { cpu_id = 2 }, { prev_comm = "swapper/2", prev_tid = 0, prev_prio = 20,
prev_state = 0, next_comm = "lttng", next_tid = 8347, next_prio = 20 }

TRACE_EVENT 정의된 헤더 화일

TRACE_EVENT(sched_switch,

        TP_PROTO(struct task_struct *prev,
                 struct task_struct *next),

        TP_ARGS(prev, next),

        TP_STRUCT__entry(
                __array(        char,   prev_comm,      TASK_COMM_LEN   )
                __field(        pid_t,  prev_pid                        )
                __field(        int,    prev_prio                       )
                __field(        long,   prev_state                      )
                __array(        char,   next_comm,      TASK_COMM_LEN   )
                __field(        pid_t,  next_pid                        )
                __field(        int,    next_prio                       )
        ),

        TP_fast_assign(
                memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                __entry->prev_pid       = prev->pid;
                __entry->prev_prio      = prev->prio;
                __entry->prev_state     = __trace_sched_switch_state(prev);
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->next_pid       = next->pid;
                __entry->next_prio      = next->prio;
        ),

        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
                __entry->prev_state & (TASK_STATE_MAX-1) ?
                  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
                                { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
                                { 16, "Z" }, { 32, "X" }, { 64, "x" },
                                { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
                __entry->prev_state & TASK_STATE_MAX ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
);

get_random_bytes() 에서의 사용 예)

drivers/char/random.c – get_random_bytes()

trace_get_random_bytes(nbytes, _RET_IP_);

include/trace/events/random.h

DEFINE_EVENT(random__get_random_bytes, get_random_bytes,
	TP_PROTO(int nbytes, unsigned long IP),
           TP_ARGS(nbytes, IP)
);

DEFINE_EVENT를 통해 아래의 함수들이 생성됨

extern struct tracepoint __tracepoint_get_random_bytes;
static inline void trace_get_random_bytes();
static inline void trace_get_random_bytes_rcuidle();
static inline int register_trace_get_random_bytes();
static inline int unregister_trace_get_random_bytes();
static inline void check_trace_callback_type_get_random_bytes();
static inline bool trace_get_random_bytes_enabled();

include/linux/tracepoint.h

struct tracepoint {
        const char *name;               /* Tracepoint name */
        struct static_key key;
        void (*regfunc)(void);
        void (*unregfunc)(void);
        struct tracepoint_func __rcu *funcs;
};

#define DEFINE_EVENT(template, name, proto, args)		\
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))

#define TRACE_EVENT(name, proto, args, struct, assign, print)   	\
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))

#define DECLARE_TRACE(name, proto, args)			\
                	__DECLARE_TRACE(name, PARAMS(proto), 		\
		PARAMS(args), 1,   					\
              	PARAMS(void *__data, proto),            		\
                     PARAMS(__data, args))

include/linux/tracepoint.h

#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
        extern struct tracepoint __tracepoint_##name;          		\
        static inline void trace_##name(proto)                          	\
        {                                                               			\
                if (static_key_false(&__tracepoint_##name.key))         	\
                        __DO_TRACE(&__tracepoint_##name,                	\
                                TP_PROTO(data_proto),                   		\
                                TP_ARGS(data_args),                     		\
                                TP_CONDITION(cond),,);                  	\
                if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {         	\
                        rcu_read_lock_sched_notrace();                  	\
                        rcu_dereference_sched(__tracepoint_##name.funcs);\
                        rcu_read_unlock_sched_notrace();                	\
                }                                                       			\
        }                                                               			\
        __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
                PARAMS(cond), PARAMS(data_proto),PARAMS(data_args)) \
        static inline int                                               		\
        register_trace_##name(void (*probe)(data_proto), void *data)	\
        {                                                               			\
                return tracepoint_probe_register(&__tracepoint_##name, \
                                                (void *)probe, data);   		\
        }                                                               			\
        static inline int                                               		\
        unregister_trace_##name(void (*probe)(data_proto), void *data)  \
        {                                                               			\
                return tracepoint_probe_unregister(&__tracepoint_##name,\
                                                (void *)probe, data);   		\
        }                                                               			\
        static inline void                                              		\
        check_trace_callback_type_##name(void (*cb)(data_proto)) 	\
        {                                                               			\
        }                                                               			\
        static inline bool                                              		\
        trace_##name##_enabled(void)                                    	\
        {                                                               			\
                return static_key_false(&__tracepoint_##name.key);  	\
        }

CONFIG_TRACEPOINTS 옵션을 사용하여 커널을 빌드 시 위와 같은 함수로 동작하고 그렇지 않은 경우 빈함수가 만들어지고 에러를 리턴한다.

#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu)      	\
        do {                                                            		\
                struct tracepoint_func *it_func_ptr;                    		\
                void *it_func;                                          		\
                void *__data;                                           		\
                                                                        			\
                if (!(cond))                                            		\
                        return;                                         		\
                prercu;                                                 		\
                rcu_read_lock_sched_notrace();                          	\
                it_func_ptr = rcu_dereference_sched((tp)->funcs);       	\
                if (it_func_ptr) {                                      		\
                        do {                                            		\
                                it_func = (it_func_ptr)->func;          		\
                                __data = (it_func_ptr)->data;           		\
                                ((void(*)(proto))(it_func))(args);      		\
                        } while ((++it_func_ptr)->func);                		\
                }                                                       			\
                rcu_read_unlock_sched_notrace();                        	\
                postrcu;                                                		\
        } while (0)

이벤트 분석

# cat events/sched/enable
0

# echo 1 > events/sched/enable
# cat set_event
sched:sched_wake_idle_without_ipi
sched:sched_swap_numa
sched:sched_stick_numa
sched:sched_move_numa
sched:sched_process_hang
sched:sched_pi_setprio
sched:sched_stat_runtime
sched:sched_stat_blocked
sched:sched_stat_iowait
sched:sched_stat_sleep
sched:sched_stat_wait
sched:sched_process_exec
sched:sched_process_fork
sched:sched_process_wait
sched:sched_wait_task
sched:sched_process_exit
sched:sched_process_free
sched:sched_migrate_task
sched:sched_switch
sched:sched_wakeup_new
sched:sched_wakeup
sched:sched_kthread_stop_ret
sched:sched_kthread_stop

# ls events/sched/
enable                  sched_process_exec  sched_stat_iowait   sched_wait_task
filter                  sched_process_exit  sched_stat_runtime  sched_wake_idle_without_ipi
sched_kthread_stop      sched_process_fork  sched_stat_sleep    sched_wakeup
sched_kthread_stop_ret  sched_process_free  sched_stat_wait     sched_wakeup_new
sched_migrate_task      sched_process_hang  sched_stick_numa
sched_move_numa         sched_process_wait  sched_swap_numa
sched_pi_setprio        sched_stat_blocked  sched_switch

사용 가능한 이벤트 조회

# cat /sys/kernel/debug/tracing/available_events
mac80211:drv_return_void
mac80211:drv_return_int
(...)
irq_vectors:thermal_apic_exit
irq_vectors:thermal_apic_entry
irq_vectors:threshold_apic_exit
irq_vectors:threshold_apic_entry
irq_vectors:call_function_single_exit
irq_vectors:call_function_single_entry
irq_vectors:call_function_exit
irq_vectors:call_function_entry
irq_vectors:irq_work_exit
irq_vectors:irq_work_entry
irq_vectors:x86_platform_ipi_exit
irq_vectors:x86_platform_ipi_entry
irq_vectors:error_apic_exit
irq_vectors:error_apic_entry
irq_vectors:spurious_apic_exit
irq_vectors:spurious_apic_entry
irq_vectors:reschedule_exit
irq_vectors:reschedule_entry
irq_vectors:local_timer_exit
irq_vectors:local_timer_entry
nmi:nmi_handler
syscalls:sys_exit_mmap
syscalls:sys_enter_mmap
vsyscall:emulate_vsyscall
raw_syscalls:sys_exit
raw_syscalls:sys_enter
mce:mce_record
tlb:tlb_flush
exceptions:page_fault_kernel
exceptions:page_fault_user
syscalls:sys_exit_unshare
syscalls:sys_enter_unshare
syscalls:sys_exit_set_tid_address
syscalls:sys_enter_set_tid_address
task:task_rename
task:task_newtask
(...)

참고

Kernel Tracing

2015-12-162018-01-30 문영일 Leave a comment

Tools

Trace Tools

SystemTap
LTTng
GDB
ktap
dtrace4linux
Oracle Linux Dtrace
sysdig

Trace frameworks

Ftrace
perf_events (perf)
eBPF

Trace source

tracepoints – kernel static tracing
kprobes – kernel dynamic tracing
uprobes – user-level dynamic tracing

Trace history

초기: Static tracers, prototype dynamic tracers
2004: Linux kprobes (2.6.9)
2005: Solaris DTrace (s10)
2008: Linux ftrace (2.6.27)
2009: Linux perf (2.6.31)
2009 tracepoints (2.6.32)
2010~2014: ftrace & perf_events enhancements
2014: eBPF patches
(참고: Brendan Gregg – Linux Performance Analysis: New Tools and Old Secrets)

Kernel Option

Kernel hacking  --→
	[*] Tracers  --→
		-*-   Kernel Function Tracer                     
		[*]     Kernel Function Graph Tracer             
		[*]   Interrupts-off Latency Tracer              
		[ ]   Preemption-off Latency Tracer              
		[*]   Scheduling Latency Tracer                  
		[ ]   Trace syscalls                             
		-*-   Create a snapshot trace buffer             
		-*-     Allow snapshot to swap per CPU            
   	            Branch Profiling (No branch profiling)  --->
		[*]   Trace max stack                            
		[*]   Support for tracing block IO actions       
		[ ]   Enable kprobes-based dynamic events        
		[ ]   Enable uprobes-based dynamic events        
		[*]   enable/disable function tracing dynamically
		[*]   Kernel function profiler                  
		[ ]   Perform a startup test on ftrace          
		[ ]   Add tracepoint that benchmarks tracepoints
		< >   Ring buffer benchmark stress tester
		[ ]   Ring buffer startup self test

Dependency

프로파일링: 커널 빌드 시 gcc의 -pg 옵션을 사용하여 추가적인 프로파일링 코드를 생성하고 이를 통해 함수들의 분석을 돕는다.
mount -t debugfs none /sys/kernel/debug
CONFIG_DEBUG_FS
CONFIG_FUNCTION_PROFILER
CONFIG_FTRACE
CONFIG_KPROBES (dynamic kernel tracing)

Visual Tools

KernelShark

Trace Compass

visual_tools2

perf CPU Flame Graph

visual_tools3

perf Block I/O Latency Heat Map

visual_tools4

LTTviewer

visual_tools5

참고

Function Tracer Technology
Kernel Trace Systems
Linux Profiling tools and techniques | pixelbeat.org
NETFLIX – Linux Profiling at Netflix, Linux Performance Tools – 다운로드 pdf
Brendan Gregg – Linux Performance Analysis and Tools – 다운로드 pdf
Linux Kernel Debugging And Profiling Tools | Adeneo Embedded – 다운로드 pdf
Linux Systems Performance (2016) | NETFLIX – 다운로드 pdf
Velocity 2015 linux perf tools (2015) | Brendan Gregg – Slideshare

Kernel Debugging

2015-12-162016-01-04 문영일 Leave a comment

kernel menuconfig

Kernel hacking
    printk and dmesg options  --->
    Compile-time checks and compiler options  --->
-*- Magic SysRq key
(0x1) Enable magic SysRq key functions by default
-*- Kernel debugging
    Memory Debugging  --->
[ ] Debug shared IRQ handlers
    Debug Lockups and Hangs  --->
[ ] Panic on Oops
(0) panic timeout
[*] Collect scheduler debugging info
[ ] Collect scheduler statistics
[ ] Detect stack corruption on calls to schedule()
[*] Collect kernel timers statistics
[ ] Debug preemptible kernel
    Lock Debugging (spinlocks, mutexes, etc...)  --->
-*- Stack backtrace support
[ ] kobject debugging
[*] Verbose BUG() reporting (adds 70K)
[ ] Debug linked list manipulation
[ ] Debug priority linked list manipulation
[ ] Debug SG table operations
[ ] Debug notifier call chains
[ ] Debug credential management
    RCU Debugging  --->
[ ] Force extended block device numbers and spread them
< > Notifier error injection
[ ] Fault-injection framework
[*] Tracers  --->
    Runtime Testing  --->
[ ] Enable debugging of DMA-API usage
< > Test module loading with 'hello world' module
< > Test user/kernel boundary protections
< > Test BPF filter functionality
< > Test firmware loading via userspace interface
< > udelay test driver
[ ] Sample kernel code  ----
[*] KGDB: kernel debugger  --->
[ ] Export kernel pagetable layout to userspace via debugfs
[ ] Filter access to /dev/mem
[*] Enable stack unwinding support (EXPERIMENTAL)
[ ] Verbose user fault messages
[ ] Kernel low-level debugging functions (read help!)
< > Kprobes test module
[ ] Write the current PID to the CONTEXTIDR register
[ ] Set loadable kernel module data as NX and text as RO
[ ] CoreSight Tracing Support  ----

Debug File System (CONFIG_DEBUG_FS)

커널 메뉴 설정에서 아래가 체크되어야 한다.
Kernel hacking ––>
→ Compile-time checks and compiler options ––>
→ [*] Debug filesystem
Debugfs 는 마운트되어야 사용할 수 있다.
(mount -t debugfs none /sys/kernel/debug)
/sys/kernel/debug/debug_objects/stats을 통해 통계 정보를 알 수 있다.

참고

debug_objects_early_init()

2015-12-162019-03-14 문영일 Leave a comment

Debug Objects

커널에서 사용하는 객체를 트래킹하기 위해 별도로 Debug Object를 할당하고 상태 값을 기록해두어 life time을 트래킹할 수 있도록 한다.

객체를 커널에서 할당하여 사용 시 종종 다음과 같은 실수를 반복한다.

사용중인 객체의 할당 해제
사용중인 객체의 재초기화

Debug Object를 사용하면 다음을 수행할 때마다 트래킹한다.

객체 초기화
객체 추가
객체 삭제

참고: infrastructure to debug (dynamic) objects

특징

Debug Object 코드를 사용하려면 다음 커널 옵션을 설정해야 한다.
- CONFIG_DEBUG_KERNEL
- CONFIG_DEBUG_OBJECTS
Debug Object 기능을 enable 하기 위해서는 부트업 타임에 다음을 준비하여야 한다.
- CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT 커널 옵션을 사용하거나
- “debug_objects=1” 커널 파라메터를 사용한다.
문제가 발생하여 로그를 출력할 때 최대 5번까지로 제한한다.
/sys/kernel/debug/debug_objects 디렉토리에서 트래킹을 사용한다.
- 예) cat stats

적용된 커널 소스

타이머
- CONFIG_DEBUG_OBJECTS_TIMERS 커널 옵션 필요
워크큐
- CONFIG_DEBUG_OBJECTS_WORK 커널 옵션 필요
RCU
- CONFIG_DEBUG_OBJECTS_RCU_HEAD 커널 옵션 필요
per-cpu 카운터
- DEBUG_OBJECTS_PERCPU_COUNTER 커널 옵션 필요
kfree() & vfree()
- CONFIG_DEBUG_OBJECTS_FREE 옵션 필요
- 오브젝트의 deactivation 과정을 감시할 수 있다. (leak 감시)

Debug Object 상태 전환

다음은 Debug Object의 상태들이다.

none
- 객체를 할당 받지 않은 상태이다.
- Debug Object가 처음 할당되어 객체 풀에서 대기한다.
init
- 객체를 할당 받은 초기 상태이다.
- Debug Object는 해시 리스트에서 관리된다.
active
- 객체에 접근 가능한 상태이다.
- Debug Object는 해시 리스트에서 관리된다.
inactive
- 객체에 접근을 허용하지 않은 상태이다.
- Debug Object는 해시 리스트에서 관리된다.
destroyed
- 객체가 파괴된 상태이다.
- Debug Object는 해시 리스트에서 관리된다.
notavailable
- 객체가 할당 해제된 상태이다.
- Debug Object는 재사용을 위해 객체 풀에서 대기한다.

Debug Object의 상태가 바뀌는 과정을 보여준다.

Debug Object 초기화

Debug Object를 사용하기 위해 커널은 다음과 같아 두 단계에 걸쳐 초기화를 수행된다.

부트업 초반에 debug_objects_early_init()를 통해 early 초기화
- 처음 슬랩을 사용하는 kmem 캐시를 사용하기 전까지 임시로 사용할 static debug object를 사용한다.
부트업 후반에 debug_objects_mem_init()을 통해 정규 초기화
- 슬랩을 사용하는 kmem 캐시가 준비된 후 기존 static debug object를 모두 kmem 캐시에서 할당한 객체로 migration 한다.

다음 그림은 debug object의 2 단계 초기화를 보여준다.

Debug Object 주요 API

Debug Object의 주요 API는 다음과 같다.

debug_object_init()
debug_object_init_on_stack()
debug_object_activate()
debug_object_deactivate()
debug_object_destroy()
debug_object_free()

Debug Object Life-time

Debug object의 할당은 아래 두 함수에서 요청되고 Debug Pool에서 할당해준다. Debug Pool의 Debug Object가 최소 레벨(default: 256) 보다 부족해지는 경우 kem 캐시를 통해 refill 한다.

debug_object_init()
debug_object_init_on_stack()

Debug Object의 할당 해제는 아래 함수에서 요청되고 Debug Pool로 이동시킨다. 만일 Debug Pool의 Debug Object가 pool size(defautl: 1024)를 초과하는 경우 할당 해제하여 kmem 캐시로 돌려보낸다.

debug_object_free()

초기화

debug_objects_early_init()

lib/debugobjects.c

/*
 * Called during early boot to initialize the hash buckets and link
 * the static object pool objects into the poll list. After this call
 * the object tracker is fully operational.
 */

void __init debug_objects_early_init(void)
{
        int i;

        for (i = 0; i < ODEBUG_HASH_SIZE; i++) 
                raw_spin_lock_init(&obj_hash[i].lock);

        for (i = 0; i < ODEBUG_POOL_SIZE; i++) 
                hlist_add_head(&obj_static_pool[i].node, &obj_pool);
}

kmem 캐시가 준비가되기 직전의 커널 부트업에서 사용할 Debug Object 사용을 위해 초기화를 수행한다. 커널 빌드 시 준비해 둔 1024개의 static debug object를 임시로 사용한다.

코드 라인 5~6에서 Debug Object 해시를 초기화한다.
- ODEBUG_HASH_SIZE=16K
코드 라인 8~9에서 Debug Object 풀을 초기화한다.
- ODEBUG_POOL_SIZE=1024
  - s390 아키텍처를 위해 512 -> 1024로 증가시켰다.

debug_objects_mem_init()

lib/debugobjects.c

/*
 * Called after the kmem_caches are functional to setup a dedicated
 * cache pool, which has the SLAB_DEBUG_OBJECTS flag set. This flag
 * prevents that the debug code is called on kmem_cache_free() for the
 * debug tracker objects to avoid recursive calls.
 */

void __init debug_objects_mem_init(void)
{
        if (!debug_objects_enabled)
                return;

        obj_cache = kmem_cache_create("debug_objects_cache",
                                      sizeof (struct debug_obj), 0,
                                      SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE,
                                      NULL);

        if (!obj_cache || debug_objects_replace_static_objects()) {
                debug_objects_enabled = 0;
                kmem_cache_destroy(obj_cache);
                pr_warn("out of memory.\n");
        } else
                debug_objects_selftest();

        /*
         * Increase the thresholds for allocating and freeing objects
         * according to the number of possible CPUs available in the system.
         */
        debug_objects_pool_size += num_possible_cpus() * 32;
        debug_objects_pool_min_level += num_possible_cpus() * 4;
}

Debug Object 기능이 enable된 경우 사용을 위해 kmem 캐시등을 준비하고 기존 static debug object를 keme 캐시에서 할당받은 debug object로 migration 한다. 그 후 사용할 pool 사이즈와 최소 개수를 cpu 수에 맞게 적절히 조절한다.

코드 라인 3~4에서 Debug Object 기능을 enable 하지 않은 경우 함수를 빠져나간다.
코드 라인 6~9에서 debug_obj 구조체를 위해 kmem 캐시를 준비하고
코드 라인 11~16에서 기존 static debug object를 keme 캐시에서 할당받은 debug object로 migration 한다.
코드 라인 22에서 pool size를 possible cpu 개수 * 32 만큼 증가시킨다.
코드 라인 23에서 pool 최소 레벨도 poosible cpu 개수 * 4 만큼 증가시킨다.

주요 함수들

debug_object_init()

lib/debugobjects.c

/**
 * debug_object_init - debug checks when an object is initialized
 * @addr:       address of the object
 * @descr:      pointer to an object specific debug description structure
 */

void debug_object_init(void *addr, struct debug_obj_descr *descr)
{
        if (!debug_objects_enabled)
                return;

        __debug_object_init(addr, descr, 0);
}
EXPORT_SYMBOL_GPL(debug_object_init);

객체를 초기화 시 디버그 체크를 수행한다.

/**
 * debug_object_init_on_stack - debug checks when an object on stack is
 *                              initialized
 * @addr:       address of the object
 * @descr:      pointer to an object specific debug description structure
 */

void debug_object_init_on_stack(void *addr, struct debug_obj_descr *descr)
{
        if (!debug_objects_enabled)
                return;

        __debug_object_init(addr, descr, 1);
}
EXPORT_SYMBOL_GPL(debug_object_init_on_stack);

스택위에 있는 객체를 초기화 시 디버그 체크를 수행한다.

기존 상태가 active 및 destroyed에서 진입한 경우 에러 메시지를 출력한다.

__debug_object_init()

lib/debugobjects.c

static void
__debug_object_init(void *addr, struct debug_obj_descr *descr, int onstack)
{
        enum debug_obj_state state;
        struct debug_bucket *db;
        struct debug_obj *obj;
        unsigned long flags;

        fill_pool();

        db = get_bucket((unsigned long) addr);

        raw_spin_lock_irqsave(&db->lock, flags);

        obj = lookup_object(addr, db);
        if (!obj) {
                obj = alloc_object(addr, db, descr);
                if (!obj) {
                        debug_objects_enabled = 0;
                        raw_spin_unlock_irqrestore(&db->lock, flags);
                        debug_objects_oom();
                        return;
                }
                debug_object_is_on_stack(addr, onstack);
        }

        switch (obj->state) {
        case ODEBUG_STATE_NONE:
        case ODEBUG_STATE_INIT:
        case ODEBUG_STATE_INACTIVE:
                obj->state = ODEBUG_STATE_INIT;
                break;

        case ODEBUG_STATE_ACTIVE:
                debug_print_object(obj, "init");
                state = obj->state;
                raw_spin_unlock_irqrestore(&db->lock, flags);
                debug_object_fixup(descr->fixup_init, addr, state);
                return;

        case ODEBUG_STATE_DESTROYED:
                debug_print_object(obj, "init");
                break;
        default:
                break;
        }

        raw_spin_unlock_irqrestore(&db->lock, flags);
}

객체를 초기화 시 디버그 체크를 수행한다.

코드 라인 8에서 object pool에 준비되어 있는 오브젝트가 min level 이하인 경우 추가 할당을 해둔다.
코드 라인 10에서 객체 주소의 pfn을 이용한 해시를 통해 debug bucket을 가져온다.
코드 라인 12~24에서 debug bucket에 락을 걸고 객체 주소에 해당하는 Debug Object를 검색한다. 만일 존재하지 않는 경우 Debug Object를 할당한다.
코드 라인 26~45에서 Debug Object의 상태가 active나 destroyed이면 문제가 발생하였으므로 에러 메시지를 출력하고, active 상태인 경우는 추가 fixup 코드를 수행한다.
- 예) ODEBUG: assert_init not available (active state 0) object type: timer_list hint: stub_timer+0x0/0x20

다음 그림은 debug_object_init()을 수행할 때 debug_object의 이동 또는 추가 할당되는 모습을 보여준다.

debug_object_activate()

lib/debugobjects.c

/**
 * debug_object_activate - debug checks when an object is activated
 * @addr:       address of the object
 * @descr:      pointer to an object specific debug description structure
 * Returns 0 for success, -EINVAL for check failed.
 */

int debug_object_activate(void *addr, struct debug_obj_descr *descr)
{
        enum debug_obj_state state;
        struct debug_bucket *db;
        struct debug_obj *obj;
        unsigned long flags;
        int ret;
        struct debug_obj o = { .object = addr,
                               .state = ODEBUG_STATE_NOTAVAILABLE,
                               .descr = descr };

        if (!debug_objects_enabled)
                return 0;

        db = get_bucket((unsigned long) addr);

        raw_spin_lock_irqsave(&db->lock, flags);

        obj = lookup_object(addr, db);
        if (obj) {
                switch (obj->state) {
                case ODEBUG_STATE_INIT:
                case ODEBUG_STATE_INACTIVE:
                        obj->state = ODEBUG_STATE_ACTIVE;
                        ret = 0;
                        break;

                case ODEBUG_STATE_ACTIVE:
                        debug_print_object(obj, "activate");
                        state = obj->state;
                        raw_spin_unlock_irqrestore(&db->lock, flags);
                        ret = debug_object_fixup(descr->fixup_activate, addr, state);
                        return ret ? 0 : -EINVAL;

                case ODEBUG_STATE_DESTROYED:
                        debug_print_object(obj, "activate");
                        ret = -EINVAL;
                        break;
                default:
                        ret = 0;
                        break;
                }
                raw_spin_unlock_irqrestore(&db->lock, flags);
                return ret;
        }

        raw_spin_unlock_irqrestore(&db->lock, flags);
        /*
         * We are here when a static object is activated. We
         * let the type specific code confirm whether this is
         * true or not. if true, we just make sure that the
         * static object is tracked in the object tracker. If
         * not, this must be a bug, so we try to fix it up.
         */
        if (descr->is_static_object && descr->is_static_object(addr)) {
                /* track this static object */
                debug_object_init(addr, descr);
                debug_object_activate(addr, descr);
        } else {
                debug_print_object(&o, "activate");
                ret = debug_object_fixup(descr->fixup_activate, addr,
                                        ODEBUG_STATE_NOTAVAILABLE);
                return ret ? 0 : -EINVAL;
        }
        return 0;
}
EXPORT_SYMBOL_GPL(debug_object_activate);

객체를 활성화 시 디버그 체크를 수행한다.

코드 라인 12~13에서 Debug Object 기능을 enable 하지 않은 경우 함수를 빠져나간다.
코드 라인 15에서 객체 주소의 pfn을 이용한 해시를 통해 debug bucket을 가져온다.
코드 라인 17~19에서 debug bucket에 락을 걸고 객체 주소에 해당하는 Debug Object를 검색한다.
코드 라인 20~45에서 Debug Object의 상태가 이미 activate 이거나 destroyed 이면 문제가 발생하였으므로 에러 메시지를 출력하고, active 상태인 경우는 추가 fixup 코드를 수행한다.
코드 라인 55~64에서 Debug Object 검색이 안된 경우이다. Debug Object의 디스크립터가 static 객체인경우에만 Debug Object를 새로 초기화하고 activate상태로 변경하여 static 객체를 트래킹하게 한다. 그렇지 않고 dynamic 객체인 경우 에러 메시지를 출력하고 fixup 코드를 수행한다.

debug_object_deactivate()

lib/debugobjects.c

/**
 * debug_object_deactivate - debug checks when an object is deactivated
 * @addr:       address of the object
 * @descr:      pointer to an object specific debug description structure
 */

void debug_object_deactivate(void *addr, struct debug_obj_descr *descr)
{
        struct debug_bucket *db;
        struct debug_obj *obj;
        unsigned long flags;

        if (!debug_objects_enabled)
                return;

        db = get_bucket((unsigned long) addr);

        raw_spin_lock_irqsave(&db->lock, flags);

        obj = lookup_object(addr, db);
        if (obj) {
                switch (obj->state) {
                case ODEBUG_STATE_INIT:
                case ODEBUG_STATE_INACTIVE:
                case ODEBUG_STATE_ACTIVE:
                        if (!obj->astate)
                                obj->state = ODEBUG_STATE_INACTIVE;
                        else
                                debug_print_object(obj, "deactivate");
                        break;

                case ODEBUG_STATE_DESTROYED:
                        debug_print_object(obj, "deactivate");
                        break;
                default:
                        break;
                }
        } else {
                struct debug_obj o = { .object = addr,
                                       .state = ODEBUG_STATE_NOTAVAILABLE,
                                       .descr = descr };

                debug_print_object(&o, "deactivate");
        }

        raw_spin_unlock_irqrestore(&db->lock, flags);
}
EXPORT_SYMBOL_GPL(debug_object_deactivate);

객체를 비활성화 시 디버그 체크를 수행한다.

코드 라인 7~8에서 Debug Object 기능을 enable 하지 않은 경우 함수를 빠져나간다.
코드 라인 10에서 객체 주소의 pfn을 이용한 해시를 통해 debug bucket을 가져온다.
코드 라인 12~14에서 debug bucket에 락을 걸고 객체 주소에 해당하는 Debug Object를 검색한다.
코드 라인 15~31에서 Debug Object의 상태가 실제 activate 된 적이 없는 모든 경우에 대해 문제가 발생하였으므로 에러 메시지를 출력한다.
코드 라인 32~38에서 Debug Object 검색이 안된 경우이다. 이 경우에도 에러 메시지를 출력한다.

debug_object_destroy()

lib/debugobjects.c

/**
 * debug_object_destroy - debug checks when an object is destroyed
 * @addr:       address of the object
 * @descr:      pointer to an object specific debug description structure
 */

void debug_object_destroy(void *addr, struct debug_obj_descr *descr)
{
        enum debug_obj_state state;
        struct debug_bucket *db;
        struct debug_obj *obj;
        unsigned long flags;

        if (!debug_objects_enabled)
                return;

        db = get_bucket((unsigned long) addr);

        raw_spin_lock_irqsave(&db->lock, flags);

        obj = lookup_object(addr, db);
        if (!obj)
                goto out_unlock;

        switch (obj->state) {
        case ODEBUG_STATE_NONE:
        case ODEBUG_STATE_INIT:
        case ODEBUG_STATE_INACTIVE:
                obj->state = ODEBUG_STATE_DESTROYED;
                break;
        case ODEBUG_STATE_ACTIVE:
                debug_print_object(obj, "destroy");
                state = obj->state;
                raw_spin_unlock_irqrestore(&db->lock, flags);
                debug_object_fixup(descr->fixup_destroy, addr, state);
                return;

        case ODEBUG_STATE_DESTROYED:
                debug_print_object(obj, "destroy");
                break;
        default:
                break;
        }
out_unlock:
        raw_spin_unlock_irqrestore(&db->lock, flags);
}
EXPORT_SYMBOL_GPL(debug_object_destroy);

객체를 소멸 시 디버그 체크를 수행한다.

코드 라인 8~9에서 Debug Object 기능을 enable 하지 않은 경우 함수를 빠져나간다.
코드 라인 11에서 객체 주소의 pfn을 이용한 해시를 통해 debug bucket을 가져온다.
코드 라인 13~17에서 debug bucket에 락을 걸고 객체 주소에 해당하는 Debug Object를 검색한다.
코드 라인 19~37에서 Debug Object의 상태가 active 또는 destroyed 상태인 경우 문제가 발생하였으므로 에러 메시지를 출력하고, active 상태인 경우 fixup 코드도 수행한다.

debug_object_free()

lib/debugobjects.c

/**
 * debug_object_free - debug checks when an object is freed
 * @addr:       address of the object
 * @descr:      pointer to an object specific debug description structure
 */

void debug_object_free(void *addr, struct debug_obj_descr *descr)
{
        enum debug_obj_state state;
        struct debug_bucket *db;
        struct debug_obj *obj;
        unsigned long flags;

        if (!debug_objects_enabled)
                return;

        db = get_bucket((unsigned long) addr);

        raw_spin_lock_irqsave(&db->lock, flags);

        obj = lookup_object(addr, db);
        if (!obj)
                goto out_unlock;

        switch (obj->state) {
        case ODEBUG_STATE_ACTIVE:
                debug_print_object(obj, "free");
                state = obj->state;
                raw_spin_unlock_irqrestore(&db->lock, flags);
                debug_object_fixup(descr->fixup_free, addr, state);
                return;
        default:
                hlist_del(&obj->node);
                raw_spin_unlock_irqrestore(&db->lock, flags);
                free_object(obj);
                return;
        }
out_unlock:
        raw_spin_unlock_irqrestore(&db->lock, flags);
}
EXPORT_SYMBOL_GPL(debug_object_free);

객체의 할당 해제 시 디버그 체크를 수행한다.

코드 라인 8~9에서 Debug Object 기능을 enable 하지 않은 경우 함수를 빠져나간다.
코드 라인 11에서 객체 주소의 pfn을 이용한 해시를 통해 debug bucket을 가져온다.
코드 라인 13~17에서 debug bucket에 락을 걸고 객체 주소에 해당하는 Debug Object를 검색한다.
코드 라인 19~37에서 Debug Object의 상태가 active 상태인 경우 문제가 발생하였으므로 에러 메시지를 출력하고, fixup 코드도 수행한다.

다음 그림은 debug_object_free() 함수 호출 시 Debug object가 할당 해제되어 객체 풀로 되돌아가거나 kmem 캐시로 회수되는 모습을 보여준다.

Fixup Operations

아래 API들을 사용할 때 이미 Debug Object가 active 상태인 경우 에러가 발생하고, 디스크립터에 구현된 fixup 후크 함수를 호출한다.

debug_object_init() -> active 상태를 만나면 fixup 후크 함수를 호출한다.
debug_object_activate() -> active 상태를 만나면 fixup 후크 함수를 호출한다.
debug_object_deactivate() -> 없음
debug_object_destroy() -> active 상태를 만나면 fixup 후크 함수를 호출한다.
debug_object_free() -> active 상태를 만나면 fixup 후크 함수를 호출한다.

다음 그림은 Debug Object를 deactivate 상태 변환 없이 destroy 상태로 변경할 때 fixup 후크 함수가 호출되는 모습을 보여준다.

구조체

debug_object 구조체

include/linux/debugobjects.h

/**
 * struct debug_obj - representaion of an tracked object
 * @node:       hlist node to link the object into the tracker list
 * @state:      tracked object state
 * @astate:     current active state
 * @object:     pointer to the real object
 * @descr:      pointer to an object type specific debug description structure
 */

struct debug_obj {
        struct hlist_node       node;
        enum debug_obj_state    state;
        unsigned int            astate;
        void                    *object;
        struct debug_obj_descr  *descr;
};

node
- 객체 pool이나 해시리스트에 연결될 때 사용되는 노드이다.
state
- 트래킹할 객체 상태 값이다.
astate
- 현재 active 상태 값을 나타낸다. (초기화 시 0)
- debug_object_active_state() API를 통해서 설정되며, 현재는 특정 gpu 드라이버에서만 사용되고 있다.
object
- 실제 객체 주소
descr
- debug object descriptor를 가리킨다.

debug_obj_descr 구조체

include/linux/debugobjects.h

/**
 * struct debug_obj_descr - object type specific debug description structure
 *
 * @name:               name of the object typee
 * @debug_hint:         function returning address, which have associated
 *                      kernel symbol, to allow identify the object
 * @is_static_object:   return true if the obj is static, otherwise return false
 * @fixup_init:         fixup function, which is called when the init check
 *                      fails. All fixup functions must return true if fixup
 *                      was successful, otherwise return false
 * @fixup_activate:     fixup function, which is called when the activate check
 *                      fails
 * @fixup_destroy:      fixup function, which is called when the destroy check
 *                      fails
 * @fixup_free:         fixup function, which is called when the free check
 *                      fails
 * @fixup_assert_init:  fixup function, which is called when the assert_init
 *                      check fails
 */

struct debug_obj_descr {
        const char              *name;
        void *(*debug_hint)(void *addr);
        bool (*is_static_object)(void *addr);
        bool (*fixup_init)(void *addr, enum debug_obj_state state);
        bool (*fixup_activate)(void *addr, enum debug_obj_state state);
        bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
        bool (*fixup_free)(void *addr, enum debug_obj_state state);
        bool (*fixup_assert_init)(void *addr, enum debug_obj_state state);
};

Debug Object 디스크립터에는 이름과 몇 개의 fixup용 후크 함수들이 지정된다.

*name
- 디스크립터를 설명하고 보여줄 수 있는 이름
(*is_static_object)
- 객체가 static 상태인지 여부를 판단할 수 있는 후크 함수와 연결된다.
  - 예) timer_is_static_object(), work_is_static_object(), rcuhead_is_static_object()
(*debug_hint)
- 에러 메시지를 출력할 때 사용 위치를 보여주는 기능이다.
- 사용 위치가 커널에 심볼로 export 되어있는 함수의 경우 다음 사용예와 같이 함수 명과 함수에서의 Debug Object를 사용한 상대 위치가 표시된다.
  - 예) ODEBUG: free active (active state 0) object type: timer_list hint: process_timeout+0x0/0x10

debug_bucket 구조체

dll/debugobjects.c

struct debug_bucket {
        struct hlist_head       list;
        raw_spinlock_t          lock;
};

static struct debug_bucket      obj_hash[ODEBUG_HASH_SIZE];

list
- 사용 중 상태의 Debug Object가 연결될 리스트이다.
lock
- 위의 list 추가 삭제 시 필요한 lock이다.

obj_hash[] 배열

dll/debugobjects.c

static struct debug_bucket      obj_hash[ODEBUG_HASH_SIZE];

사용 중 상태의 Debug Object가 모여 있는 해시리스트이다.

obj_static_pool[] 배열

dll/debugobjects.c

static struct debug_obj         obj_static_pool[ODEBUG_POOL_SIZE] __initdata;

부트 업 과정 중 kmem 캐시가 활성화 되기 전까지 Debug Object를 사용해야 하는 경우 static pool을 활용한다.

부트업이 완료되면 사용하지 않는다.

참고

Kernel Debugging | 문c
The object-lifetime debugging infrastructure | Kernel.org

smp_setup_processor_id()

2015-12-162021-09-13 문영일 Leave a comment

리눅스는 물리 cpu 번호(id)를 사용하지 않고 로지컬 cpu 번호(id)를 사용하여 관리한다. 현재 부트된 물리 cpu를 로지컬 cpu id 0번으로 배치하여 사용한다.

DTB를 사용하는 경우 setup_arch() -> arm_dt_init_cpu_maps() 함수에서 로지컬 cpu id가 재조정된다.

NR_CPUS

가능한 CPU의 최대 개수로 configuration 할 때 static하게 정해지는 값.
- nr_cpu_ids
  - 런타임에 사용하는 cpu의 수
  - 런타임에 설정되기 전까지는 NR_CPUS 값과 동일하다.
커널 버전에 따라 범위와 default 값이 약간씩 다르다.
커널 버전이 높아질 때마다 조금씩 증가되는 추세이다.
arm
- 2 ~ 32의 범위에 default는 4이다.
arm64
- 2 ~ 4096의 범위에 default는 256이다.
- 참고:
  - arm64: default NR_CPUS to 256 (2019, v5.1-rc1)
  - arm64: kconfig: increase NR_CPUS range to 2-4096 (2015, v4.1-rc1)
  - arm64: defconfig: increase NR_CPUS default to 64 (2014, v3.18-rc1)
x86_32
- 2 ~ 8의 범위에 default는 8이다.
- 추가로 X86_BIGSMP 설정을 사용하는 경우 2 ~ 32의 범위에 default는 32이다.
x86_64
- 2 ~ 64의 범위에 default는 64이다.
- 추가로 MAXSMP 설정을 사용하는 경우 2 ~ 8192 범위에 default는 8192이다.

smp_setup_processor_id() – ARM64

arch/arm64/kernel/setup.c

void __init smp_setup_processor_id(void)
{
        u64 mpidr = read_cpuid_mpidr() & MPIDR_HWID_BITMASK;
        set_cpu_logical_map(0, mpidr);

        /*
         * clear __my_cpu_offset on boot CPU to avoid hang caused by
         * using percpu variable early, for example, lockdep will
         * access percpu variable inside lock_release
         */
        set_my_cpu_offset(0);
        pr_info("Booting Linux on physical CPU 0x%010lx [0x%08x]\n",
                (unsigned long)mpidr, read_cpuid_id());
}

부트 cpu인 로지컬 cpu 0번에 대한 mpidr 값을 읽어 매핑한다. (처음 부팅 시 로지컬 cpu는 항상 0번이다.)

코드 라인 3~4에서 MPIDR의 Aff3, Aff2, Aff1, Aff0 필드들만을 가져오기 위해 비트마스킹을 하여 읽어온다. 그런 후 이를 로지컬 cpu 0번에 해당하는 cpu_logical_map에 저장한다.
- CPU Affinity는 여러 개의 CPU core 중에서 각각의 cpu가 가지는 고유 번호같은 것.
- Affinity는 계층적으로 표현된다.
  - 아래 별도 섹션에서 다룬다.
코드 라인 11에서 현재 cpu가 로지컬 cpu 0 이므로 per-cpu에서 사용할 현재 부트 cpu에 대한 offset을 0으로 설정한다.
코드 라인 12~13에서 부팅 cpu에 대한 mpidr 값과 midr 정보를 출력한다.
- 예) “Booting Linux on physical CPU 0x0000000000 [0x410fd083]“

다음 그림은 arm64 시스템에서 8개의 cpu가 있는 경우를 가정하여 위의 함수가 처음 동작한 경우이다.

arm64에서는 로지컬 cpu 0번에 해당하는 매핑만 설정해둔다.

__cpu_logical_map[] – ARM64

arch/arm64/kernel/setup.c

u64 __cpu_logical_map[NR_CPUS] = { [0 ... NR_CPUS-1] = INVALID_HWID};

Designated Initializers 라고 불리는 배열 초기화 방법.
- 참고: Designated Initializers

MPIDR 관련 매크로 – ARM64

#define INVALID_HWID            ULONG_MAX

#define MPIDR_UP_BITMASK        (0x1 << 30)
#define MPIDR_MT_BITMASK        (0x1 << 24)
#define MPIDR_HWID_BITMASK      UL(0xff00ffffff)

#define MPIDR_LEVEL_BITS_SHIFT  3
#define MPIDR_LEVEL_BITS        (1 << MPIDR_LEVEL_BITS_SHIFT)
#define MPIDR_LEVEL_MASK        ((1 << MPIDR_LEVEL_BITS) - 1)

#define MPIDR_LEVEL_SHIFT(level) \
        (((1 << level) >> 1) << MPIDR_LEVEL_BITS_SHIFT)

#define MPIDR_AFFINITY_LEVEL(mpidr, level) \
        ((mpidr >> MPIDR_LEVEL_SHIFT(level)) & MPIDR_LEVEL_MASK)

MPIDR_LEVEL_BITS
- 하나의 affinity level 값이 몇 비트로 이루어져 있는지 나타내고 8로 설정되어 있다 (1바이트).
MPIDR_LEVEL_MASK
- 0xff

다음은 2개의 cpu가 빅 클러스터를 이루고, 4개의 cpu가 리틀 클러스터를 가진 odroid-N2 시스템의 mpidr 및 midr 값들을 보여준다.

클러스터 id가 0과 1로 구분되고, cpu id가 0~3으로 구분됨을 알 수 있다.

   Booting Linux on physical CPU 0x0000000000 [0x410fd034]
CPU1: Booted secondary processor 0x0000000001 [0x410fd034]
CPU2: Booted secondary processor 0x0000000100 [0x410fd092]
CPU3: Booted secondary processor 0x0000000101 [0x410fd092]
CPU4: Booted secondary processor 0x0000000102 [0x410fd092]
CPU5: Booted secondary processor 0x0000000103 [0x410fd092]

smp_setup_processor_id() – ARM32

arch/arm/kernel/setup.c

void __init smp_setup_processor_id(void)
{
        int i;
        u32 mpidr = is_smp() ? read_cpuid_mpidr() & MPIDR_HWID_BITMASK : 0;
        u32 cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);

        cpu_logical_map(0) = cpu;
        for (i = 1; i < nr_cpu_ids; ++i)
                cpu_logical_map(i) = i == cpu ? 0 : i;

        /*
         * clear __my_cpu_offset on boot CPU to avoid hang caused by
         * using percpu variable early, for example, lockdep will
         * access percpu variable inside lock_release
         */
        set_my_cpu_offset(0);

        pr_info("Booting Linux on physical CPU 0x%x\n", mpidr);
}

현재 로지컬 cpu 0번에 대한 물리 cpu 번호를 읽어 매핑한다. (처음 부팅 시 로지컬 cpu는 항상 0번이다.)

코드 라인 4~7에서 MPIDR의 하위 3바이트(Aff0, Aff1, Aff2)를 가져오기 위해 비트마스킹을 한 후 가장 단계가 낮은 affinity level 0번 값을 읽어온다. 읽어온 값은 물리 cpu 번호이며 이를 로지컬 cpu 0번에 해당하는 cpu_logical_map에 저장한다.
- CPU Affinity는 여러 개의 CPU core 중에서 각각의 cpu가 가지는 고유 번호같은 것.
- Affinity는 계층적으로 표현된다.
- x86의 하이퍼스레딩과 같이 arm에서도 가상 코어를 상용화하려 하였다가 포기하였다.
코드 라인 8~9에서 임시로 로지컬 cpu 1번 부터 나머지 로지컬 cpu에 대해 물리 cpu 번호와 동일하게 구성한다.
코드 라인 11에서 현재 cpu가 로지컬 cpu 0 이므로 per-cpu에서 사용할 현재 cpu에 대한 offset을 0으로 설정한다.
코드 라인 13에서 어떤 물리 CPU로 리눅스 부팅이 되었는지 안내하는 정보 출력한다.

MPIDR 관련 매크로 – ARM32

arch/arm/include/asm/cputype.h

#define MPIDR_AFFINITY_LEVEL(mpidr, level) \
        ((mpidr >> (MPIDR_LEVEL_BITS * level)) & MPIDR_LEVEL_MASK)

#define MPIDR_LEVEL_BITS 8
#define MPIDR_LEVEL_MASK ((1 << MPIDR_LEVEL_BITS) - 1)

MPIDR_LEVEL_BITS
- 하나의 affinity level 값이 몇 비트로 이루어져 있는지 나타내고 8로 설정되어 있다 (1바이트).
MPIDR_LEVEL_MASK
- 하위 MPIDR_LEVEL_BITS개의 비트만 1이고 나머지는 0인 값이다.

다음 그림은 arm 시스템에서 8개의 cpu가 있는 경우를 가정하여 위의 함수가 처음 동작한 경우이다.

MPIDR(Multiprocessor Affinity Register)

멀티프로세서 시스템의 스케줄링을 위해 어떠한 코어들간에 친화력(affinity)이 있는지 레벨별로 제공한다.

Process Affnity(프로세스 친화력)

프로세스 스케쥴링시 한 번 배정되었던 프로세스를 어떤 CPU 코어를 사용하게 할 지 결정하기 위해 필요.

리눅스 스케줄러는 가능하면 캐시 데이터 재활용을 위해 같은 코어에 배정
로드 밸런스를 위해 같은 코어에 배정을 하지 않는 경우 affinity 0 레벨에서 검토하고 배정할 core가 busy한 경우 점차 상향하여 위로 올라간다.

1) ARM64(AArch64)

다음 그림은 ARM64의 MPIDR 레지스터를 보여준다.

최대 4단계의 affinity 레벨을 제공한다.

U(Uniprocessor):

cpu가 UP 및 SMP 용도 두 가지로 표현되어 있다.

0=Multiprocessor
1=Uniprocessor

MT(Multi-Thread):

virtual core(hw 멀티스레드) 지원 여부 (멀티 스레딩 타입 접근으로 구현된 논리 프로세서의 밀결합 최소 레벨)

0
- 최소 affinity 레벨에서의 프로세스 성능이 최대 독립적
  - affinity 레벨 0은 독립적인 core id를 사용하므로 각각의 core 성능은 최대한 독립적으로 운영된다.
1
- 최소 affinity 레벨에서의 프로세스 성능이 매우 의존적
  - affinity 레벨 0은 virtual core id를 사용하므로 각각의 virtual core 성능은 같은 affinity 레벨의 virtual core의 성능에 영향을 끼친다.
  - x86의 하이퍼스레드와 동일한 개념이다.

ARM64 CPU Topology

리눅스 커널 구현에 따라 보통 2개 중 하나를 사용한다. ARM64의 경우도 virtual core를 개발하여 상용화하려다 계획을 포기하였다. 따라서 현재 모든 arm64 cpu들은 MT=0 모드만 사용한다. 향후 2021년 ARMv9 아키텍처 출시하고, 2022년 이후 어느 시점에서 추가 개발되어 출시할 예정이다.

3/4단계 affinity 레벨 사용 (MT=1)
- affinity 2 + (affnity 3 << 8):
  - package id
- affinity 1:
  - core id
- affinity 0:
  - thread id
2단계 affinity 레벨 사용 (MT=0)
- affinity 1 + (affnity 2 << 8) + (affnity 3 << 16) :
  - package id
- affinity 0:
  - core id

다음 그림은 ARM64 CPU topology에서 MT(Multi Thread) 지원 여부에 따라 구성된 모습을 보여준다.

리눅스는 MT=1일 때 8개의 cpu로 인식하고, MT=0일 때 4개의 cpu로 인식한다.

2) ARM(AArch32)

다음 그림은 32bit ARM의 MPIDR 레지스터를 보여준다.

cpu가 UP 및 SMP 용도 두 가지로 표현되어 있다.
최대 3단계의 affinity 레벨을 제공한다.

mpidr

ARM CPU Topology

구현에 따라 보통 2개 중 하나를 사용한다. ARM의 경우 virtual core를 개발하여 상용화하려다 계획을 포기하였다. 따라서 현재 모든 arm cpu들은 MT=0 모드만 사용한다.

3단계 affinity 레벨 사용 (MT=1)
- affinity 2:
  - socket id
- affinity 1:
  - core id
- affinity 0:
  - thread id
2단계 affinity 레벨 사용 (MT=0)
- affinity 2:
  - (reserved)
- affinity 1:
  - socket id
- affinity 0:
  - core id

다음 그림은 ARM CPU topology에서 MT(MultiThread) 지원 여부에 따라 구성된 모습을 보여준다.

리눅스는 MT=1일 때 8개의 cpu로 인식하고, MT=0일 때 4개의 cpu로 인식한다.

per-cpu 관련 offset 설정

set_my_cpu_offset() – ARM64

arch/arm64/include/asm/percpu.h

static inline void set_my_cpu_offset(unsigned long off)
{
        asm volatile(ALTERNATIVE("msr tpidr_el1, %0",
                                 "msr tpidr_el2, %0",
                                 ARM64_HAS_VIRT_HOST_EXTN)
                        :: "r" (off) : "memory");
}

현재 cpu의 per-cpu offset을 @offset 값으로 설정한다

TPIDR 레지스터를 사용하여 현재 cpu에 대한 per-cpu offset 값을 저장하여 per-cpu 변수에 대한 빠른 access를 가능하게 한다.
부팅 cpu를 제외한 나머지 cpu들에 대해 nVHE 기능 지원 여부에 따라 VHE 지원하는 경우 tpidr_el1으로 동작하고, nVHE의 경우 tpidr_el2 레지스터를 사용해야 한다.

TPIDR_EL1 또는 TPIDR_EL2 – ARM64

TPID(스레드 ID) 정보가 기록된 레지스터
- 현재 리눅스에서는 TPID를 저장하는 목적으로 사용하지 않고 각 cpu의 per-cpu offset를 저장하여 더 빠른 per-cpu data의 access를 위해 사용된다.

set_my_cpu_offset() – ARM32

arch/arm/include/asm/percpu.h

/*
 * Same as asm-generic/percpu.h, except that we store the per cpu offset 
 * in the TPIDRPRW. TPIDRPRW only exists on V6K and V7
 */
#if defined(CONFIG_SMP) && !defined(CONFIG_CPU_V6)
static inline void set_my_cpu_offset(unsigned long off)
{
        /* Set TPIDRPRW */

        asm volatile("mcr p15, 0, %0, c13, c0, 4" : : "r" (off) : "memory");
}

현재 cpu의 per-cpu offset을 @offset 값으로 설정한다

per-cpu 자료구조에서 사용되는 cpu마다 개별적으로 가지는 offset 값.
ARMv7에서는 속도 향상을 위해서 TPIDRPRW register를 사용함.
- ARMv7 이전 아키텍처에서는 이 레지스터를 사용하지 않고 메모리를 사용하여 연산하느라 메모리에 대한 접근이 2번 필요하여 느렸었고 이를 극복하기 위해 본래의 목적으로 사용하지 않는 레지스터인 TPIDRPRW를 사용하여 메모리 접근을 한 번으로 줄이기 위해 사용한다.
- 참고: ARM: implement optimized percpu variable | LWN.net

TPIDRPRW (Thread ID-R) – ARM32

Multiprocessor Extension에서 사용하며 PL1(Previlidge Level 1으로 커널 레벨) 이상에서만 사용가능하다.
Security Extension에서는 레지스터는 뱅크된다.
TPID(스레드 ID) 정보가 기록된 레지스터
- 현재 리눅스에서는 TPID를 저장하는 목적으로 사용하지 않고 각 cpu의 per-cpu offset를 저장하여 더 빠른 per-cpu data의 access를 위해 사용된다.