TRACE_EVENT

TRACE_EVENT 특징

– Static Kernel Tracing: 커널 코드안에서 static probe point가 있다.
– TRACE_EVENT 매크로는 low overhead 성능을 위해 만들어졌다.
– ftrace는 물론 perf, LTTng와 SystemTap에서도 사용된다.

TRACE_EVENT 매크로 사용

  • 커널 코드내부에 tracepoint를 만들어야 한다.
  • tracepoint를 hook하는 callback 펑션을 만들어야 한다.
  • callback 펑션은 tracer ring buffer를 통해 가능한 빨리 데이터 기록.
  • 기록된 데이터는 사람이 읽을 수 있는 데이터 형태를 갖어야 한다.
TRACE_EVENT(name, proto, args, struct, assign, print)

TRACE_EVENT 매크로 인수

.name

  • tracepoint 이름
  • trace_ 접두어로 함수가 만들어진다.
    TRACE_EVENT(sched_switch,

     

.prototype

  • tracepoint callbacks을 위한 prototype
    TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next)
    trace_sched_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next);

     

.args

  • prototype에 매치되는 실제 데이터 인수
    TP_ARGS(rq, prev, next),
      #define TRACE_POINT(name, proto, args) \
           void trace_##name(proto)            \
           {                                   \
                   if (trace_##name##_active)  \
                           callback(args);     \
           }

     

.struct (optional)

  • tracepoint 로 전달될 데이터가 저장될 구조체
    TP_STRUCT__entry(
    		__array(	char,	prev_comm,	TASK_COMM_LEN	)
    		__field(	pid_t,	prev_pid			)
    		__field(	int,	prev_prio			)
    		__field(	long,	prev_state			)
    		__array(	char,	next_comm,	TASK_COMM_LEN	)
    		__field(	pid_t,	next_pid			)
    		__field(	int,	next_prio			)
        ),
      struct {
    	      char   prev_comm[TASK_COMM_LEN];
    	      pid_t  prev_pid;
    	      int    prev_prio;
    	      long   prev_state;
    	      char   next_comm[TASK_COMM_LEN];
    	      pid_t  next_pid;
    	      int    next_prio;
        };

     

.assign

  • ring buffer에 연결된 구조체에 데이터를 옮기기 위해 사용
     TP_fast_assign(
    		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
    		__entry->prev_pid	= prev->pid;
    		__entry->prev_prio	= prev->prio;
    		__entry->prev_state	= prev->state;
    		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
    		__entry->next_pid	= next->pid;
    		__entry->next_prio	= next->prio;
        ),

     

.print

  • 사람이 읽을 수 있는 ASCII 형태 출력
    TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> " \
     		  "next_comm=%s next_pid=%d next_prio=%d",
    		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
    		__entry->prev_state ?
    		  __print_flags(__entry->prev_state, "|",
    				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
    				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
    				{ 128, "W" }) : "R",
    		__entry->next_comm, __entry->next_pid, __entry->next_prio)

 format 화일

#cat /sys/kernel/debug/tracing/events/sched/sched_switch/format
name: sched_switch
ID: 264
format:
        field:unsigned short common_type;       offset:0;       size:2; signed:0;
        field:unsigned char common_flags;       offset:2;       size:1; signed:0;
        field:unsigned char common_preempt_count;       offset:3;       size:1; signed:0;
        field:int common_pid;   offset:4;       size:4; signed:1;

        field:char prev_comm[16];       offset:8;       size:16;        signed:1;
        field:pid_t prev_pid;   offset:24;      size:4; signed:1;
        field:int prev_prio;    offset:28;      size:4; signed:1;
        field:long prev_state;  offset:32;      size:8; signed:1;
        field:char next_comm[16];       offset:40;      size:16;        signed:1;
        field:pid_t next_pid;   offset:56;      size:4; signed:1;
        field:int next_prio;    offset:60;      size:4; signed:1;

print fmt: "prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", REC->prev_comm, REC->prev_pid, REC->prev_prio, REC->prev_state & (1024-1) ? __print_flags(REC->prev_state & (1024-1), "|", { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" }, { 16, "Z" }, { 32, "X" }, { 64, "x" }, { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R", REC->prev_state & 1024 ? "+" : "", REC->next_comm, REC->next_pid, REC->next_prio

 

trace 함수 호출 예)

  static inline void
   context_switch(struct rq *rq, struct task_struct *prev,
	          struct task_struct *next)
   {
	   struct mm_struct *mm, *oldmm;

	   prepare_task_switch(rq, prev, next);
	   trace_sched_switch(rq, prev, next);
	   mm = next->mm;
	   oldmm = prev->active_mm;
[04:37:30.629091416] (+0.000050732) sched_switch: { cpu_id = 2 }, { prev_comm = "swapper/2", prev_tid = 0, prev_prio = 20,
prev_state = 0, next_comm = "lttng", next_tid = 8347, next_prio = 20 }

 

TRACE_EVENT 정의된 헤더 화일

TRACE_EVENT(sched_switch,

        TP_PROTO(struct task_struct *prev,
                 struct task_struct *next),

        TP_ARGS(prev, next),

        TP_STRUCT__entry(
                __array(        char,   prev_comm,      TASK_COMM_LEN   )
                __field(        pid_t,  prev_pid                        )
                __field(        int,    prev_prio                       )
                __field(        long,   prev_state                      )
                __array(        char,   next_comm,      TASK_COMM_LEN   )
                __field(        pid_t,  next_pid                        )
                __field(        int,    next_prio                       )
        ),

        TP_fast_assign(
                memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
                __entry->prev_pid       = prev->pid;
                __entry->prev_prio      = prev->prio;
                __entry->prev_state     = __trace_sched_switch_state(prev);
                memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
                __entry->next_pid       = next->pid;
                __entry->next_prio      = next->prio;
        ),

        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
                __entry->prev_state & (TASK_STATE_MAX-1) ?
                  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
                                { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
                                { 16, "Z" }, { 32, "X" }, { 64, "x" },
                                { 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
                __entry->prev_state & TASK_STATE_MAX ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
);

 

get_random_bytes() 에서의 사용 예)

drivers/char/random.c – get_random_bytes()

trace_get_random_bytes(nbytes, _RET_IP_);

 

include/trace/events/random.h

DEFINE_EVENT(random__get_random_bytes, get_random_bytes,
	TP_PROTO(int nbytes, unsigned long IP),
           TP_ARGS(nbytes, IP)
);

DEFINE_EVENT를 통해 아래의 함수들이 생성됨

extern struct tracepoint __tracepoint_get_random_bytes;
static inline void trace_get_random_bytes();
static inline void trace_get_random_bytes_rcuidle();
static inline int register_trace_get_random_bytes();
static inline int unregister_trace_get_random_bytes();
static inline void check_trace_callback_type_get_random_bytes();
static inline bool trace_get_random_bytes_enabled();

 

include/linux/tracepoint.h

struct tracepoint {
        const char *name;               /* Tracepoint name */
        struct static_key key;
        void (*regfunc)(void);
        void (*unregfunc)(void);
        struct tracepoint_func __rcu *funcs;
};
#define DEFINE_EVENT(template, name, proto, args)		\
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
#define TRACE_EVENT(name, proto, args, struct, assign, print)   	\
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
#define DECLARE_TRACE(name, proto, args)			\
                	__DECLARE_TRACE(name, PARAMS(proto), 		\
		PARAMS(args), 1,   					\
              	PARAMS(void *__data, proto),            		\
                     PARAMS(__data, args))

 

include/linux/tracepoint.h

#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \
        extern struct tracepoint __tracepoint_##name;          		\
        static inline void trace_##name(proto)                          	\
        {                                                               			\
                if (static_key_false(&__tracepoint_##name.key))         	\
                        __DO_TRACE(&__tracepoint_##name,                	\
                                TP_PROTO(data_proto),                   		\
                                TP_ARGS(data_args),                     		\
                                TP_CONDITION(cond),,);                  	\
                if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) {         	\
                        rcu_read_lock_sched_notrace();                  	\
                        rcu_dereference_sched(__tracepoint_##name.funcs);\
                        rcu_read_unlock_sched_notrace();                	\
                }                                                       			\
        }                                                               			\
        __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \
                PARAMS(cond), PARAMS(data_proto),PARAMS(data_args)) \
        static inline int                                               		\
        register_trace_##name(void (*probe)(data_proto), void *data)	\
        {                                                               			\
                return tracepoint_probe_register(&__tracepoint_##name, \
                                                (void *)probe, data);   		\
        }                                                               			\
        static inline int                                               		\
        unregister_trace_##name(void (*probe)(data_proto), void *data)  \
        {                                                               			\
                return tracepoint_probe_unregister(&__tracepoint_##name,\
                                                (void *)probe, data);   		\
        }                                                               			\
        static inline void                                              		\
        check_trace_callback_type_##name(void (*cb)(data_proto)) 	\
        {                                                               			\
        }                                                               			\
        static inline bool                                              		\
        trace_##name##_enabled(void)                                    	\
        {                                                               			\
                return static_key_false(&__tracepoint_##name.key);  	\
        }

CONFIG_TRACEPOINTS 옵션을 사용하여 커널을 빌드 시 위와 같은 함수로 동작하고 그렇지 않은 경우 빈함수가 만들어지고 에러를 리턴한다.

#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu)      	\
        do {                                                            		\
                struct tracepoint_func *it_func_ptr;                    		\
                void *it_func;                                          		\
                void *__data;                                           		\
                                                                        			\
                if (!(cond))                                            		\
                        return;                                         		\
                prercu;                                                 		\
                rcu_read_lock_sched_notrace();                          	\
                it_func_ptr = rcu_dereference_sched((tp)->funcs);       	\
                if (it_func_ptr) {                                      		\
                        do {                                            		\
                                it_func = (it_func_ptr)->func;          		\
                                __data = (it_func_ptr)->data;           		\
                                ((void(*)(proto))(it_func))(args);      		\
                        } while ((++it_func_ptr)->func);                		\
                }                                                       			\
                rcu_read_unlock_sched_notrace();                        	\
                postrcu;                                                		\
        } while (0)

이벤트 분석

# cat events/sched/enable
0

# echo 1 > events/sched/enable
# cat set_event
sched:sched_wake_idle_without_ipi
sched:sched_swap_numa
sched:sched_stick_numa
sched:sched_move_numa
sched:sched_process_hang
sched:sched_pi_setprio
sched:sched_stat_runtime
sched:sched_stat_blocked
sched:sched_stat_iowait
sched:sched_stat_sleep
sched:sched_stat_wait
sched:sched_process_exec
sched:sched_process_fork
sched:sched_process_wait
sched:sched_wait_task
sched:sched_process_exit
sched:sched_process_free
sched:sched_migrate_task
sched:sched_switch
sched:sched_wakeup_new
sched:sched_wakeup
sched:sched_kthread_stop_ret
sched:sched_kthread_stop

# ls events/sched/
enable                  sched_process_exec  sched_stat_iowait   sched_wait_task
filter                  sched_process_exit  sched_stat_runtime  sched_wake_idle_without_ipi
sched_kthread_stop      sched_process_fork  sched_stat_sleep    sched_wakeup
sched_kthread_stop_ret  sched_process_free  sched_stat_wait     sched_wakeup_new
sched_migrate_task      sched_process_hang  sched_stick_numa
sched_move_numa         sched_process_wait  sched_swap_numa
sched_pi_setprio        sched_stat_blocked  sched_switch

사용 가능한 이벤트 조회

# cat /sys/kernel/debug/tracing/available_events
mac80211:drv_return_void
mac80211:drv_return_int
(...)
irq_vectors:thermal_apic_exit
irq_vectors:thermal_apic_entry
irq_vectors:threshold_apic_exit
irq_vectors:threshold_apic_entry
irq_vectors:call_function_single_exit
irq_vectors:call_function_single_entry
irq_vectors:call_function_exit
irq_vectors:call_function_entry
irq_vectors:irq_work_exit
irq_vectors:irq_work_entry
irq_vectors:x86_platform_ipi_exit
irq_vectors:x86_platform_ipi_entry
irq_vectors:error_apic_exit
irq_vectors:error_apic_entry
irq_vectors:spurious_apic_exit
irq_vectors:spurious_apic_entry
irq_vectors:reschedule_exit
irq_vectors:reschedule_entry
irq_vectors:local_timer_exit
irq_vectors:local_timer_entry
nmi:nmi_handler
syscalls:sys_exit_mmap
syscalls:sys_enter_mmap
vsyscall:emulate_vsyscall
raw_syscalls:sys_exit
raw_syscalls:sys_enter
mce:mce_record
tlb:tlb_flush
exceptions:page_fault_kernel
exceptions:page_fault_user
syscalls:sys_exit_unshare
syscalls:sys_enter_unshare
syscalls:sys_exit_set_tid_address
syscalls:sys_enter_set_tid_address
task:task_rename
task:task_newtask
(...)

 

참고

 

 

댓글 남기기