Linux tracing and profiling

#undef TRACE_SYSTEM
#define TRACE_SYSTEM mysubsys

#if !defined(_TRACE_MYSUBSYS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MYSUBSYS_H

#include <linux/tracepoint.h>

TRACE_EVENT(mysubsys_myevent,

    TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next),

    TP_ARGS(rq, prev, next),

    TP_STRUCT__entry(
        __array(    char,    prev_comm,    TASK_COMM_LEN    )
        __field(    pid_t,    prev_pid            )
        __field(    int,    prev_prio            )
        __field(    long,    prev_state            )
        __array(    char,    next_comm,    TASK_COMM_LEN    )
        __field(    pid_t,    next_pid            )
        __field(    int,    next_prio            )
    ),
    // __field => scalar member
    // __array => static array member
    // __string => nul-terminated variable-length string
    // __dynamic_array => dynamically-sized array
    // __field_struct => struct or union
    // __bitmask => array of longs, with bits
    // see more in samples/trace_event/*.h
    // checkpatch.pl will complain of spacing, that's ok

    TP_fast_assign(
        memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
        __entry->prev_pid    = prev->pid;
        __entry->prev_prio    = prev->prio;
        __entry->prev_state    = prev->state;
        memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
        __entry->next_pid    = next->pid;
        __entry->next_prio    = next->prio;
    ),
    // __assign_str => copy variable-length string
    // __get_dynamic_array => to access dynamically-sized array for copying (e.g. memcpy)
    // __assign_bitmask

    TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> " \
         "next_comm=%s next_pid=%d next_prio=%d",
        __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
        __entry->prev_state ?
        __print_flags(__entry->prev_state, "|",
                { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
                { 16, "Z" }, { 32, "X" }, { 64, "x" },
                { 128, "W" }) : "R",
        __entry->next_comm, __entry->next_pid, __entry->next_prio)
    // __get_str => get variable-length string
    // __get_dynamic_array, __get_dynamic_array_len => get dynamically-sized array
    // __print_symbolic(val, values) => convert selector value to string
    // __get_bitmask
    // __print_array
    // see more in samples/trace_event/*.h
);

#endif /* _TRACE_MYSUBSYS_H */

/* This part must be outside protection */

// If header file is located in local directory (i.e. not include/trace/events), then do:
// #undef TRACE_INCLUDE_PATH
// #undef TRACE_INCLUDE_FILE
// #define TRACE_INCLUDE_PATH .
// #define TRACE_INCLUDE_FILE mysubsys    // if the same as TRACE_SYSTEM, then may be omitted
// then also add to Makefile:    EXTRA_CFLAGS = -I$(src)
// or:                           CFLAGS_xxx.o = -I$(src)

#include <trace/define_trace.h>

README => brief help

tracing_on - 1/0 to enable/disable tracing

trace - contents of the buffer, immutable on reads, to clear: echo > trace
trace_pipe - contents of the buffer, clears when read

see format in https://www.kernel.org/doc/Documentation/trace/ftrace.txt

buffer_size_kb - size of trace buffer per cpu
buffer_total_size_kb - total size of trace buffers for all cpus

current_tracer - currently selected function and latency tracer
available_tracers - available tracers (blk function_graph wakeup_dl wakeup_rt wakeup function nop etc.)

function	function entry (who calls whom)
function_graph	function entry & exit (nested call tree)
blk	block io (see blktrace
irqsoff	how long interrupts are disabled (requires CONFIG_IRQSOFF_TRACER), longest interrupt-disabled section (see tracing_max_latency)
preemptoff	how long preemption is disabled (requires CONFIG_PREEMPT_TRACER)
preemptirqsoff	how long preemption and/or irqs are disabled
wakeup	how long does it take for a process to run after being woken, latency for highest-priority task to be scheduled after wakeup
wakeup_rt	wakeup-to-sched latency just for RT tasks
wakeup_dl	wakeup-to-sched latency just for DL tasks
nop	remove all tracers

max_graph_depth - limit depth of nested calls in function graph (0 = unlimited)

saved_cmdlines - pid to command line
saved_cmdlines_size - number of entries (lines) in saved_cmdlines

trace_marker - userland can add to the trace by writing here
tracing_cpumask - mask of CPUs to trace

options/<name> - options to control tracers
trace_options

available_events - list of available events (subsys:eventname)

set_event - event selector (can also use events/.../enable)

events/enable - enable all events (0/1)
events/<subsys>/enable - enable all events in sybsystem
events/<subsys>/filter - filter for the events (see <event>/filter below, but may use only vars defined by all events)
events/<subsys>/<event>/enable - enable event (0/1)
events/<subsys>/<event>/format - lists event variable names (from struct) and formatting data
events/<subsys>/<event>/filter - filter for the event

see more in https://www.kernel.org/doc/Documentation/trace/events.txt

see variable names in format

numeric ops:     ==, !=, <, <=, >, >=, &
string ops:             ==, !=, ~ (~ is limited pattern matching: "xxx*", "*xxx", "*xxx*")
boolean:               (expr1) && ((expr2) || (expr3))
sample:                 echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter

to clear:                 echo 0 > filter

events/<subsys>/<event>/trigger - command to perform on event

see more in https://www.kernel.org/doc/Documentation/trace/events.txt

Format: <trigger>[:count][if <filter>]

trigger:

traceon, traceoff
enable_event:<system>:<event>
disable_event:<system>:<event>
stacktrace
snapshot

example:       echo traceoff > events/block/block_unplug/trigger
                echo traceoff:3 > events/block/block_unplug/trigger
                    echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' >events/block/block_unplug/trigger

The first disables tracing every time block_unplug is hit.
The second disables tracing the first 3 times block_unplug is hit.
The third enables the kmalloc event the first 3 times block_unplug is hit and has value of greater than 1 for the 'nr_rq' event field.

Like function triggers, the counter is only decremented if it enabled or disabled tracing.

To remove a trigger without a count: echo '!<trigger> > <system>/<event>/trigger
To remove a trigger with a count: echo '!<trigger>:0 > <system>/<event>/trigger

Filters can be ignored when removing a trigger.

example: echo 'stacktrace:5 if bytes_req >= 65536' > /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger

uprobe_events - probes in userspace, see https://www.kernel.org/doc/Documentation/trace/uprobetracer.txt
uprobe_profile

available_filter_functions - list of functions that can be filtered on with set_ftrace_filter/set_ftrace_notrace/set_graph_function/set_graph_notrace

enabled_functions - list of hookable functions

set_ftrace_filter - for function tracer: trace only listed functions
also can be used with function_graph to force function as a leaf one, and display time for it

accepts:	func_full_name, func_end, func_begin, func_middle
modules:	Can select a group via module
Format:	:mod:<module-name>
example:	echo :mod:ext3 > set_ftrace_filter
to remove function(s):	echo '!xyz' >> set_ftrace_filter
triggers:	a command to perform when function is hit
Format:	<function>:<trigger>[:count]
trigger:	traceon, traceoff enable_event:<system>:<event> disable_event:<system>:<event> stacktrace snapshot dump cpudump
example:	echo do_fault:traceoff > set_ftrace_filter echo do_trap:traceoff:3 > set_ftrace_filter
The first one will disable tracing every time do_fault is hit The second will disable tracing at most 3 times when do_trap is hit To remove trigger without count: echo '!<function>:<trigger> > set_ftrace_filter To remove trigger with a count: echo '!<function>:<trigger>:0 > set_ftrace_filter

set_ftrace_notrace - for function tracer: list functions to never trace

accepts: func_full_name, *func_end, func_begin*, *func_middle*
modules: Can select a group via module command :mod:
Does not accept triggers

Example:

echo
'*lock*' > set_ftrace_notrace

cat set_ftrace_notrace | head -10

set_ftrace_pid - for function tracer: trace only this pid

Helper script:

#!/bin/sh
DEBUGFS=`grep debugfs /proc/mounts | awk '{ print $2; }'`
echo $$ > $DEBUGFS/tracing/set_ftrace_pid
echo function > $DEBUGFS/tracing/current_tracer
exec $*

set_graph_function - for function_graph tracer: trace only listed functions (similar to set_ftrace_filter)
set_graph_notrace - for function_graph tracer: list functions to never trace (similar to set_ftrace_notrace)

Example:

echo SyS_read SyS_write > set_graph_function
echo SyS_open >> set_graph_function
echo function_graph > current_tracer

options/func_stack_trace - trace stack and task id

Example: what calls a specific function (task id & stack trace)

echo function > current_tracer
echo kfree > set_ftrace_filter
echo 1 > options/func_stack_trace
... do something ...
echo 0 > options/func_stack_trace
echo > set_ftrace_filter

function_profile_enabled - function profiling
trace_stat/function<CPU>

Example (including preempted time):

echo nop > current_tracer
echo 1 > function_profile_enabled
cat trace_stat/function0 | head

Example (excluding preempted time):

echo 0 > options/sleep-time
echo 1 > function_profile_enabled
cat trace_stat/function0 | head

Example (excluding time for child function, only measure the function body):

echo 0 > options/graph-time
echo 1 > function_profile_enabled
cat trace_stat/function0 | head

Reset data:

echo 0 > function_profile_enabled
echo 1 > function_profile_enabled

echo 1 > /proc/sys/kernel/stack_tracer_enabled - enables stack tracing
(also "stacktrace" command in triggers/filters)

cat stack_max_size - max stack usage
cat stack_trace - stack trace at max stack usage point

stack_trace_filter - similar to set_ftrace_filter but limits what stacktrace traces

per_cpu/cpu<n>/trace - per-cpu version of trace
per_cpu/cpu<n>/trace_pipe - per-cpu version of trace_pipe
per_cpu/cpu<n>/snapshot - similar to regular snapshot

tracing_max_latency - some tracers will record max. latency here (irqsoff, preemptoff, preemptirqsoff, wakeup, wakeup_rt, wakeup_dl)
tracing_thresh - will record latency only if it exceeds @tracing_thresh usec

kprobe_events - kprobes to install/intercept as events (use echo '...' >> to add)

probe:             p:ev1 symbol (can also specify module, offset, addr, and data to add to the event)
return probe:     r:grp2/ev2 symbol
                        r:grp2/ev3 symbol $retval (records return value)
to clear:             -:ev1
                        -:grp2/ev2
                        -:grp2/ev3
                        blank (clear all)

https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt

events/kprobes/enable - 1/0 to enable/disable all kprobes
events/kprobes/filter - similar to events/<subsys>/filter
events/kprobes/<evname>/enable - 1/0 to enable/disable specific kprobe
events/kprobes/<evname>/format - lists event variable names and formatting data
events/kprobes/<evname>/filter - similar to events/<subsys>/<event>/filter
events/kprobes/<evname>/trigger - similar to events/<subsys>/<event>/trigger

https://github.com/brendangregg/perf-tools/blob/master/kernel/kprobe

snapshot - take/read snapshot of trace

echo 1 > snapshot         // take snapshot of trace
cat snapshot                 // use as trace
echo 0 > snapshot         // free snapshot

instances - parallel tracing: multiple configs of traces

mkdir instances/foo // creates dir similar to debug/tracing
... config trace in instances/foo ...
mkdir instances/bar
... config trace in instances/bar ...

trace_options - what and how to output to the trace,
see https://www.kernel.org/doc/Documentation/trace/ftrace.txt
options/<optname>

echo stacktrace >trace_options

echo nostacktrace >trace_options

trace_clock - clock used to timestamp the events

local:	Per cpu clock but may not be synced across CPUs
global:	Synced across CPUs but slows tracing down
counter:	Not a clock, but just an incremented counter, sync across CPUs
uptime:	Jiffy counter from time of boot
perf:	Same clock that perf events use
x86-tsc:	TSC cycle counter

Listing Events

Counting Events

# CPU counter statistics for the specified command:
perf stat command

# Detailed CPU counter statistics (includes extras) for the specified command:
perf stat -d command

# CPU counter statistics for the specified PID, until Ctrl-C:
perf stat -p PID

# CPU counter statistics for the entire system, for 5 seconds:
perf stat -a sleep 5

# Various basic CPU statistics, system wide, for 10 seconds:
perf stat -e cycles,instructions,cache-references,cache-misses,bus-cycles -a sleep 10

# Various CPU level 1 data cache statistics for the specified command:
perf stat -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores command

# Various CPU data TLB statistics for the specified command:
perf stat -e dTLB-loads,dTLB-load-misses,dTLB-prefetch-misses command

# Various CPU last level cache statistics for the specified command:
perf stat -e LLC-loads,LLC-load-misses,LLC-stores,LLC-prefetches command

# Count system calls for the specified PID, until Ctrl-C:
perf stat -e 'syscalls:sys_enter_*' -p PID

# Count system calls for the entire system, for 5 seconds:
perf stat -e 'syscalls:sys_enter_*' -a sleep 5

# Count scheduler events for the specified PID, until Ctrl-C:
perf stat -e 'sched:*' -p PID

# Count scheduler events for the specified PID, for 10 seconds:
perf stat -e 'sched:*' -p PID sleep 10

# Count ext4 events for the entire system, for 10 seconds:
perf stat -e 'ext4:*' -a sleep 10

# Count block device I/O events for the entire system, for 10 seconds:
perf stat -e 'block:*' -a sleep 10

# Show system calls by process, refreshing every 2 seconds:
perf top -e raw_syscalls:sys_enter -ns comm

Profiling

# Sample on-CPU functions for the specified command, at 99 Hertz:
perf record -F 99 command

# Sample on-CPU functions for the specified PID, at 99 Hertz, until Ctrl-C:
perf record -F 99 -p PID

# Sample on-CPU functions for the specified PID, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID sleep 10

# Sample CPU stack traces for the specified PID, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID -g -- sleep 10

# Sample CPU stack traces for the PID, using dwarf to unwind stacks, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID -g dwarf sleep 10

# Sample CPU stack traces for the entire system, at 99 Hertz, for 10 seconds:
perf record -F 99 -ag -- sleep 10

# Sample CPU stack traces for the entire system, with dwarf stacks, at 99 Hertz, for 10 seconds:
perf record -F 99 -ag dwarf sleep 10

# Sample CPU stack traces, once every 10,000 Level 1 data cache misses, for 5 seconds:
perf record -e L1-dcache-load-misses -c 10000 -ag -- sleep 5

# Sample CPU stack traces, once every 100 last level cache misses, for 5 seconds:
perf record -e LLC-load-misses -c 100 -ag -- sleep 5

# Sample on-CPU kernel instructions, for 5 seconds:
perf record -e cycles:k -a -- sleep 5

# Sample on-CPU user instructions, for 5 seconds:
perf record -e cycles:u -a -- sleep 5

# Sample on-CPU instructions precisely (using PEBS), for 5 seconds:
perf record -e cycles:p -a -- sleep 5

# Perform branch tracing (needs HW support), for 1 second:
perf record -b -a sleep 1

Static Tracing

# Trace new processes, until Ctrl-C:
perf record -e sched:sched_process_exec -a

# Trace all context-switches, until Ctrl-C:
perf record -e context-switches -a

# Trace all context-switches with stack traces, until Ctrl-C:
perf record -e context-switches -ag

# Trace all context-switches with stack traces, for 10 seconds:
perf record -e context-switches -ag -- sleep 10

# Trace CPU migrations, for 10 seconds:
perf record -e migrations -a -- sleep 10

# Trace all connect()s with stack traces (outbound connections), until Ctrl-C:
perf record -e syscalls:sys_enter_connect -ag

# Trace all accepts()s with stack traces (inbound connections), until Ctrl-C:
perf record -e syscalls:sys_enter_accept* -ag

# Trace all block device (disk I/O) requests with stack traces, until Ctrl-C:
perf record -e block:block_rq_insert -ag

# Trace all block device issues and completions (has timestamps), until Ctrl-C:
perf record -e block:block_rq_issue -e block:block_rq_complete -a

# Trace all block completions, of size at least 100 Kbytes, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'nr_sector > 200'

# Trace all block completions, synchronous writes only, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'rwbs == "WS"'

# Trace all block completions, all types of writes, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'rwbs ~ "*W*"'

# Trace all minor faults (RSS growth) with stack traces, until Ctrl-C:
perf record -e minor-faults -ag

# Trace all page faults with stack traces, until Ctrl-C:
perf record -e page-faults -ag

# Trace all ext4 calls, and write to a non-ext4 location, until Ctrl-C:
perf record -e 'ext4:*' -o /tmp/perf.data -a

# Trace kswapd wakeup events, until Ctrl-C:
perf record -e vmscan:mm_vmscan_wakeup_kswapd -ag

Dynamic Tracing

# Add a tracepoint for the kernel tcp_sendmsg() function entry ("--add" is optional):
perf probe --add tcp_sendmsg

# Remove the tcp_sendmsg() tracepoint (or use "--del"):
perf probe -d tcp_sendmsg

# Add a tracepoint for the kernel tcp_sendmsg() function return:
perf probe 'tcp_sendmsg%return'

# Show available variables for the kernel tcp_sendmsg() function (needs debuginfo):
perf probe -V tcp_sendmsg

# Show available variables for the kernel tcp_sendmsg() function, plus external vars (needs debuginfo):
perf probe -V tcp_sendmsg --externs

# Show available line probes for tcp_sendmsg() (needs debuginfo):
perf probe -L tcp_sendmsg

# Show available variables for tcp_sendmsg() at line number 81 (needs debuginfo):
perf probe -V tcp_sendmsg:81

# Add a tracepoint for tcp_sendmsg(), with three entry argument registers (platform specific):
perf probe 'tcp_sendmsg %ax %dx %cx'

# Add a tracepoint for tcp_sendmsg(), with an alias ("bytes") for the %cx register (platform specific):
perf probe 'tcp_sendmsg bytes=%cx'

# Trace previously created probe when the bytes (alias) variable is greater than 100:
perf record -e probe:tcp_sendmsg --filter 'bytes > 100'

# Add a tracepoint for tcp_sendmsg() return, and capture the return value:
perf probe 'tcp_sendmsg%return $retval'

# Add a tracepoint for tcp_sendmsg(), and "size" entry argument (reliable, but needs debuginfo):
perf probe 'tcp_sendmsg size'

# Add a tracepoint for tcp_sendmsg(), with size and socket state (needs debuginfo):
perf probe 'tcp_sendmsg size sk->__sk_common.skc_state'

# Tell me how on Earth you would do this, but don't actually do it (needs debuginfo):
perf probe -nv 'tcp_sendmsg size sk->__sk_common.skc_state'

# Trace previous probe when size is non-zero, and state is not TCP_ESTABLISHED(1) (needs debuginfo):
perf record -e probe:tcp_sendmsg --filter 'size > 0 && skc_state != 1' -a

# Add a tracepoint for tcp_sendmsg() line 81 with local variable seglen (needs debuginfo):
perf probe 'tcp_sendmsg:81 seglen'

# Add a tracepoint for do_sys_open() with the filename as a string (needs debuginfo):
perf probe 'do_sys_open filename:string'

# Add a tracepoint for myfunc() return, and include the retval as a string:
perf probe 'myfunc%return +0($retval):string'

# Add a tracepoint for the user-level malloc() function from libc:
perf probe -x /lib64/libc.so.6 malloc

# List currently available dynamic probes:
perf probe -l

Reporting

# Show perf.data in an ncurses browser (TUI) if possible:
perf report

# Show perf.data with a column for sample count:
perf report -n

# Show perf.data as a text report, with data coalesced and percentages:
perf report --stdio

# List all raw events from perf.data:
perf script

# List all raw events from perf.data, with customized fields:
perf script -f time,event,trace

# Dump raw contents from perf.data as hex (for debugging):
perf script -D

# Disassemble and annotate instructions with percentages (needs some debuginfo):
perf annotate --stdio

CONFIG_DEBUG_INFO
CONFIG_KPROBES
CONFIG_RELAY
CONFIG_DEBUG_FS
CONFIG_MODULES
CONFIG_MODULE_UNLOAD
// CONFIG_UTRACE (no longer needed in recent kernels)

kernel: make headers_install

Ubuntu:    apt-get install systemtap systemtap-runtime
Fedora:    dnf install -y systemtap systemtap-runtime

For stock kernels (Fedora):
    dnf install -y kernel-debuginfo kernel-devel kernel-debuginfo-common-<arch>
    e.g.   kernel-debuginfo-2.6.32-53.el6.i686.rpm
           kernel-devel-2.6.32-53.el6.i686.rpm
           kernel-debuginfo-common-i686-2.6.32-53.el6.i686.rpm
// yum install kernel-devel yum-utils debuginfo-install kernel

Ubuntu:

apt-get install elfutils libelf1 libelf-dev libdw1 libdw-dev
apt-get install latex2html libnss3 libnss3-dev libnss3-tools

Fedora:

dnf install -y kernel-devel yum-utils kernel
dnf install -y elfutils elfutils-libelf elfutils-devel
dnf install -y latex2html nss-tools nss-devel libvirt-devel

openSUSE:

zypper install -y systemtap systemtap-runtime
zypper install -y kernel-devel
zypper install -y elfutils libelf0 libelf1 libelf-devel
zypper install -y libebl1 libebl-devel libdw-devel
zypper install -y latex2html libvirt-devel
zypper install -y mozilla-nss mozilla-nss-tools mozilla-nss-devel

git clone git://sourceware.org/git/systemtap.git
https://sourceware.org/systemtap/ftp/releases

./configure --prefix=/usr
make [-j8] all
sudo make install

Ubuntu:

adduser sergey stapusr
adduser sergey stapsys
adduser sergey stapdev

Fedora:

usermod -a --groups stapusr sergey
usermod -a --groups stapsys sergey
usermod -a --groups stapdev sergey

openSUSE:

groupadd -r stapusr
groupadd -r stapsys
groupadd -r stapdev
usermod -a --groups stapusr sergey
usermod -a --groups stapsys sergey
usermod -a --groups stapdev sergey

re-login

Perf commands
perf	list	list available events
	stat	measure total event count for a single program or for system for some time
	top	top-like dynamic view of hottest functions
	record	run a command and record data to a file
	report	analyze file generated by perf record; can generate flat, or graph profile
	annotate	annotate sources or assemembly
	sched	trace/measure scheduler actions and latencies
	mem	profile/display memory accesses
	lock	analyze lock events
	timechart	visualize system behavior during a workload (CPU, disk, network IO)
	kmem	trace kernel memory usage (SLAB and page allocators)
	kvm	trace/measure KVM host and guest statistics
	probe	define new dynamic tracepoints (functions, source code lines etc.)
	inject	add additional info to the events stream
	bench	execute benchmark suites
	script	dump perf.dat
	script-python	process perf.dat with Python script
	script-perl	process perf.dat with Perl script