Code sample: LINUX_ROOT/samples/trace_events
https://www.kernel.org/doc/Documentation/trace/tracepoints.txt
(obsolete)
http://lwn.net/Articles/379903
http://lwn.net/Articles/381064
http://lwn.net/Articles/383362
https://www.kernel.org/doc/htmldocs/tracepoint/index.html
Declare single tracepoint:
TRACE_EVENT(event_name,
proto, args, struct, assign, print)
event_name - tracepoint (event) name
prototype - args declaration
args - args list
struct - describe the structure to be stored in the tracer ring buffer
assign - assignment statements for the struct
print - formatting of the struct
Declare multiple tracepoints with the same proto, args, struct, assign and print.
Saves data space.
DECLARE_EVENT_CLASS(template_name,
proto, args, struct, assign, print);
DECLARE_EVENT(template_name,
event1_name, proto, args);
...
DECLARE_EVENT(template_name,
event2_name, proto, args);
Conditional tracepoint:
TRACE_EVENT_CONDITION(name,
proto, args, cond, struct, assign, print)
or: DEFINE_EVENT_CONDITION(template, name, proto, args, cond)
for cond,
use TP_CONDITION(boolean-expression)
Header file, typically located in include/trace/events
#undef
TRACE_SYSTEM
#define TRACE_SYSTEM
mysubsys
#if
!defined(_TRACE_MYSUBSYS_H) || defined(TRACE_HEADER_MULTI_READ)
#define
_TRACE_MYSUBSYS_H
#include
<linux/tracepoint.h>
TRACE_EVENT(mysubsys_myevent,
TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct
*next),
TP_ARGS(rq, prev, next),
TP_STRUCT__entry(
__array(
char,
prev_comm,
TASK_COMM_LEN )
__field(
pid_t,
prev_pid
)
__field(
int,
prev_prio
)
__field(
long,
prev_state
)
__array(
char,
next_comm,
TASK_COMM_LEN )
__field(
pid_t,
next_pid
)
__field(
int,
next_prio
)
),
// __field => scalar member
// __array => static array member
// __string => nul-terminated variable-length string
// __dynamic_array => dynamically-sized array
// __field_struct => struct or union
// __bitmask => array of longs, with bits
// see more in samples/trace_event/*.h
// checkpatch.pl will complain of spacing, that's ok
TP_fast_assign(
memcpy(__entry->next_comm,
next->comm, TASK_COMM_LEN);
__entry->prev_pid =
prev->pid;
__entry->prev_prio =
prev->prio;
__entry->prev_state =
prev->state;
memcpy(__entry->prev_comm,
prev->comm, TASK_COMM_LEN);
__entry->next_pid =
next->pid;
__entry->next_prio =
next->prio;
),
// __assign_str => copy variable-length string
// __get_dynamic_array => to access
dynamically-sized array for copying (e.g. memcpy)
// __assign_bitmask
TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s
==> " \
"next_comm=%s next_pid=%d
next_prio=%d",
__entry->prev_comm,
__entry->prev_pid, __entry->prev_prio,
__entry->prev_state ?
__print_flags(__entry->prev_state, "|",
{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8,
"t" },
{ 16, "Z" }, { 32, "X" }, { 64, "x" },
{ 128, "W" }) : "R",
__entry->next_comm,
__entry->next_pid, __entry->next_prio)
// __get_str => get variable-length string
// __get_dynamic_array, __get_dynamic_array_len => get
dynamically-sized array
// __print_symbolic(val, values) => convert
selector value to string
// __get_bitmask
// __print_array
// see more in samples/trace_event/*.h
);
#endif /*
_TRACE_MYSUBSYS_H */
/* This part must be
outside protection */
// If header file is
located in local directory (i.e. not include/trace/events), then do:
// #undef
TRACE_INCLUDE_PATH
// #undef
TRACE_INCLUDE_FILE
// #define
TRACE_INCLUDE_PATH .
// #define
TRACE_INCLUDE_FILE mysubsys //
if the same as TRACE_SYSTEM, then may be omitted
// then also add to
Makefile: EXTRA_CFLAGS = -I$(src)
//
or:
CFLAGS_xxx.o = -I$(src)
#include
<trace/define_trace.h>
Source file (use of tracepoint)
// declare CREATE_TRACE_POINTS only in one source
file,
// otherwise will generate duplicate tracepoint routines/data,
// and linker will fail
#define CREATE_TRACE_POINTS
#include <trace/events/mysubsys.h>
... code ...
trace_mysubsys_myevent(rq, prev, next);
... code ...
if (trace_mysubsys_myevent_enabled()) {
... do some calculations ...
trace_mysubsys_myevent(rq, prev, next);
}
In loadable module:
EXPORT_TRACEPOINT_SYMBOL(mysubsys_myevent);
EXPORT_TRACEPOINT_SYMBOL_GPL(mysubsys_myevent);
// on load:
register_trace_mysubsys_myevent(###...);
// obsolete?
// on unload:
unregister_trace_mysubsys_myevent(###...);
// obsolete?
tracepoint_synchronize_unregister();
Declares:
trace_mysubsys_myevent(struct rq *rq, struct
task_struct *prev, struct task_struct *next);
struct {
char prev_comm[TASK_COMM_LEN];
pid_t
prev_pid;
int prev_prio;
long prev_state;
char next_comm[TASK_COMM_LEN];
pid_t
next_pid;
int next_prio;
};
Creates:
/sys/kernel/debug/tracing/events/sched/mysubsys_myevent/...
enable
=> 0/1
format => lists variable names and
formats
filter => filter expression (see
below)
trigger => trigger operation (see below)
To use:
cd
/sys/kernel/debug/tracing
echo 1 >
events/mysubsys/myevent/enable
... do something ...
cat trace
cd
/sys/kernel/debug/tracing
echo
mysubsys_myevent other1 other2 > set_event
... do something ...
cat trace
https://www.kernel.org/doc/Documentation/trace/ftrace.txt
https://lwn.net/Articles/370423
mem events: https://www.kernel.org/doc/Documentation/trace/events-kmem.txt
NMI events: https://www.kernel.org/doc/Documentation/trace/events-nmi.txt
power events: https://www.kernel.org/doc/Documentation/trace/events-power.txt
https://www.kernel.org/doc/Documentation/trace/tracepoint-analysis.txt
https://lwn.net/Articles/365835
https://lwn.net/Articles/370423
Under /sys/kernel/debug/tracing
README
=> brief help
tracing_on
- 1/0 to enable/disable tracing
trace
- contents of the buffer, immutable on reads, to clear: echo >
trace
trace_pipe
- contents of the buffer, clears when read
buffer_size_kb
- size of trace buffer per cpu
buffer_total_size_kb
- total size of trace buffers for all cpus
current_tracer
- currently selected function and latency tracer
available_tracers
- available tracers (blk function_graph wakeup_dl wakeup_rt wakeup
function nop etc.)
function |
function entry (who calls whom) |
function_graph |
function entry & exit (nested call tree) |
blk |
block io (see blktrace |
irqsoff |
how long interrupts are disabled (requires
CONFIG_IRQSOFF_TRACER),
longest interrupt-disabled section (see tracing_max_latency) |
preemptoff |
how long preemption is disabled (requires
CONFIG_PREEMPT_TRACER) |
preemptirqsoff |
how long preemption and/or irqs are disabled |
wakeup |
how long does it take for a process to run after being
woken,
latency for highest-priority task to be scheduled after wakeup |
wakeup_rt |
wakeup-to-sched latency just for RT tasks |
wakeup_dl |
wakeup-to-sched latency just for DL tasks |
nop |
remove all tracers |
max_graph_depth
- limit depth of nested calls in function graph (0 =
unlimited)
saved_cmdlines
- pid to command line
saved_cmdlines_size
- number of entries (lines) in
saved_cmdlines
trace_marker
- userland can add to the trace by writing here
tracing_cpumask
- mask of CPUs to trace
options/<name>
- options to control tracers
trace_options
available_events
- list of available events (subsys:eventname)
set_event
- event selector (can also use events/.../enable)
to enable:
echo
'subsys:eventname' > set_event
echo
eventname > set_event
to disable:
echo '!subsys:eventname'
> set_event
echo
'!eventname' > set_event
to enable all:
echo '*:*' >set_event
to enable subsys:
echo 'subsys:*'
>set_event
to disable all:
echo >set_event
can also specify on boot: parameter:
trace_event=s1:e1,s2:e2,...,sx:ex
https://www.kernel.org/doc/Documentation/trace/events.txt
events/enable
- enable all events (0/1)
events/<subsys>/enable
- enable all events in sybsystem
events/<subsys>/filter
- filter for the events (see
<event>/filter below, but may use only vars defined by
all events)
events/<subsys>/<event>/enable
- enable
event (0/1)
events/<subsys>/<event>/format
- lists
event variable names (from
struct)
and formatting data
events/<subsys>/<event>/filter
- filter for
the event
see more in
https://www.kernel.org/doc/Documentation/trace/events.txt
see variable names in
format
numeric ops:
==, !=, <,
<=, >, >=, &
string ops:
==, !=, ~ (~ is
limited pattern matching: "xxx*", "*xxx",
"*xxx*")
boolean:
(expr1) &&
((expr2) || (expr3))
sample:
echo "((sig >= 10
&& sig < 15) ||
sig == 17) && comm != bash" > filter
to clear:
echo 0 > filter
events/<subsys>/<event>/trigger
- command
to perform on event
Format:
<trigger>[:count][if
<filter>]
trigger:
traceon, traceoff
enable_event:<system>:<event>
disable_event:<system>:<event>
stacktrace
snapshot
example:
echo traceoff >
events/block/block_unplug/trigger
echo traceoff:3
> events/block/block_unplug/trigger
echo
'enable_event:kmem:kmalloc:3 if nr_rq > 1'
>events/block/block_unplug/trigger
The first disables tracing
every time block_unplug is hit.
The second disables tracing the first 3 times block_unplug is hit.
The third enables the kmalloc event the first 3 times block_unplug is
hit and has value of greater than 1 for the 'nr_rq' event field.
Like function triggers, the counter is only decremented if it enabled
or disabled tracing.
To remove a trigger without a count:
echo
'!<trigger> >
<system>/<event>/trigger
To remove a trigger with a count:
echo
'!<trigger>:0 >
<system>/<event>/trigger
Filters can be ignored when removing a trigger.
example:
echo 'stacktrace:5 if bytes_req >= 65536' >
/sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
uprobe_events
- probes in userspace, see
https://www.kernel.org/doc/Documentation/trace/uprobetracer.txt
uprobe_profile
available_filter_functions
- list of functions that can be filtered on
with
set_ftrace_filter/
set_ftrace_notrace/
set_graph_function/
set_graph_notrace
enabled_functions
- list of hookable functions
set_ftrace_filter
- for function tracer: trace only listed functions
also can
be used with function_graph to force function as a leaf one,
and display time for it
accepts: |
func_full_name, *func_end, func_begin*, *func_middle* |
modules: |
Can select a group via module |
Format: |
:mod:<module-name> |
example: |
echo :mod:ext3
> set_ftrace_filter |
to
remove function(s): |
echo '!*xyz*'
>> set_ftrace_filter |
triggers: |
a command to perform when function is hit |
Format: |
<function>:<trigger>[:count] |
trigger: |
traceon,
traceoff
enable_event:<system>:<event>
disable_event:<system>:<event>
stacktrace
snapshot
dump
cpudump
|
example: |
echo
do_fault:traceoff > set_ftrace_filter
echo
do_trap:traceoff:3 > set_ftrace_filter
|
The first one will
disable tracing every time do_fault is hit
The second will disable tracing at most 3 times when do_trap is hit
To remove trigger without count:
echo
'!<function>:<trigger> >
set_ftrace_filter
To remove trigger with a count:
echo
'!<function>:<trigger>:0 >
set_ftrace_filter
|
set_ftrace_notrace
- for function tracer: list functions to never trace
accepts: func_full_name,
*func_end, func_begin*, *func_middle*
modules: Can select a group via module command :mod:
Does not accept triggers
Example:
echo
'*lock*' > set_ftrace_notrace
cat set_ftrace_notrace | head -10
set_ftrace_pid
- for function tracer: trace only this pid
Helper script:
#!/bin/sh
DEBUGFS=`grep
debugfs /proc/mounts | awk '{ print $2; }'`
echo $$ >
$DEBUGFS/tracing/set_ftrace_pid
echo function
> $DEBUGFS/tracing/current_tracer
exec $*
set_graph_function
- for function_graph tracer: trace only listed
functions (similar to set_ftrace_filter)
set_graph_notrace
- for function_graph tracer: list functions to never
trace (similar to set_ftrace_notrace)
Example:
echo SyS_read SyS_write
> set_graph_function
echo SyS_open
>> set_graph_function
echo function_graph
> current_tracer
options/func_stack_trace
- trace stack and task id
Example: what calls a
specific function (task id & stack trace)
echo
function > current_tracer
echo kfree >
set_ftrace_filter
echo 1 >
options/func_stack_trace
... do something ...
echo 0 >
options/func_stack_trace
echo >
set_ftrace_filter
function_profile_enabled
- function profiling
trace_stat/function<CPU>
Example (including
preempted time):
echo nop >
current_tracer
echo 1 >
function_profile_enabled
cat
trace_stat/function0 | head
Example (excluding preempted time):
echo 0 >
options/sleep-time
echo 1 >
function_profile_enabled
cat
trace_stat/function0 | head
Example (excluding time for child function, only measure the function
body):
echo 0 >
options/graph-time
echo 1 >
function_profile_enabled
cat
trace_stat/function0 | head
Reset data:
echo 0 >
function_profile_enabled
echo 1 >
function_profile_enabled
echo 1 > /proc/sys/kernel/
stack_tracer_enabled
- enables stack
tracing
(also "stacktrace" command in triggers/filters)
cat
stack_max_size - max stack usage
cat
stack_trace - stack trace at max stack usage point
stack_trace_filter
- similar to set_ftrace_filter but limits what
stacktrace traces
per_cpu/cpu<n>/trace
- per-cpu version of
trace
per_cpu/cpu<n>/trace_pipe
- per-cpu version of
trace_pipe
per_cpu/cpu<n>/snapshot
- similar to regular
snapshot
tracing_max_latency
- some tracers will record max. latency here (irqsoff, preemptoff,
preemptirqsoff, wakeup, wakeup_rt, wakeup_dl)
tracing_thresh
- will record latency only if it exceeds @tracing_thresh
usec
kprobe_events
- kprobes to install/intercept as events (use echo '...'
>> to add)
probe:
p:ev1
symbol (can also specify module, offset, addr, and data to
add to the event)
return probe: r:grp2/ev2 symbol
r:grp2/ev3 symbol $retval
(records return value)
to clear:
-:ev1
-:grp2/ev2
-:grp2/ev3
blank (clear all)
events/kprobes/enable
- 1/0 to enable/disable all kprobes
events/kprobes/filter
- similar to events/<subsys>/filter
events/kprobes/<evname>/enable
- 1/0 to enable/disable
specific kprobe
events/kprobes/<evname>/format
- lists event variable
names and formatting data
events/kprobes/<evname>/filter
- similar to
events/<subsys>/<event>/filter
events/kprobes/<evname>/trigger
- similar to
events/<subsys>/<event>/trigger
snapshot
- take/read snapshot of
trace
echo 1 > snapshot
// take
snapshot of trace
cat snapshot
// use as trace
echo 0 > snapshot
// free snapshot
instances
- parallel tracing: multiple configs of traces
mkdir instances/foo
//
creates dir similar to debug/tracing
... config trace in instances/foo ...
mkdir instances/bar
... config trace in instances/bar ...
trace_options
- what and how to output to the trace,
see
https://www.kernel.org/doc/Documentation/trace/ftrace.txt
options/<optname>
echo stacktrace >trace_options
echo nostacktrace >trace_options
trace_clock
- clock used to timestamp the events
local: |
Per cpu clock but may not be synced across CPUs |
global: |
Synced across CPUs but slows tracing down |
counter: |
Not a clock, but just an incremented counter, sync
across CPUs |
uptime: |
Jiffy counter from time of boot |
perf: |
Same clock that perf
events use |
x86-tsc: |
TSC cycle counter |
Function graph tracer columns:
cpu - duration - marks - code
- can filter by-cpu with grep or use
per_cpu/cpu<n>/trace
- duration is reported inline for leaf functions,
for composite functions reported at the line of closing } symbol
- marks: "+"
means > 10 us
"!" means > 100 us
echo
funcgraph-tail > trace_options //
display func-name after
the closing bracket
To dump ftrace buffer to console on oops:
echo 50 >
buffer_size_kb //
limit the size of the dump
echo 1 >
/proc/sys/kernel/ftrace_dump_on_oops
or boot with:
ftrace_dump_on_oops=1
and optional
selection of:
ftrace=[tracer]
ftrace_filter=[function
list]
ftrace_notrace=[function
list]
ftrace_graph_filter=[function
list]
ftrace_graph_notrace=[function
list]
trace_buf_size=nn[KMG]
trace_event=[event-list]
trace_options=[option-list]
// e.g. trace_options=stacktrace
stacktrace
stacktrace_filter=[function
list]
tp_printk
traceoff_on_warning
To reset ftrace:
Command-line tool to control ftrace:
trace-cmd
http://lwn.net/Articles/410200
http://events.linuxfoundation.org/slides/2010/linuxcon_japan/linuxcon_jp2010_rostedt.pdf
trace-cmd list
trace-cmd record -e sched ls -ltr /usr > /dev/null
trace-cmd record -o func.dat -p function ls -ltr /usr > /dev/null
trace-cmd record -o fgraph.dat -p function_graph ls -ltr /usr >
/dev/null
trace-cmd record -o fgraph-events.dat -e sched -p function_graph ls
-ltr /usr > /dev/null
trace-cmd record -e sched_switch -f 'prev_prio < 100'
trace-cmd record -p function_graph -O nograph-time
trace-cmd record -p function_graph -g sys_read
trace-cmd record -p function_graph -l do_IRQ -l timer_interrupt
trace-cmd record -p function_graph -n '*lock*'
trace-cmd record -p function_graph -l do_IRQ -e irq_handler_entry
trace-cmd report -i func.dat
trace-cmd hist [-P] [-i func.dat] //
histogram
trace-cmd reset
KernelShark (GUI viewer for trace-cmd files)
https://lwn.net/Articles/425583
http://events.linuxfoundation.org/slides/2010/linuxcon_japan/linuxcon_jp2010_rostedt.pdf
kernelshark [func.dat]
Install perf
for a stock kernel:
apt-get install
linux-tools-common
apt-get
install
linux-tools-generic
Build perf
for custom kernel (Ubuntu):
First, under standard
distro kernel:
apt-get
linux-tools (to install man pages etc.)
Then:
apt-get install flex bison
apt-get install
libelf-dev libaudit-dev libnuma-dev libslang2-dev libdw-dev
binutils-dev
apt-get install
libunwind8-dev perl-modules libperl-dev asciidoc xmlto libiberty-dev
apt-get install
python-dev libpython-dev
cd
LINUX_ROOT/tools/perf
make
mkdir
/usr/lib/linux-tools/`uname -r`
install perf
/usr/lib/linux-tools/`uname -r`
or: make
prefix=/myown install
Build perf
for custom kernel (Fedora):
dnf install -y perf
dnf install -y gcc glibc-devel glibc-headers
dnf install -y binutils-devel elfutils-libelf-devel elfutils-devel libunwind-devel
dnf install -y audit-libs-devel slang-devel gtk2-devel
dnf install -y python-devel perl-devel perl-ExtUtils-Embed numactl-devel
dnf install -y asciidoc xmlto flex bison
cd
LINUX_ROOT/tools/perf
make
install into /usr/bin
Note: kernel traces may be incomplete if
frame pointer omitted,
so for profiling may want to
use CONFIG_FRAME_POINTER=y
Also enable CONFIG_KPROBES=y, CONFIG_KPROBE_EVENTS=y
http://perf.wiki.kernel.org/index.php/Tutorial
http://perf.wiki.kernel.org/index.php/Perf_examples
http://www.brendangregg.com/perf.html
http://lwn.net/Articles/353295
http://www.brendangregg.com/perf.html#OneLiners
http://www.brendangregg.com/perf.html#Links
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/examples.txt
Perf
commands |
perf |
list |
list available events |
stat |
measure total event count for a single program or for
system for some time |
top |
top-like dynamic view of hottest functions |
record |
run a command and record data to a file |
report |
analyze file generated by perf record; can generate
flat, or graph profile |
annotate |
annotate sources or assemembly |
sched |
trace/measure scheduler actions and latencies |
mem |
profile/display memory accesses |
lock |
analyze lock events |
timechart |
visualize system behavior during a workload (CPU, disk,
network IO) |
kmem |
trace kernel memory usage (SLAB and page allocators) |
kvm |
trace/measure KVM host and guest statistics |
probe |
define new dynamic tracepoints (functions, source code
lines etc.) |
inject |
add additional info to the events stream |
bench |
execute benchmark suites |
script |
dump perf.dat |
script-python |
process
perf.dat
with Python script |
script-perl |
process
perf.dat
with Perl script |
Examples:
perf list
perf stat -e cpu-clock sleep 1
perf top
Listing all currently known events:
perf list
Listing sched tracepoints:
perf list 'sched:*'
Event statistics:
Example commands:
<cmd> = dd if=/dev/zero of=/dev/null
count=1000000
<cmd> = sleep 3
perf stat -B
<cmd>
perf stat -e cycles <cmd>
perf stat -e cycles:u <cmd>
perf stat -e cycles:k <cmd>
perf stat -e instructions:u <cmd>
perf stat -e instructions:k <cmd>
u - user
k - kernel
h - hypervisor
H - host machine
G - guest machine
perf stat -a
<cmd>
# for
all CPUs
Profiling:
perf record [-a] [-F 1000]
<cmd>
# on
all CPUs, 1000 samples/sec
perf report [-k /foo/vmlinux]
--sort=CPU
# break-up by CPU
Call graph:
perf record [...] -ag <cmd>
perf report -g --stdio
How many kmallocs are executed during "sleep 10"?
perf stat -a -e kmem:kmalloc sleep 10
perf record -a -e kmem:kmalloc sleep 10
perf report
https://lwn.net/Articles/346470
https://www.kernel.org/doc/Documentation/trace/tracepoint-analysis.txt
Eclipse plugin: http://www.eclipse.org/linuxtools/projectPages/perf
Visualization:
Example one-liners
http://www.brendangregg.com/perf.html#OneLiners
Listing Events
# Listing all currently known events:
perf list
# Listing sched tracepoints:
perf list 'sched:*'
# Listing sched tracepoints (older syntax):
perf list -e 'sched:*'
Counting Events
# CPU counter statistics for the specified command:
perf stat command
# Detailed CPU counter statistics (includes extras) for the specified command:
perf stat -d command
# CPU counter statistics for the specified PID, until Ctrl-C:
perf stat -p PID
# CPU counter statistics for the entire system, for 5 seconds:
perf stat -a sleep 5
# Various basic CPU statistics, system wide, for 10 seconds:
perf stat -e cycles,instructions,cache-references,cache-misses,bus-cycles -a sleep 10
# Various CPU level 1 data cache statistics for the specified command:
perf stat -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores command
# Various CPU data TLB statistics for the specified command:
perf stat -e dTLB-loads,dTLB-load-misses,dTLB-prefetch-misses command
# Various CPU last level cache statistics for the specified command:
perf stat -e LLC-loads,LLC-load-misses,LLC-stores,LLC-prefetches command
# Count system calls for the specified PID, until Ctrl-C:
perf stat -e 'syscalls:sys_enter_*' -p PID
# Count system calls for the entire system, for 5 seconds:
perf stat -e 'syscalls:sys_enter_*' -a sleep 5
# Count scheduler events for the specified PID, until Ctrl-C:
perf stat -e 'sched:*' -p PID
# Count scheduler events for the specified PID, for 10 seconds:
perf stat -e 'sched:*' -p PID sleep 10
# Count ext4 events for the entire system, for 10 seconds:
perf stat -e 'ext4:*' -a sleep 10
# Count block device I/O events for the entire system, for 10 seconds:
perf stat -e 'block:*' -a sleep 10
# Show system calls by process, refreshing every 2 seconds:
perf top -e raw_syscalls:sys_enter -ns comm
Profiling
# Sample on-CPU functions for the specified command, at 99 Hertz:
perf record -F 99 command
# Sample on-CPU functions for the specified PID, at 99 Hertz, until Ctrl-C:
perf record -F 99 -p PID
# Sample on-CPU functions for the specified PID, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID sleep 10
# Sample CPU stack traces for the specified PID, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID -g -- sleep 10
# Sample CPU stack traces for the PID, using dwarf to unwind stacks, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID -g dwarf sleep 10
# Sample CPU stack traces for the entire system, at 99 Hertz, for 10 seconds:
perf record -F 99 -ag -- sleep 10
# Sample CPU stack traces for the entire system, with dwarf stacks, at 99 Hertz, for 10 seconds:
perf record -F 99 -ag dwarf sleep 10
# Sample CPU stack traces, once every 10,000 Level 1 data cache misses, for 5 seconds:
perf record -e L1-dcache-load-misses -c 10000 -ag -- sleep 5
# Sample CPU stack traces, once every 100 last level cache misses, for 5 seconds:
perf record -e LLC-load-misses -c 100 -ag -- sleep 5
# Sample on-CPU kernel instructions, for 5 seconds:
perf record -e cycles:k -a -- sleep 5
# Sample on-CPU user instructions, for 5 seconds:
perf record -e cycles:u -a -- sleep 5
# Sample on-CPU instructions precisely (using PEBS), for 5 seconds:
perf record -e cycles:p -a -- sleep 5
# Perform branch tracing (needs HW support), for 1 second:
perf record -b -a sleep 1
Static Tracing
# Trace new processes, until Ctrl-C:
perf record -e sched:sched_process_exec -a
# Trace all context-switches, until Ctrl-C:
perf record -e context-switches -a
# Trace all context-switches with stack traces, until Ctrl-C:
perf record -e context-switches -ag
# Trace all context-switches with stack traces, for 10 seconds:
perf record -e context-switches -ag -- sleep 10
# Trace CPU migrations, for 10 seconds:
perf record -e migrations -a -- sleep 10
# Trace all connect()s with stack traces (outbound connections), until Ctrl-C:
perf record -e syscalls:sys_enter_connect -ag
# Trace all accepts()s with stack traces (inbound connections), until Ctrl-C:
perf record -e syscalls:sys_enter_accept* -ag
# Trace all block device (disk I/O) requests with stack traces, until Ctrl-C:
perf record -e block:block_rq_insert -ag
# Trace all block device issues and completions (has timestamps), until Ctrl-C:
perf record -e block:block_rq_issue -e block:block_rq_complete -a
# Trace all block completions, of size at least 100 Kbytes, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'nr_sector > 200'
# Trace all block completions, synchronous writes only, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'rwbs == "WS"'
# Trace all block completions, all types of writes, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'rwbs ~ "*W*"'
# Trace all minor faults (RSS growth) with stack traces, until Ctrl-C:
perf record -e minor-faults -ag
# Trace all page faults with stack traces, until Ctrl-C:
perf record -e page-faults -ag
# Trace all ext4 calls, and write to a non-ext4 location, until Ctrl-C:
perf record -e 'ext4:*' -o /tmp/perf.data -a
# Trace kswapd wakeup events, until Ctrl-C:
perf record -e vmscan:mm_vmscan_wakeup_kswapd -ag
Dynamic Tracing
# Add a tracepoint for the kernel tcp_sendmsg() function entry ("--add" is optional):
perf probe --add tcp_sendmsg
# Remove the tcp_sendmsg() tracepoint (or use "--del"):
perf probe -d tcp_sendmsg
# Add a tracepoint for the kernel tcp_sendmsg() function return:
perf probe 'tcp_sendmsg%return'
# Show available variables for the kernel tcp_sendmsg() function (needs debuginfo):
perf probe -V tcp_sendmsg
# Show available variables for the kernel tcp_sendmsg() function, plus external vars (needs debuginfo):
perf probe -V tcp_sendmsg --externs
# Show available line probes for tcp_sendmsg() (needs debuginfo):
perf probe -L tcp_sendmsg
# Show available variables for tcp_sendmsg() at line number 81 (needs debuginfo):
perf probe -V tcp_sendmsg:81
# Add a tracepoint for tcp_sendmsg(), with three entry argument registers (platform specific):
perf probe 'tcp_sendmsg %ax %dx %cx'
# Add a tracepoint for tcp_sendmsg(), with an alias ("bytes") for the %cx register (platform specific):
perf probe 'tcp_sendmsg bytes=%cx'
# Trace previously created probe when the bytes (alias) variable is greater than 100:
perf record -e probe:tcp_sendmsg --filter 'bytes > 100'
# Add a tracepoint for tcp_sendmsg() return, and capture the return value:
perf probe 'tcp_sendmsg%return $retval'
# Add a tracepoint for tcp_sendmsg(), and "size" entry argument (reliable, but needs debuginfo):
perf probe 'tcp_sendmsg size'
# Add a tracepoint for tcp_sendmsg(), with size and socket state (needs debuginfo):
perf probe 'tcp_sendmsg size sk->__sk_common.skc_state'
# Tell me how on Earth you would do this, but don't actually do it (needs debuginfo):
perf probe -nv 'tcp_sendmsg size sk->__sk_common.skc_state'
# Trace previous probe when size is non-zero, and state is not TCP_ESTABLISHED(1) (needs debuginfo):
perf record -e probe:tcp_sendmsg --filter 'size > 0 && skc_state != 1' -a
# Add a tracepoint for tcp_sendmsg() line 81 with local variable seglen (needs debuginfo):
perf probe 'tcp_sendmsg:81 seglen'
# Add a tracepoint for do_sys_open() with the filename as a string (needs debuginfo):
perf probe 'do_sys_open filename:string'
# Add a tracepoint for myfunc() return, and include the retval as a string:
perf probe 'myfunc%return +0($retval):string'
# Add a tracepoint for the user-level malloc() function from libc:
perf probe -x /lib64/libc.so.6 malloc
# List currently available dynamic probes:
perf probe -l
Reporting
# Show perf.data in an ncurses browser (TUI) if possible:
perf report
# Show perf.data with a column for sample count:
perf report -n
# Show perf.data as a text report, with data coalesced and percentages:
perf report --stdio
# List all raw events from perf.data:
perf script
# List all raw events from perf.data, with customized fields:
perf script -f time,event,trace
# Dump raw contents from perf.data as hex (for debugging):
perf script -D
# Disassemble and annotate instructions with percentages (needs some debuginfo):
perf annotate --stdio
Documentation: https://sourceware.org/systemtap/documentation.html
Eclipse plugin: http://www.eclipse.org/linuxtools/projectPages/systemtap
Plugin docs: http://help.eclipse.org
-> SystemTap User Guide
To build:
CONFIG_DEBUG_INFO
CONFIG_KPROBES
CONFIG_RELAY
CONFIG_DEBUG_FS
CONFIG_MODULES
CONFIG_MODULE_UNLOAD
// CONFIG_UTRACE (no longer needed in recent kernels)
kernel: make headers_install
Ubuntu: apt-get install systemtap systemtap-runtime
Fedora: dnf install -y systemtap systemtap-runtime
For stock kernels (Fedora):
dnf install -y kernel-debuginfo kernel-devel kernel-debuginfo-common-<arch>
e.g. kernel-debuginfo-2.6.32-53.el6.i686.rpm
kernel-devel-2.6.32-53.el6.i686.rpm
kernel-debuginfo-common-i686-2.6.32-53.el6.i686.rpm
// yum install kernel-devel yum-utils debuginfo-install kernel
Ubuntu:
apt-get install elfutils libelf1 libelf-dev libdw1 libdw-dev
apt-get install latex2html libnss3 libnss3-dev libnss3-tools
Fedora:
dnf install -y kernel-devel yum-utils kernel
dnf install -y elfutils elfutils-libelf elfutils-devel
dnf install -y latex2html nss-tools nss-devel libvirt-devel
openSUSE:
zypper install -y systemtap systemtap-runtime
zypper install -y kernel-devel
zypper install -y elfutils libelf0 libelf1 libelf-devel
zypper install -y libebl1 libebl-devel libdw-devel
zypper install -y latex2html libvirt-devel
zypper install -y mozilla-nss mozilla-nss-tools mozilla-nss-devel
git clone git://sourceware.org/git/systemtap.git
https://sourceware.org/systemtap/ftp/releases
./configure --prefix=/usr
make [-j8] all
sudo make install
Ubuntu:
adduser sergey stapusr
adduser sergey stapsys
adduser sergey stapdev
Fedora:
usermod -a --groups stapusr sergey
usermod -a --groups stapsys sergey
usermod -a --groups stapdev sergey
openSUSE:
groupadd -r stapusr
groupadd -r stapsys
groupadd -r stapdev
usermod -a --groups stapusr sergey
usermod -a --groups stapsys sergey
usermod -a --groups stapdev sergey
re-login
Quick test:
stap -ve 'probe begin { log("hello world") exit ()
}'
stap -c df -e 'probe syscall.* { if (target()==pid()) log(name."
".argstr) }'
http://lttng.org
https://wiki.eclipse.org/Trace_Compass
oprofile is a legacy tool
In Fedora 22 and recent openSUSE:
Convential oprofile (opcontrol) is no longer available.
Distributed version of oprofile (dnf install -y oprofile) is just a wrapper around perf and has ainterface different from conventional oprofile.
Refer to Fedora System Adminisration Guide.
In Ubuntu:
apt-get
install oprofile
build kernel with
CONFIG_PROFILING=y
CONFIG_OPROFILE=y
boot with idle=poll
to disable cpu sleep, and
to force correct counting of CPU_CLK_UNHALTED etc. cycles
othwerwise profile will not reflect the actual time spent halted
caution: CPU will run hot
Capture data:
opcontrol
--reset
rm -r /var/lib/oprofile
opcontrol --setup --vmlinux=/foo/vmlinux
--event=RETIRED_INSNS:100000 # profile based on
retired instructions
opcontrol --setup --vmlinux=/foo/vmlinux
--event=CPU_CYCLES:1500000 #
profile based on CPU cycles
# can add --kernel-range=0xc00xxxxx,0xc0xxxxxx
# obtain addresses with
# cat /proc/kallsyms | grep "
_text"
# cat /proc/kallsyms | grep "
_etext"
opcontrol --start-daemon
opcontrol --reset
opcontrol --start
... do something ...
opcontrol --stop
[... repeat ...]
opcontrol --shutdown
... analyze ...
opcontrol --reset
rm -r /var/lib/oprofile
Analyze:
op_time -l
oprofpp -l -i
/foo/vmlinux
op_to_source -a -i
/foo/vmlinux
Eclipse plugin: http://www.eclipse.org/linuxtools/projectPages/oprofile
#
perf list | grep sched:
[...]
sched:sched_stat_wait
[Tracepoint event]
sched:sched_stat_sleep
[Tracepoint event]
sched:sched_stat_iowait
[Tracepoint event]
http://www.iometer.org
http://glandium.org/blog/?p=1476
blktrace:
blktrace -d /dev/sdc2 -o - |
blkparse -i -
https://lwn.net/Articles/608497
http://www.brendangregg.com/blog/2014-07-16/iosnoop-for-linux.html
https://raw.githubusercontent.com/brendangregg/perf-tools/master/iosnoop
"Exploring
the Linux Storage Path - Tracing block I/O kernel events"
LINUX_ROOT/include/trace/events/block.h