Static tracepoints

Code sample: LINUX_ROOT/samples/trace_events

https://www.kernel.org/doc/Documentation/trace/tracepoints.txt (obsolete)
http://lwn.net/Articles/379903
http://lwn.net/Articles/381064
http://lwn.net/Articles/383362
https://www.kernel.org/doc/htmldocs/tracepoint/index.html

Declare single tracepoint:

TRACE_EVENT(event_name, proto, args, struct, assign, print)

event_name - tracepoint (event) name
prototype - args declaration
args - args list
struct - describe the structure to be stored in the tracer ring buffer
assign - assignment statements for the struct
print - formatting of the struct
   
Declare multiple tracepoints with the same protoargsstructassign and print.
Saves data space.
   
DECLARE_EVENT_CLASS(template_name, proto, args, struct, assign, print);
DECLARE_EVENT(template_name, event1_name, proto, args);
...
DECLARE_EVENT(template_name, event2_name, proto, args);


Conditional tracepoint:

TRACE_EVENT_CONDITION(name, proto, args, cond, struct, assign, print)
or: DEFINE_EVENT_CONDITION(template, name, proto, args, cond)

for cond, use TP_CONDITION(boolean-expression)
   
Header file, typically located in include/trace/events

#undef TRACE_SYSTEM
#define TRACE_SYSTEM mysubsys

#if !defined(_TRACE_MYSUBSYS_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_MYSUBSYS_H

#include <linux/tracepoint.h>
  
TRACE_EVENT(mysubsys_myevent,   

    TP_PROTO(struct rq *rq, struct task_struct *prev, struct task_struct *next),

    TP_ARGS(rq, prev, next),
   
    TP_STRUCT__entry(
        __array(    char,    prev_comm,    TASK_COMM_LEN    )
        __field(    pid_t,    prev_pid            )
        __field(    int,    prev_prio            )
        __field(    long,    prev_state            )
        __array(    char,    next_comm,    TASK_COMM_LEN    )
        __field(    pid_t,    next_pid            )
        __field(    int,    next_prio            )
    ),
    // __field  =>  scalar member
    // __array  =>  static array member
    // __string =>  nul-terminated variable-length string
    // __dynamic_array => dynamically-sized array
    // __field_struct => struct or union
    // __bitmask => array of longs, with bits
    // see more in samples/trace_event/*.h
    // checkpatch.pl will complain of spacing, that's ok
   
    TP_fast_assign(
        memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
        __entry->prev_pid    = prev->pid;
        __entry->prev_prio    = prev->prio;
        __entry->prev_state    = prev->state;
        memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
        __entry->next_pid    = next->pid;
        __entry->next_prio    = next->prio;
    ),
    // __assign_str  => copy variable-length string
    // __get_dynamic_array  =>  to access dynamically-sized array for copying (e.g. memcpy)
    // __assign_bitmask

    TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> " \
           "next_comm=%s next_pid=%d next_prio=%d",
        __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
        __entry->prev_state ?
          __print_flags(__entry->prev_state, "|",
                { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
                { 16, "Z" }, { 32, "X" }, { 64, "x" },
                { 128, "W" }) : "R",
        __entry->next_comm, __entry->next_pid, __entry->next_prio)
    // __get_str  =>  get variable-length string
    // __get_dynamic_array, __get_dynamic_array_len => get dynamically-sized array
    // __print_symbolic(val, values)  =>  convert selector value to string
    // __get_bitmask
    // __print_array
    // see more in samples/trace_event/*.h
);       

#endif /* _TRACE_MYSUBSYS_H */

/* This part must be outside protection */

// If header file is located in local directory (i.e. not include/trace/events), then do:
// #undef TRACE_INCLUDE_PATH
// #undef TRACE_INCLUDE_FILE
// #define TRACE_INCLUDE_PATH .
// #define TRACE_INCLUDE_FILE mysubsys     // if the same as TRACE_SYSTEM, then may be omitted
// then also add to Makefile:    EXTRA_CFLAGS = -I$(src)
// or:                           CFLAGS_xxx.o = -I$(src)

#include <trace/define_trace.h>


Source file (use of tracepoint)

// declare CREATE_TRACE_POINTS only in one source file,
// otherwise will generate duplicate tracepoint routines/data,
// and linker will fail
#define CREATE_TRACE_POINTS
#include <trace/events/mysubsys.h>

... code ...
trace_mysubsys_myevent(rq, prev, next);
... code ...
if (trace_mysubsys_myevent_enabled()) {
    ... do some calculations ...
    trace_mysubsys_myevent(rq, prev, next);
}

In loadable module:

EXPORT_TRACEPOINT_SYMBOL(mysubsys_myevent);
EXPORT_TRACEPOINT_SYMBOL_GPL(mysubsys_myevent);

// on load:
register_trace_mysubsys_myevent(###...);        // obsolete?

// on unload:
unregister_trace_mysubsys_myevent(###...);    // obsolete?
tracepoint_synchronize_unregister();



Declares:

trace_mysubsys_myevent(struct rq *rq, struct task_struct *prev, struct task_struct *next);
              
struct {
          char   prev_comm[TASK_COMM_LEN];
          pid_t  prev_pid;
          int    prev_prio;
          long   prev_state;
          char   next_comm[TASK_COMM_LEN];
          pid_t  next_pid;
          int    next_prio;
};              
   
Creates:

/sys/kernel/debug/tracing/events/sched/mysubsys_myevent/...

enable    => 0/1
format    => lists variable names and formats
filter    => filter expression (see below)
trigger => trigger operation (see below)
   
To use:

cd /sys/kernel/debug/tracing
echo 1 > events/mysubsys/myevent/enable
... do something ...
cat trace

cd /sys/kernel/debug/tracing
echo mysubsys_myevent other1 other2 > set_event
... do something ...
cat trace


ftrace

https://www.kernel.org/doc/Documentation/trace/ftrace.txt
https://lwn.net/Articles/370423

mem events: https://www.kernel.org/doc/Documentation/trace/events-kmem.txt
NMI events: https://www.kernel.org/doc/Documentation/trace/events-nmi.txt
power events: https://www.kernel.org/doc/Documentation/trace/events-power.txt

https://www.kernel.org/doc/Documentation/trace/tracepoint-analysis.txt

https://lwn.net/Articles/365835
https://lwn.net/Articles/370423

Under /sys/kernel/debug/tracing

README => brief help

tracing_on - 1/0 to enable/disable tracing

trace - contents of the buffer, immutable on reads, to clear: echo > trace
trace_pipe - contents of the buffer, clears when read

see format in https://www.kernel.org/doc/Documentation/trace/ftrace.txt

buffer_size_kb - size of trace buffer per cpu
buffer_total_size_kb - total size of trace buffers for all cpus

current_tracer - currently selected function and latency tracer
available_tracers - available tracers (blk function_graph wakeup_dl wakeup_rt wakeup function nop etc.)

function function entry (who calls whom)
function_graph function entry & exit (nested call tree)
blk block io (see blktrace
irqsoff how long interrupts are disabled (requires CONFIG_IRQSOFF_TRACER),
longest interrupt-disabled section (see tracing_max_latency)
preemptoff how long preemption is disabled (requires CONFIG_PREEMPT_TRACER)
preemptirqsoff how long preemption and/or irqs are disabled
wakeup how long does it take for a process to run after being woken,
latency for highest-priority task to be scheduled after wakeup
wakeup_rt wakeup-to-sched latency just for RT tasks
wakeup_dl wakeup-to-sched latency just for DL tasks
nop remove all tracers

max_graph_depth - limit depth of nested calls in function graph (0 = unlimited)

saved_cmdlines - pid to command line
saved_cmdlines_size - number of entries (lines) in saved_cmdlines

trace_marker - userland can add to the trace by writing here
tracing_cpumask - mask of CPUs to trace

options/<name> - options to control tracers
trace_options

available_events - list of available events (subsys:eventname)

set_event - event selector (can also use events/.../enable)

to enable:             echo 'subsys:eventname' > set_event
                            echo eventname > set_event

to disable:             echo '!subsys:eventname' > set_event
                            echo '!eventname' > set_event

to enable all:             echo '*:*' >set_event
to enable subsys:      echo 'subsys:*' >set_event
to disable all:            echo >set_event

can also specify on boot:  parameter:  trace_event=s1:e1,s2:e2,...,sx:ex

https://www.kernel.org/doc/Documentation/trace/events.txt

events/enable - enable all events (0/1)
events/<subsys>/enable - enable all events in sybsystem
events/<subsys>/filter - filter for the events (see <event>/filter below, but may use only vars defined by all events)
events/<subsys>/<event>/enable - enable event (0/1)
events/<subsys>/<event>/format - lists event variable names (from struct) and formatting data
events/<subsys>/<event>/filter - filter for the event

see more in https://www.kernel.org/doc/Documentation/trace/events.txt

see variable names in format

numeric ops:        ==, !=, <, <=, >, >=, &
string ops:             ==, !=, ~ (~ is limited pattern matching: "xxx*", "*xxx", "*xxx*")
boolean:               (expr1) && ((expr2) || (expr3))
sample:                 echo "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter

to clear:                 echo 0 > filter

events/<subsys>/<event>/trigger - command to perform on event

see more in https://www.kernel.org/doc/Documentation/trace/events.txt

Format: <trigger>[:count][if <filter>]

trigger:
traceon, traceoff
enable_event:<system>:<event>
disable_event:<system>:<event>
stacktrace
snapshot

example:       echo traceoff > events/block/block_unplug/trigger
                  echo traceoff:3 > events/block/block_unplug/trigger
                    echo 'enable_event:kmem:kmalloc:3 if nr_rq > 1' >events/block/block_unplug/trigger

The first disables tracing every time block_unplug is hit.
The second disables tracing the first 3 times block_unplug is hit.
The third enables the kmalloc event the first 3 times block_unplug is hit and has value of greater than 1 for the 'nr_rq' event field.

Like function triggers, the counter is only decremented if it enabled or disabled tracing.

To remove a trigger without a count:    echo '!<trigger> > <system>/<event>/trigger
To remove a trigger with a count:         echo '!<trigger>:0 > <system>/<event>/trigger

Filters can be ignored when removing a trigger.

example:   echo 'stacktrace:5 if bytes_req >= 65536' > /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger

uprobe_events - probes in userspace, see https://www.kernel.org/doc/Documentation/trace/uprobetracer.txt
uprobe_profile

available_filter_functions - list of functions that can be filtered on with set_ftrace_filter/set_ftrace_notrace/set_graph_function/set_graph_notrace

enabled_functions - list of hookable functions

set_ftrace_filter - for function tracer: trace only listed functions
                               also can be used with function_graph to force function as a leaf one, and display time for it

accepts:  func_full_name, *func_end, func_begin*, *func_middle*
modules:  Can select a group via module
Format:  :mod:<module-name>
example:  echo :mod:ext3 > set_ftrace_filter
to remove function(s):  echo '!*xyz*' >> set_ftrace_filter
triggers:  a command to perform when function is hit
Format:  <function>:<trigger>[:count]
trigger: traceon, traceoff
enable_event:<system>:<event>
disable_event:<system>:<event>
stacktrace
snapshot
dump
cpudump
example: echo do_fault:traceoff > set_ftrace_filter
echo do_trap:traceoff:3 > set_ftrace_filter
The first one will disable tracing every time do_fault is hit
The second will disable tracing at most 3 times when do_trap is hit

To remove trigger without count:
echo '!<function>:<trigger> > set_ftrace_filter

To remove trigger with a count:
echo '!<function>:<trigger>:0 > set_ftrace_filter

set_ftrace_notrace - for function tracer: list functions to never trace

accepts: func_full_name, *func_end, func_begin*, *func_middle*
modules: Can select a group via module command :mod:
Does not accept triggers

Example:

echo '*lock*' > set_ftrace_notrace
cat set_ftrace_notrace | head -10

set_ftrace_pid - for function tracer: trace only this pid

Helper script:

#!/bin/sh
DEBUGFS=`grep debugfs /proc/mounts | awk '{ print $2; }'`
echo $$ > $DEBUGFS/tracing/set_ftrace_pid
echo function > $DEBUGFS/tracing/current_tracer
exec $*


set_graph_function - for function_graph tracer: trace only listed functions (similar to set_ftrace_filter)
set_graph_notrace - for function_graph tracer: list functions to never trace (similar to set_ftrace_notrace)

Example:

echo SyS_read SyS_write > set_graph_function
echo SyS_open >> set_graph_function
echo function_graph > current_tracer

options/func_stack_trace - trace stack and task id

Example: what calls a specific function (task id & stack trace)

echo function > current_tracer
echo kfree > set_ftrace_filter
echo 1 > options/func_stack_trace
... do something ...
echo 0 > options/func_stack_trace
echo > set_ftrace_filter

function_profile_enabled - function profiling
trace_stat/function<CPU>

Example (including preempted time):

echo nop > current_tracer
echo 1 > function_profile_enabled
cat trace_stat/function0 | head


Example (excluding preempted time):

echo 0 > options/sleep-time
echo 1 > function_profile_enabled
cat trace_stat/function0 | head


Example (excluding time for child function, only measure the function body):

echo 0 > options/graph-time
echo 1 > function_profile_enabled
cat trace_stat/function0 | head


Reset data:

echo 0 > function_profile_enabled
echo 1 > function_profile_enabled


echo 1 > /proc/sys/kernel/stack_tracer_enabled - enables stack tracing
(also "stacktrace" command in triggers/filters)

cat stack_max_size - max stack usage
cat stack_trace - stack trace at max stack usage point

stack_trace_filter - similar to set_ftrace_filter but limits what stacktrace traces

per_cpu/cpu<n>/trace - per-cpu version of trace
per_cpu/cpu<n>/trace_pipe - per-cpu version of trace_pipe
per_cpu/cpu<n>/snapshot - similar to regular snapshot

tracing_max_latency - some tracers will record max. latency here (irqsoff, preemptoff, preemptirqsoff, wakeup, wakeup_rt, wakeup_dl)
tracing_thresh - will record latency only if it exceeds @tracing_thresh usec

kprobe_events - kprobes to install/intercept as events (use echo '...' >> to add)

probe:              p:ev1 symbol (can also specify module, offset, addr, and data to add to the event)
return probe:     r:grp2/ev2 symbol
                        r:grp2/ev3 symbol $retval (records return value)
to clear:             -:ev1
                        -:grp2/ev2
                        -:grp2/ev3
                        blank (clear all)

https://www.kernel.org/doc/Documentation/trace/kprobetrace.txt

events/kprobes/enable - 1/0 to enable/disable all kprobes
events/kprobes/filter - similar to events/<subsys>/filter
events/kprobes/<evname>/enable - 1/0 to enable/disable specific kprobe
events/kprobes/<evname>/format - lists event variable names and formatting data
events/kprobes/<evname>/filter - similar to events/<subsys>/<event>/filter
events/kprobes/<evname>/trigger - similar to events/<subsys>/<event>/trigger

https://github.com/brendangregg/perf-tools/blob/master/kernel/kprobe

snapshot - take/read snapshot of trace

echo 1 > snapshot         // take snapshot of trace
cat snapshot                  // use as trace
echo 0 > snapshot         // free snapshot

instances - parallel tracing: multiple configs of traces

mkdir instances/foo         // creates dir similar to debug/tracing
... config trace in instances/foo ...
mkdir instances/bar
... config trace in instances/bar ...

trace_options - what and how to output to the trace,
                          see https://www.kernel.org/doc/Documentation/trace/ftrace.txt
options/<optname>

echo stacktrace >trace_options
echo nostacktrace >trace_options


trace_clock - clock used to timestamp the events

local:  Per cpu clock but may not be synced across CPUs
global: Synced across CPUs but slows tracing down
counter: Not a clock, but just an incremented counter, sync across CPUs
uptime: Jiffy counter from time of boot
perf: Same clock that perf events use
x86-tsc: TSC cycle counter
           

Function graph tracer columns:

cpu - duration - marks - code
echo funcgraph-tail > trace_options // display func-name after the closing bracket

To dump ftrace buffer to console on oops:

echo 50 > buffer_size_kb     // limit the size of the dump
echo 1 > /proc/sys/kernel/ftrace_dump_on_oops

or boot with:

ftrace_dump_on_oops=1

and optional selection of:

ftrace=[tracer]
ftrace_filter=[function list]
ftrace_notrace=[function list]
ftrace_graph_filter=[function list]
ftrace_graph_notrace=[function list]
trace_buf_size=nn[KMG]
trace_event=[event-list]
trace_options=[option-list] // e.g. trace_options=stacktrace
stacktrace
stacktrace_filter=[function list]
tp_printk
traceoff_on_warning

To reset ftrace:

echo > current_tracer nop
echo > set_ftrace_filter
echo > set_graph_function
echo > set_ftrace_pid
echo 0 > events/enable
echo 0 > tracing_thresh
echo > kprobe_events
echo 1 > tracing_on


https://github.com/brendangregg/perf-tools/blob/master/tools/reset-ftrace

trace-cmd

Command-line tool to control ftrace:

trace-cmd

http://lwn.net/Articles/410200
http://events.linuxfoundation.org/slides/2010/linuxcon_japan/linuxcon_jp2010_rostedt.pdf

trace-cmd list

trace-cmd record -e sched ls -ltr /usr > /dev/null
trace-cmd record -o func.dat -p function ls -ltr /usr > /dev/null
trace-cmd record -o fgraph.dat -p function_graph ls -ltr /usr > /dev/null
trace-cmd record -o fgraph-events.dat -e sched -p function_graph ls -ltr /usr > /dev/null
trace-cmd record -e sched_switch -f 'prev_prio < 100'
trace-cmd record -p function_graph -O nograph-time
trace-cmd record -p function_graph -g sys_read
trace-cmd record -p function_graph -l do_IRQ -l timer_interrupt
trace-cmd record -p function_graph -n '*lock*'
trace-cmd record -p function_graph -l do_IRQ -e irq_handler_entry

trace-cmd report -i func.dat

trace-cmd hist [-P] [-i func.dat]    // histogram

trace-cmd reset
       
KernelShark  (GUI viewer for trace-cmd files)

https://lwn.net/Articles/425583
http://events.linuxfoundation.org/slides/2010/linuxcon_japan/linuxcon_jp2010_rostedt.pdf

kernelshark [func.dat]

perf

Install perf for a stock kernel:

apt-get install linux-tools-common
apt-get install linux-tools-generic

Build perf for custom kernel (Ubuntu):

First, under standard distro kernel:

apt-get linux-tools  (to install man pages etc.)

Then:

apt-get install flex bison
apt-get install libelf-dev libaudit-dev libnuma-dev libslang2-dev libdw-dev binutils-dev
apt-get install libunwind8-dev perl-modules libperl-dev asciidoc xmlto libiberty-dev
apt-get install python-dev libpython-dev

cd LINUX_ROOT/tools/perf
make
mkdir /usr/lib/linux-tools/`uname -r`
install perf /usr/lib/linux-tools/`uname -r`


or: make prefix=/myown install

Build perf for custom kernel (Fedora):

dnf install -y perf

dnf install -y gcc glibc-devel glibc-headers
dnf install -y binutils-devel elfutils-libelf-devel elfutils-devel libunwind-devel
dnf install -y audit-libs-devel slang-devel gtk2-devel
dnf install -y python-devel perl-devel perl-ExtUtils-Embed numactl-devel
dnf install -y asciidoc xmlto flex bison

cd LINUX_ROOT/tools/perf
make

install into /usr/bin

Note:     kernel traces may be incomplete if frame pointer omitted,
              so for profiling may want to use CONFIG_FRAME_POINTER=y

Also enable CONFIG_KPROBES=y, CONFIG_KPROBE_EVENTS=y

http://perf.wiki.kernel.org/index.php/Tutorial
http://perf.wiki.kernel.org/index.php/Perf_examples
http://www.brendangregg.com/perf.html
http://lwn.net/Articles/353295
http://www.brendangregg.com/perf.html#OneLiners
http://www.brendangregg.com/perf.html#Links
http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/examples.txt

Perf commands
perf list list available events
stat measure total event count for a single program or for system for some time
top top-like dynamic view of hottest functions
record run a command and record data to a file
report analyze file generated by perf record; can generate flat, or graph profile
annotate annotate sources or assemembly
sched trace/measure scheduler actions and latencies
mem profile/display memory accesses
lock analyze lock events
timechart visualize system behavior during a workload (CPU, disk, network IO)
kmem trace kernel memory usage (SLAB and page allocators)
kvm trace/measure KVM host and guest statistics
probe define new dynamic tracepoints (functions, source code lines etc.)
inject add additional info to the events stream
bench execute benchmark suites
script dump perf.dat
script-python process perf.dat with Python script
script-perl process perf.dat with Perl script

Examples:

perf list
perf stat -e cpu-clock sleep 1
perf top

Listing all currently known events:

perf list

Listing sched tracepoints:

perf list 'sched:*'

Event statistics:

Example commands:

<cmd> = dd if=/dev/zero of=/dev/null count=1000000
<cmd> = sleep 3

perf stat -B <cmd>   
perf stat -e cycles <cmd>
perf stat -e cycles:u <cmd>
perf stat -e cycles:k <cmd>
perf stat -e instructions:u  <cmd>
perf stat -e instructions:k  <cmd>

u - user
k - kernel
h - hypervisor
H - host machine
G - guest machine

perf stat -a <cmd>    # for all CPUs

Profiling:

perf record [-a] [-F 1000] <cmd>   # on all CPUs, 1000 samples/sec
perf report [-k /foo/vmlinux]

--sort=CPU    # break-up by CPU
   
Call graph:

perf record [...] -ag <cmd>
perf report -g --stdio

How many kmallocs are executed during "sleep 10"?

perf stat -a -e kmem:kmalloc sleep 10

perf record -a -e kmem:kmalloc sleep 10
perf report

https://lwn.net/Articles/346470
https://www.kernel.org/doc/Documentation/trace/tracepoint-analysis.txt

Eclipse plugin: http://www.eclipse.org/linuxtools/projectPages/perf

Visualization:

http://www.brendangregg.com/flamegraphs.html
http://www.brendangregg.com/perf.html#FlameGraphs
http://www.brendangregg.com/perf.html#HeatMaps

Example one-liners
http://www.brendangregg.com/perf.html#OneLiners

Listing Events

# Listing all currently known events:
perf list

# Listing sched tracepoints:
perf list 'sched:*'

# Listing sched tracepoints (older syntax):
perf list -e 'sched:*'

Counting Events

# CPU counter statistics for the specified command:
perf stat command

# Detailed CPU counter statistics (includes extras) for the specified command:
perf stat -d command

# CPU counter statistics for the specified PID, until Ctrl-C:
perf stat -p PID

# CPU counter statistics for the entire system, for 5 seconds:
perf stat -a sleep 5

# Various basic CPU statistics, system wide, for 10 seconds:
perf stat -e cycles,instructions,cache-references,cache-misses,bus-cycles -a sleep 10

# Various CPU level 1 data cache statistics for the specified command:
perf stat -e L1-dcache-loads,L1-dcache-load-misses,L1-dcache-stores command

# Various CPU data TLB statistics for the specified command:
perf stat -e dTLB-loads,dTLB-load-misses,dTLB-prefetch-misses command

# Various CPU last level cache statistics for the specified command:
perf stat -e LLC-loads,LLC-load-misses,LLC-stores,LLC-prefetches command

# Count system calls for the specified PID, until Ctrl-C:
perf stat -e 'syscalls:sys_enter_*' -p PID

# Count system calls for the entire system, for 5 seconds:
perf stat -e 'syscalls:sys_enter_*' -a sleep 5

# Count scheduler events for the specified PID, until Ctrl-C:
perf stat -e 'sched:*' -p PID

# Count scheduler events for the specified PID, for 10 seconds:
perf stat -e 'sched:*' -p PID sleep 10

# Count ext4 events for the entire system, for 10 seconds:
perf stat -e 'ext4:*' -a sleep 10

# Count block device I/O events for the entire system, for 10 seconds:
perf stat -e 'block:*' -a sleep 10

# Show system calls by process, refreshing every 2 seconds:
perf top -e raw_syscalls:sys_enter -ns comm

Profiling

# Sample on-CPU functions for the specified command, at 99 Hertz:
perf record -F 99 command

# Sample on-CPU functions for the specified PID, at 99 Hertz, until Ctrl-C:
perf record -F 99 -p PID

# Sample on-CPU functions for the specified PID, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID sleep 10

# Sample CPU stack traces for the specified PID, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID -g -- sleep 10

# Sample CPU stack traces for the PID, using dwarf to unwind stacks, at 99 Hertz, for 10 seconds:
perf record -F 99 -p PID -g dwarf sleep 10

# Sample CPU stack traces for the entire system, at 99 Hertz, for 10 seconds:
perf record -F 99 -ag -- sleep 10

# Sample CPU stack traces for the entire system, with dwarf stacks, at 99 Hertz, for 10 seconds:
perf record -F 99 -ag dwarf sleep 10

# Sample CPU stack traces, once every 10,000 Level 1 data cache misses, for 5 seconds:
perf record -e L1-dcache-load-misses -c 10000 -ag -- sleep 5

# Sample CPU stack traces, once every 100 last level cache misses, for 5 seconds:
perf record -e LLC-load-misses -c 100 -ag -- sleep 5

# Sample on-CPU kernel instructions, for 5 seconds:
perf record -e cycles:k -a -- sleep 5

# Sample on-CPU user instructions, for 5 seconds:
perf record -e cycles:u -a -- sleep 5

# Sample on-CPU instructions precisely (using PEBS), for 5 seconds:
perf record -e cycles:p -a -- sleep 5

# Perform branch tracing (needs HW support), for 1 second:
perf record -b -a sleep 1

Static Tracing

# Trace new processes, until Ctrl-C:
perf record -e sched:sched_process_exec -a

# Trace all context-switches, until Ctrl-C:
perf record -e context-switches -a

# Trace all context-switches with stack traces, until Ctrl-C:
perf record -e context-switches -ag

# Trace all context-switches with stack traces, for 10 seconds:
perf record -e context-switches -ag -- sleep 10

# Trace CPU migrations, for 10 seconds:
perf record -e migrations -a -- sleep 10

# Trace all connect()s with stack traces (outbound connections), until Ctrl-C:
perf record -e syscalls:sys_enter_connect -ag

# Trace all accepts()s with stack traces (inbound connections), until Ctrl-C:
perf record -e syscalls:sys_enter_accept* -ag

# Trace all block device (disk I/O) requests with stack traces, until Ctrl-C:
perf record -e block:block_rq_insert -ag

# Trace all block device issues and completions (has timestamps), until Ctrl-C:
perf record -e block:block_rq_issue -e block:block_rq_complete -a

# Trace all block completions, of size at least 100 Kbytes, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'nr_sector > 200'

# Trace all block completions, synchronous writes only, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'rwbs == "WS"'

# Trace all block completions, all types of writes, until Ctrl-C:
perf record -e block:block_rq_complete --filter 'rwbs ~ "*W*"'

# Trace all minor faults (RSS growth) with stack traces, until Ctrl-C:
perf record -e minor-faults -ag

# Trace all page faults with stack traces, until Ctrl-C:
perf record -e page-faults -ag

# Trace all ext4 calls, and write to a non-ext4 location, until Ctrl-C:
perf record -e 'ext4:*' -o /tmp/perf.data -a

# Trace kswapd wakeup events, until Ctrl-C:
perf record -e vmscan:mm_vmscan_wakeup_kswapd -ag

Dynamic Tracing

# Add a tracepoint for the kernel tcp_sendmsg() function entry ("--add" is optional):
perf probe --add tcp_sendmsg

# Remove the tcp_sendmsg() tracepoint (or use "--del"):
perf probe -d tcp_sendmsg

# Add a tracepoint for the kernel tcp_sendmsg() function return:
perf probe 'tcp_sendmsg%return'

# Show available variables for the kernel tcp_sendmsg() function (needs debuginfo):
perf probe -V tcp_sendmsg

# Show available variables for the kernel tcp_sendmsg() function, plus external vars (needs debuginfo):
perf probe -V tcp_sendmsg --externs

# Show available line probes for tcp_sendmsg() (needs debuginfo):
perf probe -L tcp_sendmsg

# Show available variables for tcp_sendmsg() at line number 81 (needs debuginfo):
perf probe -V tcp_sendmsg:81

# Add a tracepoint for tcp_sendmsg(), with three entry argument registers (platform specific):
perf probe 'tcp_sendmsg %ax %dx %cx'

# Add a tracepoint for tcp_sendmsg(), with an alias ("bytes") for the %cx register (platform specific):
perf probe 'tcp_sendmsg bytes=%cx'

# Trace previously created probe when the bytes (alias) variable is greater than 100:
perf record -e probe:tcp_sendmsg --filter 'bytes > 100'

# Add a tracepoint for tcp_sendmsg() return, and capture the return value:
perf probe 'tcp_sendmsg%return $retval'

# Add a tracepoint for tcp_sendmsg(), and "size" entry argument (reliable, but needs debuginfo):
perf probe 'tcp_sendmsg size'

# Add a tracepoint for tcp_sendmsg(), with size and socket state (needs debuginfo):
perf probe 'tcp_sendmsg size sk->__sk_common.skc_state'

# Tell me how on Earth you would do this, but don't actually do it (needs debuginfo):
perf probe -nv 'tcp_sendmsg size sk->__sk_common.skc_state'

# Trace previous probe when size is non-zero, and state is not TCP_ESTABLISHED(1) (needs debuginfo):
perf record -e probe:tcp_sendmsg --filter 'size > 0 && skc_state != 1' -a

# Add a tracepoint for tcp_sendmsg() line 81 with local variable seglen (needs debuginfo):
perf probe 'tcp_sendmsg:81 seglen'

# Add a tracepoint for do_sys_open() with the filename as a string (needs debuginfo):
perf probe 'do_sys_open filename:string'

# Add a tracepoint for myfunc() return, and include the retval as a string:
perf probe 'myfunc%return +0($retval):string'

# Add a tracepoint for the user-level malloc() function from libc:
perf probe -x /lib64/libc.so.6 malloc

# List currently available dynamic probes:
perf probe -l

Reporting

# Show perf.data in an ncurses browser (TUI) if possible:
perf report

# Show perf.data with a column for sample count:
perf report -n

# Show perf.data as a text report, with data coalesced and percentages:
perf report --stdio

# List all raw events from perf.data:
perf script

# List all raw events from perf.data, with customized fields:
perf script -f time,event,trace

# Dump raw contents from perf.data as hex (for debugging):
perf script -D

# Disassemble and annotate instructions with percentages (needs some debuginfo):
perf annotate --stdio


SystemTap

Documentation: https://sourceware.org/systemtap/documentation.html

Eclipse plugin: http://www.eclipse.org/linuxtools/projectPages/systemtap
Plugin docs: http://help.eclipse.org -> SystemTap User Guide

To build:

CONFIG_DEBUG_INFO
CONFIG_KPROBES
CONFIG_RELAY
CONFIG_DEBUG_FS
CONFIG_MODULES
CONFIG_MODULE_UNLOAD
// CONFIG_UTRACE  (no longer needed in recent kernels)

kernel:  make headers_install

Ubuntu:    apt-get install systemtap systemtap-runtime
Fedora:    dnf install -y systemtap systemtap-runtime

For stock kernels (Fedora):
    dnf install -y kernel-debuginfo kernel-devel kernel-debuginfo-common-<arch>
    e.g.   kernel-debuginfo-2.6.32-53.el6.i686.rpm
           kernel-devel-2.6.32-53.el6.i686.rpm
           kernel-debuginfo-common-i686-2.6.32-53.el6.i686.rpm
// yum install kernel-devel yum-utils debuginfo-install kernel

Ubuntu:
apt-get install elfutils libelf1 libelf-dev libdw1 libdw-dev
apt-get install latex2html libnss3 libnss3-dev libnss3-tools

Fedora:
dnf install -y kernel-devel yum-utils kernel      
dnf install -y elfutils elfutils-libelf elfutils-devel
dnf install -y latex2html nss-tools nss-devel libvirt-devel

openSUSE:
zypper install -y systemtap systemtap-runtime
zypper install -y kernel-devel
zypper install -y elfutils libelf0 libelf1 libelf-devel
zypper install -y libebl1 libebl-devel libdw-devel
zypper install -y latex2html libvirt-devel
zypper install -y mozilla-nss mozilla-nss-tools mozilla-nss-devel

git clone git://sourceware.org/git/systemtap.git
https://sourceware.org/systemtap/ftp/releases

./configure --prefix=/usr
make [-j8] all
sudo make install

Ubuntu:
adduser sergey stapusr
adduser sergey stapsys
adduser sergey stapdev

Fedora:
usermod -a --groups stapusr sergey
usermod -a --groups stapsys sergey
usermod -a --groups stapdev sergey

openSUSE:
groupadd -r stapusr
groupadd -r stapsys
groupadd -r stapdev
usermod -a --groups stapusr sergey
usermod -a --groups stapsys sergey
usermod -a --groups stapdev sergey

re-login

Quick test:

stap -ve 'probe begin { log("hello world") exit () }'
stap -c df -e 'probe syscall.* { if (target()==pid()) log(name." ".argstr) }'

LTTng + TraceCompass

http://lttng.org
https://wiki.eclipse.org/Trace_Compass

OProfile

oprofile is a legacy tool

In Fedora 22 and recent openSUSE:

Convential oprofile (opcontrol) is no longer available.
Distributed version of oprofile (dnf install -y oprofile) is just a wrapper around perf and has ainterface different from conventional oprofile.
Refer to Fedora System Adminisration Guide.

In Ubuntu:

apt-get install oprofile

build kernel with

CONFIG_PROFILING=y
CONFIG_OPROFILE=y

   
boot with idle=poll

to disable cpu sleep, and to force correct counting of CPU_CLK_UNHALTED etc. cycles
othwerwise profile will not reflect the actual time spent halted
   
caution: CPU will run hot

Capture data:

opcontrol --reset
rm -r /var/lib/oprofile
   
opcontrol --setup --vmlinux=/foo/vmlinux --event=RETIRED_INSNS:100000   # profile based on retired instructions
opcontrol --setup --vmlinux=/foo/vmlinux --event=CPU_CYCLES:1500000     # profile based on CPU cycles
# can add --kernel-range=0xc00xxxxx,0xc0xxxxxx
# obtain addresses with
#     cat /proc/kallsyms | grep " _text"
#     cat /proc/kallsyms | grep " _etext"
opcontrol --start-daemon
opcontrol --reset

opcontrol --start
... do something ...
opcontrol --stop
[... repeat ...]

opcontrol --shutdown
   
... analyze ...
   
opcontrol --reset
rm -r /var/lib/oprofile

   
Analyze:

op_time -l
oprofpp -l -i /foo/vmlinux
op_to_source -a -i /foo/vmlinux

   
Eclipse plugin:  http://www.eclipse.org/linuxtools/projectPages/oprofile

IO metering

# perf list | grep sched:
 [...]
 sched:sched_stat_wait                      [Tracepoint event]
 sched:sched_stat_sleep                     [Tracepoint event]
 sched:sched_stat_iowait                    [Tracepoint event]
 
http://www.iometer.org
http://glandium.org/blog/?p=1476

blktrace:

blktrace -d /dev/sdc2 -o - | blkparse -i -

https://lwn.net/Articles/608497
http://www.brendangregg.com/blog/2014-07-16/iosnoop-for-linux.html
https://raw.githubusercontent.com/brendangregg/perf-tools/master/iosnoop
"Exploring the Linux Storage Path - Tracing block I/O kernel events"

LINUX_ROOT/include/trace/events/block.h