根据以下两个 commit,可以看到,自 5.1 内核起,bpf prog 可以统计自己的运行情况了。
bpf: enable program stats[1] 5.1 kernel. bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS[2] 5.8 kernel.
stats 例子
以 bpf2bpf demo[3] 为例,
# sysctl -w kernel.bpf_stats_enabled=1
kernel.bpf_stats_enabled = 1
# ls -l /proc/$(pidof bpf2bpf)/fd
total 0
...
lrwx------ 1 root root 64 May 13 04:41 8 -> anon_inode:bpf-prog
lrwx------ 1 root root 64 May 13 04:41 9 -> anon_inode:bpf-prog
# cat /proc/140775/fdinfo/8
pos: 0
flags: 02000002
mnt_id: 15
ino: 4112
prog_type: 2
prog_jited: 1
prog_tag: 26a3bc86e887bd13
memlock: 4096
prog_id: 1619
run_time_ns: 115664
run_cnt: 2
recursion_misses: 0
verified_insns: 52
# cat /proc/140775/fdinfo/9
pos: 0
flags: 02000002
mnt_id: 15
ino: 4112
prog_type: 2
prog_jited: 1
prog_tag: d45d3e8749eeb769
memlock: 4096
prog_id: 1620
run_time_ns: 1009673
run_cnt: 4
recursion_misses: 0
verified_insns: 52
其中,run_time_ns
表示 bpf prog 累计运行时间(单位:纳秒),run_cnt
表示 bpf prog 运行的次数;以此 run_time_ns/run_cnt
计算平均运行时间。
fdinfo
/proc/[pid]/fdinfo/[fd]
里的内容,是由 bpf_prog_show_fdinfo()
函数填充的,该函数定义在 kernel/bpf/syscall.c
文件中。
// ${KERNEL}/kernel/bpf/syscall.c
static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
{
const struct bpf_prog *prog = filp->private_data;
char prog_tag[sizeof(prog->tag) * 2 + 1] = { };
struct bpf_prog_kstats stats;
bpf_prog_get_stats(prog, &stats);
bin2hex(prog_tag, prog->tag, sizeof(prog->tag));
seq_printf(m,
"prog_type:\t%u\n"
"prog_jited:\t%u\n"
"prog_tag:\t%s\n"
"memlock:\t%llu\n"
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
"run_cnt:\t%llu\n"
"recursion_misses:\t%llu\n"
"verified_insns:\t%u\n",
prog->type,
prog->jited,
prog_tag,
prog->pages * 1ULL << PAGE_SHIFT,
prog->aux->id,
stats.nsecs,
stats.cnt,
stats.misses,
prog->aux->verified_insns);
}
static void bpf_prog_get_stats(const struct bpf_prog *prog,
struct bpf_prog_kstats *stats)
{
u64 nsecs = 0, cnt = 0, misses = 0;
int cpu;
for_each_possible_cpu(cpu) {
const struct bpf_prog_stats *st;
unsigned int start;
u64 tnsecs, tcnt, tmisses;
st = per_cpu_ptr(prog->stats, cpu);
do {
start = u64_stats_fetch_begin_irq(&st->syncp);
tnsecs = u64_stats_read(&st->nsecs);
tcnt = u64_stats_read(&st->cnt);
tmisses = u64_stats_read(&st->misses);
} while (u64_stats_fetch_retry_irq(&st->syncp, start));
nsecs += tnsecs;
cnt += tcnt;
misses += tmisses;
}
stats->nsecs = nsecs;
stats->cnt = cnt;
stats->misses = misses;
}
BPF_PROG_RUN
with stats
如果 sysctl -w kernel.bpf_stats_enabled=1
启用了 stats,kernel 在运行 bpf prog 时就会统计运行时间和运行次数。
// ${KERNEL}/include/linux/bpf.h
static __always_inline u32 __bpf_prog_run(const struct bpf_prog *prog,
const void *ctx,
bpf_dispatcher_fn dfunc)
{
u32 ret;
cant_migrate();
if (static_branch_unlikely(&bpf_stats_enabled_key)) {
struct bpf_prog_stats *stats;
u64 start = sched_clock();
unsigned long flags;
ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
stats = this_cpu_ptr(prog->stats);
flags = u64_stats_update_begin_irqsave(&stats->syncp);
u64_stats_inc(&stats->cnt);
u64_stats_add(&stats->nsecs, sched_clock() - start);
u64_stats_update_end_irqrestore(&stats->syncp, flags);
} else {
ret = dfunc(ctx, prog->insnsi, prog->bpf_func);
}
return ret;
}
在运行 bpf prog 时进行统计,在读取 fdinfo 时获取统计数据。
小结
自 5.1 内核起,bpf prog 可以统计自己的运行情况了。 自 5.8 内核起,可以通过 BPF_ENABLE_STATS
BPF 系统调用子命令来开启 stats。推荐在进行 benchmark/stress-test 时,开启 stats,以便获取更多的信息。
参考资料
bpf: enable program stats: https://github.com/torvalds/linux/commit/492ecee892c2a4ba6a14903d5d586ff750b7e805
[2]bpf: Sharing bpf runtime stats with BPF_ENABLE_STATS: https://github.com/torvalds/linux/commit/d46edd671a147032e22cfeb271a5734703093649
[3]bpf2bpf demo: https://github.com/Asphaltt/learn-by-example/tree/main/ebpf-bpf2bpf