cilium/ebpf perf map NPE bug 排查

文摘 2024-07-09 18:31 湖北

BPF_MAP_TYPE_PERF_EVENT_ARRAY NPE

源码

eBPF code

//go:build ignore

#include <common.h>
#include <vmlinux.h>
#include <bpf_helpers.h>
#include <bpf_core_read.h>
#include <bpf_endian.h>

char __license[] SEC("license") = "Dual MIT/GPL";
struct event {
    u32 pid;
    u32 ppid;
    u32 uid;
    char comm[100];
};

struct {
    __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
} events SEC(".maps");

// Force emitting struct event into the ELF.
const struct event *unused __attribute__((unused));

SEC("kprobe/sys_execve")
int kprobe_execve(struct pt_regs *ctx) {
    struct event e;
    u32 pid, ppid, uid;
    struct task_struct *task;

    pid = bpf_get_current_pid_tgid() >> 32;
    uid = bpf_get_current_uid_gid() & 0xFFFFFFFF;

    task = (struct task_struct *)bpf_get_current_task();
    ppid = BPF_CORE_READ(task, real_parent, tgid);

    e.pid  = pid;
    e.ppid = ppid;
    e.uid  = uid;
    bpf_get_current_comm(&e.comm, sizeof(e.comm));

    bpf_perf_event_output(ctx, &events, BPF_F_CURRENT_CPU, &e, sizeof(e));
    return 0;
}

GO code

package main

import (
    "bytes"
    "encoding/binary"
    "fmt"
    "github.com/cilium/ebpf/btf"
    "github.com/cilium/ebpf/link"
    "log"
    "os"
    "os/signal"
    "syscall"

    "github.com/cilium/ebpf"
    "github.com/cilium/ebpf/perf"
    "github.com/cilium/ebpf/rlimit"
)

type bpfEvent struct {
    Pid  uint32
    Ppid uint32
    Uid  uint32
    Comm [100]byte
}

//go:generate go run github.com/cilium/ebpf/cmd/bpf2go bpf exec.c -- -I../headers_fake
func main() {
    // Remove memory limit for eBPF programs
    if err := rlimit.RemoveMemlock(); err != nil {
        fmt.Fprintf(os.Stderr, "Failed to remove memlock limit: %v\n", err)
        os.Exit(1)
    }

    spec, err := btf.LoadSpec("/data/gb/external.btf")
    if err != nil {
        log.Fatalf("loading BTF: %v", err)
    }
    opts := &ebpf.CollectionOptions{
        Programs: ebpf.ProgramOptions{
            KernelTypes: spec,
        },
    }

    objs := bpfObjects{}
    if err := loadBpfObjects(&objs, opts); err != nil {
        fmt.Fprintf(os.Stderr, "Loading objects failed: %v\n", err)
        os.Exit(1)
    }
    defer objs.KprobeExecve.Close()
    defer objs.Events.Close()

    kp, err := link.Kprobe("sys_execve", objs.KprobeExecve, nil)
    if err != nil {
        log.Fatalf("opening kprobe: %s", err)
    }
    defer kp.Close()

    // Set up a perf reader to read events from the eBPF program
    rd, err := perf.NewReader(objs.Events, os.Getpagesize())
    if err != nil {
        fmt.Fprintf(os.Stderr, "Creating perf reader failed: %v\n", err)
        os.Exit(1)
    }
    defer rd.Close()

    // Set up a channel to receive signals
    sig := make(chan os.Signal, 1)
    signal.Notify(sig, os.Interrupt, syscall.SIGTERM)

    fmt.Println("Listening for events..")

    // Loop to read events
    go func() {
        for {
            record, err := rd.Read()
            if err != nil {
                fmt.Fprintf(os.Stderr, "Reading from perf reader failed: %v\n", err)
                os.Exit(1)
            }

            // Parse event data
            var e bpfEvent
            if err := binary.Read(bytes.NewBuffer(record.RawSample), binary.LittleEndian, &e); err != nil {
                fmt.Fprintf(os.Stderr, "Parsing event data failed: %v\n", err)
                os.Exit(1)
            }

            fmt.Printf("PID: %d, PPID: %d, UID: %d, Comm: %s\n", e.Pid, e.Ppid, e.Uid, string(e.Comm[:]))
        }
    }()

    // Wait for a signal to exit
    <-sig
    fmt.Println("Exiting..")
}

header pkg: headers_fake.tar.gz

BUG 复现

构建

1. 添加 eBPF 和 Go 代码添加到目录 cilium/ebpf/examples/kprobe-exec
2. 将 C 代码头文件添加到目录 cilium/ebpf/examples/headers_fake
3. 使用 cd ./cillium/ebpf/examples ;make -C ..进行 C 编译以及 GO 代码生成
4. 使用 cd cilium/ebpf/examples/kprobe-exec; CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -gcflags "all=-N -l" 进行 GO 代码构建
5. 更新编译后 kprobe-exec 二进制文件到 arm 环境 (os: KY10 SP1.1 aarch64,内核: 4.19.90-23.15.v2101)

tips: Go code use external BTF file,your test vm need to enable BTF or use pahole gen external BTF. if your vm has edabled BTF , should remove GO code :

    spec, err := btf.LoadSpec("/data/gb/external.btf")
    if err != nil {
        log.Fatalf("loading BTF: %v", err)
    }
    opts := &ebpf.CollectionOptions{
        Programs: ebpf.ProgramOptions{
            KernelTypes: spec,
        },
    }

错误信息

1. NPE error

# ./kprobe-exec 
Listening for events..
panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x1bd1f8]

goroutine 25 [running]:
github.com/cilium/ebpf/perf.(*Reader).ReadInto(0x40012b95e0, 0x4002358c98)
        /Users/zhuhuijun/go/src/github.com/cilium/ebpf/perf/reader.go:389 +0x6c8
github.com/cilium/ebpf/perf.(*Reader).Read(0x40012b95e0)
        /Users/zhuhuijun/go/src/github.com/cilium/ebpf/perf/reader.go:337 +0x48
main.main.func1()
        /Users/zhuhuijun/go/src/github.com/cilium/ebpf/examples/kprobe-exec/main.go:75 +0x54
created by main.main in goroutine 1
        /Users/zhuhuijun/go/src/github.com/cilium/ebpf/examples/kprobe-exec/main.go:73 +0xa04

1. Causes the error code location https://github.com/cilium/ebpf/blob/f2b2f6d61aaf1fa8c0eb14c2a58f7e199fbd6086/perf/reader.go#L389

问题排查

从错误信息来看，eBPF MAP 和代码应该已经挂载（使用 bpftool prog list 和 bpftool map list 确认），从 perf 中读取数据的时候 ring 有 nil 值。

ring.loadHead() 被调用来加载性能事件环（perf event ring）的头部指针，目前从 ring 为 nil 的情况来看，这可能是由于几个原因导致的：

• CPU 离线：在创建性能事件环时，如果某个 CPU 当时是离线状态，那么为该 CPU 创建的 ring 可能会被设置为 nil。这是因为代码中有检查，如果在尝试为 CPU 创建性能事件环时遇到 unix.ENODEV 错误（表示设备不存在），则会跳过该 CPU 并将对应的 ring 设置为 nil。
• 创建性能事件环失败：在尝试为每个 CPU 创建性能事件环时，如果遇到除 unix.ENODEV 外的任何错误，整个 NewReaderWithOptions 函数会提前返回错误，不会为任何 CPU 创建 ring。但如果错误发生在为部分 CPU 创建 ring 之后，那么已经创建的 ring 可能不会被清理，导致部分 ring 为 nil。
• 资源释放：如果 Reader 对象的 Close 方法被调用，所有的 ring 资源都会被释放，并且 rings 切片中的所有元素都会被设置为 nil。如果在调用 Close 方法之后尝试访问 ring，就会遇到 nil。

基于上面的分析，那目前最直接的解决方法，就是增加 ring 的判断，如果为 nil 跳过处理即可。调整代码之后随便提交了 PR（PR: https://github.com/cilium/ebpf/pull/1503），但是这样问题就彻底解决了吗？随着 pr review 发现问题不仅仅是 ring 的校验问题。

错误的 nCPU

随着 @florianl 的留言注意到 rings 的初始化过程，发现 rings 的大小跟 nCPU 参数的大小一致，以下为 nCPU 初始化的代码：

var (
        nCPU     = int(array.MaxEntries())
        rings    = make([]*perfEventRing, 0, nCPU)
        eventFds = make([]*sys.FD, 0, nCPU)
    )

此处 nCPU 大小跟 PerfEventArray 类型 MAP 的 MaxEntries 一样，在 eBPF 中，PerfEventArray 类型的 ebpf.Map 通常用于跟踪每个 CPU 的性能事件。每个 CPU 都有自己的条目，因此 MaxEntries() 方法返回的最大条目数通常等于系统上的 CPU 核心数。基于上面的认知 nCPU 应该是等于当前操作系统的 CPU 核数，最终导致 ring 出现 nil 的代码如下：

    for i := 0; i < nCPU; i++ {
        event, ring, err := newPerfEventRing(i, perCPUBuffer, opts)
        if errors.Is(err, unix.ENODEV) {
            fmt.Printf("cpu: %d,err: %v\n", i, err)
            // The requested CPU is currently offline, skip it.
            rings = append(rings, nil)
            eventFds = append(eventFds, nil)
            continue
        }
    ...
  }

至此找到 ring 为 nil 的直接原因，上面的代码表示当前操作系统的 CPU 有出现不在线的情况？通过检查 /sys/devices/system/cpu/online配置发现所有 CPU 均在线，难道是 nCPU 取错了？

# cat /sys/devices/system/cpu/online
0-15

那就通过 gdb 调试下可执行文件看下，执行过程中 nCPU 大小是多少？

# gdb tui kprobe-exec
(gdb) break github.com/cilium/ebpf/perf.NewReaderWithOptions
(gdb) run 
(gdb) layout asm
(gdb) n
(gdb) info locals
eventFds = {array = 0x3225f0 <github.com/cilium/ebpf/rlimit.rlimitMu>, len = 274878742608, cap = 274878742608}
rings = {array = 0x0, len = 262144, cap = 3286512}
&err = 0x400009e9c0
closeOnError = {void (io.Closer)} 0x4000187338
poller = 0x180c70 <github.com/cilium/ebpf.newMapWithOptions.func1>
bufferSize = 274878837448
nCPU = 64
(gdb)

结果很明显，nCPU 获取错了，但是这个 64 是从哪里来的？而且 64 这个值很眼熟，对，当前虚拟机的物理宿主机总物理 CPU 为 64核。

源码排查

导致问题代码：https://github.com/cilium/ebpf/blob/main/map.go#L114C1-L153C2

func (spec *MapSpec) fixupMagicFields() (*MapSpec, error) {
    switch spec.Type {
    case ArrayOfMaps, HashOfMaps:
        ...
    case PerfEventArray:
        ...

        n, err := PossibleCPU()
        if err != nil {
            return nil, fmt.Errorf("fixup perf event array: %w", err)
        }

        if n := uint32(n); spec.MaxEntries == 0 || spec.MaxEntries > n {
            // MaxEntries should be zero most of the time, but there is code
            // out there which hardcodes large constants. Clamp the number
            // of entries to the number of CPUs at most. Allow creating maps with
            // less than n items since some kernel selftests relied on this
            // behaviour in the past.
            spec.MaxEntries = n
        }
    }

    return spec, nil
}

cilium/ebpf 在初始化 map 时会根据类型调整 MapSpec 某些字段，当类型为 BPF_MAP_TYPE_PERF_EVENT_ARRAY 时，会从 /sys/devices/system/cpu/possible 获取当前服务器可能存在的 CPU 数量，PossibleCPU 方法的具体实现代码(https://github.com/cilium/ebpf/blob/main/cpu.go#L10)如下：

var possibleCPU = sync.OnceValues(func() (int, error) {
    return parseCPUsFromFile("/sys/devices/system/cpu/possible")
})

那到服务器上查询下具体值：

# cat /sys/devices/system/cpu/possible
0-63

至此，已经找到 nCPU 为 64 的根本原因，但是为什么 /sys/devices/system/cpu/possible 和 /sys/devices/system/cpu/online 会出现不一致的情况呢？

CPU 信息

当前环境完整 CPU 信息如下：

# grep . /sys/devices/system/cpu/{online,offline,possible,present}
/sys/devices/system/cpu/online:0-15
/sys/devices/system/cpu/offline:16-63
/sys/devices/system/cpu/possible:0-63
/sys/devices/system/cpu/present:0-15

possible: 表示系统中所有可能存在的 CPU。这包括系统支持的最大 CPU 数量，无论这些 CPU 当前是否已启用或实际存在。

present: 表示当前存在于系统中的 CPU。这些 CPU 是物理上存在并可能被启用的 CPU，但不一定都在使用。

online: 表示当前已经启用并正在使用的 CPU。这些 CPU 是操作系统实际分配和调度任务的 CPU。

offline: 表示当前处于关闭状态的 CPU。这些 CPU 可能存在于系统中，但当前未被启用。

在ARM64架构中，cpu_possible_bits 包含系统初始化时确定的所有可能的CPU核。在系统启动时，内核会从DTS（设备树源）文件中读取CPU节点信息。所有格式正确的CPU核心节点将被认为是可能的核心，即它们属于 cpu_possible_mask。

设备树（DTS）

1. DTS文件定义：

• DTS文件中定义了系统中存在的CPU核心信息。
• 每个CPU节点包含其属性，如CPU ID、兼容性等。

2. 系统初始化：

• 系统启动时，内核解析DTS文件。
• 内核将所有格式正确的CPU核心节点标记为可能的核心。

3. **cpu_possible_mask**：

• cpu_possible_mask 包含所有可能存在的CPU核心，包括当前在线和离线的核心。

假设DTS文件中定义了8个CPU核心，系统初始化时，内核将这些核心全部标记为可能的核心，即使其中一些核心当前未启用。

在 KY10 SP1.1 系统中 DTB 文件存在于目录 /boot/dtb-$(uname -r)，具体目录如下：

# ls /boot/dtb-$(uname -r)
amd  apm  arm  cavium  hisilicon  phytium  qcom

这些子目录通常对应不同厂商和架构的设备树二进制文件（DTB）。每个子目录中包含了特定厂商或架构的设备树文件，用于支持相应的硬件平台。

• amd/: Advanced Micro Devices (AMD)，主要提供x86和x86_64架构的CPU和APU。
• apm/: Applied Micro Circuits Corporation (APM)，主要提供ARM架构的处理器。
• arm/: ARM Holdings，ARM架构的处理器设计。
• cavium/: Cavium Networks（现为Marvell Technology Group的一部分），提供ARM架构的多核处理器。
• hisilicon/: HiSilicon Technologies，华为的半导体子公司，主要提供ARM架构的SoC。
• phytium/: Phytium Technology，主要提供ARM架构的CPU。
• qcom/: Qualcomm，主要提供ARM架构的处理器和通信芯片。

可以通过 lscpu 命令查看当前服务器的 CPU 硬件厂商为 Phytium ：

# lscpu
Architecture:                    aarch64
CPU op-mode(s):                  64-bit
Byte Order:                      Little Endian
CPU(s):                          16
On-line CPU(s) list:             0-15
Thread(s) per core:              1
Core(s) per socket:              4
Socket(s):                       4
NUMA node(s):                    1
Vendor ID:                       Phytium
Model:                           2
Model name:                      FT-2000+/64
Stepping:                        0x1
BogoMIPS:                        100.00
NUMA node0 CPU(s):               0-15
Vulnerability Itlb multihit:     Not affected
Vulnerability L1tf:              Not affected
Vulnerability Mds:               Not affected
Vulnerability Meltdown:          Mitigation; PTI
Vulnerability Spec store bypass: Vulnerable
Vulnerability Spectre v1:        Mitigation; __user pointer sanitization
Vulnerability Spectre v2:        Not affected
Vulnerability Srbds:             Not affected
Vulnerability Tsx async abort:   Not affected
Flags:                           fp asimd evtstrm crc32 cpuid

通过以下命令找到当前操作系统内核启动时候加载哪个 DTS 文件：

# grep dtb /etc/grub.d/*
/etc/grub.d/10_linux:       x0x660) GRUB_DEFAULT_DTB="u-boot-general.dtb" ;;
/etc/grub.d/10_linux:       x0x662)  GRUB_DEFAULT_DTB="ft2000plus.dtb" ;;
/etc/grub.d/10_linux:  for i in "dtb-${version}" "dtb-${alt_version}" "dtb-${version}/phytium" "dtb-${alt_version}/phytium"; do

那看下 Phytium 厂家提供 DTB 文件，我们通过以下命令将 DTB 文件转换为更加阅读友好的 DTS 格式：

dtc -I dtb -O dts -o output.dts /boot/dtb-4.19.90-23.15.v2101.ky10.aarch64/phytium/ft2000plus.dtb

检查下 output.dts 中可用 CPU 具体配置信息：

# cat output.dts | grep -A 10 -i 'cpu@'
                cpu@0 {
                        device_type = "cpu";
                        compatible = "arm,armv8";
                        reg = < 0x00 0x00 >;
                        enable-method = "psci";
                        numa-node-id = < 0x00 >;
                        linux,phandle = < 0x03 >;
                        phandle = < 0x03 >;
                };

                ...
        };

统计下 CPU 总数正好是 64 ，那就解释了为什么 /sys/devices/system/cpu/possible 为 64 核。

# cat output.dts | grep -i 'cpu@'|wc -l
64

朱慧君

大龄yaml工程师逼逼叨