DevOps · #ebpf#observability#linux#networking

eBPF技术解析与可观测性应用

2024.08.21 9 min 3.6k
// 目录 · contents

前言

eBPF(extended Berkeley Packet Filter)是Linux内核中的革命性技术,它允许在内核空间安全地运行沙箱化程序,无需修改内核源码或加载内核模块。eBPF正在重塑可观测性、网络和安全领域。本文将深入解析eBPF的架构原理和实际应用。

eBPF架构

graph TB
    subgraph UserSpace["用户空间"]
        App["eBPF程序<br>(C/Go/Rust)"]
        Loader["加载器<br>(libbpf/cilium/ebpf)"]
        Reader["数据读取<br>(Map/Perf/Ring Buffer)"]
    end

    subgraph Kernel["内核空间"]
        Verifier["验证器<br>(Verifier)"]
        JIT["JIT编译器"]
        VM["eBPF虚拟机"]

        subgraph HookPoints["挂载点"]
            KP["kprobe/kretprobe<br>内核函数"]
            TP["tracepoint<br>内核跟踪点"]
            XDP["XDP<br>网络数据包"]
            TC["TC<br>流量控制"]
            Socket["Socket Filter"]
            LSM["LSM<br>安全模块"]
            Sched["Scheduler"]
        end

        Maps["eBPF Maps<br>(键值存储)"]
    end

    App --> |"编译"| Loader
    Loader --> |"加载"| Verifier
    Verifier --> |"验证通过"| JIT
    JIT --> VM
    VM --> HookPoints
    VM <--> Maps
    Maps <--> Reader

    style Verifier fill:#f44336,color:#fff
    style JIT fill:#4CAF50,color:#fff

工作流程

sequenceDiagram
    participant Dev as 开发者
    participant Clang as Clang/LLVM
    participant Loader as 加载器
    participant Verifier as 验证器
    participant JIT as JIT编译器
    participant Hook as 挂载点

    Dev->>Clang: 编写eBPF C程序
    Clang->>Clang: 编译为BPF字节码
    Clang-->>Loader: .o文件(ELF格式)
    Loader->>Verifier: bpf()系统调用加载
    Verifier->>Verifier: 安全性验证
    alt 验证失败
        Verifier-->>Loader: 拒绝加载 + 错误信息
    else 验证通过
        Verifier->>JIT: 传递字节码
        JIT->>JIT: 编译为本机指令
        JIT->>Hook: 挂载到内核钩子点
        Note over Hook: 每次事件触发时执行
    end

验证器(Verifier)

验证器是eBPF安全性的核心保障,确保程序不会崩溃内核:

graph TB
    subgraph Checks["验证器检查项"]
        DAG["DAG检查<br>无循环/无不可达代码"]
        Bounds["边界检查<br>内存访问合法性"]
        Stack["栈大小检查<br>最大512字节"]
        Insn["指令数检查<br>最大100万条"]
        Helper["辅助函数检查<br>只能调用允许的函数"]
        Type["类型检查<br>指针类型安全"]
    end

验证器确保的安全保证:

  1. 程序一定终止:不允许无界循环(有界循环在5.3+支持)
  2. 内存访全:所有内存访问经过边界检查
  3. 资源有限:栈最大512字节,指令数有上限
  4. 权限控制:不同程序类型有不同的辅助函数权限

eBPF Maps

Maps是eBPF程序与用户空间之间通信的核心数据结构:

graph LR
    subgraph MapTypes["常用Map类型"]
        Hash["BPF_MAP_TYPE_HASH<br>哈希表"]
        Array["BPF_MAP_TYPE_ARRAY<br>数组"]
        Perf["BPF_MAP_TYPE_PERF_EVENT_ARRAY<br>Perf事件"]
        Ring["BPF_MAP_TYPE_RINGBUF<br>环形缓冲区"]
        LRU["BPF_MAP_TYPE_LRU_HASH<br>LRU哈希表"]
        LPM["BPF_MAP_TYPE_LPM_TRIE<br>最长前缀匹配"]
        PerCPU["BPF_MAP_TYPE_PERCPU_HASH<br>每CPU哈希表"]
    end
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
// eBPF Map定义示例
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 10240);
__type(key, u32); // PID作为key
__type(value, u64); // 计数器作为value
} syscall_count SEC(".maps");

// Per-CPU Map(无锁高性能)
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
__uint(max_entries, 10240);
__type(key, struct event_key);
__type(value, u64);
} event_counter SEC(".maps");

// Ring Buffer(推荐的事件传输方式)
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 256 * 1024); // 256KB
} events SEC(".maps");

eBPF程序类型

kprobe - 内核函数跟踪

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
// 跟踪进程创建
#include <vmlinux.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>

struct event {
u32 pid;
u32 ppid;
u32 uid;
char comm[16];
char filename[256];
};

struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 256 * 1024);
} events SEC(".maps");

SEC("kprobe/sys_execve")
int trace_execve(struct pt_regs *ctx)
{
struct event *e;
struct task_struct *task;

e = bpf_ringbuf_reserve(&events, sizeof(*e), 0);
if (!e)
return 0;

task = (struct task_struct *)bpf_get_current_task();

e->pid = bpf_get_current_pid_tgid() >> 32;
e->ppid = BPF_CORE_READ(task, real_parent, tgid);
e->uid = bpf_get_current_uid_gid() & 0xFFFFFFFF;
bpf_get_current_comm(&e->comm, sizeof(e->comm));
bpf_probe_read_user_str(&e->filename, sizeof(e->filename),
(void *)PT_REGS_PARM1(ctx));

bpf_ringbuf_submit(e, 0);
return 0;
}

char LICENSE[] SEC("license") = "GPL";

XDP - 高速网络数据处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
// XDP程序:简单的DDoS防护
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/ip.h>
#include <linux/tcp.h>
#include <bpf/bpf_helpers.h>

// 记录每个IP的请求计数
struct {
__uint(type, BPF_MAP_TYPE_LRU_HASH);
__uint(max_entries, 100000);
__type(key, __u32); // IP地址
__type(value, __u64); // 计数
} ip_counter SEC(".maps");

// IP黑名单
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(max_entries, 10000);
__type(key, __u32);
__type(value, __u8);
} blacklist SEC(".maps");

#define RATE_LIMIT 1000 // 每秒最大请求数

SEC("xdp")
int xdp_firewall(struct xdp_md *ctx)
{
void *data = (void *)(long)ctx->data;
void *data_end = (void *)(long)ctx->data_end;

// 解析以太网头
struct ethhdr *eth = data;
if ((void *)(eth + 1) > data_end)
return XDP_PASS;

if (eth->h_proto != __constant_htons(ETH_P_IP))
return XDP_PASS;

// 解析IP头
struct iphdr *ip = (void *)(eth + 1);
if ((void *)(ip + 1) > data_end)
return XDP_PASS;

__u32 src_ip = ip->saddr;

// 检查黑名单
if (bpf_map_lookup_elem(&blacklist, &src_ip))
return XDP_DROP;

// 速率限制
__u64 *count = bpf_map_lookup_elem(&ip_counter, &src_ip);
if (count) {
__sync_fetch_and_add(count, 1);
if (*count > RATE_LIMIT)
return XDP_DROP;
} else {
__u64 init_val = 1;
bpf_map_update_elem(&ip_counter, &src_ip, &init_val, BPF_ANY);
}

return XDP_PASS;
}

char LICENSE[] SEC("license") = "GPL";
graph LR
    NIC["网卡"] --> XDP["XDP程序"]
    XDP --> |"XDP_PASS"| Stack["网络协议栈"]
    XDP --> |"XDP_DROP"| Drop["丢弃"]
    XDP --> |"XDP_TX"| TX["原网卡发回"]
    XDP --> |"XDP_REDIRECT"| Redirect["转发到其他网卡"]

    style XDP fill:#FF9800,color:#fff
    style Drop fill:#f44336,color:#fff

tracepoint - 内核跟踪点

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
// 跟踪TCP连接事件
SEC("tracepoint/sock/inet_sock_set_state")
int trace_tcp_state(struct trace_event_raw_inet_sock_set_state *ctx)
{
if (ctx->protocol != IPPROTO_TCP)
return 0;

// 只关注连接建立
if (ctx->newstate != TCP_ESTABLISHED)
return 0;

struct event *e;
e = bpf_ringbuf_reserve(&events, sizeof(*e), 0);
if (!e)
return 0;

e->pid = bpf_get_current_pid_tgid() >> 32;
e->saddr = ctx->saddr;
e->daddr = ctx->daddr;
e->sport = ctx->sport;
e->dport = ctx->dport;
bpf_get_current_comm(&e->comm, sizeof(e->comm));

bpf_ringbuf_submit(e, 0);
return 0;
}

BCC工具集

BCC(BPF Compiler Collection)提供了大量开箱即用的eBPF工具:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# 跟踪文件打开操作
opensnoop

# 跟踪磁盘I/O延迟
biolatency

# 跟踪TCP连接
tcpconnect
tcplife

# 跟踪DNS查询
gethostlatency

# 跟踪内存分配
memleak

# 跟踪文件系统延迟
ext4slower 1 # 显示超过1ms的ext4操作
xfsslower 1

# 函数延迟分析
funclatency 'vfs_read'

# CPU缓存命中分析
llcstat

# 跟踪TCP重传
tcpretrans
graph TB
    subgraph BCCTools["BCC工具分类"]
        subgraph CPU["CPU分析"]
            execsnoop["execsnoop<br>新进程跟踪"]
            runqlat["runqlat<br>调度延迟"]
            profile["profile<br>CPU采样"]
            cpudist["cpudist<br>CPU分布"]
        end

        subgraph IO["I/O分析"]
            biolatency["biolatency<br>块I/O延迟"]
            biotop["biotop<br>块I/O top"]
            filetop["filetop<br>文件I/O top"]
            cachestat["cachestat<br>缓存命中"]
        end

        subgraph Network["网络分析"]
            tcpconnect["tcpconnect<br>TCP连接"]
            tcplife["tcplife<br>TCP生命周期"]
            tcpretrans["tcpretrans<br>TCP重传"]
            tcpdrop["tcpdrop<br>TCP丢包"]
        end
    end

bpftrace

bpftrace是高级的eBPF跟踪语言,类似awk:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# 统计系统调用频率
bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }'

# 跟踪进程的read延迟分布(直方图)
bpftrace -e '
kprobe:vfs_read { @start[tid] = nsecs; }
kretprobe:vfs_read /@start[tid]/ {
@usecs = hist((nsecs - @start[tid]) / 1000);
delete(@start[tid]);
}
'

# 跟踪TCP连接建立
bpftrace -e '
kprobe:tcp_connect {
printf("%-6d %-16s TCP connect\n", pid, comm);
}
'

# 分析文件系统写延迟
bpftrace -e '
kprobe:vfs_write {
@start[tid] = nsecs;
}
kretprobe:vfs_write /@start[tid]/ {
$dur = (nsecs - @start[tid]) / 1000;
if ($dur > 1000) {
printf("%-6d %-16s slow write: %d us\n", pid, comm, $dur);
}
@write_latency_us = hist($dur);
delete(@start[tid]);
}
'

# 跟踪页错误
bpftrace -e '
software:page-faults:1 {
@[comm, kstack] = count();
}
'

# 统计每个CPU的中断
bpftrace -e '
hardirq:irq_handler_entry {
@[str(args->name)] = count();
}
'

网络可观测性

Cilium

Cilium是基于eBPF的容器网络和安全方案:

graph TB
    subgraph Cilium["Cilium架构"]
        Agent["Cilium Agent"]
        Operator["Cilium Operator"]
        Hubble["Hubble<br>网络可观测性"]

        subgraph DataPlane["eBPF数据平面"]
            XDP["XDP<br>L3/L4负载均衡"]
            TC["TC<br>Pod网络策略"]
            Socket["cgroup/socket<br>L7策略"]
            Connect["cgroup/connect<br>透明代理"]
        end
    end

    subgraph HubbleUI["Hubble可观测性"]
        FlowLog["网络流日志"]
        ServiceMap["服务拓扑图"]
        Metrics["Prometheus指标"]
    end

    Agent --> DataPlane
    Agent --> Hubble
    Hubble --> FlowLog
    Hubble --> ServiceMap
    Hubble --> Metrics
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 查看Hubble网络流
hubble observe --namespace production

# 查看特定Pod的流量
hubble observe --pod production/api-server --protocol TCP

# 查看HTTP流
hubble observe --http-status-code 500

# 查看DNS查询
hubble observe --protocol DNS

# 查看丢弃的流量
hubble observe --verdict DROPPED

# 查看服务拓扑
hubble observe --to-service production/api-server

Cilium L7可观测性

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
name: l7-visibility
namespace: production
spec:
endpointSelector:
matchLabels:
app: api-server
egress:
- toEndpoints:
- matchLabels:
app: database
toPorts:
- ports:
- port: "5432"
protocol: TCP
rules:
l7proto: postgres
l7:
- action: "log"
ingress:
- fromEndpoints:
- matchLabels:
app: frontend
toPorts:
- ports:
- port: "8080"
protocol: TCP
rules:
http:
- method: "GET"
- method: "POST"

安全监控

Falco

Falco使用eBPF(或内核模块)实现运行时安全检测:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# Falco规则示例
- rule: Terminal Shell in Container
desc: 检测在容器内打开的终端shell
condition: >
spawned_process and container
and shell_procs
and proc.tty != 0
and container_entrypoint
output: >
Shell spawned in container
(user=%user.name container=%container.name
shell=%proc.name pid=%proc.pid)
priority: WARNING
tags: [container, shell]

- rule: Suspicious Network Connection
desc: 检测可疑的出站网络连接
condition: >
outbound and container
and fd.l4proto=tcp
and not allowed_outbound
output: >
Suspicious outbound connection
(command=%proc.cmdline connection=%fd.name
container=%container.name)
priority: NOTICE
tags: [network, container]

- rule: Read Sensitive File
desc: 检测读取敏感文件
condition: >
open_read and container
and sensitive_files
and not proc.name in (allowed_sensitive_readers)
output: >
Sensitive file read in container
(file=%fd.name command=%proc.cmdline
container=%container.name)
priority: WARNING
tags: [filesystem, container]
graph TB
    subgraph Falco["Falco架构"]
        eBPF["eBPF探针<br>(内核事件采集)"]
        Engine["规则引擎<br>(条件匹配)"]
        Output["输出通道"]
    end

    Kernel["Linux内核<br>系统调用"] --> eBPF
    eBPF --> Engine
    Engine --> |"匹配"| Output

    Output --> Stdout["标准输出"]
    Output --> Syslog["Syslog"]
    Output --> Slack["Slack"]
    Output --> Falcosidekick["Falcosidekick"]

    Falcosidekick --> Loki["Loki"]
    Falcosidekick --> ES["Elasticsearch"]
    Falcosidekick --> Webhook["Webhook"]

Pixie

Pixie是基于eBPF的Kubernetes自动可观测性平台:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 安装Pixie
px deploy

# 查看HTTP请求
px run px/http_data

# 查看服务拓扑
px run px/service_stats

# 查看DNS查询
px run px/dns_data

# 跟踪MySQL查询
px run px/mysql_data

# 自定义PxL脚本
px run -f my_script.pxl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# PxL脚本示例 - 查看HTTP延迟分布
import px

# 获取HTTP追踪数据
df = px.DataFrame(table='http_events', start_time='-5m')

# 过滤生产环境
df = df[df.ctx['namespace'] == 'production']

# 计算延迟统计
df.latency_ms = df.resp_latency_ns / 1e6

stats = df.groupby(['req_path', 'resp_status']).agg(
count=('latency_ms', px.count),
p50=('latency_ms', px.quantiles, 0.5),
p99=('latency_ms', px.quantiles, 0.99),
error_rate=('resp_status', lambda x: px.count(x >= 400) / px.count(x)),
)

px.display(stats, 'HTTP Stats')

使用Go开发eBPF程序

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
// 使用cilium/ebpf库
package main

import (
"bytes"
"encoding/binary"
"fmt"
"log"
"os"
"os/signal"

"github.com/cilium/ebpf"
"github.com/cilium/ebpf/link"
"github.com/cilium/ebpf/ringbuf"
)

//go:generate go run github.com/cilium/ebpf/cmd/bpf2go -cc clang bpf trace.bpf.c

type Event struct {
PID uint32
PPID uint32
UID uint32
Comm [16]byte
Filename [256]byte
}

func main() {
// 加载eBPF程序
objs := bpfObjects{}
if err := loadBpfObjects(&objs, nil); err != nil {
log.Fatalf("loading objects: %v", err)
}
defer objs.Close()

// 挂载到kprobe
kp, err := link.Kprobe("sys_execve", objs.TraceExecve, nil)
if err != nil {
log.Fatalf("opening kprobe: %v", err)
}
defer kp.Close()

// 读取Ring Buffer事件
rd, err := ringbuf.NewReader(objs.Events)
if err != nil {
log.Fatalf("opening ringbuf reader: %v", err)
}
defer rd.Close()

sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt)
go func() {
<-sig
rd.Close()
}()

fmt.Printf("%-8s %-8s %-16s %s\n", "PID", "PPID", "COMM", "FILENAME")
for {
record, err := rd.Read()
if err != nil {
return
}

var event Event
if err := binary.Read(bytes.NewReader(record.RawSample), binary.LittleEndian, &event); err != nil {
continue
}

comm := string(bytes.TrimRight(event.Comm[:], "\x00"))
filename := string(bytes.TrimRight(event.Filename[:], "\x00"))
fmt.Printf("%-8d %-8d %-16s %s\n", event.PID, event.PPID, comm, filename)
}
}

eBPF生态全景

graph TB
    subgraph Ecosystem["eBPF生态"]
        subgraph Networking["网络"]
            Cilium2["Cilium<br>CNI + Service Mesh"]
            Katran["Katran<br>L4负载均衡"]
            Cloudflare["Cloudflare<br>DDoS防护"]
        end

        subgraph Observability["可观测性"]
            Pixie2["Pixie<br>自动可观测性"]
            Hubble2["Hubble<br>网络可观测性"]
            Parca["Parca<br>持续剖析"]
            Pyroscope["Pyroscope<br>持续剖析"]
        end

        subgraph Security["安全"]
            Falco2["Falco<br>运行时安全"]
            Tetragon["Tetragon<br>安全可观测性"]
            Tracee["Tracee<br>安全跟踪"]
        end

        subgraph Tools["开发工具"]
            BCC2["BCC<br>工具集"]
            bpftrace2["bpftrace<br>跟踪语言"]
            libbpf2["libbpf<br>C库"]
            ciliumebpf["cilium/ebpf<br>Go库"]
            aya["Aya<br>Rust库"]
        end
    end

总结

eBPF技术的核心价值:

  1. 安全性:验证器确保程序不会崩溃内核,可在生产环境安全使用
  2. 高性能:在内核空间运行,JIT编译为原生指令,避免内核-用户空间切换
  3. 可编程:无需修改内核或加载模块,动态注入观测和控制逻辑
  4. 广泛应用:网络(Cilium)、安全(Falco/Tetragon)、可观测性(Pixie/Hubble)

eBPF正在成为Linux内核的”JavaScript”——一个可编程的扩展平台。随着工具生态的成熟,eBPF将成为每个DevOps工程师必备的技能。从BCC工具和bpftrace入门,逐步深入到使用Go/Rust开发自定义eBPF程序,是推荐的学习路径。

作者 · authorzt
发布 · date2024-08-21
篇幅 · length3.6k 字 · 9 min
许可 · licenseCC BY-SA 4.0
$ echo "comments" · 评论