前言
eBPF(extended Berkeley Packet
Filter)是Linux内核中的革命性技术,它允许在内核空间安全地运行沙箱化程序,无需修改内核源码或加载内核模块。eBPF正在重塑可观测性、网络和安全领域。本文将深入解析eBPF的架构原理和实际应用。
eBPF架构
graph TB
subgraph UserSpace["用户空间"]
App["eBPF程序<br>(C/Go/Rust)"]
Loader["加载器<br>(libbpf/cilium/ebpf)"]
Reader["数据读取<br>(Map/Perf/Ring Buffer)"]
end
subgraph Kernel["内核空间"]
Verifier["验证器<br>(Verifier)"]
JIT["JIT编译器"]
VM["eBPF虚拟机"]
subgraph HookPoints["挂载点"]
KP["kprobe/kretprobe<br>内核函数"]
TP["tracepoint<br>内核跟踪点"]
XDP["XDP<br>网络数据包"]
TC["TC<br>流量控制"]
Socket["Socket Filter"]
LSM["LSM<br>安全模块"]
Sched["Scheduler"]
end
Maps["eBPF Maps<br>(键值存储)"]
end
App --> |"编译"| Loader
Loader --> |"加载"| Verifier
Verifier --> |"验证通过"| JIT
JIT --> VM
VM --> HookPoints
VM <--> Maps
Maps <--> Reader
style Verifier fill:#f44336,color:#fff
style JIT fill:#4CAF50,color:#fff
工作流程
sequenceDiagram
participant Dev as 开发者
participant Clang as Clang/LLVM
participant Loader as 加载器
participant Verifier as 验证器
participant JIT as JIT编译器
participant Hook as 挂载点
Dev->>Clang: 编写eBPF C程序
Clang->>Clang: 编译为BPF字节码
Clang-->>Loader: .o文件(ELF格式)
Loader->>Verifier: bpf()系统调用加载
Verifier->>Verifier: 安全性验证
alt 验证失败
Verifier-->>Loader: 拒绝加载 + 错误信息
else 验证通过
Verifier->>JIT: 传递字节码
JIT->>JIT: 编译为本机指令
JIT->>Hook: 挂载到内核钩子点
Note over Hook: 每次事件触发时执行
end
验证器(Verifier)
验证器是eBPF安全性的核心保障,确保程序不会崩溃内核:
graph TB
subgraph Checks["验证器检查项"]
DAG["DAG检查<br>无循环/无不可达代码"]
Bounds["边界检查<br>内存访问合法性"]
Stack["栈大小检查<br>最大512字节"]
Insn["指令数检查<br>最大100万条"]
Helper["辅助函数检查<br>只能调用允许的函数"]
Type["类型检查<br>指针类型安全"]
end
验证器确保的安全保证:
程序一定终止 :不允许无界循环(有界循环在5.3+支持)
内存访全 :所有内存访问经过边界检查
资源有限 :栈最大512字节,指令数有上限
权限控制 :不同程序类型有不同的辅助函数权限
eBPF Maps
Maps是eBPF程序与用户空间之间通信的核心数据结构:
graph LR
subgraph MapTypes["常用Map类型"]
Hash["BPF_MAP_TYPE_HASH<br>哈希表"]
Array["BPF_MAP_TYPE_ARRAY<br>数组"]
Perf["BPF_MAP_TYPE_PERF_EVENT_ARRAY<br>Perf事件"]
Ring["BPF_MAP_TYPE_RINGBUF<br>环形缓冲区"]
LRU["BPF_MAP_TYPE_LRU_HASH<br>LRU哈希表"]
LPM["BPF_MAP_TYPE_LPM_TRIE<br>最长前缀匹配"]
PerCPU["BPF_MAP_TYPE_PERCPU_HASH<br>每CPU哈希表"]
end
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 10240 ); __type(key, u32); __type(value, u64); } syscall_count SEC (".maps" ) ;struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, 10240 ); __type(key, struct event_key); __type(value, u64); } event_counter SEC (".maps" ) ;struct { __uint(type, BPF_MAP_TYPE_RINGBUF); __uint(max_entries, 256 * 1024 ); } events SEC (".maps" ) ;
eBPF程序类型
kprobe - 内核函数跟踪
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 #include <vmlinux.h> #include <bpf/bpf_helpers.h> #include <bpf/bpf_tracing.h> struct event { u32 pid; u32 ppid; u32 uid; char comm[16 ]; char filename[256 ]; };struct { __uint(type, BPF_MAP_TYPE_RINGBUF); __uint(max_entries, 256 * 1024 ); } events SEC (".maps" ) ; SEC("kprobe/sys_execve" )int trace_execve (struct pt_regs *ctx) { struct event *e ; struct task_struct *task ; e = bpf_ringbuf_reserve(&events, sizeof (*e), 0 ); if (!e) return 0 ; task = (struct task_struct *)bpf_get_current_task(); e->pid = bpf_get_current_pid_tgid() >> 32 ; e->ppid = BPF_CORE_READ(task, real_parent, tgid); e->uid = bpf_get_current_uid_gid() & 0xFFFFFFFF ; bpf_get_current_comm(&e->comm, sizeof (e->comm)); bpf_probe_read_user_str(&e->filename, sizeof (e->filename), (void *)PT_REGS_PARM1(ctx)); bpf_ringbuf_submit(e, 0 ); return 0 ; }char LICENSE[] SEC("license" ) = "GPL" ;
XDP - 高速网络数据处理
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 #include <linux/bpf.h> #include <linux/if_ether.h> #include <linux/ip.h> #include <linux/tcp.h> #include <bpf/bpf_helpers.h> struct { __uint(type, BPF_MAP_TYPE_LRU_HASH); __uint(max_entries, 100000 ); __type(key, __u32); __type(value, __u64); } ip_counter SEC (".maps" ) ;struct { __uint(type, BPF_MAP_TYPE_HASH); __uint(max_entries, 10000 ); __type(key, __u32); __type(value, __u8); } blacklist SEC (".maps" ) ;#define RATE_LIMIT 1000 SEC("xdp" )int xdp_firewall (struct xdp_md *ctx) { void *data = (void *)(long )ctx->data; void *data_end = (void *)(long )ctx->data_end; struct ethhdr *eth = data; if ((void *)(eth + 1 ) > data_end) return XDP_PASS; if (eth->h_proto != __constant_htons(ETH_P_IP)) return XDP_PASS; struct iphdr *ip = (void *)(eth + 1 ); if ((void *)(ip + 1 ) > data_end) return XDP_PASS; __u32 src_ip = ip->saddr; if (bpf_map_lookup_elem(&blacklist, &src_ip)) return XDP_DROP; __u64 *count = bpf_map_lookup_elem(&ip_counter, &src_ip); if (count) { __sync_fetch_and_add(count, 1 ); if (*count > RATE_LIMIT) return XDP_DROP; } else { __u64 init_val = 1 ; bpf_map_update_elem(&ip_counter, &src_ip, &init_val, BPF_ANY); } return XDP_PASS; }char LICENSE[] SEC("license" ) = "GPL" ;
graph LR
NIC["网卡"] --> XDP["XDP程序"]
XDP --> |"XDP_PASS"| Stack["网络协议栈"]
XDP --> |"XDP_DROP"| Drop["丢弃"]
XDP --> |"XDP_TX"| TX["原网卡发回"]
XDP --> |"XDP_REDIRECT"| Redirect["转发到其他网卡"]
style XDP fill:#FF9800,color:#fff
style Drop fill:#f44336,color:#fff
tracepoint - 内核跟踪点
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 SEC("tracepoint/sock/inet_sock_set_state" )int trace_tcp_state (struct trace_event_raw_inet_sock_set_state *ctx) { if (ctx->protocol != IPPROTO_TCP) return 0 ; if (ctx->newstate != TCP_ESTABLISHED) return 0 ; struct event *e ; e = bpf_ringbuf_reserve(&events, sizeof (*e), 0 ); if (!e) return 0 ; e->pid = bpf_get_current_pid_tgid() >> 32 ; e->saddr = ctx->saddr; e->daddr = ctx->daddr; e->sport = ctx->sport; e->dport = ctx->dport; bpf_get_current_comm(&e->comm, sizeof (e->comm)); bpf_ringbuf_submit(e, 0 ); return 0 ; }
BCC工具集
BCC(BPF Compiler Collection)提供了大量开箱即用的eBPF工具:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 opensnoop biolatency tcpconnect tcplife gethostlatency memleak ext4slower 1 xfsslower 1 funclatency 'vfs_read' llcstat tcpretrans
graph TB
subgraph BCCTools["BCC工具分类"]
subgraph CPU["CPU分析"]
execsnoop["execsnoop<br>新进程跟踪"]
runqlat["runqlat<br>调度延迟"]
profile["profile<br>CPU采样"]
cpudist["cpudist<br>CPU分布"]
end
subgraph IO["I/O分析"]
biolatency["biolatency<br>块I/O延迟"]
biotop["biotop<br>块I/O top"]
filetop["filetop<br>文件I/O top"]
cachestat["cachestat<br>缓存命中"]
end
subgraph Network["网络分析"]
tcpconnect["tcpconnect<br>TCP连接"]
tcplife["tcplife<br>TCP生命周期"]
tcpretrans["tcpretrans<br>TCP重传"]
tcpdrop["tcpdrop<br>TCP丢包"]
end
end
bpftrace
bpftrace是高级的eBPF跟踪语言,类似awk:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 bpftrace -e 'tracepoint:raw_syscalls:sys_enter { @[comm] = count(); }' bpftrace -e ' kprobe:vfs_read { @start[tid] = nsecs; } kretprobe:vfs_read /@start[tid]/ { @usecs = hist((nsecs - @start[tid]) / 1000); delete(@start[tid]); } ' bpftrace -e ' kprobe:tcp_connect { printf("%-6d %-16s TCP connect\n", pid, comm); } ' bpftrace -e ' kprobe:vfs_write { @start[tid] = nsecs; } kretprobe:vfs_write /@start[tid]/ { $dur = (nsecs - @start[tid]) / 1000; if ($dur > 1000) { printf("%-6d %-16s slow write: %d us\n", pid, comm, $dur); } @write_latency_us = hist($dur); delete(@start[tid]); } ' bpftrace -e ' software:page-faults:1 { @[comm, kstack] = count(); } ' bpftrace -e ' hardirq:irq_handler_entry { @[str(args->name)] = count(); } '
网络可观测性
Cilium
Cilium是基于eBPF的容器网络和安全方案:
graph TB
subgraph Cilium["Cilium架构"]
Agent["Cilium Agent"]
Operator["Cilium Operator"]
Hubble["Hubble<br>网络可观测性"]
subgraph DataPlane["eBPF数据平面"]
XDP["XDP<br>L3/L4负载均衡"]
TC["TC<br>Pod网络策略"]
Socket["cgroup/socket<br>L7策略"]
Connect["cgroup/connect<br>透明代理"]
end
end
subgraph HubbleUI["Hubble可观测性"]
FlowLog["网络流日志"]
ServiceMap["服务拓扑图"]
Metrics["Prometheus指标"]
end
Agent --> DataPlane
Agent --> Hubble
Hubble --> FlowLog
Hubble --> ServiceMap
Hubble --> Metrics
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 hubble observe --namespace production hubble observe --pod production/api-server --protocol TCP hubble observe --http-status-code 500 hubble observe --protocol DNS hubble observe --verdict DROPPED hubble observe --to-service production/api-server
Cilium L7可观测性
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 apiVersion: cilium.io/v2 kind: CiliumNetworkPolicy metadata: name: l7-visibility namespace: production spec: endpointSelector: matchLabels: app: api-server egress: - toEndpoints: - matchLabels: app: database toPorts: - ports: - port: "5432" protocol: TCP rules: l7proto: postgres l7: - action: "log" ingress: - fromEndpoints: - matchLabels: app: frontend toPorts: - ports: - port: "8080" protocol: TCP rules: http: - method: "GET" - method: "POST"
安全监控
Falco
Falco使用eBPF(或内核模块)实现运行时安全检测:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 - rule: Terminal Shell in Container desc: 检测在容器内打开的终端shell condition: > spawned_process and container and shell_procs and proc.tty != 0 and container_entrypoint output: > Shell spawned in container (user=%user.name container=%container.name shell=%proc.name pid=%proc.pid) priority: WARNING tags: [container , shell ]- rule: Suspicious Network Connection desc: 检测可疑的出站网络连接 condition: > outbound and container and fd.l4proto=tcp and not allowed_outbound output: > Suspicious outbound connection (command=%proc.cmdline connection=%fd.name container=%container.name) priority: NOTICE tags: [network , container ]- rule: Read Sensitive File desc: 检测读取敏感文件 condition: > open_read and container and sensitive_files and not proc.name in (allowed_sensitive_readers) output: > Sensitive file read in container (file=%fd.name command=%proc.cmdline container=%container.name) priority: WARNING tags: [filesystem , container ]
graph TB
subgraph Falco["Falco架构"]
eBPF["eBPF探针<br>(内核事件采集)"]
Engine["规则引擎<br>(条件匹配)"]
Output["输出通道"]
end
Kernel["Linux内核<br>系统调用"] --> eBPF
eBPF --> Engine
Engine --> |"匹配"| Output
Output --> Stdout["标准输出"]
Output --> Syslog["Syslog"]
Output --> Slack["Slack"]
Output --> Falcosidekick["Falcosidekick"]
Falcosidekick --> Loki["Loki"]
Falcosidekick --> ES["Elasticsearch"]
Falcosidekick --> Webhook["Webhook"]
Pixie
Pixie是基于eBPF的Kubernetes自动可观测性平台:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 px deploy px run px/http_data px run px/service_stats px run px/dns_data px run px/mysql_data px run -f my_script.pxl
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 import px df = px.DataFrame(table='http_events' , start_time='-5m' ) df = df[df.ctx['namespace' ] == 'production' ] df.latency_ms = df.resp_latency_ns / 1e6 stats = df.groupby(['req_path' , 'resp_status' ]).agg( count=('latency_ms' , px.count), p50=('latency_ms' , px.quantiles, 0.5 ), p99=('latency_ms' , px.quantiles, 0.99 ), error_rate=('resp_status' , lambda x: px.count(x >= 400 ) / px.count(x)), ) px.display(stats, 'HTTP Stats' )
使用Go开发eBPF程序
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 package mainimport ( "bytes" "encoding/binary" "fmt" "log" "os" "os/signal" "github.com/cilium/ebpf" "github.com/cilium/ebpf/link" "github.com/cilium/ebpf/ringbuf" )type Event struct { PID uint32 PPID uint32 UID uint32 Comm [16 ]byte Filename [256 ]byte }func main () { objs := bpfObjects{} if err := loadBpfObjects(&objs, nil ); err != nil { log.Fatalf("loading objects: %v" , err) } defer objs.Close() kp, err := link.Kprobe("sys_execve" , objs.TraceExecve, nil ) if err != nil { log.Fatalf("opening kprobe: %v" , err) } defer kp.Close() rd, err := ringbuf.NewReader(objs.Events) if err != nil { log.Fatalf("opening ringbuf reader: %v" , err) } defer rd.Close() sig := make (chan os.Signal, 1 ) signal.Notify(sig, os.Interrupt) go func () { <-sig rd.Close() }() fmt.Printf("%-8s %-8s %-16s %s\n" , "PID" , "PPID" , "COMM" , "FILENAME" ) for { record, err := rd.Read() if err != nil { return } var event Event if err := binary.Read(bytes.NewReader(record.RawSample), binary.LittleEndian, &event); err != nil { continue } comm := string (bytes.TrimRight(event.Comm[:], "\x00" )) filename := string (bytes.TrimRight(event.Filename[:], "\x00" )) fmt.Printf("%-8d %-8d %-16s %s\n" , event.PID, event.PPID, comm, filename) } }
eBPF生态全景
graph TB
subgraph Ecosystem["eBPF生态"]
subgraph Networking["网络"]
Cilium2["Cilium<br>CNI + Service Mesh"]
Katran["Katran<br>L4负载均衡"]
Cloudflare["Cloudflare<br>DDoS防护"]
end
subgraph Observability["可观测性"]
Pixie2["Pixie<br>自动可观测性"]
Hubble2["Hubble<br>网络可观测性"]
Parca["Parca<br>持续剖析"]
Pyroscope["Pyroscope<br>持续剖析"]
end
subgraph Security["安全"]
Falco2["Falco<br>运行时安全"]
Tetragon["Tetragon<br>安全可观测性"]
Tracee["Tracee<br>安全跟踪"]
end
subgraph Tools["开发工具"]
BCC2["BCC<br>工具集"]
bpftrace2["bpftrace<br>跟踪语言"]
libbpf2["libbpf<br>C库"]
ciliumebpf["cilium/ebpf<br>Go库"]
aya["Aya<br>Rust库"]
end
end
总结
eBPF技术的核心价值:
安全性 :验证器确保程序不会崩溃内核,可在生产环境安全使用
高性能 :在内核空间运行,JIT编译为原生指令,避免内核-用户空间切换
可编程 :无需修改内核或加载模块,动态注入观测和控制逻辑
广泛应用 :网络(Cilium)、安全(Falco/Tetragon)、可观测性(Pixie/Hubble)
eBPF正在成为Linux内核的”JavaScript”——一个可编程的扩展平台。随着工具生态的成熟,eBPF将成为每个DevOps工程师必备的技能。从BCC工具和bpftrace入门,逐步深入到使用Go/Rust开发自定义eBPF程序,是推荐的学习路径。