| /* |
| * eBPF kernel space program part |
| * |
| * Toy eBPF program for demonstration purposes, some parts derived from |
| * kernel tree's samples/bpf/sockex2_kern.c example. |
| * |
| * More background on eBPF, kernel tree: Documentation/networking/filter.txt |
| * |
| * Note, this file is rather large, and most classifier and actions are |
| * likely smaller to accomplish one specific use-case and are tailored |
| * for high performance. For performance reasons, you might also have the |
| * classifier and action already merged inside the classifier. |
| * |
| * In order to show various features it serves as a bigger programming |
| * example, which you should feel free to rip apart and experiment with. |
| * |
| * Compilation, configuration example: |
| * |
| * Note: as long as the BPF backend in LLVM is still experimental, |
| * you need to build LLVM with LLVM with --enable-experimental-targets=BPF |
| * Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y, |
| * and you have libelf.h and gelf.h headers and can link tc against -lelf. |
| * |
| * In case you need to sync kernel headers, go to your kernel source tree: |
| * # make headers_install INSTALL_HDR_PATH=/usr/ |
| * |
| * $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH |
| * $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o |
| * $ objdump -h bpf.o |
| * [...] |
| * 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3 |
| * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE |
| * 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3 |
| * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE |
| * 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3 |
| * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE |
| * 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2 |
| * CONTENTS, ALLOC, LOAD, DATA |
| * 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0 |
| * CONTENTS, ALLOC, LOAD, DATA |
| * [...] |
| * # echo 1 > /proc/sys/net/core/bpf_jit_enable |
| * $ gcc bpf_agent.c -o bpf_agent -Wall -O2 |
| * # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal) |
| * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ |
| * action bpf obj bpf.o sec action-mark \ |
| * action bpf obj bpf.o sec action-rand ok |
| * # tc filter show dev em1 |
| * filter parent 1: protocol all pref 49152 bpf |
| * filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier] |
| * action order 1: bpf bpf.o:[action-mark] default-action pipe |
| * index 52 ref 1 bind 1 |
| * |
| * action order 2: bpf bpf.o:[action-rand] default-action pipe |
| * index 53 ref 1 bind 1 |
| * |
| * action order 3: gact action pass |
| * random type none pass val 0 |
| * index 38 ref 1 bind 1 |
| * |
| * The same program can also be installed on ingress side (as opposed to above |
| * egress configuration), e.g.: |
| * |
| * # tc qdisc add dev em1 handle ffff: ingress |
| * # tc filter add dev em1 parent ffff: bpf obj ... |
| * |
| * Notes on BPF agent: |
| * |
| * In the above example, the bpf_agent creates the unix domain socket |
| * natively. "tc exec" can also spawn a shell and hold the socktes there: |
| * |
| * # tc exec bpf imp /tmp/bpf-uds |
| * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ |
| * action bpf obj bpf.o sec action-mark \ |
| * action bpf obj bpf.o sec action-rand ok |
| * sh-4.2# (shell spawned from tc exec) |
| * sh-4.2# bpf_agent |
| * [...] |
| * |
| * This will read out fds over environment and produce the same data dump |
| * as below. This has the advantage that the spawned shell owns the fds |
| * and thus if the agent is restarted, it can reattach to the same fds, also |
| * various programs can easily read/modify the data simultaneously from user |
| * space side. |
| * |
| * If the shell is unnecessary, the agent can also just be spawned directly |
| * via tc exec: |
| * |
| * # tc exec bpf imp /tmp/bpf-uds run bpf_agent |
| * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \ |
| * action bpf obj bpf.o sec action-mark \ |
| * action bpf obj bpf.o sec action-rand ok |
| * |
| * BPF agent example output: |
| * |
| * ver: 1 |
| * obj: bpf.o |
| * dev: 64770 |
| * ino: 6045133 |
| * maps: 3 |
| * map0: |
| * `- fd: 4 |
| * | serial: 1 |
| * | type: 1 |
| * | max elem: 256 |
| * | size key: 1 |
| * ` size val: 16 |
| * map1: |
| * `- fd: 5 |
| * | serial: 2 |
| * | type: 1 |
| * | max elem: 1024 |
| * | size key: 4 |
| * ` size val: 16 |
| * map2: |
| * `- fd: 6 |
| * | serial: 3 |
| * | type: 2 |
| * | max elem: 64 |
| * | size key: 4 |
| * ` size val: 8 |
| * data, period: 5sec |
| * `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0 |
| * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0] |
| * ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0] |
| * data, period: 5sec |
| * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1 |
| * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0] |
| * ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0] |
| * data, period: 5sec |
| * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3 |
| * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0] |
| * ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0] |
| * [...] |
| * |
| * This now means, the below classifier and action pipeline has been loaded |
| * as eBPF bytecode into the kernel, the kernel has verified that the |
| * execution of the bytecode is "safe", and it has JITed the programs |
| * afterwards, so that upon invocation they're running on native speed. tc |
| * has transferred all map file descriptors to the bpf_agent via IPC and |
| * even after tc exits, the agent can read out or modify all map data. |
| * |
| * Note that the export to the uds is done only once in the classifier and |
| * not in the action. It's enough to export the (here) shared descriptors |
| * once. |
| * |
| * If you need to disassemble the generated JIT image (echo with 2), the |
| * kernel tree has under tools/net/ a small helper, you can invoke e.g. |
| * `bpf_jit_disasm -o`. |
| * |
| * Please find in the code below further comments. |
| * |
| * -- Happy eBPF hacking! ;) |
| */ |
| #include <stdint.h> |
| #include <stdbool.h> |
| #include <sys/types.h> |
| #include <sys/socket.h> |
| #include <asm/types.h> |
| #include <linux/in.h> |
| #include <linux/if.h> |
| #include <linux/if_ether.h> |
| #include <linux/ip.h> |
| #include <linux/ipv6.h> |
| #include <linux/if_tunnel.h> |
| #include <linux/filter.h> |
| #include <linux/bpf.h> |
| |
| /* Common, shared definitions with ebpf_agent.c. */ |
| #include "bpf_shared.h" |
| /* BPF helper functions for our example. */ |
| #include "../../include/bpf_api.h" |
| |
| /* Could be defined here as well, or included from the header. */ |
| #define TC_ACT_UNSPEC (-1) |
| #define TC_ACT_OK 0 |
| #define TC_ACT_RECLASSIFY 1 |
| #define TC_ACT_SHOT 2 |
| #define TC_ACT_PIPE 3 |
| #define TC_ACT_STOLEN 4 |
| #define TC_ACT_QUEUED 5 |
| #define TC_ACT_REPEAT 6 |
| |
| /* Other, misc stuff. */ |
| #define IP_MF 0x2000 |
| #define IP_OFFSET 0x1FFF |
| |
| /* eBPF map definitions, all placed in section "maps". */ |
| struct bpf_elf_map __section("maps") map_proto = { |
| .type = BPF_MAP_TYPE_HASH, |
| .id = BPF_MAP_ID_PROTO, |
| .size_key = sizeof(uint8_t), |
| .size_value = sizeof(struct count_tuple), |
| .max_elem = 256, |
| }; |
| |
| struct bpf_elf_map __section("maps") map_queue = { |
| .type = BPF_MAP_TYPE_HASH, |
| .id = BPF_MAP_ID_QUEUE, |
| .size_key = sizeof(uint32_t), |
| .size_value = sizeof(struct count_queue), |
| .max_elem = 1024, |
| }; |
| |
| struct bpf_elf_map __section("maps") map_drops = { |
| .type = BPF_MAP_TYPE_ARRAY, |
| .id = BPF_MAP_ID_DROPS, |
| .size_key = sizeof(uint32_t), |
| .size_value = sizeof(long), |
| .max_elem = 64, |
| }; |
| |
| /* Helper functions and definitions for the flow dissector used by the |
| * example classifier. This resembles the kernel's flow dissector to |
| * some extend and is just used as an example to show what's possible |
| * with eBPF. |
| */ |
| struct sockaddr; |
| |
| struct vlan_hdr { |
| __be16 h_vlan_TCI; |
| __be16 h_vlan_encapsulated_proto; |
| }; |
| |
| struct flow_keys { |
| __u32 src; |
| __u32 dst; |
| union { |
| __u32 ports; |
| __u16 port16[2]; |
| }; |
| __s32 th_off; |
| __u8 ip_proto; |
| }; |
| |
| static inline int flow_ports_offset(__u8 ip_proto) |
| { |
| switch (ip_proto) { |
| case IPPROTO_TCP: |
| case IPPROTO_UDP: |
| case IPPROTO_DCCP: |
| case IPPROTO_ESP: |
| case IPPROTO_SCTP: |
| case IPPROTO_UDPLITE: |
| default: |
| return 0; |
| case IPPROTO_AH: |
| return 4; |
| } |
| } |
| |
| static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off) |
| { |
| return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) & |
| (IP_MF | IP_OFFSET)); |
| } |
| |
| static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off, |
| __u8 *ip_proto, struct flow_keys *flow) |
| { |
| __u8 ip_ver_len; |
| |
| if (unlikely(flow_is_frag(skb, nh_off))) |
| *ip_proto = 0; |
| else |
| *ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr, |
| protocol)); |
| if (*ip_proto != IPPROTO_GRE) { |
| flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr)); |
| flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr)); |
| } |
| |
| ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */); |
| if (likely(ip_ver_len == 0x45)) |
| nh_off += 20; |
| else |
| nh_off += (ip_ver_len & 0xF) << 2; |
| |
| return nh_off; |
| } |
| |
| static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off) |
| { |
| __u32 w0 = load_word(skb, off); |
| __u32 w1 = load_word(skb, off + sizeof(w0)); |
| __u32 w2 = load_word(skb, off + sizeof(w0) * 2); |
| __u32 w3 = load_word(skb, off + sizeof(w0) * 3); |
| |
| return w0 ^ w1 ^ w2 ^ w3; |
| } |
| |
| static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off, |
| __u8 *ip_proto, struct flow_keys *flow) |
| { |
| *ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr)); |
| |
| flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr)); |
| flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr)); |
| |
| return nh_off + sizeof(struct ipv6hdr); |
| } |
| |
| static inline bool flow_dissector(struct __sk_buff *skb, |
| struct flow_keys *flow) |
| { |
| int poff, nh_off = BPF_LL_OFF + ETH_HLEN; |
| __be16 proto = skb->protocol; |
| __u8 ip_proto; |
| |
| /* TODO: check for skb->vlan_tci, skb->vlan_proto first */ |
| if (proto == htons(ETH_P_8021AD)) { |
| proto = load_half(skb, nh_off + |
| offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); |
| nh_off += sizeof(struct vlan_hdr); |
| } |
| if (proto == htons(ETH_P_8021Q)) { |
| proto = load_half(skb, nh_off + |
| offsetof(struct vlan_hdr, h_vlan_encapsulated_proto)); |
| nh_off += sizeof(struct vlan_hdr); |
| } |
| |
| if (likely(proto == htons(ETH_P_IP))) |
| nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); |
| else if (proto == htons(ETH_P_IPV6)) |
| nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); |
| else |
| return false; |
| |
| switch (ip_proto) { |
| case IPPROTO_GRE: { |
| struct gre_hdr { |
| __be16 flags; |
| __be16 proto; |
| }; |
| |
| __u16 gre_flags = load_half(skb, nh_off + |
| offsetof(struct gre_hdr, flags)); |
| __u16 gre_proto = load_half(skb, nh_off + |
| offsetof(struct gre_hdr, proto)); |
| |
| if (gre_flags & (GRE_VERSION | GRE_ROUTING)) |
| break; |
| |
| nh_off += 4; |
| if (gre_flags & GRE_CSUM) |
| nh_off += 4; |
| if (gre_flags & GRE_KEY) |
| nh_off += 4; |
| if (gre_flags & GRE_SEQ) |
| nh_off += 4; |
| |
| if (gre_proto == ETH_P_8021Q) { |
| gre_proto = load_half(skb, nh_off + |
| offsetof(struct vlan_hdr, |
| h_vlan_encapsulated_proto)); |
| nh_off += sizeof(struct vlan_hdr); |
| } |
| if (gre_proto == ETH_P_IP) |
| nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); |
| else if (gre_proto == ETH_P_IPV6) |
| nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); |
| else |
| return false; |
| break; |
| } |
| case IPPROTO_IPIP: |
| nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow); |
| break; |
| case IPPROTO_IPV6: |
| nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow); |
| default: |
| break; |
| } |
| |
| nh_off += flow_ports_offset(ip_proto); |
| |
| flow->ports = load_word(skb, nh_off); |
| flow->th_off = nh_off; |
| flow->ip_proto = ip_proto; |
| |
| return true; |
| } |
| |
| static inline void cls_update_proto_map(const struct __sk_buff *skb, |
| const struct flow_keys *flow) |
| { |
| uint8_t proto = flow->ip_proto; |
| struct count_tuple *ct, _ct; |
| |
| ct = map_lookup_elem(&map_proto, &proto); |
| if (likely(ct)) { |
| lock_xadd(&ct->packets, 1); |
| lock_xadd(&ct->bytes, skb->len); |
| return; |
| } |
| |
| /* No hit yet, we need to create a new entry. */ |
| _ct.packets = 1; |
| _ct.bytes = skb->len; |
| |
| map_update_elem(&map_proto, &proto, &_ct, BPF_ANY); |
| } |
| |
| static inline void cls_update_queue_map(const struct __sk_buff *skb) |
| { |
| uint32_t queue = skb->queue_mapping; |
| struct count_queue *cq, _cq; |
| bool mismatch; |
| |
| mismatch = skb->queue_mapping != get_smp_processor_id(); |
| |
| cq = map_lookup_elem(&map_queue, &queue); |
| if (likely(cq)) { |
| lock_xadd(&cq->total, 1); |
| if (mismatch) |
| lock_xadd(&cq->mismatch, 1); |
| return; |
| } |
| |
| /* No hit yet, we need to create a new entry. */ |
| _cq.total = 1; |
| _cq.mismatch = mismatch ? 1 : 0; |
| |
| map_update_elem(&map_queue, &queue, &_cq, BPF_ANY); |
| } |
| |
| /* eBPF program definitions, placed in various sections, which can |
| * have custom section names. If custom names are in use, it's |
| * required to point tc to the correct section, e.g. |
| * |
| * tc filter add [...] bpf obj cls.o sec cls-tos [...] |
| * |
| * in case the program resides in __section("cls-tos"). |
| * |
| * Default section for cls_bpf is: "classifier", for act_bpf is: |
| * "action". Naturally, if for example multiple actions are present |
| * in the same file, they need to have distinct section names. |
| * |
| * It is however not required to have multiple programs sharing |
| * a file. |
| */ |
| __section("classifier") |
| int cls_main(struct __sk_buff *skb) |
| { |
| struct flow_keys flow; |
| |
| if (!flow_dissector(skb, &flow)) |
| return 0; /* No match in cls_bpf. */ |
| |
| cls_update_proto_map(skb, &flow); |
| cls_update_queue_map(skb); |
| |
| return flow.ip_proto; |
| } |
| |
| static inline void act_update_drop_map(void) |
| { |
| uint32_t *count, cpu = get_smp_processor_id(); |
| |
| count = map_lookup_elem(&map_drops, &cpu); |
| if (count) |
| /* Only this cpu is accessing this element. */ |
| (*count)++; |
| } |
| |
| __section("action-mark") |
| int act_mark_main(struct __sk_buff *skb) |
| { |
| /* You could also mangle skb data here with the helper function |
| * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could |
| * do that already in the classifier itself as a merged combination |
| * of classifier'n'action model. |
| */ |
| |
| if (skb->mark == 0xcafe) { |
| act_update_drop_map(); |
| return TC_ACT_SHOT; |
| } |
| |
| /* Default configured tc opcode. */ |
| return TC_ACT_UNSPEC; |
| } |
| |
| __section("action-rand") |
| int act_rand_main(struct __sk_buff *skb) |
| { |
| /* Sorry, we're near event horizon ... */ |
| if ((get_prandom_u32() & 3) == 0) { |
| act_update_drop_map(); |
| return TC_ACT_SHOT; |
| } |
| |
| return TC_ACT_UNSPEC; |
| } |
| |
| /* Last but not least, the file contains a license. Some future helper |
| * functions may only be available with a GPL license. |
| */ |
| BPF_LICENSE("GPL"); |