iproute2/examples/bpf/bpf_prog.c - nest-cam/4320010/iproute2 - Git at Google

 /*
  * eBPF kernel space program part
  *
  * Toy eBPF program for demonstration purposes, some parts derived from
  * kernel tree's samples/bpf/sockex2_kern.c example.
  *
  * More background on eBPF, kernel tree: Documentation/networking/filter.txt
  *
  * Note, this file is rather large, and most classifier and actions are
  * likely smaller to accomplish one specific use-case and are tailored
  * for high performance. For performance reasons, you might also have the
  * classifier and action already merged inside the classifier.
  *
  * In order to show various features it serves as a bigger programming
  * example, which you should feel free to rip apart and experiment with.
  *
  * Compilation, configuration example:
  *
  *  Note: as long as the BPF backend in LLVM is still experimental,
  *  you need to build LLVM with LLVM with --enable-experimental-targets=BPF
  *  Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
  *  and you have libelf.h and gelf.h headers and can link tc against -lelf.
  *
  *  In case you need to sync kernel headers, go to your kernel source tree:
  *  # make headers_install INSTALL_HDR_PATH=/usr/
  *
  *  $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
  *  $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
  *  $ objdump -h bpf.o
  *  [...]
  *  3 classifier    000007f8  0000000000000000  0000000000000000  00000040  2**3
  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
  *  4 action-mark   00000088  0000000000000000  0000000000000000  00000838  2**3
  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
  *  5 action-rand   00000098  0000000000000000  0000000000000000  000008c0  2**3
  *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
  *  6 maps          00000030  0000000000000000  0000000000000000  00000958  2**2
  *                  CONTENTS, ALLOC, LOAD, DATA
  *  7 license       00000004  0000000000000000  0000000000000000  00000988  2**0
  *                  CONTENTS, ALLOC, LOAD, DATA
  *  [...]
  *  # echo 1 > /proc/sys/net/core/bpf_jit_enable
  *  $ gcc bpf_agent.c -o bpf_agent -Wall -O2
  *  # ./bpf_agent /tmp/bpf-uds      (e.g. on a different terminal)
  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
  *                             action bpf obj bpf.o sec action-mark            \
  *                             action bpf obj bpf.o sec action-rand ok
  *  # tc filter show dev em1
  *  filter parent 1: protocol all pref 49152 bpf
  *  filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
  *    action order 1: bpf bpf.o:[action-mark] default-action pipe
  *    index 52 ref 1 bind 1
  *
  *    action order 2: bpf bpf.o:[action-rand] default-action pipe
  *    index 53 ref 1 bind 1
  *
  *    action order 3: gact action pass
  *    random type none pass val 0
  *    index 38 ref 1 bind 1
  *
  * The same program can also be installed on ingress side (as opposed to above
  * egress configuration), e.g.:
  *
  * # tc qdisc add dev em1 handle ffff: ingress
  * # tc filter add dev em1 parent ffff: bpf obj ...
  *
  * Notes on BPF agent:
  *
  * In the above example, the bpf_agent creates the unix domain socket
  * natively. "tc exec" can also spawn a shell and hold the socktes there:
  *
  *  # tc exec bpf imp /tmp/bpf-uds
  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
  *                             action bpf obj bpf.o sec action-mark            \
  *                             action bpf obj bpf.o sec action-rand ok
  *  sh-4.2# (shell spawned from tc exec)
  *  sh-4.2# bpf_agent
  *  [...]
  *
  * This will read out fds over environment and produce the same data dump
  * as below. This has the advantage that the spawned shell owns the fds
  * and thus if the agent is restarted, it can reattach to the same fds, also
  * various programs can easily read/modify the data simultaneously from user
  * space side.
  *
  * If the shell is unnecessary, the agent can also just be spawned directly
  * via tc exec:
  *
  *  # tc exec bpf imp /tmp/bpf-uds run bpf_agent
  *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
  *                             action bpf obj bpf.o sec action-mark            \
  *                             action bpf obj bpf.o sec action-rand ok
  *
  * BPF agent example output:
  *
  * ver: 1
  * obj: bpf.o
  * dev: 64770
  * ino: 6045133
  * maps: 3
  * map0:
  *  `- fd: 4
  *   | serial: 1
  *   | type: 1
  *   | max elem: 256
  *   | size key: 1
  *   ` size val: 16
  * map1:
  *  `- fd: 5
  *   | serial: 2
  *   | type: 1
  *   | max elem: 1024
  *   | size key: 4
  *   ` size val: 16
  * map2:
  *  `- fd: 6
  *   | serial: 3
  *   | type: 2
  *   | max elem: 64
  *   | size key: 4
  *   ` size val: 8
  * data, period: 5sec
  *  `- number of drops:	cpu0:     0	cpu1:     0	cpu2:     0	cpu3:     0
  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 0, mis: 0]	q3:[pkts: 0, mis: 0]
  *   ` protos:	tcp:[pkts: 0, bytes: 0]	udp:[pkts: 0, bytes: 0]	icmp:[pkts: 0, bytes: 0]
  * data, period: 5sec
  *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     0	cpu3:     1
  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 24, mis: 14]	q3:[pkts: 0, mis: 0]
  *   ` protos:	tcp:[pkts: 13, bytes: 1989]	udp:[pkts: 10, bytes: 710]	icmp:[pkts: 0, bytes: 0]
  * data, period: 5sec
  *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     3	cpu3:     3
  *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 39, mis: 21]	q3:[pkts: 0, mis: 0]
  *   ` protos:	tcp:[pkts: 20, bytes: 3549]	udp:[pkts: 18, bytes: 1278]	icmp:[pkts: 0, bytes: 0]
  * [...]
  *
  * This now means, the below classifier and action pipeline has been loaded
  * as eBPF bytecode into the kernel, the kernel has verified that the
  * execution of the bytecode is "safe", and it has JITed the programs
  * afterwards, so that upon invocation they're running on native speed. tc
  * has transferred all map file descriptors to the bpf_agent via IPC and
  * even after tc exits, the agent can read out or modify all map data.
  *
  * Note that the export to the uds is done only once in the classifier and
  * not in the action. It's enough to export the (here) shared descriptors
  * once.
  *
  * If you need to disassemble the generated JIT image (echo with 2), the
  * kernel tree has under tools/net/ a small helper, you can invoke e.g.
  * `bpf_jit_disasm -o`.
  *
  * Please find in the code below further comments.
  *
  *   -- Happy eBPF hacking! ;)
  */
 #include <stdint.h>
 #include <stdbool.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <asm/types.h>
 #include <linux/in.h>
 #include <linux/if.h>
 #include <linux/if_ether.h>
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 #include <linux/if_tunnel.h>
 #include <linux/filter.h>
 #include <linux/bpf.h>

 /* Common, shared definitions with ebpf_agent.c. */
 #include "bpf_shared.h"
 /* BPF helper functions for our example. */
 #include "../../include/bpf_api.h"

 /* Could be defined here as well, or included from the header. */
 #define TC_ACT_UNSPEC		(-1)
 #define TC_ACT_OK		0
 #define TC_ACT_RECLASSIFY	1
 #define TC_ACT_SHOT		2
 #define TC_ACT_PIPE		3
 #define TC_ACT_STOLEN		4
 #define TC_ACT_QUEUED		5
 #define TC_ACT_REPEAT		6

 /* Other, misc stuff. */
 #define IP_MF			0x2000
 #define IP_OFFSET		0x1FFF

 /* eBPF map definitions, all placed in section "maps". */
 struct bpf_elf_map __section("maps") map_proto = {
 	.type		=	BPF_MAP_TYPE_HASH,
 	.id		=	BPF_MAP_ID_PROTO,
 	.size_key	=	sizeof(uint8_t),
 	.size_value	=	sizeof(struct count_tuple),
 	.max_elem	=	256,
 };

 struct bpf_elf_map __section("maps") map_queue = {
 	.type		=	BPF_MAP_TYPE_HASH,
 	.id		=	BPF_MAP_ID_QUEUE,
 	.size_key	=	sizeof(uint32_t),
 	.size_value	=	sizeof(struct count_queue),
 	.max_elem	=	1024,
 };

 struct bpf_elf_map __section("maps") map_drops = {
 	.type		=	BPF_MAP_TYPE_ARRAY,
 	.id		=	BPF_MAP_ID_DROPS,
 	.size_key	=	sizeof(uint32_t),
 	.size_value	=	sizeof(long),
 	.max_elem	=	64,
 };

 /* Helper functions and definitions for the flow dissector used by the
  * example classifier. This resembles the kernel's flow dissector to
  * some extend and is just used as an example to show what's possible
  * with eBPF.
  */
 struct sockaddr;

 struct vlan_hdr {
 	__be16 h_vlan_TCI;
 	__be16 h_vlan_encapsulated_proto;
 };

 struct flow_keys {
 	__u32 src;
 	__u32 dst;
 	union {
 		__u32 ports;
 		__u16 port16[2];
 	};
 	__s32 th_off;
 	__u8 ip_proto;
 };

 static inline int flow_ports_offset(__u8 ip_proto)
 {
 	switch (ip_proto) {
 	case IPPROTO_TCP:
 	case IPPROTO_UDP:
 	case IPPROTO_DCCP:
 	case IPPROTO_ESP:
 	case IPPROTO_SCTP:
 	case IPPROTO_UDPLITE:
 	default:
 		return 0;
 	case IPPROTO_AH:
 		return 4;
 	}
 }

 static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off)
 {
 	return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
 		  (IP_MF | IP_OFFSET));
 }

 static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
 				  __u8 *ip_proto, struct flow_keys *flow)
 {
 	__u8 ip_ver_len;

 	if (unlikely(flow_is_frag(skb, nh_off)))
 		*ip_proto = 0;
 	else
 		*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
 							     protocol));
 	if (*ip_proto != IPPROTO_GRE) {
 		flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
 		flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
 	}

 	ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
 	if (likely(ip_ver_len == 0x45))
 		nh_off += 20;
 	else
 		nh_off += (ip_ver_len & 0xF) << 2;

 	return nh_off;
 }

 static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
 {
 	__u32 w0 = load_word(skb, off);
 	__u32 w1 = load_word(skb, off + sizeof(w0));
 	__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
 	__u32 w3 = load_word(skb, off + sizeof(w0) * 3);

 	return w0 ^ w1 ^ w2 ^ w3;
 }

 static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
 				  __u8 *ip_proto, struct flow_keys *flow)
 {
 	*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));

 	flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
 	flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));

 	return nh_off + sizeof(struct ipv6hdr);
 }

 static inline bool flow_dissector(struct __sk_buff *skb,
 				  struct flow_keys *flow)
 {
 	int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
 	__be16 proto = skb->protocol;
 	__u8 ip_proto;

 	/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
 	if (proto == htons(ETH_P_8021AD)) {
 		proto = load_half(skb, nh_off +
 				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
 		nh_off += sizeof(struct vlan_hdr);
 	}
 	if (proto == htons(ETH_P_8021Q)) {
 		proto = load_half(skb, nh_off +
 				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
 		nh_off += sizeof(struct vlan_hdr);
 	}

 	if (likely(proto == htons(ETH_P_IP)))
 		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
 	else if (proto == htons(ETH_P_IPV6))
 		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
 	else
 		return false;

 	switch (ip_proto) {
 	case IPPROTO_GRE: {
 		struct gre_hdr {
 			__be16 flags;
 			__be16 proto;
 		};

 		__u16 gre_flags = load_half(skb, nh_off +
 					    offsetof(struct gre_hdr, flags));
 		__u16 gre_proto = load_half(skb, nh_off +
 					    offsetof(struct gre_hdr, proto));

 		if (gre_flags & (GRE_VERSION | GRE_ROUTING))
 			break;

 		nh_off += 4;
 		if (gre_flags & GRE_CSUM)
 			nh_off += 4;
 		if (gre_flags & GRE_KEY)
 			nh_off += 4;
 		if (gre_flags & GRE_SEQ)
 			nh_off += 4;

 		if (gre_proto == ETH_P_8021Q) {
 			gre_proto = load_half(skb, nh_off +
 					      offsetof(struct vlan_hdr,
 						       h_vlan_encapsulated_proto));
 			nh_off += sizeof(struct vlan_hdr);
 		}
 		if (gre_proto == ETH_P_IP)
 			nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
 		else if (gre_proto == ETH_P_IPV6)
 			nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
 		else
 			return false;
 		break;
 	}
 	case IPPROTO_IPIP:
 		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
 		break;
 	case IPPROTO_IPV6:
 		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
 	default:
 		break;
 	}

 	nh_off += flow_ports_offset(ip_proto);

 	flow->ports = load_word(skb, nh_off);
 	flow->th_off = nh_off;
 	flow->ip_proto = ip_proto;

 	return true;
 }

 static inline void cls_update_proto_map(const struct __sk_buff *skb,
 					const struct flow_keys *flow)
 {
 	uint8_t proto = flow->ip_proto;
 	struct count_tuple *ct, _ct;

 	ct = map_lookup_elem(&map_proto, &proto);
 	if (likely(ct)) {
 		lock_xadd(&ct->packets, 1);
 		lock_xadd(&ct->bytes, skb->len);
 		return;
 	}

 	/* No hit yet, we need to create a new entry. */
 	_ct.packets = 1;
 	_ct.bytes = skb->len;

 	map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
 }

 static inline void cls_update_queue_map(const struct __sk_buff *skb)
 {
 	uint32_t queue = skb->queue_mapping;
 	struct count_queue *cq, _cq;
 	bool mismatch;

 	mismatch = skb->queue_mapping != get_smp_processor_id();

 	cq = map_lookup_elem(&map_queue, &queue);
 	if (likely(cq)) {
 		lock_xadd(&cq->total, 1);
 		if (mismatch)
 			lock_xadd(&cq->mismatch, 1);
 		return;
 	}

 	/* No hit yet, we need to create a new entry. */
 	_cq.total = 1;
 	_cq.mismatch = mismatch ? 1 : 0;

 	map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
 }

 /* eBPF program definitions, placed in various sections, which can
  * have custom section names. If custom names are in use, it's
  * required to point tc to the correct section, e.g.
  *
  *     tc filter add [...] bpf obj cls.o sec cls-tos [...]
  *
  * in case the program resides in __section("cls-tos").
  *
  * Default section for cls_bpf is: "classifier", for act_bpf is:
  * "action". Naturally, if for example multiple actions are present
  * in the same file, they need to have distinct section names.
  *
  * It is however not required to have multiple programs sharing
  * a file.
  */
 __section("classifier")
 int cls_main(struct __sk_buff *skb)
 {
 	struct flow_keys flow;

 	if (!flow_dissector(skb, &flow))
 		return 0; /* No match in cls_bpf. */

 	cls_update_proto_map(skb, &flow);
 	cls_update_queue_map(skb);

 	return flow.ip_proto;
 }

 static inline void act_update_drop_map(void)
 {
 	uint32_t *count, cpu = get_smp_processor_id();

 	count = map_lookup_elem(&map_drops, &cpu);
 	if (count)
 		/* Only this cpu is accessing this element. */
 		(*count)++;
 }

 __section("action-mark")
 int act_mark_main(struct __sk_buff *skb)
 {
 	/* You could also mangle skb data here with the helper function
 	 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
 	 * do that already in the classifier itself as a merged combination
 	 * of classifier'n'action model.
 	 */

 	if (skb->mark == 0xcafe) {
 		act_update_drop_map();
 		return TC_ACT_SHOT;
 	}

 	/* Default configured tc opcode. */
 	return TC_ACT_UNSPEC;
 }

 __section("action-rand")
 int act_rand_main(struct __sk_buff *skb)
 {
 	/* Sorry, we're near event horizon ... */
 	if ((get_prandom_u32() & 3) == 0) {
 		act_update_drop_map();
 		return TC_ACT_SHOT;
 	}

 	return TC_ACT_UNSPEC;
 }

 /* Last but not least, the file contains a license. Some future helper
  * functions may only be available with a GPL license.
  */
 BPF_LICENSE("GPL");
	/*
	* eBPF kernel space program part
	*
	* Toy eBPF program for demonstration purposes, some parts derived from
	* kernel tree's samples/bpf/sockex2_kern.c example.
	*
	* More background on eBPF, kernel tree: Documentation/networking/filter.txt
	*
	* Note, this file is rather large, and most classifier and actions are
	* likely smaller to accomplish one specific use-case and are tailored
	* for high performance. For performance reasons, you might also have the
	* classifier and action already merged inside the classifier.
	*
	* In order to show various features it serves as a bigger programming
	* example, which you should feel free to rip apart and experiment with.
	*
	* Compilation, configuration example:
	*
	* Note: as long as the BPF backend in LLVM is still experimental,
	* you need to build LLVM with LLVM with --enable-experimental-targets=BPF
	* Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
	* and you have libelf.h and gelf.h headers and can link tc against -lelf.
	*
	* In case you need to sync kernel headers, go to your kernel source tree:
	* # make headers_install INSTALL_HDR_PATH=/usr/
	*
	* $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
	* $ clang -O2 -emit-llvm -c bpf_prog.c -o - \| llc -march=bpf -filetype=obj -o bpf.o
	* $ objdump -h bpf.o
	* [...]
	* 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3
	* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
	* 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3
	* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
	* 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3
	* CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
	* 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2
	* CONTENTS, ALLOC, LOAD, DATA
	* 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0
	* CONTENTS, ALLOC, LOAD, DATA
	* [...]
	* # echo 1 > /proc/sys/net/core/bpf_jit_enable
	* $ gcc bpf_agent.c -o bpf_agent -Wall -O2
	* # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal)
	* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
	* action bpf obj bpf.o sec action-mark \
	* action bpf obj bpf.o sec action-rand ok
	* # tc filter show dev em1
	* filter parent 1: protocol all pref 49152 bpf
	* filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
	* action order 1: bpf bpf.o:[action-mark] default-action pipe
	* index 52 ref 1 bind 1
	*
	* action order 2: bpf bpf.o:[action-rand] default-action pipe
	* index 53 ref 1 bind 1
	*
	* action order 3: gact action pass
	* random type none pass val 0
	* index 38 ref 1 bind 1
	*
	* The same program can also be installed on ingress side (as opposed to above
	* egress configuration), e.g.:
	*
	* # tc qdisc add dev em1 handle ffff: ingress
	* # tc filter add dev em1 parent ffff: bpf obj ...
	*
	* Notes on BPF agent:
	*
	* In the above example, the bpf_agent creates the unix domain socket
	* natively. "tc exec" can also spawn a shell and hold the socktes there:
	*
	* # tc exec bpf imp /tmp/bpf-uds
	* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
	* action bpf obj bpf.o sec action-mark \
	* action bpf obj bpf.o sec action-rand ok
	* sh-4.2# (shell spawned from tc exec)
	* sh-4.2# bpf_agent
	* [...]
	*
	* This will read out fds over environment and produce the same data dump
	* as below. This has the advantage that the spawned shell owns the fds
	* and thus if the agent is restarted, it can reattach to the same fds, also
	* various programs can easily read/modify the data simultaneously from user
	* space side.
	*
	* If the shell is unnecessary, the agent can also just be spawned directly
	* via tc exec:
	*
	* # tc exec bpf imp /tmp/bpf-uds run bpf_agent
	* # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
	* action bpf obj bpf.o sec action-mark \
	* action bpf obj bpf.o sec action-rand ok
	*
	* BPF agent example output:
	*
	* ver: 1
	* obj: bpf.o
	* dev: 64770
	* ino: 6045133
	* maps: 3
	* map0:
	* `- fd: 4
	* \| serial: 1
	* \| type: 1
	* \| max elem: 256
	* \| size key: 1
	* ` size val: 16
	* map1:
	* `- fd: 5
	* \| serial: 2
	* \| type: 1
	* \| max elem: 1024
	* \| size key: 4
	* ` size val: 16
	* map2:
	* `- fd: 6
	* \| serial: 3
	* \| type: 2
	* \| max elem: 64
	* \| size key: 4
	* ` size val: 8
	* data, period: 5sec
	* `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0
	* \| nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0]
	* ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0]
	* data, period: 5sec
	* `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1
	* \| nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0]
	* ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0]
	* data, period: 5sec
	* `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3
	* \| nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0]
	* ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0]
	* [...]
	*
	* This now means, the below classifier and action pipeline has been loaded
	* as eBPF bytecode into the kernel, the kernel has verified that the
	* execution of the bytecode is "safe", and it has JITed the programs
	* afterwards, so that upon invocation they're running on native speed. tc
	* has transferred all map file descriptors to the bpf_agent via IPC and
	* even after tc exits, the agent can read out or modify all map data.
	*
	* Note that the export to the uds is done only once in the classifier and
	* not in the action. It's enough to export the (here) shared descriptors
	* once.
	*
	* If you need to disassemble the generated JIT image (echo with 2), the
	* kernel tree has under tools/net/ a small helper, you can invoke e.g.
	* `bpf_jit_disasm -o`.
	*
	* Please find in the code below further comments.
	*
	* -- Happy eBPF hacking! ;)
	*/
	#include <stdint.h>
	#include <stdbool.h>
	#include <sys/types.h>
	#include <sys/socket.h>
	#include <asm/types.h>
	#include <linux/in.h>
	#include <linux/if.h>
	#include <linux/if_ether.h>
	#include <linux/ip.h>
	#include <linux/ipv6.h>
	#include <linux/if_tunnel.h>
	#include <linux/filter.h>
	#include <linux/bpf.h>

	/* Common, shared definitions with ebpf_agent.c. */
	#include "bpf_shared.h"
	/* BPF helper functions for our example. */
	#include "../../include/bpf_api.h"

	/* Could be defined here as well, or included from the header. */
	#define TC_ACT_UNSPEC (-1)
	#define TC_ACT_OK 0
	#define TC_ACT_RECLASSIFY 1
	#define TC_ACT_SHOT 2
	#define TC_ACT_PIPE 3
	#define TC_ACT_STOLEN 4
	#define TC_ACT_QUEUED 5
	#define TC_ACT_REPEAT 6

	/* Other, misc stuff. */
	#define IP_MF 0x2000
	#define IP_OFFSET 0x1FFF

	/* eBPF map definitions, all placed in section "maps". */
	struct bpf_elf_map __section("maps") map_proto = {
	.type = BPF_MAP_TYPE_HASH,
	.id = BPF_MAP_ID_PROTO,
	.size_key = sizeof(uint8_t),
	.size_value = sizeof(struct count_tuple),
	.max_elem = 256,
	};

	struct bpf_elf_map __section("maps") map_queue = {
	.type = BPF_MAP_TYPE_HASH,
	.id = BPF_MAP_ID_QUEUE,
	.size_key = sizeof(uint32_t),
	.size_value = sizeof(struct count_queue),
	.max_elem = 1024,
	};

	struct bpf_elf_map __section("maps") map_drops = {
	.type = BPF_MAP_TYPE_ARRAY,
	.id = BPF_MAP_ID_DROPS,
	.size_key = sizeof(uint32_t),
	.size_value = sizeof(long),
	.max_elem = 64,
	};

	/* Helper functions and definitions for the flow dissector used by the
	* example classifier. This resembles the kernel's flow dissector to
	* some extend and is just used as an example to show what's possible
	* with eBPF.
	*/
	struct sockaddr;

	struct vlan_hdr {
	__be16 h_vlan_TCI;
	__be16 h_vlan_encapsulated_proto;
	};

	struct flow_keys {
	__u32 src;
	__u32 dst;
	union {
	__u32 ports;
	__u16 port16[2];
	};
	__s32 th_off;
	__u8 ip_proto;
	};

	static inline int flow_ports_offset(__u8 ip_proto)
	{
	switch (ip_proto) {
	case IPPROTO_TCP:
	case IPPROTO_UDP:
	case IPPROTO_DCCP:
	case IPPROTO_ESP:
	case IPPROTO_SCTP:
	case IPPROTO_UDPLITE:
	default:
	return 0;
	case IPPROTO_AH:
	return 4;
	}
	}

	static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off)
	{
	return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
	(IP_MF \| IP_OFFSET));
	}

	static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
	__u8 ip_proto, struct flow_keys flow)
	{
	__u8 ip_ver_len;

	if (unlikely(flow_is_frag(skb, nh_off)))
	*ip_proto = 0;
	else
	*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
	protocol));
	if (*ip_proto != IPPROTO_GRE) {
	flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
	flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
	}

	ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
	if (likely(ip_ver_len == 0x45))
	nh_off += 20;
	else
	nh_off += (ip_ver_len & 0xF) << 2;

	return nh_off;
	}

	static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
	{
	__u32 w0 = load_word(skb, off);
	__u32 w1 = load_word(skb, off + sizeof(w0));
	__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
	__u32 w3 = load_word(skb, off + sizeof(w0) * 3);

	return w0 ^ w1 ^ w2 ^ w3;
	}

	static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
	__u8 ip_proto, struct flow_keys flow)
	{
	*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));

	flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
	flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));

	return nh_off + sizeof(struct ipv6hdr);
	}

	static inline bool flow_dissector(struct __sk_buff *skb,
	struct flow_keys *flow)
	{
	int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
	__be16 proto = skb->protocol;
	__u8 ip_proto;

	/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
	if (proto == htons(ETH_P_8021AD)) {
	proto = load_half(skb, nh_off +
	offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
	nh_off += sizeof(struct vlan_hdr);
	}
	if (proto == htons(ETH_P_8021Q)) {
	proto = load_half(skb, nh_off +
	offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
	nh_off += sizeof(struct vlan_hdr);
	}

	if (likely(proto == htons(ETH_P_IP)))
	nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
	else if (proto == htons(ETH_P_IPV6))
	nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
	else
	return false;

	switch (ip_proto) {
	case IPPROTO_GRE: {
	struct gre_hdr {
	__be16 flags;
	__be16 proto;
	};

	__u16 gre_flags = load_half(skb, nh_off +
	offsetof(struct gre_hdr, flags));
	__u16 gre_proto = load_half(skb, nh_off +
	offsetof(struct gre_hdr, proto));

	if (gre_flags & (GRE_VERSION \| GRE_ROUTING))
	break;

	nh_off += 4;
	if (gre_flags & GRE_CSUM)
	nh_off += 4;
	if (gre_flags & GRE_KEY)
	nh_off += 4;
	if (gre_flags & GRE_SEQ)
	nh_off += 4;

	if (gre_proto == ETH_P_8021Q) {
	gre_proto = load_half(skb, nh_off +
	offsetof(struct vlan_hdr,
	h_vlan_encapsulated_proto));
	nh_off += sizeof(struct vlan_hdr);
	}
	if (gre_proto == ETH_P_IP)
	nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
	else if (gre_proto == ETH_P_IPV6)
	nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
	else
	return false;
	break;
	}
	case IPPROTO_IPIP:
	nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
	break;
	case IPPROTO_IPV6:
	nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
	default:
	break;
	}

	nh_off += flow_ports_offset(ip_proto);

	flow->ports = load_word(skb, nh_off);
	flow->th_off = nh_off;
	flow->ip_proto = ip_proto;

	return true;
	}

	static inline void cls_update_proto_map(const struct __sk_buff *skb,
	const struct flow_keys *flow)
	{
	uint8_t proto = flow->ip_proto;
	struct count_tuple *ct, _ct;

	ct = map_lookup_elem(&map_proto, &proto);
	if (likely(ct)) {
	lock_xadd(&ct->packets, 1);
	lock_xadd(&ct->bytes, skb->len);
	return;
	}

	/* No hit yet, we need to create a new entry. */
	_ct.packets = 1;
	_ct.bytes = skb->len;

	map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
	}

	static inline void cls_update_queue_map(const struct __sk_buff *skb)
	{
	uint32_t queue = skb->queue_mapping;
	struct count_queue *cq, _cq;
	bool mismatch;

	mismatch = skb->queue_mapping != get_smp_processor_id();

	cq = map_lookup_elem(&map_queue, &queue);
	if (likely(cq)) {
	lock_xadd(&cq->total, 1);
	if (mismatch)
	lock_xadd(&cq->mismatch, 1);
	return;
	}

	/* No hit yet, we need to create a new entry. */
	_cq.total = 1;
	_cq.mismatch = mismatch ? 1 : 0;

	map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
	}

	/* eBPF program definitions, placed in various sections, which can
	* have custom section names. If custom names are in use, it's
	* required to point tc to the correct section, e.g.
	*
	* tc filter add [...] bpf obj cls.o sec cls-tos [...]
	*
	* in case the program resides in __section("cls-tos").
	*
	* Default section for cls_bpf is: "classifier", for act_bpf is:
	* "action". Naturally, if for example multiple actions are present
	* in the same file, they need to have distinct section names.
	*
	* It is however not required to have multiple programs sharing
	* a file.
	*/
	__section("classifier")
	int cls_main(struct __sk_buff *skb)
	{
	struct flow_keys flow;

	if (!flow_dissector(skb, &flow))
	return 0; /* No match in cls_bpf. */

	cls_update_proto_map(skb, &flow);
	cls_update_queue_map(skb);

	return flow.ip_proto;
	}

	static inline void act_update_drop_map(void)
	{
	uint32_t *count, cpu = get_smp_processor_id();

	count = map_lookup_elem(&map_drops, &cpu);
	if (count)
	/* Only this cpu is accessing this element. */
	(*count)++;
	}

	__section("action-mark")
	int act_mark_main(struct __sk_buff *skb)
	{
	/* You could also mangle skb data here with the helper function
	* BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
	* do that already in the classifier itself as a merged combination
	* of classifier'n'action model.
	*/

	if (skb->mark == 0xcafe) {
	act_update_drop_map();
	return TC_ACT_SHOT;
	}

	/* Default configured tc opcode. */
	return TC_ACT_UNSPEC;
	}

	__section("action-rand")
	int act_rand_main(struct __sk_buff *skb)
	{
	/* Sorry, we're near event horizon ... */
	if ((get_prandom_u32() & 3) == 0) {
	act_update_drop_map();
	return TC_ACT_SHOT;
	}

	return TC_ACT_UNSPEC;
	}

	/* Last but not least, the file contains a license. Some future helper
	* functions may only be available with a GPL license.
	*/
	BPF_LICENSE("GPL");