Maps all internal BPF instructions into x86_64 instructions.
This patch replaces original BPF x64 JIT with internal BPF x64 JIT.
sysctl net.core.bpf_jit_enable is reused as on/off switch.
Performance:
1. old BPF JIT and internal BPF JIT generate equivalent x86_64 code.
No performance difference is observed for filters that were JIT-able before
Example assembler code for BPF filter "tcpdump port 22"
original BPF -> old JIT: original BPF -> internal BPF -> new JIT:
0: push %rbp 0: push %rbp
1: mov %rsp,%rbp 1: mov %rsp,%rbp
4: sub $0x60,%rsp 4: sub $0x228,%rsp
8: mov %rbx,-0x8(%rbp) b: mov %rbx,-0x228(%rbp) // prologue
12: mov %r13,-0x220(%rbp)
19: mov %r14,-0x218(%rbp)
20: mov %r15,-0x210(%rbp)
27: xor %eax,%eax // clear A
c: xor %ebx,%ebx 29: xor %r13,%r13 // clear X
e: mov 0x68(%rdi),%r9d 2c: mov 0x68(%rdi),%r9d
12: sub 0x6c(%rdi),%r9d 30: sub 0x6c(%rdi),%r9d
16: mov 0xd8(%rdi),%r8 34: mov 0xd8(%rdi),%r10
3b: mov %rdi,%rbx
1d: mov $0xc,%esi 3e: mov $0xc,%esi
22: callq 0xffffffffe1021e15 43: callq 0xffffffffe102bd75
27: cmp $0x86dd,%eax 48: cmp $0x86dd,%rax
2c: jne 0x0000000000000069 4f: jne 0x000000000000009a
2e: mov $0x14,%esi 51: mov $0x14,%esi
33: callq 0xffffffffe1021e31 56: callq 0xffffffffe102bd91
38: cmp $0x84,%eax 5b: cmp $0x84,%rax
3d: je 0x0000000000000049 62: je 0x0000000000000074
3f: cmp $0x6,%eax 64: cmp $0x6,%rax
42: je 0x0000000000000049 68: je 0x0000000000000074
44: cmp $0x11,%eax 6a: cmp $0x11,%rax
47: jne 0x00000000000000c6 6e: jne 0x0000000000000117
49: mov $0x36,%esi 74: mov $0x36,%esi
4e: callq 0xffffffffe1021e15 79: callq 0xffffffffe102bd75
53: cmp $0x16,%eax 7e: cmp $0x16,%rax
56: je 0x00000000000000bf 82: je 0x0000000000000110
58: mov $0x38,%esi 88: mov $0x38,%esi
5d: callq 0xffffffffe1021e15 8d: callq 0xffffffffe102bd75
62: cmp $0x16,%eax 92: cmp $0x16,%rax
65: je 0x00000000000000bf 96: je 0x0000000000000110
67: jmp 0x00000000000000c6 98: jmp 0x0000000000000117
69: cmp $0x800,%eax 9a: cmp $0x800,%rax
6e: jne 0x00000000000000c6 a1: jne 0x0000000000000117
70: mov $0x17,%esi a3: mov $0x17,%esi
75: callq 0xffffffffe1021e31 a8: callq 0xffffffffe102bd91
7a: cmp $0x84,%eax ad: cmp $0x84,%rax
7f: je 0x000000000000008b b4: je 0x00000000000000c2
81: cmp $0x6,%eax b6: cmp $0x6,%rax
84: je 0x000000000000008b ba: je 0x00000000000000c2
86: cmp $0x11,%eax bc: cmp $0x11,%rax
89: jne 0x00000000000000c6 c0: jne 0x0000000000000117
8b: mov $0x14,%esi c2: mov $0x14,%esi
90: callq 0xffffffffe1021e15 c7: callq 0xffffffffe102bd75
95: test $0x1fff,%ax cc: test $0x1fff,%rax
99: jne 0x00000000000000c6 d3: jne 0x0000000000000117
d5: mov %rax,%r14
9b: mov $0xe,%esi d8: mov $0xe,%esi
a0: callq 0xffffffffe1021e44 dd: callq 0xffffffffe102bd91 // MSH
e2: and $0xf,%eax
e5: shl $0x2,%eax
e8: mov %rax,%r13
eb: mov %r14,%rax
ee: mov %r13,%rsi
a5: lea 0xe(%rbx),%esi f1: add $0xe,%esi
a8: callq 0xffffffffe1021e0d f4: callq 0xffffffffe102bd6d
ad: cmp $0x16,%eax f9: cmp $0x16,%rax
b0: je 0x00000000000000bf fd: je 0x0000000000000110
ff: mov %r13,%rsi
b2: lea 0x10(%rbx),%esi 102: add $0x10,%esi
b5: callq 0xffffffffe1021e0d 105: callq 0xffffffffe102bd6d
ba: cmp $0x16,%eax 10a: cmp $0x16,%rax
bd: jne 0x00000000000000c6 10e: jne 0x0000000000000117
bf: mov $0xffff,%eax 110: mov $0xffff,%eax
c4: jmp 0x00000000000000c8 115: jmp 0x000000000000011c
c6: xor %eax,%eax 117: mov $0x0,%eax
c8: mov -0x8(%rbp),%rbx 11c: mov -0x228(%rbp),%rbx // epilogue
cc: leaveq 123: mov -0x220(%rbp),%r13
cd: retq 12a: mov -0x218(%rbp),%r14
131: mov -0x210(%rbp),%r15
138: leaveq
139: retq
On fully cached SKBs both JITed functions take 12 nsec to execute.
BPF interpreter executes the program in 30 nsec.
The difference in generated assembler is due to the following:
Old BPF imlements LDX_MSH instruction via sk_load_byte_msh() helper function
inside bpf_jit.S.
New JIT removes the helper and does it explicitly, so ldx_msh cost
is the same for both JITs, but generated code looks longer.
New JIT has 4 registers to save, so prologue/epilogue are larger,
but the cost is within noise on x64.
Old JIT checks whether first insn clears A and if not emits 'xor %eax,%eax'.
New JIT clears %rax unconditionally.
2. old BPF JIT doesn't support ANC_NLATTR, ANC_PAY_OFFSET, ANC_RANDOM
extensions. New JIT supports all BPF extensions.
Performance of such filters improves 2-4 times depending on a filter.
The longer the filter the higher performance gain.
Synthetic benchmarks with many ancillary loads see 20x speedup
which seems to be the maximum gain from JIT
Notes:
. net.core.bpf_jit_enable=2 + tools/net/bpf_jit_disasm is still functional
and can be used to see generated assembler
. there are two jit_compile() functions and code flow for classic filters is:
sk_attach_filter() - load classic BPF
bpf_jit_compile() - try to JIT from classic BPF
sk_convert_filter() - convert classic to internal
bpf_int_jit_compile() - JIT from internal BPF
seccomp and tracing filters will just call bpf_int_jit_compile()
Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
315 lines
7.8 KiB
C
315 lines
7.8 KiB
C
/*
|
|
* Linux Socket Filter Data Structures
|
|
*/
|
|
#ifndef __LINUX_FILTER_H__
|
|
#define __LINUX_FILTER_H__
|
|
|
|
#include <linux/atomic.h>
|
|
#include <linux/compat.h>
|
|
#include <linux/workqueue.h>
|
|
#include <uapi/linux/filter.h>
|
|
|
|
/* Internally used and optimized filter representation with extended
|
|
* instruction set based on top of classic BPF.
|
|
*/
|
|
|
|
/* instruction classes */
|
|
#define BPF_ALU64 0x07 /* alu mode in double word width */
|
|
|
|
/* ld/ldx fields */
|
|
#define BPF_DW 0x18 /* double word */
|
|
#define BPF_XADD 0xc0 /* exclusive add */
|
|
|
|
/* alu/jmp fields */
|
|
#define BPF_MOV 0xb0 /* mov reg to reg */
|
|
#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */
|
|
|
|
/* change endianness of a register */
|
|
#define BPF_END 0xd0 /* flags for endianness conversion: */
|
|
#define BPF_TO_LE 0x00 /* convert to little-endian */
|
|
#define BPF_TO_BE 0x08 /* convert to big-endian */
|
|
#define BPF_FROM_LE BPF_TO_LE
|
|
#define BPF_FROM_BE BPF_TO_BE
|
|
|
|
#define BPF_JNE 0x50 /* jump != */
|
|
#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */
|
|
#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */
|
|
#define BPF_CALL 0x80 /* function call */
|
|
#define BPF_EXIT 0x90 /* function return */
|
|
|
|
/* Placeholder/dummy for 0 */
|
|
#define BPF_0 0
|
|
|
|
/* Register numbers */
|
|
enum {
|
|
BPF_REG_0 = 0,
|
|
BPF_REG_1,
|
|
BPF_REG_2,
|
|
BPF_REG_3,
|
|
BPF_REG_4,
|
|
BPF_REG_5,
|
|
BPF_REG_6,
|
|
BPF_REG_7,
|
|
BPF_REG_8,
|
|
BPF_REG_9,
|
|
BPF_REG_10,
|
|
__MAX_BPF_REG,
|
|
};
|
|
|
|
/* BPF has 10 general purpose 64-bit registers and stack frame. */
|
|
#define MAX_BPF_REG __MAX_BPF_REG
|
|
|
|
/* ArgX, context and stack frame pointer register positions. Note,
|
|
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
|
|
* calls in BPF_CALL instruction.
|
|
*/
|
|
#define BPF_REG_ARG1 BPF_REG_1
|
|
#define BPF_REG_ARG2 BPF_REG_2
|
|
#define BPF_REG_ARG3 BPF_REG_3
|
|
#define BPF_REG_ARG4 BPF_REG_4
|
|
#define BPF_REG_ARG5 BPF_REG_5
|
|
#define BPF_REG_CTX BPF_REG_6
|
|
#define BPF_REG_FP BPF_REG_10
|
|
|
|
/* Additional register mappings for converted user programs. */
|
|
#define BPF_REG_A BPF_REG_0
|
|
#define BPF_REG_X BPF_REG_7
|
|
#define BPF_REG_TMP BPF_REG_8
|
|
|
|
/* BPF program can access up to 512 bytes of stack space. */
|
|
#define MAX_BPF_STACK 512
|
|
|
|
/* bpf_add|sub|...: a += x, bpf_mov: a = x */
|
|
#define BPF_ALU64_REG(op, a, x) \
|
|
((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_X, a, x, 0, 0})
|
|
#define BPF_ALU32_REG(op, a, x) \
|
|
((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_X, a, x, 0, 0})
|
|
|
|
/* bpf_add|sub|...: a += imm, bpf_mov: a = imm */
|
|
#define BPF_ALU64_IMM(op, a, imm) \
|
|
((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_K, a, 0, 0, imm})
|
|
#define BPF_ALU32_IMM(op, a, imm) \
|
|
((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_K, a, 0, 0, imm})
|
|
|
|
/* R0 = *(uint *) (skb->data + off) */
|
|
#define BPF_LD_ABS(size, off) \
|
|
((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_ABS, 0, 0, 0, off})
|
|
|
|
/* R0 = *(uint *) (skb->data + x + off) */
|
|
#define BPF_LD_IND(size, x, off) \
|
|
((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_IND, 0, x, 0, off})
|
|
|
|
/* a = *(uint *) (x + off) */
|
|
#define BPF_LDX_MEM(sz, a, x, off) \
|
|
((struct sock_filter_int) {BPF_LDX|BPF_SIZE(sz)|BPF_MEM, a, x, off, 0})
|
|
|
|
/* if (a 'op' x) goto pc+off */
|
|
#define BPF_JMP_REG(op, a, x, off) \
|
|
((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_X, a, x, off, 0})
|
|
|
|
/* if (a 'op' imm) goto pc+off */
|
|
#define BPF_JMP_IMM(op, a, imm, off) \
|
|
((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_K, a, 0, off, imm})
|
|
|
|
#define BPF_EXIT_INSN() \
|
|
((struct sock_filter_int) {BPF_JMP|BPF_EXIT, 0, 0, 0, 0})
|
|
|
|
static inline int size_to_bpf(int size)
|
|
{
|
|
switch (size) {
|
|
case 1:
|
|
return BPF_B;
|
|
case 2:
|
|
return BPF_H;
|
|
case 4:
|
|
return BPF_W;
|
|
case 8:
|
|
return BPF_DW;
|
|
default:
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
/* Macro to invoke filter function. */
|
|
#define SK_RUN_FILTER(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi)
|
|
|
|
struct sock_filter_int {
|
|
__u8 code; /* opcode */
|
|
__u8 a_reg:4; /* dest register */
|
|
__u8 x_reg:4; /* source register */
|
|
__s16 off; /* signed offset */
|
|
__s32 imm; /* signed immediate constant */
|
|
};
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
/* A struct sock_filter is architecture independent. */
|
|
struct compat_sock_fprog {
|
|
u16 len;
|
|
compat_uptr_t filter; /* struct sock_filter * */
|
|
};
|
|
#endif
|
|
|
|
struct sock_fprog_kern {
|
|
u16 len;
|
|
struct sock_filter *filter;
|
|
};
|
|
|
|
struct sk_buff;
|
|
struct sock;
|
|
struct seccomp_data;
|
|
|
|
struct sk_filter {
|
|
atomic_t refcnt;
|
|
u32 jited:1, /* Is our filter JIT'ed? */
|
|
len:31; /* Number of filter blocks */
|
|
struct sock_fprog_kern *orig_prog; /* Original BPF program */
|
|
struct rcu_head rcu;
|
|
unsigned int (*bpf_func)(const struct sk_buff *skb,
|
|
const struct sock_filter_int *filter);
|
|
union {
|
|
struct sock_filter insns[0];
|
|
struct sock_filter_int insnsi[0];
|
|
struct work_struct work;
|
|
};
|
|
};
|
|
|
|
static inline unsigned int sk_filter_size(unsigned int proglen)
|
|
{
|
|
return max(sizeof(struct sk_filter),
|
|
offsetof(struct sk_filter, insns[proglen]));
|
|
}
|
|
|
|
#define sk_filter_proglen(fprog) \
|
|
(fprog->len * sizeof(fprog->filter[0]))
|
|
|
|
int sk_filter(struct sock *sk, struct sk_buff *skb);
|
|
|
|
u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
|
|
const struct sock_filter_int *insni);
|
|
u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
|
|
const struct sock_filter_int *insni);
|
|
|
|
int sk_convert_filter(struct sock_filter *prog, int len,
|
|
struct sock_filter_int *new_prog, int *new_len);
|
|
|
|
int sk_unattached_filter_create(struct sk_filter **pfp,
|
|
struct sock_fprog *fprog);
|
|
void sk_unattached_filter_destroy(struct sk_filter *fp);
|
|
|
|
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
|
|
int sk_detach_filter(struct sock *sk);
|
|
|
|
int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
|
|
int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
|
|
unsigned int len);
|
|
void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
|
|
|
|
void sk_filter_charge(struct sock *sk, struct sk_filter *fp);
|
|
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
|
|
|
|
u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
|
|
void bpf_int_jit_compile(struct sk_filter *fp);
|
|
|
|
#ifdef CONFIG_BPF_JIT
|
|
#include <stdarg.h>
|
|
#include <linux/linkage.h>
|
|
#include <linux/printk.h>
|
|
|
|
void bpf_jit_compile(struct sk_filter *fp);
|
|
void bpf_jit_free(struct sk_filter *fp);
|
|
|
|
static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
|
|
u32 pass, void *image)
|
|
{
|
|
pr_err("flen=%u proglen=%u pass=%u image=%pK\n",
|
|
flen, proglen, pass, image);
|
|
if (image)
|
|
print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
|
|
16, 1, image, proglen, false);
|
|
}
|
|
#else
|
|
#include <linux/slab.h>
|
|
static inline void bpf_jit_compile(struct sk_filter *fp)
|
|
{
|
|
}
|
|
static inline void bpf_jit_free(struct sk_filter *fp)
|
|
{
|
|
kfree(fp);
|
|
}
|
|
#endif
|
|
|
|
static inline int bpf_tell_extensions(void)
|
|
{
|
|
return SKF_AD_MAX;
|
|
}
|
|
|
|
enum {
|
|
BPF_S_RET_K = 1,
|
|
BPF_S_RET_A,
|
|
BPF_S_ALU_ADD_K,
|
|
BPF_S_ALU_ADD_X,
|
|
BPF_S_ALU_SUB_K,
|
|
BPF_S_ALU_SUB_X,
|
|
BPF_S_ALU_MUL_K,
|
|
BPF_S_ALU_MUL_X,
|
|
BPF_S_ALU_DIV_X,
|
|
BPF_S_ALU_MOD_K,
|
|
BPF_S_ALU_MOD_X,
|
|
BPF_S_ALU_AND_K,
|
|
BPF_S_ALU_AND_X,
|
|
BPF_S_ALU_OR_K,
|
|
BPF_S_ALU_OR_X,
|
|
BPF_S_ALU_XOR_K,
|
|
BPF_S_ALU_XOR_X,
|
|
BPF_S_ALU_LSH_K,
|
|
BPF_S_ALU_LSH_X,
|
|
BPF_S_ALU_RSH_K,
|
|
BPF_S_ALU_RSH_X,
|
|
BPF_S_ALU_NEG,
|
|
BPF_S_LD_W_ABS,
|
|
BPF_S_LD_H_ABS,
|
|
BPF_S_LD_B_ABS,
|
|
BPF_S_LD_W_LEN,
|
|
BPF_S_LD_W_IND,
|
|
BPF_S_LD_H_IND,
|
|
BPF_S_LD_B_IND,
|
|
BPF_S_LD_IMM,
|
|
BPF_S_LDX_W_LEN,
|
|
BPF_S_LDX_B_MSH,
|
|
BPF_S_LDX_IMM,
|
|
BPF_S_MISC_TAX,
|
|
BPF_S_MISC_TXA,
|
|
BPF_S_ALU_DIV_K,
|
|
BPF_S_LD_MEM,
|
|
BPF_S_LDX_MEM,
|
|
BPF_S_ST,
|
|
BPF_S_STX,
|
|
BPF_S_JMP_JA,
|
|
BPF_S_JMP_JEQ_K,
|
|
BPF_S_JMP_JEQ_X,
|
|
BPF_S_JMP_JGE_K,
|
|
BPF_S_JMP_JGE_X,
|
|
BPF_S_JMP_JGT_K,
|
|
BPF_S_JMP_JGT_X,
|
|
BPF_S_JMP_JSET_K,
|
|
BPF_S_JMP_JSET_X,
|
|
/* Ancillary data */
|
|
BPF_S_ANC_PROTOCOL,
|
|
BPF_S_ANC_PKTTYPE,
|
|
BPF_S_ANC_IFINDEX,
|
|
BPF_S_ANC_NLATTR,
|
|
BPF_S_ANC_NLATTR_NEST,
|
|
BPF_S_ANC_MARK,
|
|
BPF_S_ANC_QUEUE,
|
|
BPF_S_ANC_HATYPE,
|
|
BPF_S_ANC_RXHASH,
|
|
BPF_S_ANC_CPU,
|
|
BPF_S_ANC_ALU_XOR_X,
|
|
BPF_S_ANC_VLAN_TAG,
|
|
BPF_S_ANC_VLAN_TAG_PRESENT,
|
|
BPF_S_ANC_PAY_OFFSET,
|
|
BPF_S_ANC_RANDOM,
|
|
};
|
|
|
|
#endif /* __LINUX_FILTER_H__ */
|