twx-linux/include/linux/filter.h
Alexei Starovoitov 622582786c net: filter: x86: internal BPF JIT
Maps all internal BPF instructions into x86_64 instructions.
This patch replaces original BPF x64 JIT with internal BPF x64 JIT.
sysctl net.core.bpf_jit_enable is reused as on/off switch.

Performance:

1. old BPF JIT and internal BPF JIT generate equivalent x86_64 code.
  No performance difference is observed for filters that were JIT-able before

Example assembler code for BPF filter "tcpdump port 22"

original BPF -> old JIT:            original BPF -> internal BPF -> new JIT:
   0:   push   %rbp                      0:     push   %rbp
   1:   mov    %rsp,%rbp                 1:     mov    %rsp,%rbp
   4:   sub    $0x60,%rsp                4:     sub    $0x228,%rsp
   8:   mov    %rbx,-0x8(%rbp)           b:     mov    %rbx,-0x228(%rbp) // prologue
                                        12:     mov    %r13,-0x220(%rbp)
                                        19:     mov    %r14,-0x218(%rbp)
                                        20:     mov    %r15,-0x210(%rbp)
                                        27:     xor    %eax,%eax         // clear A
   c:   xor    %ebx,%ebx                29:     xor    %r13,%r13         // clear X
   e:   mov    0x68(%rdi),%r9d          2c:     mov    0x68(%rdi),%r9d
  12:   sub    0x6c(%rdi),%r9d          30:     sub    0x6c(%rdi),%r9d
  16:   mov    0xd8(%rdi),%r8           34:     mov    0xd8(%rdi),%r10
                                        3b:     mov    %rdi,%rbx
  1d:   mov    $0xc,%esi                3e:     mov    $0xc,%esi
  22:   callq  0xffffffffe1021e15       43:     callq  0xffffffffe102bd75
  27:   cmp    $0x86dd,%eax             48:     cmp    $0x86dd,%rax
  2c:   jne    0x0000000000000069       4f:     jne    0x000000000000009a
  2e:   mov    $0x14,%esi               51:     mov    $0x14,%esi
  33:   callq  0xffffffffe1021e31       56:     callq  0xffffffffe102bd91
  38:   cmp    $0x84,%eax               5b:     cmp    $0x84,%rax
  3d:   je     0x0000000000000049       62:     je     0x0000000000000074
  3f:   cmp    $0x6,%eax                64:     cmp    $0x6,%rax
  42:   je     0x0000000000000049       68:     je     0x0000000000000074
  44:   cmp    $0x11,%eax               6a:     cmp    $0x11,%rax
  47:   jne    0x00000000000000c6       6e:     jne    0x0000000000000117
  49:   mov    $0x36,%esi               74:     mov    $0x36,%esi
  4e:   callq  0xffffffffe1021e15       79:     callq  0xffffffffe102bd75
  53:   cmp    $0x16,%eax               7e:     cmp    $0x16,%rax
  56:   je     0x00000000000000bf       82:     je     0x0000000000000110
  58:   mov    $0x38,%esi               88:     mov    $0x38,%esi
  5d:   callq  0xffffffffe1021e15       8d:     callq  0xffffffffe102bd75
  62:   cmp    $0x16,%eax               92:     cmp    $0x16,%rax
  65:   je     0x00000000000000bf       96:     je     0x0000000000000110
  67:   jmp    0x00000000000000c6       98:     jmp    0x0000000000000117
  69:   cmp    $0x800,%eax              9a:     cmp    $0x800,%rax
  6e:   jne    0x00000000000000c6       a1:     jne    0x0000000000000117
  70:   mov    $0x17,%esi               a3:     mov    $0x17,%esi
  75:   callq  0xffffffffe1021e31       a8:     callq  0xffffffffe102bd91
  7a:   cmp    $0x84,%eax               ad:     cmp    $0x84,%rax
  7f:   je     0x000000000000008b       b4:     je     0x00000000000000c2
  81:   cmp    $0x6,%eax                b6:     cmp    $0x6,%rax
  84:   je     0x000000000000008b       ba:     je     0x00000000000000c2
  86:   cmp    $0x11,%eax               bc:     cmp    $0x11,%rax
  89:   jne    0x00000000000000c6       c0:     jne    0x0000000000000117
  8b:   mov    $0x14,%esi               c2:     mov    $0x14,%esi
  90:   callq  0xffffffffe1021e15       c7:     callq  0xffffffffe102bd75
  95:   test   $0x1fff,%ax              cc:     test   $0x1fff,%rax
  99:   jne    0x00000000000000c6       d3:     jne    0x0000000000000117
                                        d5:     mov    %rax,%r14
  9b:   mov    $0xe,%esi                d8:     mov    $0xe,%esi
  a0:   callq  0xffffffffe1021e44       dd:     callq  0xffffffffe102bd91 // MSH
                                        e2:     and    $0xf,%eax
                                        e5:     shl    $0x2,%eax
                                        e8:     mov    %rax,%r13
                                        eb:     mov    %r14,%rax
                                        ee:     mov    %r13,%rsi
  a5:   lea    0xe(%rbx),%esi           f1:     add    $0xe,%esi
  a8:   callq  0xffffffffe1021e0d       f4:     callq  0xffffffffe102bd6d
  ad:   cmp    $0x16,%eax               f9:     cmp    $0x16,%rax
  b0:   je     0x00000000000000bf       fd:     je     0x0000000000000110
                                        ff:     mov    %r13,%rsi
  b2:   lea    0x10(%rbx),%esi         102:     add    $0x10,%esi
  b5:   callq  0xffffffffe1021e0d      105:     callq  0xffffffffe102bd6d
  ba:   cmp    $0x16,%eax              10a:     cmp    $0x16,%rax
  bd:   jne    0x00000000000000c6      10e:     jne    0x0000000000000117
  bf:   mov    $0xffff,%eax            110:     mov    $0xffff,%eax
  c4:   jmp    0x00000000000000c8      115:     jmp    0x000000000000011c
  c6:   xor    %eax,%eax               117:     mov    $0x0,%eax
  c8:   mov    -0x8(%rbp),%rbx         11c:     mov    -0x228(%rbp),%rbx // epilogue
  cc:   leaveq                         123:     mov    -0x220(%rbp),%r13
  cd:   retq                           12a:     mov    -0x218(%rbp),%r14
                                       131:     mov    -0x210(%rbp),%r15
                                       138:     leaveq
                                       139:     retq

On fully cached SKBs both JITed functions take 12 nsec to execute.
BPF interpreter executes the program in 30 nsec.

The difference in generated assembler is due to the following:

Old BPF imlements LDX_MSH instruction via sk_load_byte_msh() helper function
inside bpf_jit.S.
New JIT removes the helper and does it explicitly, so ldx_msh cost
is the same for both JITs, but generated code looks longer.

New JIT has 4 registers to save, so prologue/epilogue are larger,
but the cost is within noise on x64.

Old JIT checks whether first insn clears A and if not emits 'xor %eax,%eax'.
New JIT clears %rax unconditionally.

2. old BPF JIT doesn't support ANC_NLATTR, ANC_PAY_OFFSET, ANC_RANDOM
  extensions. New JIT supports all BPF extensions.
  Performance of such filters improves 2-4 times depending on a filter.
  The longer the filter the higher performance gain.
  Synthetic benchmarks with many ancillary loads see 20x speedup
  which seems to be the maximum gain from JIT

Notes:

. net.core.bpf_jit_enable=2 + tools/net/bpf_jit_disasm is still functional
  and can be used to see generated assembler

. there are two jit_compile() functions and code flow for classic filters is:
  sk_attach_filter() - load classic BPF
  bpf_jit_compile() - try to JIT from classic BPF
  sk_convert_filter() - convert classic to internal
  bpf_int_jit_compile() - JIT from internal BPF

  seccomp and tracing filters will just call bpf_int_jit_compile()

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2014-05-15 16:31:30 -04:00

315 lines
7.8 KiB
C

/*
* Linux Socket Filter Data Structures
*/
#ifndef __LINUX_FILTER_H__
#define __LINUX_FILTER_H__
#include <linux/atomic.h>
#include <linux/compat.h>
#include <linux/workqueue.h>
#include <uapi/linux/filter.h>
/* Internally used and optimized filter representation with extended
* instruction set based on top of classic BPF.
*/
/* instruction classes */
#define BPF_ALU64 0x07 /* alu mode in double word width */
/* ld/ldx fields */
#define BPF_DW 0x18 /* double word */
#define BPF_XADD 0xc0 /* exclusive add */
/* alu/jmp fields */
#define BPF_MOV 0xb0 /* mov reg to reg */
#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */
/* change endianness of a register */
#define BPF_END 0xd0 /* flags for endianness conversion: */
#define BPF_TO_LE 0x00 /* convert to little-endian */
#define BPF_TO_BE 0x08 /* convert to big-endian */
#define BPF_FROM_LE BPF_TO_LE
#define BPF_FROM_BE BPF_TO_BE
#define BPF_JNE 0x50 /* jump != */
#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */
#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */
#define BPF_CALL 0x80 /* function call */
#define BPF_EXIT 0x90 /* function return */
/* Placeholder/dummy for 0 */
#define BPF_0 0
/* Register numbers */
enum {
BPF_REG_0 = 0,
BPF_REG_1,
BPF_REG_2,
BPF_REG_3,
BPF_REG_4,
BPF_REG_5,
BPF_REG_6,
BPF_REG_7,
BPF_REG_8,
BPF_REG_9,
BPF_REG_10,
__MAX_BPF_REG,
};
/* BPF has 10 general purpose 64-bit registers and stack frame. */
#define MAX_BPF_REG __MAX_BPF_REG
/* ArgX, context and stack frame pointer register positions. Note,
* Arg1, Arg2, Arg3, etc are used as argument mappings of function
* calls in BPF_CALL instruction.
*/
#define BPF_REG_ARG1 BPF_REG_1
#define BPF_REG_ARG2 BPF_REG_2
#define BPF_REG_ARG3 BPF_REG_3
#define BPF_REG_ARG4 BPF_REG_4
#define BPF_REG_ARG5 BPF_REG_5
#define BPF_REG_CTX BPF_REG_6
#define BPF_REG_FP BPF_REG_10
/* Additional register mappings for converted user programs. */
#define BPF_REG_A BPF_REG_0
#define BPF_REG_X BPF_REG_7
#define BPF_REG_TMP BPF_REG_8
/* BPF program can access up to 512 bytes of stack space. */
#define MAX_BPF_STACK 512
/* bpf_add|sub|...: a += x, bpf_mov: a = x */
#define BPF_ALU64_REG(op, a, x) \
((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_X, a, x, 0, 0})
#define BPF_ALU32_REG(op, a, x) \
((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_X, a, x, 0, 0})
/* bpf_add|sub|...: a += imm, bpf_mov: a = imm */
#define BPF_ALU64_IMM(op, a, imm) \
((struct sock_filter_int) {BPF_ALU64|BPF_OP(op)|BPF_K, a, 0, 0, imm})
#define BPF_ALU32_IMM(op, a, imm) \
((struct sock_filter_int) {BPF_ALU|BPF_OP(op)|BPF_K, a, 0, 0, imm})
/* R0 = *(uint *) (skb->data + off) */
#define BPF_LD_ABS(size, off) \
((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_ABS, 0, 0, 0, off})
/* R0 = *(uint *) (skb->data + x + off) */
#define BPF_LD_IND(size, x, off) \
((struct sock_filter_int) {BPF_LD|BPF_SIZE(size)|BPF_IND, 0, x, 0, off})
/* a = *(uint *) (x + off) */
#define BPF_LDX_MEM(sz, a, x, off) \
((struct sock_filter_int) {BPF_LDX|BPF_SIZE(sz)|BPF_MEM, a, x, off, 0})
/* if (a 'op' x) goto pc+off */
#define BPF_JMP_REG(op, a, x, off) \
((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_X, a, x, off, 0})
/* if (a 'op' imm) goto pc+off */
#define BPF_JMP_IMM(op, a, imm, off) \
((struct sock_filter_int) {BPF_JMP|BPF_OP(op)|BPF_K, a, 0, off, imm})
#define BPF_EXIT_INSN() \
((struct sock_filter_int) {BPF_JMP|BPF_EXIT, 0, 0, 0, 0})
static inline int size_to_bpf(int size)
{
switch (size) {
case 1:
return BPF_B;
case 2:
return BPF_H;
case 4:
return BPF_W;
case 8:
return BPF_DW;
default:
return -EINVAL;
}
}
/* Macro to invoke filter function. */
#define SK_RUN_FILTER(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi)
struct sock_filter_int {
__u8 code; /* opcode */
__u8 a_reg:4; /* dest register */
__u8 x_reg:4; /* source register */
__s16 off; /* signed offset */
__s32 imm; /* signed immediate constant */
};
#ifdef CONFIG_COMPAT
/* A struct sock_filter is architecture independent. */
struct compat_sock_fprog {
u16 len;
compat_uptr_t filter; /* struct sock_filter * */
};
#endif
struct sock_fprog_kern {
u16 len;
struct sock_filter *filter;
};
struct sk_buff;
struct sock;
struct seccomp_data;
struct sk_filter {
atomic_t refcnt;
u32 jited:1, /* Is our filter JIT'ed? */
len:31; /* Number of filter blocks */
struct sock_fprog_kern *orig_prog; /* Original BPF program */
struct rcu_head rcu;
unsigned int (*bpf_func)(const struct sk_buff *skb,
const struct sock_filter_int *filter);
union {
struct sock_filter insns[0];
struct sock_filter_int insnsi[0];
struct work_struct work;
};
};
static inline unsigned int sk_filter_size(unsigned int proglen)
{
return max(sizeof(struct sk_filter),
offsetof(struct sk_filter, insns[proglen]));
}
#define sk_filter_proglen(fprog) \
(fprog->len * sizeof(fprog->filter[0]))
int sk_filter(struct sock *sk, struct sk_buff *skb);
u32 sk_run_filter_int_seccomp(const struct seccomp_data *ctx,
const struct sock_filter_int *insni);
u32 sk_run_filter_int_skb(const struct sk_buff *ctx,
const struct sock_filter_int *insni);
int sk_convert_filter(struct sock_filter *prog, int len,
struct sock_filter_int *new_prog, int *new_len);
int sk_unattached_filter_create(struct sk_filter **pfp,
struct sock_fprog *fprog);
void sk_unattached_filter_destroy(struct sk_filter *fp);
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
int sk_detach_filter(struct sock *sk);
int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
int sk_get_filter(struct sock *sk, struct sock_filter __user *filter,
unsigned int len);
void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to);
void sk_filter_charge(struct sock *sk, struct sk_filter *fp);
void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp);
u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
void bpf_int_jit_compile(struct sk_filter *fp);
#ifdef CONFIG_BPF_JIT
#include <stdarg.h>
#include <linux/linkage.h>
#include <linux/printk.h>
void bpf_jit_compile(struct sk_filter *fp);
void bpf_jit_free(struct sk_filter *fp);
static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
u32 pass, void *image)
{
pr_err("flen=%u proglen=%u pass=%u image=%pK\n",
flen, proglen, pass, image);
if (image)
print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET,
16, 1, image, proglen, false);
}
#else
#include <linux/slab.h>
static inline void bpf_jit_compile(struct sk_filter *fp)
{
}
static inline void bpf_jit_free(struct sk_filter *fp)
{
kfree(fp);
}
#endif
static inline int bpf_tell_extensions(void)
{
return SKF_AD_MAX;
}
enum {
BPF_S_RET_K = 1,
BPF_S_RET_A,
BPF_S_ALU_ADD_K,
BPF_S_ALU_ADD_X,
BPF_S_ALU_SUB_K,
BPF_S_ALU_SUB_X,
BPF_S_ALU_MUL_K,
BPF_S_ALU_MUL_X,
BPF_S_ALU_DIV_X,
BPF_S_ALU_MOD_K,
BPF_S_ALU_MOD_X,
BPF_S_ALU_AND_K,
BPF_S_ALU_AND_X,
BPF_S_ALU_OR_K,
BPF_S_ALU_OR_X,
BPF_S_ALU_XOR_K,
BPF_S_ALU_XOR_X,
BPF_S_ALU_LSH_K,
BPF_S_ALU_LSH_X,
BPF_S_ALU_RSH_K,
BPF_S_ALU_RSH_X,
BPF_S_ALU_NEG,
BPF_S_LD_W_ABS,
BPF_S_LD_H_ABS,
BPF_S_LD_B_ABS,
BPF_S_LD_W_LEN,
BPF_S_LD_W_IND,
BPF_S_LD_H_IND,
BPF_S_LD_B_IND,
BPF_S_LD_IMM,
BPF_S_LDX_W_LEN,
BPF_S_LDX_B_MSH,
BPF_S_LDX_IMM,
BPF_S_MISC_TAX,
BPF_S_MISC_TXA,
BPF_S_ALU_DIV_K,
BPF_S_LD_MEM,
BPF_S_LDX_MEM,
BPF_S_ST,
BPF_S_STX,
BPF_S_JMP_JA,
BPF_S_JMP_JEQ_K,
BPF_S_JMP_JEQ_X,
BPF_S_JMP_JGE_K,
BPF_S_JMP_JGE_X,
BPF_S_JMP_JGT_K,
BPF_S_JMP_JGT_X,
BPF_S_JMP_JSET_K,
BPF_S_JMP_JSET_X,
/* Ancillary data */
BPF_S_ANC_PROTOCOL,
BPF_S_ANC_PKTTYPE,
BPF_S_ANC_IFINDEX,
BPF_S_ANC_NLATTR,
BPF_S_ANC_NLATTR_NEST,
BPF_S_ANC_MARK,
BPF_S_ANC_QUEUE,
BPF_S_ANC_HATYPE,
BPF_S_ANC_RXHASH,
BPF_S_ANC_CPU,
BPF_S_ANC_ALU_XOR_X,
BPF_S_ANC_VLAN_TAG,
BPF_S_ANC_VLAN_TAG_PRESENT,
BPF_S_ANC_PAY_OFFSET,
BPF_S_ANC_RANDOM,
};
#endif /* __LINUX_FILTER_H__ */