bpf, arm64: Optimize BPF store/load using arm64 str/ldr(immediate offset)
authorXu Kuohai <xukuohai@huawei.com>
Mon, 21 Mar 2022 15:28:49 +0000 (11:28 -0400)
committerDaniel Borkmann <daniel@iogearbox.net>
Thu, 31 Mar 2022 22:27:34 +0000 (00:27 +0200)
The current BPF store/load instruction is translated by the JIT into two
instructions. The first instruction moves the immediate offset into a
temporary register. The second instruction uses this temporary register
to do the real store/load.

In fact, arm64 supports addressing with immediate offsets. So This patch
introduces optimization that uses arm64 str/ldr instruction with immediate
offset when the offset fits.

Example of generated instuction for r2 = *(u64 *)(r1 + 0):

without optimization:
mov x10, 0
ldr x1, [x0, x10]

with optimization:
ldr x1, [x0, 0]

If the offset is negative, or is not aligned correctly, or exceeds max
value, rollback to the use of temporary register.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20220321152852.2334294-3-xukuohai@huawei.com
arch/arm64/net/bpf_jit.h
arch/arm64/net/bpf_jit_comp.c

index dd59b5a..3920213 100644 (file)
 #define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE)
 #define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD)
 
+/* Load/store register (immediate offset) */
+#define A64_LS_IMM(Rt, Rn, imm, size, type) \
+       aarch64_insn_gen_load_store_imm(Rt, Rn, imm, \
+               AARCH64_INSN_SIZE_##size, \
+               AARCH64_INSN_LDST_##type##_IMM_OFFSET)
+#define A64_STRBI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 8, STORE)
+#define A64_LDRBI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 8, LOAD)
+#define A64_STRHI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 16, STORE)
+#define A64_LDRHI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 16, LOAD)
+#define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE)
+#define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD)
+#define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE)
+#define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD)
+
 /* Load/store register pair */
 #define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \
        aarch64_insn_gen_load_store_pair(Rt, Rt2, Rn, offset, \
index fcc675a..4bf5045 100644 (file)
@@ -191,6 +191,47 @@ static bool is_addsub_imm(u32 imm)
        return !(imm & ~0xfff) || !(imm & ~0xfff000);
 }
 
+/*
+ * There are 3 types of AArch64 LDR/STR (immediate) instruction:
+ * Post-index, Pre-index, Unsigned offset.
+ *
+ * For BPF ldr/str, the "unsigned offset" type is sufficient.
+ *
+ * "Unsigned offset" type LDR(immediate) format:
+ *
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x x|1 1 1 0 0 1 0 1|         imm12         |    Rn   |    Rt   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * scale
+ *
+ * "Unsigned offset" type STR(immediate) format:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x x|1 1 1 0 0 1 0 0|         imm12         |    Rn   |    Rt   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * scale
+ *
+ * The offset is calculated from imm12 and scale in the following way:
+ *
+ * offset = (u64)imm12 << scale
+ */
+static bool is_lsi_offset(s16 offset, int scale)
+{
+       if (offset < 0)
+               return false;
+
+       if (offset > (0xFFF << scale))
+               return false;
+
+       if (offset & ((1 << scale) - 1))
+               return false;
+
+       return true;
+}
+
 /* Tail call offset to jump into */
 #if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
 #define PROLOGUE_OFFSET 8
@@ -971,19 +1012,38 @@ emit_cond_jmp:
        case BPF_LDX | BPF_PROBE_MEM | BPF_W:
        case BPF_LDX | BPF_PROBE_MEM | BPF_H:
        case BPF_LDX | BPF_PROBE_MEM | BPF_B:
-               emit_a64_mov_i(1, tmp, off, ctx);
                switch (BPF_SIZE(code)) {
                case BPF_W:
-                       emit(A64_LDR32(dst, src, tmp), ctx);
+                       if (is_lsi_offset(off, 2)) {
+                               emit(A64_LDR32I(dst, src, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_LDR32(dst, src, tmp), ctx);
+                       }
                        break;
                case BPF_H:
-                       emit(A64_LDRH(dst, src, tmp), ctx);
+                       if (is_lsi_offset(off, 1)) {
+                               emit(A64_LDRHI(dst, src, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_LDRH(dst, src, tmp), ctx);
+                       }
                        break;
                case BPF_B:
-                       emit(A64_LDRB(dst, src, tmp), ctx);
+                       if (is_lsi_offset(off, 0)) {
+                               emit(A64_LDRBI(dst, src, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_LDRB(dst, src, tmp), ctx);
+                       }
                        break;
                case BPF_DW:
-                       emit(A64_LDR64(dst, src, tmp), ctx);
+                       if (is_lsi_offset(off, 3)) {
+                               emit(A64_LDR64I(dst, src, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_LDR64(dst, src, tmp), ctx);
+                       }
                        break;
                }
 
@@ -1011,20 +1071,39 @@ emit_cond_jmp:
        case BPF_ST | BPF_MEM | BPF_B:
        case BPF_ST | BPF_MEM | BPF_DW:
                /* Load imm to a register then store it */
-               emit_a64_mov_i(1, tmp2, off, ctx);
                emit_a64_mov_i(1, tmp, imm, ctx);
                switch (BPF_SIZE(code)) {
                case BPF_W:
-                       emit(A64_STR32(tmp, dst, tmp2), ctx);
+                       if (is_lsi_offset(off, 2)) {
+                               emit(A64_STR32I(tmp, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp2, off, ctx);
+                               emit(A64_STR32(tmp, dst, tmp2), ctx);
+                       }
                        break;
                case BPF_H:
-                       emit(A64_STRH(tmp, dst, tmp2), ctx);
+                       if (is_lsi_offset(off, 1)) {
+                               emit(A64_STRHI(tmp, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp2, off, ctx);
+                               emit(A64_STRH(tmp, dst, tmp2), ctx);
+                       }
                        break;
                case BPF_B:
-                       emit(A64_STRB(tmp, dst, tmp2), ctx);
+                       if (is_lsi_offset(off, 0)) {
+                               emit(A64_STRBI(tmp, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp2, off, ctx);
+                               emit(A64_STRB(tmp, dst, tmp2), ctx);
+                       }
                        break;
                case BPF_DW:
-                       emit(A64_STR64(tmp, dst, tmp2), ctx);
+                       if (is_lsi_offset(off, 3)) {
+                               emit(A64_STR64I(tmp, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp2, off, ctx);
+                               emit(A64_STR64(tmp, dst, tmp2), ctx);
+                       }
                        break;
                }
                break;
@@ -1034,19 +1113,38 @@ emit_cond_jmp:
        case BPF_STX | BPF_MEM | BPF_H:
        case BPF_STX | BPF_MEM | BPF_B:
        case BPF_STX | BPF_MEM | BPF_DW:
-               emit_a64_mov_i(1, tmp, off, ctx);
                switch (BPF_SIZE(code)) {
                case BPF_W:
-                       emit(A64_STR32(src, dst, tmp), ctx);
+                       if (is_lsi_offset(off, 2)) {
+                               emit(A64_STR32I(src, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_STR32(src, dst, tmp), ctx);
+                       }
                        break;
                case BPF_H:
-                       emit(A64_STRH(src, dst, tmp), ctx);
+                       if (is_lsi_offset(off, 1)) {
+                               emit(A64_STRHI(src, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_STRH(src, dst, tmp), ctx);
+                       }
                        break;
                case BPF_B:
-                       emit(A64_STRB(src, dst, tmp), ctx);
+                       if (is_lsi_offset(off, 0)) {
+                               emit(A64_STRBI(src, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_STRB(src, dst, tmp), ctx);
+                       }
                        break;
                case BPF_DW:
-                       emit(A64_STR64(src, dst, tmp), ctx);
+                       if (is_lsi_offset(off, 3)) {
+                               emit(A64_STR64I(src, dst, off), ctx);
+                       } else {
+                               emit_a64_mov_i(1, tmp, off, ctx);
+                               emit(A64_STR64(src, dst, tmp), ctx);
+                       }
                        break;
                }
                break;