Skip to content

Commit 2f3bd06

Browse files
committed
bpf: verifier: prevent userspace memory access
With BPF_PROBE_MEM, BPF allows de-referencing an untrusted pointer. To thwart invalid memory accesses, the JITs add an exception table entry for all such accesses. But in case the src_reg + offset overflows and turns into a userspace address, the BPF program might read that memory if the user has mapped it. There are architectural features that prevent the kernel from accessing userspace memory, like Privileged Access Never (PAN) on ARM64, Supervisor Mode Access Prevention (SMAP) on x86-64, Supervisor User Memory access (SUM) on RISC-V, etc. but BPF should not rely on the existence of these features. Make the verifier add guard instructions before such memory accesses and skip the load if the address falls into the userspace region. The JITs need to implement bpf_arch_uaddress_limit() to define where the userspace addresses end for that architecture or TASK_SIZE is taken as default. The implementation is as follows: REG_AX = SRC_REG if(offset) REG_AX += offset; REG_AX >>= 32; if (REG_AX <= (uaddress_limit >> 32)) DST_REG = 0; else DST_REG = *(size *)(SRC_REG + offset); As we are comparing the upper 32 bits of load address with the upper 32 bits of uaddress_limit, we will be rejecting uaddress_limit + 4GB. In theory, both load address and uaddress_limit are being aligned down to a 4GB boundary and then the load is rejected if the (4GB aligned)load address <= (4GB aligned)uaddress_limit The above means that loads till uaddress_limit + 4GB are unintentionally rejected. This is acceptable because there is a large hole (much larger than 4GB) between userspace and kernel space memory => Correctly working BPF programs should anyway not access this 4GB memory above the userspace. Let's analyze what this patch does to the following fentry program dereferencing an untrusted pointer: SEC("fentry/tcp_v4_connect") int BPF_PROG(fentry_tcp_v4_connect, struct sock *sk) { *(volatile long *)sk; return 0; } BPF Program before | BPF Program after ------------------ | ----------------- 0: (79) r1 = *(u64 *)(r1 +0) 0: (79) r1 = *(u64 *)(r1 +0) ----------------------------------------------------------------------- 1: (79) r1 = *(u64 *)(r1 +0) --\ 1: (bf) r11 = r1 ----------------------------\ \ 2: (77) r11 >>= 32 2: (b7) r0 = 0 \ \ 3: (b5) if r11 <= 0x8000 goto pc+2 3: (95) exit \ \-> 4: (79) r1 = *(u64 *)(r1 +0) \ 5: (05) goto pc+1 \ 6: (b7) r1 = 0 \-------------------------------------- 7: (b7) r0 = 0 8: (95) exit As you can see from above, in the best case (off=0), 5 extra instructions are emitted. Now, we analyse the same program after it has gone through the JITs of X86-64, ARM64, and RISC-V architectures. We follow the single load instruction that has the untrusted pointer and see what instrumentation has been added around it. x86-64 JIT ========== JIT's Instrumentation Verifier's Instrumentation (upstream) (This patch) --------------------- -------------------------- 0: nopl 0x0(%rax,%rax,1) 0: nopl 0x0(%rax,%rax,1) 5: xchg %ax,%ax 5: xchg %ax,%ax 7: push %rbp 7: push %rbp 8: mov %rsp,%rbp 8: mov %rsp,%rbp b: mov 0x0(%rdi),%rdi b: mov 0x0(%rdi),%rdi ------------------------------------------------------------------------ f: movabs $0x800000000000,%r11 f: mov %rdi,%r10 19: cmp %r11,%rdi 12: shr $0x20,%r10 1c: jb 0x000000000000002a 16: cmp $0x8000,%r10 1e: mov %rdi,%r11 1d: jbe 0x0000000000000025 21: add $0x0,%r11 /--> 1f: mov 0x0(%rdi),%rdi 28: jae 0x000000000000002e / 23: jmp 0x0000000000000027 2a: xor %edi,%edi / 25: xor %edi,%edi 2c: jmp 0x0000000000000032 / /------------------------------------ 2e: mov 0x0(%rdi),%rdi ---/ / 27: xor %eax,%eax ---------------------------------/ 29: leave 32: xor %eax,%eax 2a: ret 34: leave 35: ret The x86-64 JIT already emits some instructions to protect against user memory access. The implementation in this patch leads to a smaller number of instructions being emitted. In the worst case the JIT will emit 9 extra instructions and this patch decreases it to 7 (22.2% less). ARM64 JIT ========= No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: add x9, x30, #0x0 0: add x9, x30, #0x0 4: nop 4: nop 8: paciasp 8: paciasp c: stp x29, x30, [sp, #-16]! c: stp x29, x30, [sp, #-16]! 10: mov x29, sp 10: mov x29, sp 14: stp x19, x20, [sp, #-16]! 14: stp x19, x20, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 18: stp x21, x22, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 1c: stp x25, x26, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 20: stp x27, x28, [sp, #-16]! 24: mov x25, sp 24: mov x25, sp 28: mov x26, #0x0 28: mov x26, #0x0 2c: sub x27, x25, #0x0 2c: sub x27, x25, #0x0 30: sub sp, sp, #0x0 30: sub sp, sp, #0x0 34: ldr x0, [x0] 34: ldr x0, [x0] -------------------------------------------------------------------------------- 38: ldr x0, [x0] ----------\ 38: add x9, x0, #0x0 -----------------------------------\\ 3c: lsr x9, x9, #32 3c: mov x7, #0x0 \\ 40: cmp x9, #0x10, lsl gregkh#12 40: mov sp, sp \\ 44: b.ls 0x0000000000000050 44: ldp x27, x28, [sp], gregkh#16 \\--> 48: ldr x0, [x0] 48: ldp x25, x26, [sp], gregkh#16 \ 4c: b 0x0000000000000054 4c: ldp x21, x22, [sp], gregkh#16 \ 50: mov x0, #0x0 50: ldp x19, x20, [sp], gregkh#16 \--------------------------------------- 54: ldp x29, x30, [sp], gregkh#16 54: mov x7, #0x0 58: add x0, x7, #0x0 58: mov sp, sp 5c: autiasp 5c: ldp x27, x28, [sp], gregkh#16 60: ret 60: ldp x25, x26, [sp], gregkh#16 64: nop 64: ldp x21, x22, [sp], gregkh#16 68: ldr x10, 0x0000000000000070 68: ldp x19, x20, [sp], gregkh#16 6c: br x10 6c: ldp x29, x30, [sp], gregkh#16 70: add x0, x7, #0x0 74: autiasp 78: ret 7c: nop 80: ldr x10, 0x0000000000000088 84: br x10 There are 6 extra instructions added in ARM64 in the best case. This will become 7 in the worst case (off != 0). RISC-V JIT (RISCV_ISA_C Disabled) ========== No Intrumentation Verifier's Instrumentation (upstream) (This patch) ----------------- -------------------------- 0: nop 0: nop 4: nop 4: nop 8: li a6, 33 8: li a6, 33 c: addi sp, sp, -16 c: addi sp, sp, -16 10: sd s0, 8(sp) 10: sd s0, 8(sp) 14: addi s0, sp, 16 14: addi s0, sp, 16 18: ld a0, 0(a0) 18: ld a0, 0(a0) --------------------------------------------------------------- 1c: ld a0, 0(a0) --\ 1c: mv t0, a0 --------------------------\ \ 20: srli t0, t0, 32 20: li a5, 0 \ \ 24: lui t1, 4096 24: ld s0, 8(sp) \ \ 28: sext.w t1, t1 28: addi sp, sp, 16 \ \ 2c: bgeu t1, t0, 12 2c: sext.w a0, a5 \ \--> 30: ld a0, 0(a0) 30: ret \ 34: j 8 \ 38: li a0, 0 \------------------------------ 3c: li a5, 0 40: ld s0, 8(sp) 44: addi sp, sp, 16 48: sext.w a0, a5 4c: ret There are 7 extra instructions added in RISC-V. Suggested-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Puranjay Mohan <puranjay12@gmail.com>
1 parent 4c8644f commit 2f3bd06

File tree

4 files changed

+60
-64
lines changed

4 files changed

+60
-64
lines changed

arch/x86/net/bpf_jit_comp.c

Lines changed: 8 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1327,7 +1327,6 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
13271327
u8 b2 = 0, b3 = 0;
13281328
u8 *start_of_ldx;
13291329
s64 jmp_offset;
1330-
s16 insn_off;
13311330
u8 jmp_cond;
13321331
u8 *func;
13331332
int nops;
@@ -1802,78 +1801,18 @@ st: if (is_imm8(insn->off))
18021801
case BPF_LDX | BPF_PROBE_MEMSX | BPF_B:
18031802
case BPF_LDX | BPF_PROBE_MEMSX | BPF_H:
18041803
case BPF_LDX | BPF_PROBE_MEMSX | BPF_W:
1805-
insn_off = insn->off;
1806-
1807-
if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
1808-
BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
1809-
/* Conservatively check that src_reg + insn->off is a kernel address:
1810-
* src_reg + insn->off >= TASK_SIZE_MAX + PAGE_SIZE
1811-
* src_reg is used as scratch for src_reg += insn->off and restored
1812-
* after emit_ldx if necessary
1813-
*/
1814-
1815-
u64 limit = TASK_SIZE_MAX + PAGE_SIZE;
1816-
u8 *end_of_jmp;
1817-
1818-
/* At end of these emitted checks, insn->off will have been added
1819-
* to src_reg, so no need to do relative load with insn->off offset
1820-
*/
1821-
insn_off = 0;
1822-
1823-
/* movabsq r11, limit */
1824-
EMIT2(add_1mod(0x48, AUX_REG), add_1reg(0xB8, AUX_REG));
1825-
EMIT((u32)limit, 4);
1826-
EMIT(limit >> 32, 4);
1827-
1828-
if (insn->off) {
1829-
/* add src_reg, insn->off */
1830-
maybe_emit_1mod(&prog, src_reg, true);
1831-
EMIT2_off32(0x81, add_1reg(0xC0, src_reg), insn->off);
1832-
}
1833-
1834-
/* cmp src_reg, r11 */
1835-
maybe_emit_mod(&prog, src_reg, AUX_REG, true);
1836-
EMIT2(0x39, add_2reg(0xC0, src_reg, AUX_REG));
1837-
1838-
/* if unsigned '>=', goto load */
1839-
EMIT2(X86_JAE, 0);
1840-
end_of_jmp = prog;
1841-
1842-
/* xor dst_reg, dst_reg */
1843-
emit_mov_imm32(&prog, false, dst_reg, 0);
1844-
/* jmp byte_after_ldx */
1845-
EMIT2(0xEB, 0);
1846-
1847-
/* populate jmp_offset for JAE above to jump to start_of_ldx */
1848-
start_of_ldx = prog;
1849-
end_of_jmp[-1] = start_of_ldx - end_of_jmp;
1850-
}
1804+
start_of_ldx = prog;
18511805
if (BPF_MODE(insn->code) == BPF_PROBE_MEMSX ||
18521806
BPF_MODE(insn->code) == BPF_MEMSX)
1853-
emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
1807+
emit_ldsx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
18541808
else
1855-
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn_off);
1809+
emit_ldx(&prog, BPF_SIZE(insn->code), dst_reg, src_reg, insn->off);
18561810
if (BPF_MODE(insn->code) == BPF_PROBE_MEM ||
18571811
BPF_MODE(insn->code) == BPF_PROBE_MEMSX) {
18581812
struct exception_table_entry *ex;
18591813
u8 *_insn = image + proglen + (start_of_ldx - temp);
18601814
s64 delta;
18611815

1862-
/* populate jmp_offset for JMP above */
1863-
start_of_ldx[-1] = prog - start_of_ldx;
1864-
1865-
if (insn->off && src_reg != dst_reg) {
1866-
/* sub src_reg, insn->off
1867-
* Restore src_reg after "add src_reg, insn->off" in prev
1868-
* if statement. But if src_reg == dst_reg, emit_ldx
1869-
* above already clobbered src_reg, so no need to restore.
1870-
* If add src_reg, insn->off was unnecessary, no need to
1871-
* restore either.
1872-
*/
1873-
maybe_emit_1mod(&prog, src_reg, true);
1874-
EMIT2_off32(0x81, add_1reg(0xE8, src_reg), insn->off);
1875-
}
1876-
18771816
if (!bpf_prog->aux->extable)
18781817
break;
18791818

@@ -3476,3 +3415,8 @@ bool bpf_jit_supports_ptr_xchg(void)
34763415
{
34773416
return true;
34783417
}
3418+
3419+
u64 bpf_arch_uaddress_limit(void)
3420+
{
3421+
return TASK_SIZE_MAX + PAGE_SIZE;
3422+
}

include/linux/filter.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,7 @@ bool bpf_jit_supports_far_kfunc_call(void);
965965
bool bpf_jit_supports_exceptions(void);
966966
bool bpf_jit_supports_ptr_xchg(void);
967967
bool bpf_jit_supports_arena(void);
968+
u64 bpf_arch_uaddress_limit(void);
968969
void arch_bpf_stack_walk(bool (*consume_fn)(void *cookie, u64 ip, u64 sp, u64 bp), void *cookie);
969970
bool bpf_helper_changes_pkt_data(void *func);
970971

kernel/bpf/core.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2944,6 +2944,15 @@ bool __weak bpf_jit_supports_arena(void)
29442944
return false;
29452945
}
29462946

2947+
u64 __weak bpf_arch_uaddress_limit(void)
2948+
{
2949+
#ifdef CONFIG_64BIT
2950+
return TASK_SIZE;
2951+
#else
2952+
return -ENOTSUPP;
2953+
#endif
2954+
}
2955+
29472956
/* Return TRUE if the JIT backend satisfies the following two conditions:
29482957
* 1) JIT backend supports atomic_xchg() on pointer-sized words.
29492958
* 2) Under the specific arch, the implementation of xchg() is the same

kernel/bpf/verifier.c

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19657,6 +19657,48 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
1965719657
goto next_insn;
1965819658
}
1965919659

19660+
/* Make it impossible to de-reference a userspace address */
19661+
if (BPF_CLASS(insn->code) == BPF_LDX &&
19662+
(BPF_MODE(insn->code) == BPF_PROBE_MEM ||
19663+
BPF_MODE(insn->code) == BPF_PROBE_MEMSX)) {
19664+
struct bpf_insn *patchlet;
19665+
u64 uaddress_limit = bpf_arch_uaddress_limit();
19666+
19667+
if (uaddress_limit < 0)
19668+
goto next_insn;
19669+
19670+
struct bpf_insn skip_user_access_with_off[] = {
19671+
BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
19672+
BPF_ALU64_IMM(BPF_ADD, BPF_REG_AX, insn->off),
19673+
BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32),
19674+
BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2),
19675+
*insn,
19676+
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
19677+
BPF_MOV64_IMM(insn->dst_reg, 0),
19678+
};
19679+
struct bpf_insn skip_user_access[] = {
19680+
BPF_MOV64_REG(BPF_REG_AX, insn->src_reg),
19681+
BPF_ALU64_IMM(BPF_RSH, BPF_REG_AX, 32),
19682+
BPF_JMP_IMM(BPF_JLE, BPF_REG_AX, uaddress_limit >> 32, 2),
19683+
*insn,
19684+
BPF_JMP_IMM(BPF_JA, 0, 0, 1),
19685+
BPF_MOV64_IMM(insn->dst_reg, 0),
19686+
};
19687+
19688+
patchlet = insn->off ? skip_user_access_with_off : skip_user_access;
19689+
cnt = insn->off ? ARRAY_SIZE(skip_user_access_with_off) :
19690+
ARRAY_SIZE(skip_user_access);
19691+
19692+
new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt);
19693+
if (!new_prog)
19694+
return -ENOMEM;
19695+
19696+
delta += cnt - 1;
19697+
env->prog = prog = new_prog;
19698+
insn = new_prog->insnsi + i + delta;
19699+
goto next_insn;
19700+
}
19701+
1966019702
/* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
1966119703
if (BPF_CLASS(insn->code) == BPF_LD &&
1966219704
(BPF_MODE(insn->code) == BPF_ABS ||

0 commit comments

Comments
 (0)