From 2283a64e571f48a6e21af2018e70fb123ef041e6 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Thu, 28 Oct 2021 18:49:58 +0200 Subject: [PATCH] ipq807x: backport eBPF layer from v5.10 kernel Signed-off-by: John Crispin --- ...x-add-the-Qualcomm-AX-target-support.patch | 44765 +++++++++++++++- 1 file changed, 44704 insertions(+), 61 deletions(-) diff --git a/patches/0017-ipq807x-add-the-Qualcomm-AX-target-support.patch b/patches/0017-ipq807x-add-the-Qualcomm-AX-target-support.patch index f05b7fe05..144b088bd 100644 --- a/patches/0017-ipq807x-add-the-Qualcomm-AX-target-support.patch +++ b/patches/0017-ipq807x-add-the-Qualcomm-AX-target-support.patch @@ -1,70 +1,71 @@ -From 9b33e2fdee38684fbc1eb08446f277823ee113e1 Mon Sep 17 00:00:00 2001 +From 164756923b3e89cb2fc825a80d4cc4236fb6dc89 Mon Sep 17 00:00:00 2001 From: John Crispin Date: Sat, 18 Jul 2020 08:53:44 +0200 Subject: [PATCH 01/30] ipq807x: add the Qualcomm AX target support Signed-off-by: John Crispin --- - config/Config-kernel.in | 9 + - include/image.mk | 6 +- - include/kernel-version.mk | 2 +- - package/boot/uboot-envtools/files/ipq807x | 37 + - .../etc/hotplug.d/firmware/11-ath10k-caldata | 5 + - target/linux/ipq807x/109-logspam.patch | 24 + - target/linux/ipq807x/Makefile | 22 + - .../ipq807x/base-files/etc/board.d/01_leds | 38 + - .../ipq807x/base-files/etc/board.d/02_network | 82 + - .../etc/hotplug.d/firmware/10-ath11k-caldata | 95 ++ - .../ipq807x/base-files/etc/init.d/aq_phy | 16 + - .../ipq807x/base-files/etc/init.d/bootcount | 12 + - .../linux/ipq807x/base-files/etc/init.d/wdt | 14 + - ...G4_v5.4.B-AQR_CIG_WIFI_ID44715_VER1673.cld | Bin 0 -> 391170 bytes - .../base-files/lib/upgrade/platform.sh | 72 + - target/linux/ipq807x/config-4.4 | 828 +++++++++ - .../arm/boot/dts/qcom-ipq6018-cig-wf188.dts | 18 + - .../arm/boot/dts/qcom-ipq6018-cig-wf188n.dts | 18 + - .../boot/dts/qcom-ipq6018-edgecore-eap101.dts | 18 + - .../boot/dts/qcom-ipq6018-miwifi-ax1800.dts | 18 + - .../boot/dts/qcom-ipq6018-wallys-dr6018.dts | 18 + - .../arch/arm/boot/dts/qcom-ipq807x-eap102.dts | 26 + - .../arch/arm/boot/dts/qcom-ipq807x-eap106.dts | 26 + - .../arch/arm/boot/dts/qcom-ipq807x-ex227.dts | 26 + - .../arch/arm/boot/dts/qcom-ipq807x-ex447.dts | 26 + - .../boot/dts/qcom-ipq807x-sercomm-wallaby.dts | 26 + - .../arch/arm/boot/dts/qcom-ipq807x-wf194c.dts | 26 + - .../arm/boot/dts/qcom-ipq807x-wf194c4.dts | 26 + - .../dts/qcom/qcom-ipq6018-miwifi-ax1800.dts | 419 +++++ - .../dts/qcom/qcom-ipq6018-wallys-dr6018.dts | 441 +++++ - .../boot/dts/qcom/qcom-ipq807x-eap102.dts | 918 ++++++++++ - .../boot/dts/qcom/qcom-ipq807x-wf194c4.dts | 942 ++++++++++ - target/linux/ipq807x/image/Makefile | 26 + - target/linux/ipq807x/image/ipq50xx.mk | 10 + - target/linux/ipq807x/image/ipq60xx.mk | 56 + - target/linux/ipq807x/image/ipq807x.mk | 90 + - target/linux/ipq807x/ipq50xx/config-default | 84 + - target/linux/ipq807x/ipq50xx/config-lowmem | 73 + - target/linux/ipq807x/ipq50xx/target.mk | 10 + - target/linux/ipq807x/ipq60xx/config-default | 122 ++ - .../linux/ipq807x/ipq60xx/profiles/default.mk | 9 + - target/linux/ipq807x/ipq60xx/target.mk | 8 + - target/linux/ipq807x/ipq807x/config-default | 78 + - .../linux/ipq807x/ipq807x/profiles/default.mk | 9 + - target/linux/ipq807x/ipq807x/target.mk | 7 + - target/linux/ipq807x/modules.mk | 61 + - .../linux/ipq807x/patches/100-qrtr-ns.patch | 976 +++++++++++ - .../linux/ipq807x/patches/101-squashfs.patch | 16 + - .../linux/ipq807x/patches/102-cig-wf188.patch | 869 ++++++++++ - .../ipq807x/patches/103-sercomm-wallaby.patch | 816 +++++++++ - target/linux/ipq807x/patches/104-wf194c.patch | 816 +++++++++ - .../patches/105-fix-dtc-gcc10-build.patch | 11 + - target/linux/ipq807x/patches/106-eap101.patch | 993 +++++++++++ - .../linux/ipq807x/patches/108-log-spam.patch | 37 + - target/linux/ipq807x/patches/109-tplink.patch | 1518 +++++++++++++++++ - .../ipq807x/patches/110-add-esmt-nand.patch | 37 + - target/linux/ipq807x/patches/111-eap106.patch | 765 +++++++++ - target/linux/ipq807x/patches/112-pstore.patch | 147 ++ - toolchain/kernel-headers/Makefile | 8 + - 59 files changed, 11904 insertions(+), 2 deletions(-) + config/Config-kernel.in | 9 + + include/image.mk | 6 +- + include/kernel-version.mk | 2 +- + package/boot/uboot-envtools/files/ipq807x | 37 + + .../etc/hotplug.d/firmware/11-ath10k-caldata | 5 + + target/linux/ipq807x/109-logspam.patch | 24 + + target/linux/ipq807x/Makefile | 22 + + .../ipq807x/base-files/etc/board.d/01_leds | 38 + + .../ipq807x/base-files/etc/board.d/02_network | 82 + + .../etc/hotplug.d/firmware/10-ath11k-caldata | 95 + + .../ipq807x/base-files/etc/init.d/aq_phy | 16 + + .../ipq807x/base-files/etc/init.d/bootcount | 12 + + .../linux/ipq807x/base-files/etc/init.d/wdt | 14 + + ...G4_v5.4.B-AQR_CIG_WIFI_ID44715_VER1673.cld | Bin 0 -> 391170 bytes + .../base-files/lib/upgrade/platform.sh | 72 + + target/linux/ipq807x/config-4.4 | 828 + + .../arm/boot/dts/qcom-ipq6018-cig-wf188.dts | 18 + + .../arm/boot/dts/qcom-ipq6018-cig-wf188n.dts | 18 + + .../boot/dts/qcom-ipq6018-edgecore-eap101.dts | 18 + + .../boot/dts/qcom-ipq6018-miwifi-ax1800.dts | 18 + + .../boot/dts/qcom-ipq6018-wallys-dr6018.dts | 18 + + .../arch/arm/boot/dts/qcom-ipq807x-eap102.dts | 26 + + .../arch/arm/boot/dts/qcom-ipq807x-eap106.dts | 26 + + .../arch/arm/boot/dts/qcom-ipq807x-ex227.dts | 26 + + .../arch/arm/boot/dts/qcom-ipq807x-ex447.dts | 26 + + .../boot/dts/qcom-ipq807x-sercomm-wallaby.dts | 26 + + .../arch/arm/boot/dts/qcom-ipq807x-wf194c.dts | 26 + + .../arm/boot/dts/qcom-ipq807x-wf194c4.dts | 26 + + .../dts/qcom/qcom-ipq6018-miwifi-ax1800.dts | 419 + + .../dts/qcom/qcom-ipq6018-wallys-dr6018.dts | 441 + + .../boot/dts/qcom/qcom-ipq807x-eap102.dts | 918 + + .../boot/dts/qcom/qcom-ipq807x-wf194c4.dts | 942 + + target/linux/ipq807x/image/Makefile | 26 + + target/linux/ipq807x/image/ipq50xx.mk | 10 + + target/linux/ipq807x/image/ipq60xx.mk | 56 + + target/linux/ipq807x/image/ipq807x.mk | 90 + + target/linux/ipq807x/ipq50xx/config-default | 84 + + target/linux/ipq807x/ipq50xx/config-lowmem | 73 + + target/linux/ipq807x/ipq50xx/target.mk | 10 + + target/linux/ipq807x/ipq60xx/config-default | 122 + + .../linux/ipq807x/ipq60xx/profiles/default.mk | 9 + + target/linux/ipq807x/ipq60xx/target.mk | 8 + + target/linux/ipq807x/ipq807x/config-default | 78 + + .../linux/ipq807x/ipq807x/profiles/default.mk | 9 + + target/linux/ipq807x/ipq807x/target.mk | 7 + + target/linux/ipq807x/modules.mk | 61 + + .../linux/ipq807x/patches/100-qrtr-ns.patch | 976 + + .../linux/ipq807x/patches/101-squashfs.patch | 16 + + .../linux/ipq807x/patches/102-cig-wf188.patch | 869 + + .../ipq807x/patches/103-sercomm-wallaby.patch | 816 + + target/linux/ipq807x/patches/104-wf194c.patch | 816 + + .../patches/105-fix-dtc-gcc10-build.patch | 11 + + target/linux/ipq807x/patches/106-eap101.patch | 993 + + .../linux/ipq807x/patches/108-log-spam.patch | 37 + + target/linux/ipq807x/patches/109-tplink.patch | 1518 + + .../ipq807x/patches/110-add-esmt-nand.patch | 37 + + target/linux/ipq807x/patches/111-eap106.patch | 765 + + target/linux/ipq807x/patches/112-pstore.patch | 147 + + .../ipq807x/patches/200-bpf_backport.patch | 44635 ++++++++++++++++ + toolchain/kernel-headers/Makefile | 8 + + 60 files changed, 56539 insertions(+), 2 deletions(-) create mode 100644 package/boot/uboot-envtools/files/ipq807x create mode 100644 target/linux/ipq807x/109-logspam.patch create mode 100644 target/linux/ipq807x/Makefile @@ -119,6 +120,7 @@ Signed-off-by: John Crispin create mode 100644 target/linux/ipq807x/patches/110-add-esmt-nand.patch create mode 100644 target/linux/ipq807x/patches/111-eap106.patch create mode 100644 target/linux/ipq807x/patches/112-pstore.patch + create mode 100644 target/linux/ipq807x/patches/200-bpf_backport.patch diff --git a/config/Config-kernel.in b/config/Config-kernel.in index f71114b5da..4a85d83118 100644 @@ -16566,6 +16568,44647 @@ index 0000000000..dc3960306d + #ifdef CONFIG_OF_ADDRESS + /* + * The following routines scan a subtree and registers a device for +diff --git a/target/linux/ipq807x/patches/200-bpf_backport.patch b/target/linux/ipq807x/patches/200-bpf_backport.patch +new file mode 100644 +index 0000000000..3e730c313e +--- /dev/null ++++ b/target/linux/ipq807x/patches/200-bpf_backport.patch +@@ -0,0 +1,44635 @@ ++--- a/arch/arm/Kconfig +++++ b/arch/arm/Kconfig ++@@ -38,7 +38,7 @@ config ARM ++ select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 ++ select HAVE_ARCH_SECCOMP_FILTER if (AEABI && !OABI_COMPAT) ++ select HAVE_ARCH_TRACEHOOK ++- select HAVE_BPF_JIT +++ select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32 ++ select HAVE_CC_STACKPROTECTOR ++ select HAVE_CONTEXT_TRACKING ++ select HAVE_C_RECORDMCOUNT ++--- a/arch/arm/net/bpf_jit_32.c +++++ b/arch/arm/net/bpf_jit_32.c ++@@ -1,13 +1,12 @@ +++// SPDX-License-Identifier: GPL-2.0-only ++ /* ++- * Just-In-Time compiler for BPF filters on 32bit ARM +++ * Just-In-Time compiler for eBPF filters on 32bit ARM ++ * +++ * Copyright (c) 2017 Shubham Bansal ++ * Copyright (c) 2011 Mircea Gherzan ++- * ++- * This program is free software; you can redistribute it and/or modify it ++- * under the terms of the GNU General Public License as published by the ++- * Free Software Foundation; version 2 of the License. ++ */ ++ +++#include ++ #include ++ #include ++ #include ++@@ -20,51 +19,182 @@ ++ #include ++ #include ++ #include +++#include ++ ++ #include "bpf_jit_32.h" ++ ++ /* ++- * ABI: +++ * eBPF prog stack layout: +++ * +++ * high +++ * original ARM_SP => +-----+ +++ * | | callee saved registers +++ * +-----+ <= (BPF_FP + SCRATCH_SIZE) +++ * | ... | eBPF JIT scratch space +++ * eBPF fp register => +-----+ +++ * (BPF_FP) | ... | eBPF prog stack +++ * +-----+ +++ * |RSVD | JIT scratchpad +++ * current ARM_SP => +-----+ <= (BPF_FP - STACK_SIZE + SCRATCH_SIZE) +++ * | | +++ * | ... | Function call stack +++ * | | +++ * +-----+ +++ * low +++ * +++ * The callee saved registers depends on whether frame pointers are enabled. +++ * With frame pointers (to be compliant with the ABI): +++ * +++ * high +++ * original ARM_SP => +--------------+ \ +++ * | pc | | +++ * current ARM_FP => +--------------+ } callee saved registers +++ * |r4-r9,fp,ip,lr| | +++ * +--------------+ / +++ * low ++ * ++- * r0 scratch register ++- * r4 BPF register A ++- * r5 BPF register X ++- * r6 pointer to the skb ++- * r7 skb->data ++- * r8 skb_headlen(skb) +++ * Without frame pointers: +++ * +++ * high +++ * original ARM_SP => +--------------+ +++ * | r4-r9,fp,lr | callee saved registers +++ * current ARM_FP => +--------------+ +++ * low +++ * +++ * When popping registers off the stack at the end of a BPF function, we +++ * reference them via the current ARM_FP register. ++ */ +++#define CALLEE_MASK (1 << ARM_R4 | 1 << ARM_R5 | 1 << ARM_R6 | \ +++ 1 << ARM_R7 | 1 << ARM_R8 | 1 << ARM_R9 | \ +++ 1 << ARM_FP) +++#define CALLEE_PUSH_MASK (CALLEE_MASK | 1 << ARM_LR) +++#define CALLEE_POP_MASK (CALLEE_MASK | 1 << ARM_PC) +++ +++enum { +++ /* Stack layout - these are offsets from (top of stack - 4) */ +++ BPF_R2_HI, +++ BPF_R2_LO, +++ BPF_R3_HI, +++ BPF_R3_LO, +++ BPF_R4_HI, +++ BPF_R4_LO, +++ BPF_R5_HI, +++ BPF_R5_LO, +++ BPF_R7_HI, +++ BPF_R7_LO, +++ BPF_R8_HI, +++ BPF_R8_LO, +++ BPF_R9_HI, +++ BPF_R9_LO, +++ BPF_FP_HI, +++ BPF_FP_LO, +++ BPF_TC_HI, +++ BPF_TC_LO, +++ BPF_AX_HI, +++ BPF_AX_LO, +++ /* Stack space for BPF_REG_2, BPF_REG_3, BPF_REG_4, +++ * BPF_REG_5, BPF_REG_7, BPF_REG_8, BPF_REG_9, +++ * BPF_REG_FP and Tail call counts. +++ */ +++ BPF_JIT_SCRATCH_REGS, +++}; +++ +++/* +++ * Negative "register" values indicate the register is stored on the stack +++ * and are the offset from the top of the eBPF JIT scratch space. +++ */ +++#define STACK_OFFSET(k) (-4 - (k) * 4) +++#define SCRATCH_SIZE (BPF_JIT_SCRATCH_REGS * 4) +++ +++#ifdef CONFIG_FRAME_POINTER +++#define EBPF_SCRATCH_TO_ARM_FP(x) ((x) - 4 * hweight16(CALLEE_PUSH_MASK) - 4) +++#else +++#define EBPF_SCRATCH_TO_ARM_FP(x) (x) +++#endif ++ ++-#define r_scratch ARM_R0 ++-/* r1-r3 are (also) used for the unaligned loads on the non-ARMv7 slowpath */ ++-#define r_off ARM_R1 ++-#define r_A ARM_R4 ++-#define r_X ARM_R5 ++-#define r_skb ARM_R6 ++-#define r_skb_data ARM_R7 ++-#define r_skb_hl ARM_R8 ++- ++-#define SCRATCH_SP_OFFSET 0 ++-#define SCRATCH_OFF(k) (SCRATCH_SP_OFFSET + 4 * (k)) ++- ++-#define SEEN_MEM ((1 << BPF_MEMWORDS) - 1) ++-#define SEEN_MEM_WORD(k) (1 << (k)) ++-#define SEEN_X (1 << BPF_MEMWORDS) ++-#define SEEN_CALL (1 << (BPF_MEMWORDS + 1)) ++-#define SEEN_SKB (1 << (BPF_MEMWORDS + 2)) ++-#define SEEN_DATA (1 << (BPF_MEMWORDS + 3)) +++#define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ +++#define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ +++#define TCALL_CNT (MAX_BPF_JIT_REG + 2) /* Tail Call Count */ ++ ++-#define FLAG_NEED_X_RESET (1 << 0) ++-#define FLAG_IMM_OVERFLOW (1 << 1) +++#define FLAG_IMM_OVERFLOW (1 << 0) +++ +++/* +++ * Map eBPF registers to ARM 32bit registers or stack scratch space. +++ * +++ * 1. First argument is passed using the arm 32bit registers and rest of the +++ * arguments are passed on stack scratch space. +++ * 2. First callee-saved argument is mapped to arm 32 bit registers and rest +++ * arguments are mapped to scratch space on stack. +++ * 3. We need two 64 bit temp registers to do complex operations on eBPF +++ * registers. +++ * +++ * As the eBPF registers are all 64 bit registers and arm has only 32 bit +++ * registers, we have to map each eBPF registers with two arm 32 bit regs or +++ * scratch memory space and we have to build eBPF 64 bit register from those. +++ * +++ */ +++static const s8 bpf2a32[][2] = { +++ /* return value from in-kernel function, and exit value from eBPF */ +++ [BPF_REG_0] = {ARM_R1, ARM_R0}, +++ /* arguments from eBPF program to in-kernel function */ +++ [BPF_REG_1] = {ARM_R3, ARM_R2}, +++ /* Stored on stack scratch space */ +++ [BPF_REG_2] = {STACK_OFFSET(BPF_R2_HI), STACK_OFFSET(BPF_R2_LO)}, +++ [BPF_REG_3] = {STACK_OFFSET(BPF_R3_HI), STACK_OFFSET(BPF_R3_LO)}, +++ [BPF_REG_4] = {STACK_OFFSET(BPF_R4_HI), STACK_OFFSET(BPF_R4_LO)}, +++ [BPF_REG_5] = {STACK_OFFSET(BPF_R5_HI), STACK_OFFSET(BPF_R5_LO)}, +++ /* callee saved registers that in-kernel function will preserve */ +++ [BPF_REG_6] = {ARM_R5, ARM_R4}, +++ /* Stored on stack scratch space */ +++ [BPF_REG_7] = {STACK_OFFSET(BPF_R7_HI), STACK_OFFSET(BPF_R7_LO)}, +++ [BPF_REG_8] = {STACK_OFFSET(BPF_R8_HI), STACK_OFFSET(BPF_R8_LO)}, +++ [BPF_REG_9] = {STACK_OFFSET(BPF_R9_HI), STACK_OFFSET(BPF_R9_LO)}, +++ /* Read only Frame Pointer to access Stack */ +++ [BPF_REG_FP] = {STACK_OFFSET(BPF_FP_HI), STACK_OFFSET(BPF_FP_LO)}, +++ /* Temporary Register for internal BPF JIT, can be used +++ * for constant blindings and others. +++ */ +++ [TMP_REG_1] = {ARM_R7, ARM_R6}, +++ [TMP_REG_2] = {ARM_R9, ARM_R8}, +++ /* Tail call count. Stored on stack scratch space. */ +++ [TCALL_CNT] = {STACK_OFFSET(BPF_TC_HI), STACK_OFFSET(BPF_TC_LO)}, +++ /* temporary register for blinding constants. +++ * Stored on stack scratch space. +++ */ +++ [BPF_REG_AX] = {STACK_OFFSET(BPF_AX_HI), STACK_OFFSET(BPF_AX_LO)}, +++}; +++ +++#define dst_lo dst[1] +++#define dst_hi dst[0] +++#define src_lo src[1] +++#define src_hi src[0] +++ +++/* +++ * JIT Context: +++ * +++ * prog : bpf_prog +++ * idx : index of current last JITed instruction. +++ * prologue_bytes : bytes used in prologue. +++ * epilogue_offset : offset of epilogue starting. +++ * offsets : array of eBPF instruction offsets in +++ * JITed code. +++ * target : final JITed code. +++ * epilogue_bytes : no of bytes used in epilogue. +++ * imm_count : no of immediate counts used for global +++ * variables. +++ * imms : array of global variable addresses. +++ */ ++ ++ struct jit_ctx { ++- const struct bpf_prog *skf; ++- unsigned idx; ++- unsigned prologue_bytes; ++- int ret0_fp_idx; ++- u32 seen; +++ const struct bpf_prog *prog; +++ unsigned int idx; +++ unsigned int prologue_bytes; +++ unsigned int epilogue_offset; +++ unsigned int cpu_architecture; ++ u32 flags; ++ u32 *offsets; ++ u32 *target; +++ u32 stack_size; ++ #if __LINUX_ARM_ARCH__ < 7 ++ u16 epilogue_bytes; ++ u16 imm_count; ++@@ -72,68 +202,16 @@ struct jit_ctx { ++ #endif ++ }; ++ ++-int bpf_jit_enable __read_mostly; ++- ++-static inline int call_neg_helper(struct sk_buff *skb, int offset, void *ret, ++- unsigned int size) ++-{ ++- void *ptr = bpf_internal_load_pointer_neg_helper(skb, offset, size); ++- ++- if (!ptr) ++- return -EFAULT; ++- memcpy(ret, ptr, size); ++- return 0; ++-} ++- ++-static u64 jit_get_skb_b(struct sk_buff *skb, int offset) ++-{ ++- u8 ret; ++- int err; ++- ++- if (offset < 0) ++- err = call_neg_helper(skb, offset, &ret, 1); ++- else ++- err = skb_copy_bits(skb, offset, &ret, 1); ++- ++- return (u64)err << 32 | ret; ++-} ++- ++-static u64 jit_get_skb_h(struct sk_buff *skb, int offset) ++-{ ++- u16 ret; ++- int err; ++- ++- if (offset < 0) ++- err = call_neg_helper(skb, offset, &ret, 2); ++- else ++- err = skb_copy_bits(skb, offset, &ret, 2); ++- ++- return (u64)err << 32 | ntohs(ret); ++-} ++- ++-static u64 jit_get_skb_w(struct sk_buff *skb, int offset) ++-{ ++- u32 ret; ++- int err; ++- ++- if (offset < 0) ++- err = call_neg_helper(skb, offset, &ret, 4); ++- else ++- err = skb_copy_bits(skb, offset, &ret, 4); ++- ++- return (u64)err << 32 | ntohl(ret); ++-} ++- ++ /* ++ * Wrappers which handle both OABI and EABI and assures Thumb2 interworking ++ * (where the assembly routines like __aeabi_uidiv could cause problems). ++ */ ++-static u32 jit_udiv(u32 dividend, u32 divisor) +++static u32 jit_udiv32(u32 dividend, u32 divisor) ++ { ++ return dividend / divisor; ++ } ++ ++-static u32 jit_mod(u32 dividend, u32 divisor) +++static u32 jit_mod32(u32 dividend, u32 divisor) ++ { ++ return dividend % divisor; ++ } ++@@ -157,36 +235,100 @@ static inline void emit(u32 inst, struct ++ _emit(ARM_COND_AL, inst, ctx); ++ } ++ ++-static u16 saved_regs(struct jit_ctx *ctx) +++/* +++ * This is rather horrid, but necessary to convert an integer constant +++ * to an immediate operand for the opcodes, and be able to detect at +++ * build time whether the constant can't be converted (iow, usable in +++ * BUILD_BUG_ON()). +++ */ +++#define imm12val(v, s) (rol32(v, (s)) | (s) << 7) +++#define const_imm8m(x) \ +++ ({ int r; \ +++ u32 v = (x); \ +++ if (!(v & ~0x000000ff)) \ +++ r = imm12val(v, 0); \ +++ else if (!(v & ~0xc000003f)) \ +++ r = imm12val(v, 2); \ +++ else if (!(v & ~0xf000000f)) \ +++ r = imm12val(v, 4); \ +++ else if (!(v & ~0xfc000003)) \ +++ r = imm12val(v, 6); \ +++ else if (!(v & ~0xff000000)) \ +++ r = imm12val(v, 8); \ +++ else if (!(v & ~0x3fc00000)) \ +++ r = imm12val(v, 10); \ +++ else if (!(v & ~0x0ff00000)) \ +++ r = imm12val(v, 12); \ +++ else if (!(v & ~0x03fc0000)) \ +++ r = imm12val(v, 14); \ +++ else if (!(v & ~0x00ff0000)) \ +++ r = imm12val(v, 16); \ +++ else if (!(v & ~0x003fc000)) \ +++ r = imm12val(v, 18); \ +++ else if (!(v & ~0x000ff000)) \ +++ r = imm12val(v, 20); \ +++ else if (!(v & ~0x0003fc00)) \ +++ r = imm12val(v, 22); \ +++ else if (!(v & ~0x0000ff00)) \ +++ r = imm12val(v, 24); \ +++ else if (!(v & ~0x00003fc0)) \ +++ r = imm12val(v, 26); \ +++ else if (!(v & ~0x00000ff0)) \ +++ r = imm12val(v, 28); \ +++ else if (!(v & ~0x000003fc)) \ +++ r = imm12val(v, 30); \ +++ else \ +++ r = -1; \ +++ r; }) +++ +++/* +++ * Checks if immediate value can be converted to imm12(12 bits) value. +++ */ +++static int imm8m(u32 x) ++ { ++- u16 ret = 0; +++ u32 rot; ++ ++- if ((ctx->skf->len > 1) || ++- (ctx->skf->insns[0].code == (BPF_RET | BPF_A))) ++- ret |= 1 << r_A; +++ for (rot = 0; rot < 16; rot++) +++ if ((x & ~ror32(0xff, 2 * rot)) == 0) +++ return rol32(x, 2 * rot) | (rot << 8); +++ return -1; +++} ++ ++-#ifdef CONFIG_FRAME_POINTER ++- ret |= (1 << ARM_FP) | (1 << ARM_IP) | (1 << ARM_LR) | (1 << ARM_PC); ++-#else ++- if (ctx->seen & SEEN_CALL) ++- ret |= 1 << ARM_LR; ++-#endif ++- if (ctx->seen & (SEEN_DATA | SEEN_SKB)) ++- ret |= 1 << r_skb; ++- if (ctx->seen & SEEN_DATA) ++- ret |= (1 << r_skb_data) | (1 << r_skb_hl); ++- if (ctx->seen & SEEN_X) ++- ret |= 1 << r_X; +++#define imm8m(x) (__builtin_constant_p(x) ? const_imm8m(x) : imm8m(x)) ++ ++- return ret; +++static u32 arm_bpf_ldst_imm12(u32 op, u8 rt, u8 rn, s16 imm12) +++{ +++ op |= rt << 12 | rn << 16; +++ if (imm12 >= 0) +++ op |= ARM_INST_LDST__U; +++ else +++ imm12 = -imm12; +++ return op | (imm12 & ARM_INST_LDST__IMM12); ++ } ++ ++-static inline int mem_words_used(struct jit_ctx *ctx) +++static u32 arm_bpf_ldst_imm8(u32 op, u8 rt, u8 rn, s16 imm8) ++ { ++- /* yes, we do waste some stack space IF there are "holes" in the set" */ ++- return fls(ctx->seen & SEEN_MEM); +++ op |= rt << 12 | rn << 16; +++ if (imm8 >= 0) +++ op |= ARM_INST_LDST__U; +++ else +++ imm8 = -imm8; +++ return op | (imm8 & 0xf0) << 4 | (imm8 & 0x0f); ++ } ++ +++#define ARM_LDR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDR_I, rt, rn, off) +++#define ARM_LDRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_LDRB_I, rt, rn, off) +++#define ARM_LDRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRD_I, rt, rn, off) +++#define ARM_LDRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_LDRH_I, rt, rn, off) +++ +++#define ARM_STR_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STR_I, rt, rn, off) +++#define ARM_STRB_I(rt, rn, off) arm_bpf_ldst_imm12(ARM_INST_STRB_I, rt, rn, off) +++#define ARM_STRD_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRD_I, rt, rn, off) +++#define ARM_STRH_I(rt, rn, off) arm_bpf_ldst_imm8(ARM_INST_STRH_I, rt, rn, off) +++ +++/* +++ * Initializes the JIT space with undefined instructions. +++ */ ++ static void jit_fill_hole(void *area, unsigned int size) ++ { ++ u32 *ptr; ++@@ -195,88 +337,23 @@ static void jit_fill_hole(void *area, un ++ *ptr++ = __opcode_to_mem_arm(ARM_INST_UDF); ++ } ++ ++-static void build_prologue(struct jit_ctx *ctx) ++-{ ++- u16 reg_set = saved_regs(ctx); ++- u16 off; ++- ++-#ifdef CONFIG_FRAME_POINTER ++- emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx); ++- emit(ARM_PUSH(reg_set), ctx); ++- emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); +++#if defined(CONFIG_AEABI) && (__LINUX_ARM_ARCH__ >= 5) +++/* EABI requires the stack to be aligned to 64-bit boundaries */ +++#define STACK_ALIGNMENT 8 ++ #else ++- if (reg_set) ++- emit(ARM_PUSH(reg_set), ctx); +++/* Stack must be aligned to 32-bit boundaries */ +++#define STACK_ALIGNMENT 4 ++ #endif ++ ++- if (ctx->seen & (SEEN_DATA | SEEN_SKB)) ++- emit(ARM_MOV_R(r_skb, ARM_R0), ctx); ++- ++- if (ctx->seen & SEEN_DATA) { ++- off = offsetof(struct sk_buff, data); ++- emit(ARM_LDR_I(r_skb_data, r_skb, off), ctx); ++- /* headlen = len - data_len */ ++- off = offsetof(struct sk_buff, len); ++- emit(ARM_LDR_I(r_skb_hl, r_skb, off), ctx); ++- off = offsetof(struct sk_buff, data_len); ++- emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); ++- emit(ARM_SUB_R(r_skb_hl, r_skb_hl, r_scratch), ctx); ++- } ++- ++- if (ctx->flags & FLAG_NEED_X_RESET) ++- emit(ARM_MOV_I(r_X, 0), ctx); ++- ++- /* do not leak kernel data to userspace */ ++- if (bpf_needs_clear_a(&ctx->skf->insns[0])) ++- emit(ARM_MOV_I(r_A, 0), ctx); ++- ++- /* stack space for the BPF_MEM words */ ++- if (ctx->seen & SEEN_MEM) ++- emit(ARM_SUB_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); ++-} ++- ++-static void build_epilogue(struct jit_ctx *ctx) ++-{ ++- u16 reg_set = saved_regs(ctx); ++- ++- if (ctx->seen & SEEN_MEM) ++- emit(ARM_ADD_I(ARM_SP, ARM_SP, mem_words_used(ctx) * 4), ctx); ++- ++- reg_set &= ~(1 << ARM_LR); ++- ++-#ifdef CONFIG_FRAME_POINTER ++- /* the first instruction of the prologue was: mov ip, sp */ ++- reg_set &= ~(1 << ARM_IP); ++- reg_set |= (1 << ARM_SP); ++- emit(ARM_LDM(ARM_SP, reg_set), ctx); ++-#else ++- if (reg_set) { ++- if (ctx->seen & SEEN_CALL) ++- reg_set |= 1 << ARM_PC; ++- emit(ARM_POP(reg_set), ctx); ++- } ++- ++- if (!(ctx->seen & SEEN_CALL)) ++- emit(ARM_BX(ARM_LR), ctx); ++-#endif ++-} ++- ++-static int16_t imm8m(u32 x) ++-{ ++- u32 rot; ++- ++- for (rot = 0; rot < 16; rot++) ++- if ((x & ~ror32(0xff, 2 * rot)) == 0) ++- return rol32(x, 2 * rot) | (rot << 8); ++- ++- return -1; ++-} +++/* total stack size used in JITed code */ +++#define _STACK_SIZE (ctx->prog->aux->stack_depth + SCRATCH_SIZE) +++#define STACK_SIZE ALIGN(_STACK_SIZE, STACK_ALIGNMENT) ++ ++ #if __LINUX_ARM_ARCH__ < 7 ++ ++ static u16 imm_offset(u32 k, struct jit_ctx *ctx) ++ { ++- unsigned i = 0, offset; +++ unsigned int i = 0, offset; ++ u16 imm; ++ ++ /* on the "fake" run we just count them (duplicates included) */ ++@@ -295,7 +372,7 @@ static u16 imm_offset(u32 k, struct jit_ ++ ctx->imms[i] = k; ++ ++ /* constants go just after the epilogue */ ++- offset = ctx->offsets[ctx->skf->len]; +++ offset = ctx->offsets[ctx->prog->len - 1] * 4; ++ offset += ctx->prologue_bytes; ++ offset += ctx->epilogue_bytes; ++ offset += i * 4; ++@@ -319,10 +396,22 @@ static u16 imm_offset(u32 k, struct jit_ ++ ++ #endif /* __LINUX_ARM_ARCH__ */ ++ +++static inline int bpf2a32_offset(int bpf_to, int bpf_from, +++ const struct jit_ctx *ctx) { +++ int to, from; +++ +++ if (ctx->target == NULL) +++ return 0; +++ to = ctx->offsets[bpf_to]; +++ from = ctx->offsets[bpf_from]; +++ +++ return to - from - 1; +++} +++ ++ /* ++ * Move an immediate that's not an imm8m to a core register. ++ */ ++-static inline void emit_mov_i_no8m(int rd, u32 val, struct jit_ctx *ctx) +++static inline void emit_mov_i_no8m(const u8 rd, u32 val, struct jit_ctx *ctx) ++ { ++ #if __LINUX_ARM_ARCH__ < 7 ++ emit(ARM_LDR_I(rd, ARM_PC, imm_offset(val, ctx)), ctx); ++@@ -333,7 +422,7 @@ static inline void emit_mov_i_no8m(int r ++ #endif ++ } ++ ++-static inline void emit_mov_i(int rd, u32 val, struct jit_ctx *ctx) +++static inline void emit_mov_i(const u8 rd, u32 val, struct jit_ctx *ctx) ++ { ++ int imm12 = imm8m(val); ++ ++@@ -343,676 +432,1508 @@ static inline void emit_mov_i(int rd, u3 ++ emit_mov_i_no8m(rd, val, ctx); ++ } ++ ++-#if __LINUX_ARM_ARCH__ < 6 +++static void emit_bx_r(u8 tgt_reg, struct jit_ctx *ctx) +++{ +++ if (elf_hwcap & HWCAP_THUMB) +++ emit(ARM_BX(tgt_reg), ctx); +++ else +++ emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); +++} ++ ++-static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +++static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) ++ { ++- _emit(cond, ARM_LDRB_I(ARM_R3, r_addr, 1), ctx); ++- _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); ++- _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 3), ctx); ++- _emit(cond, ARM_LSL_I(ARM_R3, ARM_R3, 16), ctx); ++- _emit(cond, ARM_LDRB_I(ARM_R0, r_addr, 2), ctx); ++- _emit(cond, ARM_ORR_S(ARM_R3, ARM_R3, ARM_R1, SRTYPE_LSL, 24), ctx); ++- _emit(cond, ARM_ORR_R(ARM_R3, ARM_R3, ARM_R2), ctx); ++- _emit(cond, ARM_ORR_S(r_res, ARM_R3, ARM_R0, SRTYPE_LSL, 8), ctx); +++#if __LINUX_ARM_ARCH__ < 5 +++ emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); +++ emit_bx_r(tgt_reg, ctx); +++#else +++ emit(ARM_BLX_R(tgt_reg), ctx); +++#endif ++ } ++ ++-static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +++static inline int epilogue_offset(const struct jit_ctx *ctx) ++ { ++- _emit(cond, ARM_LDRB_I(ARM_R1, r_addr, 0), ctx); ++- _emit(cond, ARM_LDRB_I(ARM_R2, r_addr, 1), ctx); ++- _emit(cond, ARM_ORR_S(r_res, ARM_R2, ARM_R1, SRTYPE_LSL, 8), ctx); +++ int to, from; +++ /* No need for 1st dummy run */ +++ if (ctx->target == NULL) +++ return 0; +++ to = ctx->epilogue_offset; +++ from = ctx->idx; +++ +++ return to - from - 2; ++ } ++ ++-static inline void emit_swap16(u8 r_dst, u8 r_src, struct jit_ctx *ctx) +++static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, u8 op) ++ { ++- /* r_dst = (r_src << 8) | (r_src >> 8) */ ++- emit(ARM_LSL_I(ARM_R1, r_src, 8), ctx); ++- emit(ARM_ORR_S(r_dst, ARM_R1, r_src, SRTYPE_LSR, 8), ctx); +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ +++#if __LINUX_ARM_ARCH__ == 7 +++ if (elf_hwcap & HWCAP_IDIVA) { +++ if (op == BPF_DIV) +++ emit(ARM_UDIV(rd, rm, rn), ctx); +++ else { +++ emit(ARM_UDIV(ARM_IP, rm, rn), ctx); +++ emit(ARM_MLS(rd, rn, ARM_IP, rm), ctx); +++ } +++ return; +++ } +++#endif ++ ++ /* ++- * we need to mask out the bits set in r_dst[23:16] due to ++- * the first shift instruction. ++- * ++- * note that 0x8ff is the encoded immediate 0x00ff0000. +++ * For BPF_ALU | BPF_DIV | BPF_K instructions +++ * As ARM_R1 and ARM_R0 contains 1st argument of bpf +++ * function, we need to save it on caller side to save +++ * it from getting destroyed within callee. +++ * After the return from the callee, we restore ARM_R0 +++ * ARM_R1. ++ */ ++- emit(ARM_BIC_I(r_dst, r_dst, 0x8ff), ctx); +++ if (rn != ARM_R1) { +++ emit(ARM_MOV_R(tmp[0], ARM_R1), ctx); +++ emit(ARM_MOV_R(ARM_R1, rn), ctx); +++ } +++ if (rm != ARM_R0) { +++ emit(ARM_MOV_R(tmp[1], ARM_R0), ctx); +++ emit(ARM_MOV_R(ARM_R0, rm), ctx); +++ } +++ +++ /* Call appropriate function */ +++ emit_mov_i(ARM_IP, op == BPF_DIV ? +++ (u32)jit_udiv32 : (u32)jit_mod32, ctx); +++ emit_blx_r(ARM_IP, ctx); +++ +++ /* Save return value */ +++ if (rd != ARM_R0) +++ emit(ARM_MOV_R(rd, ARM_R0), ctx); +++ +++ /* Restore ARM_R0 and ARM_R1 */ +++ if (rn != ARM_R1) +++ emit(ARM_MOV_R(ARM_R1, tmp[0]), ctx); +++ if (rm != ARM_R0) +++ emit(ARM_MOV_R(ARM_R0, tmp[1]), ctx); ++ } ++ ++-#else /* ARMv6+ */ +++/* Is the translated BPF register on stack? */ +++static bool is_stacked(s8 reg) +++{ +++ return reg < 0; +++} ++ ++-static void emit_load_be32(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +++/* If a BPF register is on the stack (stk is true), load it to the +++ * supplied temporary register and return the temporary register +++ * for subsequent operations, otherwise just use the CPU register. +++ */ +++static s8 arm_bpf_get_reg32(s8 reg, s8 tmp, struct jit_ctx *ctx) ++ { ++- _emit(cond, ARM_LDR_I(r_res, r_addr, 0), ctx); ++-#ifdef __LITTLE_ENDIAN ++- _emit(cond, ARM_REV(r_res, r_res), ctx); ++-#endif +++ if (is_stacked(reg)) { +++ emit(ARM_LDR_I(tmp, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx); +++ reg = tmp; +++ } +++ return reg; ++ } ++ ++-static void emit_load_be16(u8 cond, u8 r_res, u8 r_addr, struct jit_ctx *ctx) +++static const s8 *arm_bpf_get_reg64(const s8 *reg, const s8 *tmp, +++ struct jit_ctx *ctx) ++ { ++- _emit(cond, ARM_LDRH_I(r_res, r_addr, 0), ctx); ++-#ifdef __LITTLE_ENDIAN ++- _emit(cond, ARM_REV16(r_res, r_res), ctx); ++-#endif +++ if (is_stacked(reg[1])) { +++ if (__LINUX_ARM_ARCH__ >= 6 || +++ ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) { +++ emit(ARM_LDRD_I(tmp[1], ARM_FP, +++ EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx); +++ } else { +++ emit(ARM_LDR_I(tmp[1], ARM_FP, +++ EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx); +++ emit(ARM_LDR_I(tmp[0], ARM_FP, +++ EBPF_SCRATCH_TO_ARM_FP(reg[0])), ctx); +++ } +++ reg = tmp; +++ } +++ return reg; ++ } ++ ++-static inline void emit_swap16(u8 r_dst __maybe_unused, ++- u8 r_src __maybe_unused, ++- struct jit_ctx *ctx __maybe_unused) +++/* If a BPF register is on the stack (stk is true), save the register +++ * back to the stack. If the source register is not the same, then +++ * move it into the correct register. +++ */ +++static void arm_bpf_put_reg32(s8 reg, s8 src, struct jit_ctx *ctx) ++ { ++-#ifdef __LITTLE_ENDIAN ++- emit(ARM_REV16(r_dst, r_src), ctx); ++-#endif +++ if (is_stacked(reg)) +++ emit(ARM_STR_I(src, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(reg)), ctx); +++ else if (reg != src) +++ emit(ARM_MOV_R(reg, src), ctx); +++} +++ +++static void arm_bpf_put_reg64(const s8 *reg, const s8 *src, +++ struct jit_ctx *ctx) +++{ +++ if (is_stacked(reg[1])) { +++ if (__LINUX_ARM_ARCH__ >= 6 || +++ ctx->cpu_architecture >= CPU_ARCH_ARMv5TE) { +++ emit(ARM_STRD_I(src[1], ARM_FP, +++ EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx); +++ } else { +++ emit(ARM_STR_I(src[1], ARM_FP, +++ EBPF_SCRATCH_TO_ARM_FP(reg[1])), ctx); +++ emit(ARM_STR_I(src[0], ARM_FP, +++ EBPF_SCRATCH_TO_ARM_FP(reg[0])), ctx); +++ } +++ } else { +++ if (reg[1] != src[1]) +++ emit(ARM_MOV_R(reg[1], src[1]), ctx); +++ if (reg[0] != src[0]) +++ emit(ARM_MOV_R(reg[0], src[0]), ctx); +++ } ++ } ++ ++-#endif /* __LINUX_ARM_ARCH__ < 6 */ +++static inline void emit_a32_mov_i(const s8 dst, const u32 val, +++ struct jit_ctx *ctx) +++{ +++ const s8 *tmp = bpf2a32[TMP_REG_1]; ++ +++ if (is_stacked(dst)) { +++ emit_mov_i(tmp[1], val, ctx); +++ arm_bpf_put_reg32(dst, tmp[1], ctx); +++ } else { +++ emit_mov_i(dst, val, ctx); +++ } +++} ++ ++-/* Compute the immediate value for a PC-relative branch. */ ++-static inline u32 b_imm(unsigned tgt, struct jit_ctx *ctx) +++static void emit_a32_mov_i64(const s8 dst[], u64 val, struct jit_ctx *ctx) ++ { ++- u32 imm; +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *rd = is_stacked(dst_lo) ? tmp : dst; ++ ++- if (ctx->target == NULL) ++- return 0; ++- /* ++- * BPF allows only forward jumps and the offset of the target is ++- * still the one computed during the first pass. +++ emit_mov_i(rd[1], (u32)val, ctx); +++ emit_mov_i(rd[0], val >> 32, ctx); +++ +++ arm_bpf_put_reg64(dst, rd, ctx); +++} +++ +++/* Sign extended move */ +++static inline void emit_a32_mov_se_i64(const bool is64, const s8 dst[], +++ const u32 val, struct jit_ctx *ctx) { +++ u64 val64 = val; +++ +++ if (is64 && (val & (1<<31))) +++ val64 |= 0xffffffff00000000ULL; +++ emit_a32_mov_i64(dst, val64, ctx); +++} +++ +++static inline void emit_a32_add_r(const u8 dst, const u8 src, +++ const bool is64, const bool hi, +++ struct jit_ctx *ctx) { +++ /* 64 bit : +++ * adds dst_lo, dst_lo, src_lo +++ * adc dst_hi, dst_hi, src_hi +++ * 32 bit : +++ * add dst_lo, dst_lo, src_lo ++ */ ++- imm = ctx->offsets[tgt] + ctx->prologue_bytes - (ctx->idx * 4 + 8); +++ if (!hi && is64) +++ emit(ARM_ADDS_R(dst, dst, src), ctx); +++ else if (hi && is64) +++ emit(ARM_ADC_R(dst, dst, src), ctx); +++ else +++ emit(ARM_ADD_R(dst, dst, src), ctx); +++} ++ ++- return imm >> 2; +++static inline void emit_a32_sub_r(const u8 dst, const u8 src, +++ const bool is64, const bool hi, +++ struct jit_ctx *ctx) { +++ /* 64 bit : +++ * subs dst_lo, dst_lo, src_lo +++ * sbc dst_hi, dst_hi, src_hi +++ * 32 bit : +++ * sub dst_lo, dst_lo, src_lo +++ */ +++ if (!hi && is64) +++ emit(ARM_SUBS_R(dst, dst, src), ctx); +++ else if (hi && is64) +++ emit(ARM_SBC_R(dst, dst, src), ctx); +++ else +++ emit(ARM_SUB_R(dst, dst, src), ctx); +++} +++ +++static inline void emit_alu_r(const u8 dst, const u8 src, const bool is64, +++ const bool hi, const u8 op, struct jit_ctx *ctx){ +++ switch (BPF_OP(op)) { +++ /* dst = dst + src */ +++ case BPF_ADD: +++ emit_a32_add_r(dst, src, is64, hi, ctx); +++ break; +++ /* dst = dst - src */ +++ case BPF_SUB: +++ emit_a32_sub_r(dst, src, is64, hi, ctx); +++ break; +++ /* dst = dst | src */ +++ case BPF_OR: +++ emit(ARM_ORR_R(dst, dst, src), ctx); +++ break; +++ /* dst = dst & src */ +++ case BPF_AND: +++ emit(ARM_AND_R(dst, dst, src), ctx); +++ break; +++ /* dst = dst ^ src */ +++ case BPF_XOR: +++ emit(ARM_EOR_R(dst, dst, src), ctx); +++ break; +++ /* dst = dst * src */ +++ case BPF_MUL: +++ emit(ARM_MUL(dst, dst, src), ctx); +++ break; +++ /* dst = dst << src */ +++ case BPF_LSH: +++ emit(ARM_LSL_R(dst, dst, src), ctx); +++ break; +++ /* dst = dst >> src */ +++ case BPF_RSH: +++ emit(ARM_LSR_R(dst, dst, src), ctx); +++ break; +++ /* dst = dst >> src (signed)*/ +++ case BPF_ARSH: +++ emit(ARM_MOV_SR(dst, dst, SRTYPE_ASR, src), ctx); +++ break; +++ } ++ } ++ ++-#define OP_IMM3(op, r1, r2, imm_val, ctx) \ ++- do { \ ++- imm12 = imm8m(imm_val); \ ++- if (imm12 < 0) { \ ++- emit_mov_i_no8m(r_scratch, imm_val, ctx); \ ++- emit(op ## _R((r1), (r2), r_scratch), ctx); \ ++- } else { \ ++- emit(op ## _I((r1), (r2), imm12), ctx); \ ++- } \ ++- } while (0) ++- ++-static inline void emit_err_ret(u8 cond, struct jit_ctx *ctx) ++-{ ++- if (ctx->ret0_fp_idx >= 0) { ++- _emit(cond, ARM_B(b_imm(ctx->ret0_fp_idx, ctx)), ctx); ++- /* NOP to keep the size constant between passes */ ++- emit(ARM_MOV_R(ARM_R0, ARM_R0), ctx); +++/* ALU operation (32 bit) +++ * dst = dst (op) src +++ */ +++static inline void emit_a32_alu_r(const s8 dst, const s8 src, +++ struct jit_ctx *ctx, const bool is64, +++ const bool hi, const u8 op) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ s8 rn, rd; +++ +++ rn = arm_bpf_get_reg32(src, tmp[1], ctx); +++ rd = arm_bpf_get_reg32(dst, tmp[0], ctx); +++ /* ALU operation */ +++ emit_alu_r(rd, rn, is64, hi, op, ctx); +++ arm_bpf_put_reg32(dst, rd, ctx); +++} +++ +++/* ALU operation (64 bit) */ +++static inline void emit_a32_alu_r64(const bool is64, const s8 dst[], +++ const s8 src[], struct jit_ctx *ctx, +++ const u8 op) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd; +++ +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ if (is64) { +++ const s8 *rs; +++ +++ rs = arm_bpf_get_reg64(src, tmp2, ctx); +++ +++ /* ALU operation */ +++ emit_alu_r(rd[1], rs[1], true, false, op, ctx); +++ emit_alu_r(rd[0], rs[0], true, true, op, ctx); ++ } else { ++- _emit(cond, ARM_MOV_I(ARM_R0, 0), ctx); ++- _emit(cond, ARM_B(b_imm(ctx->skf->len, ctx)), ctx); +++ s8 rs; +++ +++ rs = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); +++ +++ /* ALU operation */ +++ emit_alu_r(rd[1], rs, true, false, op, ctx); +++ if (!ctx->prog->aux->verifier_zext) +++ emit_a32_mov_i(rd[0], 0, ctx); ++ } +++ +++ arm_bpf_put_reg64(dst, rd, ctx); ++ } ++ ++-static inline void emit_blx_r(u8 tgt_reg, struct jit_ctx *ctx) ++-{ ++-#if __LINUX_ARM_ARCH__ < 5 ++- emit(ARM_MOV_R(ARM_LR, ARM_PC), ctx); +++/* dst = src (4 bytes)*/ +++static inline void emit_a32_mov_r(const s8 dst, const s8 src, +++ struct jit_ctx *ctx) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ s8 rt; +++ +++ rt = arm_bpf_get_reg32(src, tmp[0], ctx); +++ arm_bpf_put_reg32(dst, rt, ctx); +++} +++ +++/* dst = src */ +++static inline void emit_a32_mov_r64(const bool is64, const s8 dst[], +++ const s8 src[], +++ struct jit_ctx *ctx) { +++ if (!is64) { +++ emit_a32_mov_r(dst_lo, src_lo, ctx); +++ if (!ctx->prog->aux->verifier_zext) +++ /* Zero out high 4 bytes */ +++ emit_a32_mov_i(dst_hi, 0, ctx); +++ } else if (__LINUX_ARM_ARCH__ < 6 && +++ ctx->cpu_architecture < CPU_ARCH_ARMv5TE) { +++ /* complete 8 byte move */ +++ emit_a32_mov_r(dst_lo, src_lo, ctx); +++ emit_a32_mov_r(dst_hi, src_hi, ctx); +++ } else if (is_stacked(src_lo) && is_stacked(dst_lo)) { +++ const u8 *tmp = bpf2a32[TMP_REG_1]; +++ +++ emit(ARM_LDRD_I(tmp[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx); +++ emit(ARM_STRD_I(tmp[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx); +++ } else if (is_stacked(src_lo)) { +++ emit(ARM_LDRD_I(dst[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(src_lo)), ctx); +++ } else if (is_stacked(dst_lo)) { +++ emit(ARM_STRD_I(src[1], ARM_FP, EBPF_SCRATCH_TO_ARM_FP(dst_lo)), ctx); +++ } else { +++ emit(ARM_MOV_R(dst[0], src[0]), ctx); +++ emit(ARM_MOV_R(dst[1], src[1]), ctx); +++ } +++} ++ ++- if (elf_hwcap & HWCAP_THUMB) ++- emit(ARM_BX(tgt_reg), ctx); ++- else ++- emit(ARM_MOV_R(ARM_PC, tgt_reg), ctx); ++-#else ++- emit(ARM_BLX_R(tgt_reg), ctx); ++-#endif +++/* Shift operations */ +++static inline void emit_a32_alu_i(const s8 dst, const u32 val, +++ struct jit_ctx *ctx, const u8 op) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ s8 rd; +++ +++ rd = arm_bpf_get_reg32(dst, tmp[0], ctx); +++ +++ /* Do shift operation */ +++ switch (op) { +++ case BPF_LSH: +++ emit(ARM_LSL_I(rd, rd, val), ctx); +++ break; +++ case BPF_RSH: +++ emit(ARM_LSR_I(rd, rd, val), ctx); +++ break; +++ case BPF_NEG: +++ emit(ARM_RSB_I(rd, rd, val), ctx); +++ break; +++ } +++ +++ arm_bpf_put_reg32(dst, rd, ctx); ++ } ++ ++-static inline void emit_udivmod(u8 rd, u8 rm, u8 rn, struct jit_ctx *ctx, ++- int bpf_op) ++-{ ++-#if __LINUX_ARM_ARCH__ == 7 ++- if (elf_hwcap & HWCAP_IDIVA) { ++- if (bpf_op == BPF_DIV) ++- emit(ARM_UDIV(rd, rm, rn), ctx); ++- else { ++- emit(ARM_UDIV(ARM_R3, rm, rn), ctx); ++- emit(ARM_MLS(rd, rn, ARM_R3, rm), ctx); +++/* dst = ~dst (64 bit) */ +++static inline void emit_a32_neg64(const s8 dst[], +++ struct jit_ctx *ctx){ +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *rd; +++ +++ /* Setup Operand */ +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Do Negate Operation */ +++ emit(ARM_RSBS_I(rd[1], rd[1], 0), ctx); +++ emit(ARM_RSC_I(rd[0], rd[0], 0), ctx); +++ +++ arm_bpf_put_reg64(dst, rd, ctx); +++} +++ +++/* dst = dst << src */ +++static inline void emit_a32_lsh_r64(const s8 dst[], const s8 src[], +++ struct jit_ctx *ctx) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd; +++ s8 rt; +++ +++ /* Setup Operands */ +++ rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Do LSH operation */ +++ emit(ARM_SUB_I(ARM_IP, rt, 32), ctx); +++ emit(ARM_RSB_I(tmp2[0], rt, 32), ctx); +++ emit(ARM_MOV_SR(ARM_LR, rd[0], SRTYPE_ASL, rt), ctx); +++ emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[1], SRTYPE_ASL, ARM_IP), ctx); +++ emit(ARM_ORR_SR(ARM_IP, ARM_LR, rd[1], SRTYPE_LSR, tmp2[0]), ctx); +++ emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_ASL, rt), ctx); +++ +++ arm_bpf_put_reg32(dst_lo, ARM_LR, ctx); +++ arm_bpf_put_reg32(dst_hi, ARM_IP, ctx); +++} +++ +++/* dst = dst >> src (signed)*/ +++static inline void emit_a32_arsh_r64(const s8 dst[], const s8 src[], +++ struct jit_ctx *ctx) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd; +++ s8 rt; +++ +++ /* Setup Operands */ +++ rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Do the ARSH operation */ +++ emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); +++ emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); +++ emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx); +++ emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx); +++ _emit(ARM_COND_MI, ARM_B(0), ctx); +++ emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASR, tmp2[0]), ctx); +++ emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_ASR, rt), ctx); +++ +++ arm_bpf_put_reg32(dst_lo, ARM_LR, ctx); +++ arm_bpf_put_reg32(dst_hi, ARM_IP, ctx); +++} +++ +++/* dst = dst >> src */ +++static inline void emit_a32_rsh_r64(const s8 dst[], const s8 src[], +++ struct jit_ctx *ctx) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd; +++ s8 rt; +++ +++ /* Setup Operands */ +++ rt = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Do RSH operation */ +++ emit(ARM_RSB_I(ARM_IP, rt, 32), ctx); +++ emit(ARM_SUBS_I(tmp2[0], rt, 32), ctx); +++ emit(ARM_MOV_SR(ARM_LR, rd[1], SRTYPE_LSR, rt), ctx); +++ emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_ASL, ARM_IP), ctx); +++ emit(ARM_ORR_SR(ARM_LR, ARM_LR, rd[0], SRTYPE_LSR, tmp2[0]), ctx); +++ emit(ARM_MOV_SR(ARM_IP, rd[0], SRTYPE_LSR, rt), ctx); +++ +++ arm_bpf_put_reg32(dst_lo, ARM_LR, ctx); +++ arm_bpf_put_reg32(dst_hi, ARM_IP, ctx); +++} +++ +++/* dst = dst << val */ +++static inline void emit_a32_lsh_i64(const s8 dst[], +++ const u32 val, struct jit_ctx *ctx){ +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd; +++ +++ /* Setup operands */ +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Do LSH operation */ +++ if (val < 32) { +++ emit(ARM_MOV_SI(tmp2[0], rd[0], SRTYPE_ASL, val), ctx); +++ emit(ARM_ORR_SI(rd[0], tmp2[0], rd[1], SRTYPE_LSR, 32 - val), ctx); +++ emit(ARM_MOV_SI(rd[1], rd[1], SRTYPE_ASL, val), ctx); +++ } else { +++ if (val == 32) +++ emit(ARM_MOV_R(rd[0], rd[1]), ctx); +++ else +++ emit(ARM_MOV_SI(rd[0], rd[1], SRTYPE_ASL, val - 32), ctx); +++ emit(ARM_EOR_R(rd[1], rd[1], rd[1]), ctx); +++ } +++ +++ arm_bpf_put_reg64(dst, rd, ctx); +++} +++ +++/* dst = dst >> val */ +++static inline void emit_a32_rsh_i64(const s8 dst[], +++ const u32 val, struct jit_ctx *ctx) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd; +++ +++ /* Setup operands */ +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Do LSR operation */ +++ if (val == 0) { +++ /* An immediate value of 0 encodes a shift amount of 32 +++ * for LSR. To shift by 0, don't do anything. +++ */ +++ } else if (val < 32) { +++ emit(ARM_MOV_SI(tmp2[1], rd[1], SRTYPE_LSR, val), ctx); +++ emit(ARM_ORR_SI(rd[1], tmp2[1], rd[0], SRTYPE_ASL, 32 - val), ctx); +++ emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_LSR, val), ctx); +++ } else if (val == 32) { +++ emit(ARM_MOV_R(rd[1], rd[0]), ctx); +++ emit(ARM_MOV_I(rd[0], 0), ctx); +++ } else { +++ emit(ARM_MOV_SI(rd[1], rd[0], SRTYPE_LSR, val - 32), ctx); +++ emit(ARM_MOV_I(rd[0], 0), ctx); +++ } +++ +++ arm_bpf_put_reg64(dst, rd, ctx); +++} +++ +++/* dst = dst >> val (signed) */ +++static inline void emit_a32_arsh_i64(const s8 dst[], +++ const u32 val, struct jit_ctx *ctx){ +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd; +++ +++ /* Setup operands */ +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Do ARSH operation */ +++ if (val == 0) { +++ /* An immediate value of 0 encodes a shift amount of 32 +++ * for ASR. To shift by 0, don't do anything. +++ */ +++ } else if (val < 32) { +++ emit(ARM_MOV_SI(tmp2[1], rd[1], SRTYPE_LSR, val), ctx); +++ emit(ARM_ORR_SI(rd[1], tmp2[1], rd[0], SRTYPE_ASL, 32 - val), ctx); +++ emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, val), ctx); +++ } else if (val == 32) { +++ emit(ARM_MOV_R(rd[1], rd[0]), ctx); +++ emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, 31), ctx); +++ } else { +++ emit(ARM_MOV_SI(rd[1], rd[0], SRTYPE_ASR, val - 32), ctx); +++ emit(ARM_MOV_SI(rd[0], rd[0], SRTYPE_ASR, 31), ctx); +++ } +++ +++ arm_bpf_put_reg64(dst, rd, ctx); +++} +++ +++static inline void emit_a32_mul_r64(const s8 dst[], const s8 src[], +++ struct jit_ctx *ctx) { +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rd, *rt; +++ +++ /* Setup operands for multiplication */ +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ rt = arm_bpf_get_reg64(src, tmp2, ctx); +++ +++ /* Do Multiplication */ +++ emit(ARM_MUL(ARM_IP, rd[1], rt[0]), ctx); +++ emit(ARM_MUL(ARM_LR, rd[0], rt[1]), ctx); +++ emit(ARM_ADD_R(ARM_LR, ARM_IP, ARM_LR), ctx); +++ +++ emit(ARM_UMULL(ARM_IP, rd[0], rd[1], rt[1]), ctx); +++ emit(ARM_ADD_R(rd[0], ARM_LR, rd[0]), ctx); +++ +++ arm_bpf_put_reg32(dst_lo, ARM_IP, ctx); +++ arm_bpf_put_reg32(dst_hi, rd[0], ctx); +++} +++ +++static bool is_ldst_imm(s16 off, const u8 size) +++{ +++ s16 off_max = 0; +++ +++ switch (size) { +++ case BPF_B: +++ case BPF_W: +++ off_max = 0xfff; +++ break; +++ case BPF_H: +++ off_max = 0xff; +++ break; +++ case BPF_DW: +++ /* Need to make sure off+4 does not overflow. */ +++ off_max = 0xfff - 4; +++ break; +++ } +++ return -off_max <= off && off <= off_max; +++} +++ +++/* *(size *)(dst + off) = src */ +++static inline void emit_str_r(const s8 dst, const s8 src[], +++ s16 off, struct jit_ctx *ctx, const u8 sz){ +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ s8 rd; +++ +++ rd = arm_bpf_get_reg32(dst, tmp[1], ctx); +++ +++ if (!is_ldst_imm(off, sz)) { +++ emit_a32_mov_i(tmp[0], off, ctx); +++ emit(ARM_ADD_R(tmp[0], tmp[0], rd), ctx); +++ rd = tmp[0]; +++ off = 0; +++ } +++ switch (sz) { +++ case BPF_B: +++ /* Store a Byte */ +++ emit(ARM_STRB_I(src_lo, rd, off), ctx); +++ break; +++ case BPF_H: +++ /* Store a HalfWord */ +++ emit(ARM_STRH_I(src_lo, rd, off), ctx); +++ break; +++ case BPF_W: +++ /* Store a Word */ +++ emit(ARM_STR_I(src_lo, rd, off), ctx); +++ break; +++ case BPF_DW: +++ /* Store a Double Word */ +++ emit(ARM_STR_I(src_lo, rd, off), ctx); +++ emit(ARM_STR_I(src_hi, rd, off + 4), ctx); +++ break; +++ } +++} +++ +++/* dst = *(size*)(src + off) */ +++static inline void emit_ldx_r(const s8 dst[], const s8 src, +++ s16 off, struct jit_ctx *ctx, const u8 sz){ +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *rd = is_stacked(dst_lo) ? tmp : dst; +++ s8 rm = src; +++ +++ if (!is_ldst_imm(off, sz)) { +++ emit_a32_mov_i(tmp[0], off, ctx); +++ emit(ARM_ADD_R(tmp[0], tmp[0], src), ctx); +++ rm = tmp[0]; +++ off = 0; +++ } else if (rd[1] == rm) { +++ emit(ARM_MOV_R(tmp[0], rm), ctx); +++ rm = tmp[0]; +++ } +++ switch (sz) { +++ case BPF_B: +++ /* Load a Byte */ +++ emit(ARM_LDRB_I(rd[1], rm, off), ctx); +++ if (!ctx->prog->aux->verifier_zext) +++ emit_a32_mov_i(rd[0], 0, ctx); +++ break; +++ case BPF_H: +++ /* Load a HalfWord */ +++ emit(ARM_LDRH_I(rd[1], rm, off), ctx); +++ if (!ctx->prog->aux->verifier_zext) +++ emit_a32_mov_i(rd[0], 0, ctx); +++ break; +++ case BPF_W: +++ /* Load a Word */ +++ emit(ARM_LDR_I(rd[1], rm, off), ctx); +++ if (!ctx->prog->aux->verifier_zext) +++ emit_a32_mov_i(rd[0], 0, ctx); +++ break; +++ case BPF_DW: +++ /* Load a Double Word */ +++ emit(ARM_LDR_I(rd[1], rm, off), ctx); +++ emit(ARM_LDR_I(rd[0], rm, off + 4), ctx); +++ break; +++ } +++ arm_bpf_put_reg64(dst, rd, ctx); +++} +++ +++/* Arithmatic Operation */ +++static inline void emit_ar_r(const u8 rd, const u8 rt, const u8 rm, +++ const u8 rn, struct jit_ctx *ctx, u8 op, +++ bool is_jmp64) { +++ switch (op) { +++ case BPF_JSET: +++ if (is_jmp64) { +++ emit(ARM_AND_R(ARM_IP, rt, rn), ctx); +++ emit(ARM_AND_R(ARM_LR, rd, rm), ctx); +++ emit(ARM_ORRS_R(ARM_IP, ARM_LR, ARM_IP), ctx); +++ } else { +++ emit(ARM_ANDS_R(ARM_IP, rt, rn), ctx); ++ } ++- return; +++ break; +++ case BPF_JEQ: +++ case BPF_JNE: +++ case BPF_JGT: +++ case BPF_JGE: +++ case BPF_JLE: +++ case BPF_JLT: +++ if (is_jmp64) { +++ emit(ARM_CMP_R(rd, rm), ctx); +++ /* Only compare low halve if high halve are equal. */ +++ _emit(ARM_COND_EQ, ARM_CMP_R(rt, rn), ctx); +++ } else { +++ emit(ARM_CMP_R(rt, rn), ctx); +++ } +++ break; +++ case BPF_JSLE: +++ case BPF_JSGT: +++ emit(ARM_CMP_R(rn, rt), ctx); +++ if (is_jmp64) +++ emit(ARM_SBCS_R(ARM_IP, rm, rd), ctx); +++ break; +++ case BPF_JSLT: +++ case BPF_JSGE: +++ emit(ARM_CMP_R(rt, rn), ctx); +++ if (is_jmp64) +++ emit(ARM_SBCS_R(ARM_IP, rd, rm), ctx); +++ break; ++ } ++-#endif +++} ++ ++- /* ++- * For BPF_ALU | BPF_DIV | BPF_K instructions, rm is ARM_R4 ++- * (r_A) and rn is ARM_R0 (r_scratch) so load rn first into ++- * ARM_R1 to avoid accidentally overwriting ARM_R0 with rm ++- * before using it as a source for ARM_R1. ++- * ++- * For BPF_ALU | BPF_DIV | BPF_X rm is ARM_R4 (r_A) and rn is ++- * ARM_R5 (r_X) so there is no particular register overlap ++- * issues. +++static int out_offset = -1; /* initialized on the first pass of build_body() */ +++static int emit_bpf_tail_call(struct jit_ctx *ctx) +++{ +++ +++ /* bpf_tail_call(void *prog_ctx, struct bpf_array *array, u64 index) */ +++ const s8 *r2 = bpf2a32[BPF_REG_2]; +++ const s8 *r3 = bpf2a32[BPF_REG_3]; +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *tcc = bpf2a32[TCALL_CNT]; +++ const s8 *tc; +++ const int idx0 = ctx->idx; +++#define cur_offset (ctx->idx - idx0) +++#define jmp_offset (out_offset - (cur_offset) - 2) +++ u32 lo, hi; +++ s8 r_array, r_index; +++ int off; +++ +++ /* if (index >= array->map.max_entries) +++ * goto out; ++ */ ++- if (rn != ARM_R1) ++- emit(ARM_MOV_R(ARM_R1, rn), ctx); ++- if (rm != ARM_R0) ++- emit(ARM_MOV_R(ARM_R0, rm), ctx); +++ BUILD_BUG_ON(offsetof(struct bpf_array, map.max_entries) > +++ ARM_INST_LDST__IMM12); +++ off = offsetof(struct bpf_array, map.max_entries); +++ r_array = arm_bpf_get_reg32(r2[1], tmp2[0], ctx); +++ /* index is 32-bit for arrays */ +++ r_index = arm_bpf_get_reg32(r3[1], tmp2[1], ctx); +++ /* array->map.max_entries */ +++ emit(ARM_LDR_I(tmp[1], r_array, off), ctx); +++ /* index >= array->map.max_entries */ +++ emit(ARM_CMP_R(r_index, tmp[1]), ctx); +++ _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); +++ +++ /* tmp2[0] = array, tmp2[1] = index */ +++ +++ /* if (tail_call_cnt > MAX_TAIL_CALL_CNT) +++ * goto out; +++ * tail_call_cnt++; +++ */ +++ lo = (u32)MAX_TAIL_CALL_CNT; +++ hi = (u32)((u64)MAX_TAIL_CALL_CNT >> 32); +++ tc = arm_bpf_get_reg64(tcc, tmp, ctx); +++ emit(ARM_CMP_I(tc[0], hi), ctx); +++ _emit(ARM_COND_EQ, ARM_CMP_I(tc[1], lo), ctx); +++ _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); +++ emit(ARM_ADDS_I(tc[1], tc[1], 1), ctx); +++ emit(ARM_ADC_I(tc[0], tc[0], 0), ctx); +++ arm_bpf_put_reg64(tcc, tmp, ctx); +++ +++ /* prog = array->ptrs[index] +++ * if (prog == NULL) +++ * goto out; +++ */ +++ BUILD_BUG_ON(imm8m(offsetof(struct bpf_array, ptrs)) < 0); +++ off = imm8m(offsetof(struct bpf_array, ptrs)); +++ emit(ARM_ADD_I(tmp[1], r_array, off), ctx); +++ emit(ARM_LDR_R_SI(tmp[1], tmp[1], r_index, SRTYPE_ASL, 2), ctx); +++ emit(ARM_CMP_I(tmp[1], 0), ctx); +++ _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); +++ +++ /* goto *(prog->bpf_func + prologue_size); */ +++ BUILD_BUG_ON(offsetof(struct bpf_prog, bpf_func) > +++ ARM_INST_LDST__IMM12); +++ off = offsetof(struct bpf_prog, bpf_func); +++ emit(ARM_LDR_I(tmp[1], tmp[1], off), ctx); +++ emit(ARM_ADD_I(tmp[1], tmp[1], ctx->prologue_bytes), ctx); +++ emit_bx_r(tmp[1], ctx); +++ +++ /* out: */ +++ if (out_offset == -1) +++ out_offset = cur_offset; +++ if (cur_offset != out_offset) { +++ pr_err_once("tail_call out_offset = %d, expected %d!\n", +++ cur_offset, out_offset); +++ return -1; +++ } +++ return 0; +++#undef cur_offset +++#undef jmp_offset +++} +++ +++/* 0xabcd => 0xcdab */ +++static inline void emit_rev16(const u8 rd, const u8 rn, struct jit_ctx *ctx) +++{ +++#if __LINUX_ARM_ARCH__ < 6 +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; ++ ++- ctx->seen |= SEEN_CALL; ++- emit_mov_i(ARM_R3, bpf_op == BPF_DIV ? (u32)jit_udiv : (u32)jit_mod, ++- ctx); ++- emit_blx_r(ARM_R3, ctx); +++ emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx); +++ emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 8), ctx); +++ emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx); +++ emit(ARM_ORR_SI(rd, tmp2[0], tmp2[1], SRTYPE_LSL, 8), ctx); +++#else /* ARMv6+ */ +++ emit(ARM_REV16(rd, rn), ctx); +++#endif +++} ++ ++- if (rd != ARM_R0) ++- emit(ARM_MOV_R(rd, ARM_R0), ctx); +++/* 0xabcdefgh => 0xghefcdab */ +++static inline void emit_rev32(const u8 rd, const u8 rn, struct jit_ctx *ctx) +++{ +++#if __LINUX_ARM_ARCH__ < 6 +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ +++ emit(ARM_AND_I(tmp2[1], rn, 0xff), ctx); +++ emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 24), ctx); +++ emit(ARM_ORR_SI(ARM_IP, tmp2[0], tmp2[1], SRTYPE_LSL, 24), ctx); +++ +++ emit(ARM_MOV_SI(tmp2[1], rn, SRTYPE_LSR, 8), ctx); +++ emit(ARM_AND_I(tmp2[1], tmp2[1], 0xff), ctx); +++ emit(ARM_MOV_SI(tmp2[0], rn, SRTYPE_LSR, 16), ctx); +++ emit(ARM_AND_I(tmp2[0], tmp2[0], 0xff), ctx); +++ emit(ARM_MOV_SI(tmp2[0], tmp2[0], SRTYPE_LSL, 8), ctx); +++ emit(ARM_ORR_SI(tmp2[0], tmp2[0], tmp2[1], SRTYPE_LSL, 16), ctx); +++ emit(ARM_ORR_R(rd, ARM_IP, tmp2[0]), ctx); +++ +++#else /* ARMv6+ */ +++ emit(ARM_REV(rd, rn), ctx); +++#endif ++ } ++ ++-static inline void update_on_xread(struct jit_ctx *ctx) +++// push the scratch stack register on top of the stack +++static inline void emit_push_r64(const s8 src[], struct jit_ctx *ctx) ++ { ++- if (!(ctx->seen & SEEN_X)) ++- ctx->flags |= FLAG_NEED_X_RESET; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s8 *rt; +++ u16 reg_set = 0; +++ +++ rt = arm_bpf_get_reg64(src, tmp2, ctx); ++ ++- ctx->seen |= SEEN_X; +++ reg_set = (1 << rt[1]) | (1 << rt[0]); +++ emit(ARM_PUSH(reg_set), ctx); ++ } ++ ++-static int build_body(struct jit_ctx *ctx) +++static void build_prologue(struct jit_ctx *ctx) ++ { ++- void *load_func[] = {jit_get_skb_b, jit_get_skb_h, jit_get_skb_w}; ++- const struct bpf_prog *prog = ctx->skf; ++- const struct sock_filter *inst; ++- unsigned i, load_order, off, condt; ++- int imm12; ++- u32 k; +++ const s8 r0 = bpf2a32[BPF_REG_0][1]; +++ const s8 r2 = bpf2a32[BPF_REG_1][1]; +++ const s8 r3 = bpf2a32[BPF_REG_1][0]; +++ const s8 r4 = bpf2a32[BPF_REG_6][1]; +++ const s8 fplo = bpf2a32[BPF_REG_FP][1]; +++ const s8 fphi = bpf2a32[BPF_REG_FP][0]; +++ const s8 *tcc = bpf2a32[TCALL_CNT]; ++ ++- for (i = 0; i < prog->len; i++) { ++- u16 code; +++ /* Save callee saved registers. */ +++#ifdef CONFIG_FRAME_POINTER +++ u16 reg_set = CALLEE_PUSH_MASK | 1 << ARM_IP | 1 << ARM_PC; +++ emit(ARM_MOV_R(ARM_IP, ARM_SP), ctx); +++ emit(ARM_PUSH(reg_set), ctx); +++ emit(ARM_SUB_I(ARM_FP, ARM_IP, 4), ctx); +++#else +++ emit(ARM_PUSH(CALLEE_PUSH_MASK), ctx); +++ emit(ARM_MOV_R(ARM_FP, ARM_SP), ctx); +++#endif +++ /* Save frame pointer for later */ +++ emit(ARM_SUB_I(ARM_IP, ARM_SP, SCRATCH_SIZE), ctx); ++ ++- inst = &(prog->insns[i]); ++- /* K as an immediate value operand */ ++- k = inst->k; ++- code = bpf_anc_helper(inst); +++ ctx->stack_size = imm8m(STACK_SIZE); ++ ++- /* compute offsets only in the fake pass */ ++- if (ctx->target == NULL) ++- ctx->offsets[i] = ctx->idx * 4; +++ /* Set up function call stack */ +++ emit(ARM_SUB_I(ARM_SP, ARM_SP, ctx->stack_size), ctx); ++ ++- switch (code) { ++- case BPF_LD | BPF_IMM: ++- emit_mov_i(r_A, k, ctx); ++- break; ++- case BPF_LD | BPF_W | BPF_LEN: ++- ctx->seen |= SEEN_SKB; ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); ++- emit(ARM_LDR_I(r_A, r_skb, ++- offsetof(struct sk_buff, len)), ctx); ++- break; ++- case BPF_LD | BPF_MEM: ++- /* A = scratch[k] */ ++- ctx->seen |= SEEN_MEM_WORD(k); ++- emit(ARM_LDR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); ++- break; ++- case BPF_LD | BPF_W | BPF_ABS: ++- load_order = 2; ++- goto load; ++- case BPF_LD | BPF_H | BPF_ABS: ++- load_order = 1; ++- goto load; ++- case BPF_LD | BPF_B | BPF_ABS: ++- load_order = 0; ++-load: ++- emit_mov_i(r_off, k, ctx); ++-load_common: ++- ctx->seen |= SEEN_DATA | SEEN_CALL; ++- ++- if (load_order > 0) { ++- emit(ARM_SUB_I(r_scratch, r_skb_hl, ++- 1 << load_order), ctx); ++- emit(ARM_CMP_R(r_scratch, r_off), ctx); ++- condt = ARM_COND_GE; ++- } else { ++- emit(ARM_CMP_R(r_skb_hl, r_off), ctx); ++- condt = ARM_COND_HI; ++- } +++ /* Set up BPF prog stack base register */ +++ emit_a32_mov_r(fplo, ARM_IP, ctx); +++ emit_a32_mov_i(fphi, 0, ctx); ++ ++- /* ++- * test for negative offset, only if we are ++- * currently scheduled to take the fast ++- * path. this will update the flags so that ++- * the slowpath instruction are ignored if the ++- * offset is negative. ++- * ++- * for loard_order == 0 the HI condition will ++- * make loads at offset 0 take the slow path too. ++- */ ++- _emit(condt, ARM_CMP_I(r_off, 0), ctx); +++ /* mov r4, 0 */ +++ emit(ARM_MOV_I(r4, 0), ctx); ++ ++- _emit(condt, ARM_ADD_R(r_scratch, r_off, r_skb_data), ++- ctx); +++ /* Move BPF_CTX to BPF_R1 */ +++ emit(ARM_MOV_R(r3, r4), ctx); +++ emit(ARM_MOV_R(r2, r0), ctx); +++ /* Initialize Tail Count */ +++ emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[0])), ctx); +++ emit(ARM_STR_I(r4, ARM_FP, EBPF_SCRATCH_TO_ARM_FP(tcc[1])), ctx); +++ /* end of prologue */ +++} ++ ++- if (load_order == 0) ++- _emit(condt, ARM_LDRB_I(r_A, r_scratch, 0), ++- ctx); ++- else if (load_order == 1) ++- emit_load_be16(condt, r_A, r_scratch, ctx); ++- else if (load_order == 2) ++- emit_load_be32(condt, r_A, r_scratch, ctx); ++- ++- _emit(condt, ARM_B(b_imm(i + 1, ctx)), ctx); ++- ++- /* the slowpath */ ++- emit_mov_i(ARM_R3, (u32)load_func[load_order], ctx); ++- emit(ARM_MOV_R(ARM_R0, r_skb), ctx); ++- /* the offset is already in R1 */ ++- emit_blx_r(ARM_R3, ctx); ++- /* check the result of skb_copy_bits */ ++- emit(ARM_CMP_I(ARM_R1, 0), ctx); ++- emit_err_ret(ARM_COND_NE, ctx); ++- emit(ARM_MOV_R(r_A, ARM_R0), ctx); ++- break; ++- case BPF_LD | BPF_W | BPF_IND: ++- load_order = 2; ++- goto load_ind; ++- case BPF_LD | BPF_H | BPF_IND: ++- load_order = 1; ++- goto load_ind; ++- case BPF_LD | BPF_B | BPF_IND: ++- load_order = 0; ++-load_ind: ++- update_on_xread(ctx); ++- OP_IMM3(ARM_ADD, r_off, r_X, k, ctx); ++- goto load_common; ++- case BPF_LDX | BPF_IMM: ++- ctx->seen |= SEEN_X; ++- emit_mov_i(r_X, k, ctx); ++- break; ++- case BPF_LDX | BPF_W | BPF_LEN: ++- ctx->seen |= SEEN_X | SEEN_SKB; ++- emit(ARM_LDR_I(r_X, r_skb, ++- offsetof(struct sk_buff, len)), ctx); ++- break; ++- case BPF_LDX | BPF_MEM: ++- ctx->seen |= SEEN_X | SEEN_MEM_WORD(k); ++- emit(ARM_LDR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); ++- break; ++- case BPF_LDX | BPF_B | BPF_MSH: ++- /* x = ((*(frame + k)) & 0xf) << 2; */ ++- ctx->seen |= SEEN_X | SEEN_DATA | SEEN_CALL; ++- /* the interpreter should deal with the negative K */ ++- if ((int)k < 0) ++- return -1; ++- /* offset in r1: we might have to take the slow path */ ++- emit_mov_i(r_off, k, ctx); ++- emit(ARM_CMP_R(r_skb_hl, r_off), ctx); ++- ++- /* load in r0: common with the slowpath */ ++- _emit(ARM_COND_HI, ARM_LDRB_R(ARM_R0, r_skb_data, ++- ARM_R1), ctx); ++- /* ++- * emit_mov_i() might generate one or two instructions, ++- * the same holds for emit_blx_r() ++- */ ++- _emit(ARM_COND_HI, ARM_B(b_imm(i + 1, ctx) - 2), ctx); +++/* restore callee saved registers. */ +++static void build_epilogue(struct jit_ctx *ctx) +++{ +++#ifdef CONFIG_FRAME_POINTER +++ /* When using frame pointers, some additional registers need to +++ * be loaded. */ +++ u16 reg_set = CALLEE_POP_MASK | 1 << ARM_SP; +++ emit(ARM_SUB_I(ARM_SP, ARM_FP, hweight16(reg_set) * 4), ctx); +++ emit(ARM_LDM(ARM_SP, reg_set), ctx); +++#else +++ /* Restore callee saved registers. */ +++ emit(ARM_MOV_R(ARM_SP, ARM_FP), ctx); +++ emit(ARM_POP(CALLEE_POP_MASK), ctx); +++#endif +++} ++ ++- emit(ARM_MOV_R(ARM_R0, r_skb), ctx); ++- /* r_off is r1 */ ++- emit_mov_i(ARM_R3, (u32)jit_get_skb_b, ctx); ++- emit_blx_r(ARM_R3, ctx); ++- /* check the return value of skb_copy_bits */ ++- emit(ARM_CMP_I(ARM_R1, 0), ctx); ++- emit_err_ret(ARM_COND_NE, ctx); ++- ++- emit(ARM_AND_I(r_X, ARM_R0, 0x00f), ctx); ++- emit(ARM_LSL_I(r_X, r_X, 2), ctx); ++- break; ++- case BPF_ST: ++- ctx->seen |= SEEN_MEM_WORD(k); ++- emit(ARM_STR_I(r_A, ARM_SP, SCRATCH_OFF(k)), ctx); ++- break; ++- case BPF_STX: ++- update_on_xread(ctx); ++- ctx->seen |= SEEN_MEM_WORD(k); ++- emit(ARM_STR_I(r_X, ARM_SP, SCRATCH_OFF(k)), ctx); ++- break; ++- case BPF_ALU | BPF_ADD | BPF_K: ++- /* A += K */ ++- OP_IMM3(ARM_ADD, r_A, r_A, k, ctx); ++- break; ++- case BPF_ALU | BPF_ADD | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_ADD_R(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_SUB | BPF_K: ++- /* A -= K */ ++- OP_IMM3(ARM_SUB, r_A, r_A, k, ctx); ++- break; ++- case BPF_ALU | BPF_SUB | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_SUB_R(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_MUL | BPF_K: ++- /* A *= K */ ++- emit_mov_i(r_scratch, k, ctx); ++- emit(ARM_MUL(r_A, r_A, r_scratch), ctx); ++- break; ++- case BPF_ALU | BPF_MUL | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_MUL(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_DIV | BPF_K: ++- if (k == 1) ++- break; ++- emit_mov_i(r_scratch, k, ctx); ++- emit_udivmod(r_A, r_A, r_scratch, ctx, BPF_DIV); ++- break; ++- case BPF_ALU | BPF_DIV | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_CMP_I(r_X, 0), ctx); ++- emit_err_ret(ARM_COND_EQ, ctx); ++- emit_udivmod(r_A, r_A, r_X, ctx, BPF_DIV); ++- break; ++- case BPF_ALU | BPF_MOD | BPF_K: ++- if (k == 1) { ++- emit_mov_i(r_A, 0, ctx); +++/* +++ * Convert an eBPF instruction to native instruction, i.e +++ * JITs an eBPF instruction. +++ * Returns : +++ * 0 - Successfully JITed an 8-byte eBPF instruction +++ * >0 - Successfully JITed a 16-byte eBPF instruction +++ * <0 - Failed to JIT. +++ */ +++static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) +++{ +++ const u8 code = insn->code; +++ const s8 *dst = bpf2a32[insn->dst_reg]; +++ const s8 *src = bpf2a32[insn->src_reg]; +++ const s8 *tmp = bpf2a32[TMP_REG_1]; +++ const s8 *tmp2 = bpf2a32[TMP_REG_2]; +++ const s16 off = insn->off; +++ const s32 imm = insn->imm; +++ const int i = insn - ctx->prog->insnsi; +++ const bool is64 = BPF_CLASS(code) == BPF_ALU64; +++ const s8 *rd, *rs; +++ s8 rd_lo, rt, rm, rn; +++ s32 jmp_offset; +++ +++#define check_imm(bits, imm) do { \ +++ if ((imm) >= (1 << ((bits) - 1)) || \ +++ (imm) < -(1 << ((bits) - 1))) { \ +++ pr_info("[%2d] imm=%d(0x%x) out of range\n", \ +++ i, imm, imm); \ +++ return -EINVAL; \ +++ } \ +++} while (0) +++#define check_imm24(imm) check_imm(24, imm) +++ +++ switch (code) { +++ /* ALU operations */ +++ +++ /* dst = src */ +++ case BPF_ALU | BPF_MOV | BPF_K: +++ case BPF_ALU | BPF_MOV | BPF_X: +++ case BPF_ALU64 | BPF_MOV | BPF_K: +++ case BPF_ALU64 | BPF_MOV | BPF_X: +++ switch (BPF_SRC(code)) { +++ case BPF_X: +++ if (imm == 1) { +++ /* Special mov32 for zext */ +++ emit_a32_mov_i(dst_hi, 0, ctx); ++ break; ++ } ++- emit_mov_i(r_scratch, k, ctx); ++- emit_udivmod(r_A, r_A, r_scratch, ctx, BPF_MOD); +++ emit_a32_mov_r64(is64, dst, src, ctx); ++ break; ++- case BPF_ALU | BPF_MOD | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_CMP_I(r_X, 0), ctx); ++- emit_err_ret(ARM_COND_EQ, ctx); ++- emit_udivmod(r_A, r_A, r_X, ctx, BPF_MOD); ++- break; ++- case BPF_ALU | BPF_OR | BPF_K: ++- /* A |= K */ ++- OP_IMM3(ARM_ORR, r_A, r_A, k, ctx); ++- break; ++- case BPF_ALU | BPF_OR | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_ORR_R(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_XOR | BPF_K: ++- /* A ^= K; */ ++- OP_IMM3(ARM_EOR, r_A, r_A, k, ctx); ++- break; ++- case BPF_ANC | SKF_AD_ALU_XOR_X: ++- case BPF_ALU | BPF_XOR | BPF_X: ++- /* A ^= X */ ++- update_on_xread(ctx); ++- emit(ARM_EOR_R(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_AND | BPF_K: ++- /* A &= K */ ++- OP_IMM3(ARM_AND, r_A, r_A, k, ctx); ++- break; ++- case BPF_ALU | BPF_AND | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_AND_R(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_LSH | BPF_K: ++- if (unlikely(k > 31)) ++- return -1; ++- emit(ARM_LSL_I(r_A, r_A, k), ctx); ++- break; ++- case BPF_ALU | BPF_LSH | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_LSL_R(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_RSH | BPF_K: ++- if (unlikely(k > 31)) ++- return -1; ++- if (k) ++- emit(ARM_LSR_I(r_A, r_A, k), ctx); ++- break; ++- case BPF_ALU | BPF_RSH | BPF_X: ++- update_on_xread(ctx); ++- emit(ARM_LSR_R(r_A, r_A, r_X), ctx); ++- break; ++- case BPF_ALU | BPF_NEG: ++- /* A = -A */ ++- emit(ARM_RSB_I(r_A, r_A, 0), ctx); ++- break; ++- case BPF_JMP | BPF_JA: ++- /* pc += K */ ++- emit(ARM_B(b_imm(i + k + 1, ctx)), ctx); ++- break; ++- case BPF_JMP | BPF_JEQ | BPF_K: ++- /* pc += (A == K) ? pc->jt : pc->jf */ ++- condt = ARM_COND_EQ; ++- goto cmp_imm; ++- case BPF_JMP | BPF_JGT | BPF_K: ++- /* pc += (A > K) ? pc->jt : pc->jf */ ++- condt = ARM_COND_HI; ++- goto cmp_imm; ++- case BPF_JMP | BPF_JGE | BPF_K: ++- /* pc += (A >= K) ? pc->jt : pc->jf */ ++- condt = ARM_COND_HS; ++-cmp_imm: ++- imm12 = imm8m(k); ++- if (imm12 < 0) { ++- emit_mov_i_no8m(r_scratch, k, ctx); ++- emit(ARM_CMP_R(r_A, r_scratch), ctx); ++- } else { ++- emit(ARM_CMP_I(r_A, imm12), ctx); ++- } ++-cond_jump: ++- if (inst->jt) ++- _emit(condt, ARM_B(b_imm(i + inst->jt + 1, ++- ctx)), ctx); ++- if (inst->jf) ++- _emit(condt ^ 1, ARM_B(b_imm(i + inst->jf + 1, ++- ctx)), ctx); ++- break; ++- case BPF_JMP | BPF_JEQ | BPF_X: ++- /* pc += (A == X) ? pc->jt : pc->jf */ ++- condt = ARM_COND_EQ; ++- goto cmp_x; ++- case BPF_JMP | BPF_JGT | BPF_X: ++- /* pc += (A > X) ? pc->jt : pc->jf */ ++- condt = ARM_COND_HI; ++- goto cmp_x; ++- case BPF_JMP | BPF_JGE | BPF_X: ++- /* pc += (A >= X) ? pc->jt : pc->jf */ ++- condt = ARM_COND_CS; ++-cmp_x: ++- update_on_xread(ctx); ++- emit(ARM_CMP_R(r_A, r_X), ctx); ++- goto cond_jump; ++- case BPF_JMP | BPF_JSET | BPF_K: ++- /* pc += (A & K) ? pc->jt : pc->jf */ ++- condt = ARM_COND_NE; ++- /* not set iff all zeroes iff Z==1 iff EQ */ ++- ++- imm12 = imm8m(k); ++- if (imm12 < 0) { ++- emit_mov_i_no8m(r_scratch, k, ctx); ++- emit(ARM_TST_R(r_A, r_scratch), ctx); ++- } else { ++- emit(ARM_TST_I(r_A, imm12), ctx); ++- } ++- goto cond_jump; ++- case BPF_JMP | BPF_JSET | BPF_X: ++- /* pc += (A & X) ? pc->jt : pc->jf */ ++- update_on_xread(ctx); ++- condt = ARM_COND_NE; ++- emit(ARM_TST_R(r_A, r_X), ctx); ++- goto cond_jump; ++- case BPF_RET | BPF_A: ++- emit(ARM_MOV_R(ARM_R0, r_A), ctx); ++- goto b_epilogue; ++- case BPF_RET | BPF_K: ++- if ((k == 0) && (ctx->ret0_fp_idx < 0)) ++- ctx->ret0_fp_idx = i; ++- emit_mov_i(ARM_R0, k, ctx); ++-b_epilogue: ++- if (i != ctx->skf->len - 1) ++- emit(ARM_B(b_imm(prog->len, ctx)), ctx); ++- break; ++- case BPF_MISC | BPF_TAX: ++- /* X = A */ ++- ctx->seen |= SEEN_X; ++- emit(ARM_MOV_R(r_X, r_A), ctx); ++- break; ++- case BPF_MISC | BPF_TXA: ++- /* A = X */ ++- update_on_xread(ctx); ++- emit(ARM_MOV_R(r_A, r_X), ctx); ++- break; ++- case BPF_ANC | SKF_AD_PROTOCOL: ++- /* A = ntohs(skb->protocol) */ ++- ctx->seen |= SEEN_SKB; ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, ++- protocol) != 2); ++- off = offsetof(struct sk_buff, protocol); ++- emit(ARM_LDRH_I(r_scratch, r_skb, off), ctx); ++- emit_swap16(r_A, r_scratch, ctx); ++- break; ++- case BPF_ANC | SKF_AD_CPU: ++- /* r_scratch = current_thread_info() */ ++- OP_IMM3(ARM_BIC, r_scratch, ARM_SP, THREAD_SIZE - 1, ctx); ++- /* A = current_thread_info()->cpu */ ++- BUILD_BUG_ON(FIELD_SIZEOF(struct thread_info, cpu) != 4); ++- off = offsetof(struct thread_info, cpu); ++- emit(ARM_LDR_I(r_A, r_scratch, off), ctx); ++- break; ++- case BPF_ANC | SKF_AD_IFINDEX: ++- case BPF_ANC | SKF_AD_HATYPE: ++- /* A = skb->dev->ifindex */ ++- /* A = skb->dev->type */ ++- ctx->seen |= SEEN_SKB; ++- off = offsetof(struct sk_buff, dev); ++- emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); ++- ++- emit(ARM_CMP_I(r_scratch, 0), ctx); ++- emit_err_ret(ARM_COND_EQ, ctx); ++- ++- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ++- ifindex) != 4); ++- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ++- type) != 2); ++- ++- if (code == (BPF_ANC | SKF_AD_IFINDEX)) { ++- off = offsetof(struct net_device, ifindex); ++- emit(ARM_LDR_I(r_A, r_scratch, off), ctx); ++- } else { ++- /* ++- * offset of field "type" in "struct ++- * net_device" is above what can be ++- * used in the ldrh rd, [rn, #imm] ++- * instruction, so load the offset in ++- * a register and use ldrh rd, [rn, rm] ++- */ ++- off = offsetof(struct net_device, type); ++- emit_mov_i(ARM_R3, off, ctx); ++- emit(ARM_LDRH_R(r_A, r_scratch, ARM_R3), ctx); ++- } +++ case BPF_K: +++ /* Sign-extend immediate value to destination reg */ +++ emit_a32_mov_se_i64(is64, dst, imm, ctx); ++ break; ++- case BPF_ANC | SKF_AD_MARK: ++- ctx->seen |= SEEN_SKB; ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); ++- off = offsetof(struct sk_buff, mark); ++- emit(ARM_LDR_I(r_A, r_skb, off), ctx); ++- break; ++- case BPF_ANC | SKF_AD_RXHASH: ++- ctx->seen |= SEEN_SKB; ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); ++- off = offsetof(struct sk_buff, hash); ++- emit(ARM_LDR_I(r_A, r_skb, off), ctx); ++- break; ++- case BPF_ANC | SKF_AD_VLAN_TAG: ++- case BPF_ANC | SKF_AD_VLAN_TAG_PRESENT: ++- ctx->seen |= SEEN_SKB; ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); ++- off = offsetof(struct sk_buff, vlan_tci); ++- emit(ARM_LDRH_I(r_A, r_skb, off), ctx); ++- if (code == (BPF_ANC | SKF_AD_VLAN_TAG)) ++- OP_IMM3(ARM_AND, r_A, r_A, ~VLAN_TAG_PRESENT, ctx); ++- else { ++- OP_IMM3(ARM_LSR, r_A, r_A, 12, ctx); ++- OP_IMM3(ARM_AND, r_A, r_A, 0x1, ctx); ++- } +++ } +++ break; +++ /* dst = dst + src/imm */ +++ /* dst = dst - src/imm */ +++ /* dst = dst | src/imm */ +++ /* dst = dst & src/imm */ +++ /* dst = dst ^ src/imm */ +++ /* dst = dst * src/imm */ +++ /* dst = dst << src */ +++ /* dst = dst >> src */ +++ case BPF_ALU | BPF_ADD | BPF_K: +++ case BPF_ALU | BPF_ADD | BPF_X: +++ case BPF_ALU | BPF_SUB | BPF_K: +++ case BPF_ALU | BPF_SUB | BPF_X: +++ case BPF_ALU | BPF_OR | BPF_K: +++ case BPF_ALU | BPF_OR | BPF_X: +++ case BPF_ALU | BPF_AND | BPF_K: +++ case BPF_ALU | BPF_AND | BPF_X: +++ case BPF_ALU | BPF_XOR | BPF_K: +++ case BPF_ALU | BPF_XOR | BPF_X: +++ case BPF_ALU | BPF_MUL | BPF_K: +++ case BPF_ALU | BPF_MUL | BPF_X: +++ case BPF_ALU | BPF_LSH | BPF_X: +++ case BPF_ALU | BPF_RSH | BPF_X: +++ case BPF_ALU | BPF_ARSH | BPF_K: +++ case BPF_ALU | BPF_ARSH | BPF_X: +++ case BPF_ALU64 | BPF_ADD | BPF_K: +++ case BPF_ALU64 | BPF_ADD | BPF_X: +++ case BPF_ALU64 | BPF_SUB | BPF_K: +++ case BPF_ALU64 | BPF_SUB | BPF_X: +++ case BPF_ALU64 | BPF_OR | BPF_K: +++ case BPF_ALU64 | BPF_OR | BPF_X: +++ case BPF_ALU64 | BPF_AND | BPF_K: +++ case BPF_ALU64 | BPF_AND | BPF_X: +++ case BPF_ALU64 | BPF_XOR | BPF_K: +++ case BPF_ALU64 | BPF_XOR | BPF_X: +++ switch (BPF_SRC(code)) { +++ case BPF_X: +++ emit_a32_alu_r64(is64, dst, src, ctx, BPF_OP(code)); +++ break; +++ case BPF_K: +++ /* Move immediate value to the temporary register +++ * and then do the ALU operation on the temporary +++ * register as this will sign-extend the immediate +++ * value into temporary reg and then it would be +++ * safe to do the operation on it. +++ */ +++ emit_a32_mov_se_i64(is64, tmp2, imm, ctx); +++ emit_a32_alu_r64(is64, dst, tmp2, ctx, BPF_OP(code)); ++ break; ++- case BPF_ANC | SKF_AD_PKTTYPE: ++- ctx->seen |= SEEN_SKB; ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, ++- __pkt_type_offset[0]) != 1); ++- off = PKT_TYPE_OFFSET(); ++- emit(ARM_LDRB_I(r_A, r_skb, off), ctx); ++- emit(ARM_AND_I(r_A, r_A, PKT_TYPE_MAX), ctx); ++-#ifdef __BIG_ENDIAN_BITFIELD ++- emit(ARM_LSR_I(r_A, r_A, 5), ctx); ++-#endif +++ } +++ break; +++ /* dst = dst / src(imm) */ +++ /* dst = dst % src(imm) */ +++ case BPF_ALU | BPF_DIV | BPF_K: +++ case BPF_ALU | BPF_DIV | BPF_X: +++ case BPF_ALU | BPF_MOD | BPF_K: +++ case BPF_ALU | BPF_MOD | BPF_X: +++ rd_lo = arm_bpf_get_reg32(dst_lo, tmp2[1], ctx); +++ switch (BPF_SRC(code)) { +++ case BPF_X: +++ rt = arm_bpf_get_reg32(src_lo, tmp2[0], ctx); +++ break; +++ case BPF_K: +++ rt = tmp2[0]; +++ emit_a32_mov_i(rt, imm, ctx); ++ break; ++- case BPF_ANC | SKF_AD_QUEUE: ++- ctx->seen |= SEEN_SKB; ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, ++- queue_mapping) != 2); ++- BUILD_BUG_ON(offsetof(struct sk_buff, ++- queue_mapping) > 0xff); ++- off = offsetof(struct sk_buff, queue_mapping); ++- emit(ARM_LDRH_I(r_A, r_skb, off), ctx); ++- break; ++- case BPF_ANC | SKF_AD_PAY_OFFSET: ++- ctx->seen |= SEEN_SKB | SEEN_CALL; ++- ++- emit(ARM_MOV_R(ARM_R0, r_skb), ctx); ++- emit_mov_i(ARM_R3, (unsigned int)skb_get_poff, ctx); ++- emit_blx_r(ARM_R3, ctx); ++- emit(ARM_MOV_R(r_A, ARM_R0), ctx); ++- break; ++- case BPF_LDX | BPF_W | BPF_ABS: ++- /* ++- * load a 32bit word from struct seccomp_data. ++- * seccomp_check_filter() will already have checked ++- * that k is 32bit aligned and lies within the ++- * struct seccomp_data. +++ default: +++ rt = src_lo; +++ break; +++ } +++ emit_udivmod(rd_lo, rd_lo, rt, ctx, BPF_OP(code)); +++ arm_bpf_put_reg32(dst_lo, rd_lo, ctx); +++ if (!ctx->prog->aux->verifier_zext) +++ emit_a32_mov_i(dst_hi, 0, ctx); +++ break; +++ case BPF_ALU64 | BPF_DIV | BPF_K: +++ case BPF_ALU64 | BPF_DIV | BPF_X: +++ case BPF_ALU64 | BPF_MOD | BPF_K: +++ case BPF_ALU64 | BPF_MOD | BPF_X: +++ goto notyet; +++ /* dst = dst >> imm */ +++ /* dst = dst << imm */ +++ case BPF_ALU | BPF_RSH | BPF_K: +++ case BPF_ALU | BPF_LSH | BPF_K: +++ if (unlikely(imm > 31)) +++ return -EINVAL; +++ if (imm) +++ emit_a32_alu_i(dst_lo, imm, ctx, BPF_OP(code)); +++ if (!ctx->prog->aux->verifier_zext) +++ emit_a32_mov_i(dst_hi, 0, ctx); +++ break; +++ /* dst = dst << imm */ +++ case BPF_ALU64 | BPF_LSH | BPF_K: +++ if (unlikely(imm > 63)) +++ return -EINVAL; +++ emit_a32_lsh_i64(dst, imm, ctx); +++ break; +++ /* dst = dst >> imm */ +++ case BPF_ALU64 | BPF_RSH | BPF_K: +++ if (unlikely(imm > 63)) +++ return -EINVAL; +++ emit_a32_rsh_i64(dst, imm, ctx); +++ break; +++ /* dst = dst << src */ +++ case BPF_ALU64 | BPF_LSH | BPF_X: +++ emit_a32_lsh_r64(dst, src, ctx); +++ break; +++ /* dst = dst >> src */ +++ case BPF_ALU64 | BPF_RSH | BPF_X: +++ emit_a32_rsh_r64(dst, src, ctx); +++ break; +++ /* dst = dst >> src (signed) */ +++ case BPF_ALU64 | BPF_ARSH | BPF_X: +++ emit_a32_arsh_r64(dst, src, ctx); +++ break; +++ /* dst = dst >> imm (signed) */ +++ case BPF_ALU64 | BPF_ARSH | BPF_K: +++ if (unlikely(imm > 63)) +++ return -EINVAL; +++ emit_a32_arsh_i64(dst, imm, ctx); +++ break; +++ /* dst = ~dst */ +++ case BPF_ALU | BPF_NEG: +++ emit_a32_alu_i(dst_lo, 0, ctx, BPF_OP(code)); +++ if (!ctx->prog->aux->verifier_zext) +++ emit_a32_mov_i(dst_hi, 0, ctx); +++ break; +++ /* dst = ~dst (64 bit) */ +++ case BPF_ALU64 | BPF_NEG: +++ emit_a32_neg64(dst, ctx); +++ break; +++ /* dst = dst * src/imm */ +++ case BPF_ALU64 | BPF_MUL | BPF_X: +++ case BPF_ALU64 | BPF_MUL | BPF_K: +++ switch (BPF_SRC(code)) { +++ case BPF_X: +++ emit_a32_mul_r64(dst, src, ctx); +++ break; +++ case BPF_K: +++ /* Move immediate value to the temporary register +++ * and then do the multiplication on it as this +++ * will sign-extend the immediate value into temp +++ * reg then it would be safe to do the operation +++ * on it. ++ */ ++- ctx->seen |= SEEN_SKB; ++- emit(ARM_LDR_I(r_A, r_skb, k), ctx); +++ emit_a32_mov_se_i64(is64, tmp2, imm, ctx); +++ emit_a32_mul_r64(dst, tmp2, ctx); +++ break; +++ } +++ break; +++ /* dst = htole(dst) */ +++ /* dst = htobe(dst) */ +++ case BPF_ALU | BPF_END | BPF_FROM_LE: +++ case BPF_ALU | BPF_END | BPF_FROM_BE: +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ if (BPF_SRC(code) == BPF_FROM_LE) +++ goto emit_bswap_uxt; +++ switch (imm) { +++ case 16: +++ emit_rev16(rd[1], rd[1], ctx); +++ goto emit_bswap_uxt; +++ case 32: +++ emit_rev32(rd[1], rd[1], ctx); +++ goto emit_bswap_uxt; +++ case 64: +++ emit_rev32(ARM_LR, rd[1], ctx); +++ emit_rev32(rd[1], rd[0], ctx); +++ emit(ARM_MOV_R(rd[0], ARM_LR), ctx); ++ break; ++- default: ++- return -1; ++ } +++ goto exit; +++emit_bswap_uxt: +++ switch (imm) { +++ case 16: +++ /* zero-extend 16 bits into 64 bits */ +++#if __LINUX_ARM_ARCH__ < 6 +++ emit_a32_mov_i(tmp2[1], 0xffff, ctx); +++ emit(ARM_AND_R(rd[1], rd[1], tmp2[1]), ctx); +++#else /* ARMv6+ */ +++ emit(ARM_UXTH(rd[1], rd[1]), ctx); +++#endif +++ if (!ctx->prog->aux->verifier_zext) +++ emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); +++ break; +++ case 32: +++ /* zero-extend 32 bits into 64 bits */ +++ if (!ctx->prog->aux->verifier_zext) +++ emit(ARM_EOR_R(rd[0], rd[0], rd[0]), ctx); +++ break; +++ case 64: +++ /* nop */ +++ break; +++ } +++exit: +++ arm_bpf_put_reg64(dst, rd, ctx); +++ break; +++ /* dst = imm64 */ +++ case BPF_LD | BPF_IMM | BPF_DW: +++ { +++ u64 val = (u32)imm | (u64)insn[1].imm << 32; ++ ++- if (ctx->flags & FLAG_IMM_OVERFLOW) ++- /* ++- * this instruction generated an overflow when ++- * trying to access the literal pool, so ++- * delegate this filter to the kernel interpreter. ++- */ ++- return -1; +++ emit_a32_mov_i64(dst, val, ctx); +++ +++ return 1; ++ } +++ /* LDX: dst = *(size *)(src + off) */ +++ case BPF_LDX | BPF_MEM | BPF_W: +++ case BPF_LDX | BPF_MEM | BPF_H: +++ case BPF_LDX | BPF_MEM | BPF_B: +++ case BPF_LDX | BPF_MEM | BPF_DW: +++ rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); +++ emit_ldx_r(dst, rn, off, ctx, BPF_SIZE(code)); +++ break; +++ /* ST: *(size *)(dst + off) = imm */ +++ case BPF_ST | BPF_MEM | BPF_W: +++ case BPF_ST | BPF_MEM | BPF_H: +++ case BPF_ST | BPF_MEM | BPF_B: +++ case BPF_ST | BPF_MEM | BPF_DW: +++ switch (BPF_SIZE(code)) { +++ case BPF_DW: +++ /* Sign-extend immediate value into temp reg */ +++ emit_a32_mov_se_i64(true, tmp2, imm, ctx); +++ break; +++ case BPF_W: +++ case BPF_H: +++ case BPF_B: +++ emit_a32_mov_i(tmp2[1], imm, ctx); +++ break; +++ } +++ emit_str_r(dst_lo, tmp2, off, ctx, BPF_SIZE(code)); +++ break; +++ /* STX XADD: lock *(u32 *)(dst + off) += src */ +++ case BPF_STX | BPF_XADD | BPF_W: +++ /* STX XADD: lock *(u64 *)(dst + off) += src */ +++ case BPF_STX | BPF_XADD | BPF_DW: +++ goto notyet; +++ /* STX: *(size *)(dst + off) = src */ +++ case BPF_STX | BPF_MEM | BPF_W: +++ case BPF_STX | BPF_MEM | BPF_H: +++ case BPF_STX | BPF_MEM | BPF_B: +++ case BPF_STX | BPF_MEM | BPF_DW: +++ rs = arm_bpf_get_reg64(src, tmp2, ctx); +++ emit_str_r(dst_lo, rs, off, ctx, BPF_SIZE(code)); +++ break; +++ /* PC += off if dst == src */ +++ /* PC += off if dst > src */ +++ /* PC += off if dst >= src */ +++ /* PC += off if dst < src */ +++ /* PC += off if dst <= src */ +++ /* PC += off if dst != src */ +++ /* PC += off if dst > src (signed) */ +++ /* PC += off if dst >= src (signed) */ +++ /* PC += off if dst < src (signed) */ +++ /* PC += off if dst <= src (signed) */ +++ /* PC += off if dst & src */ +++ case BPF_JMP | BPF_JEQ | BPF_X: +++ case BPF_JMP | BPF_JGT | BPF_X: +++ case BPF_JMP | BPF_JGE | BPF_X: +++ case BPF_JMP | BPF_JNE | BPF_X: +++ case BPF_JMP | BPF_JSGT | BPF_X: +++ case BPF_JMP | BPF_JSGE | BPF_X: +++ case BPF_JMP | BPF_JSET | BPF_X: +++ case BPF_JMP | BPF_JLE | BPF_X: +++ case BPF_JMP | BPF_JLT | BPF_X: +++ case BPF_JMP | BPF_JSLT | BPF_X: +++ case BPF_JMP | BPF_JSLE | BPF_X: +++ case BPF_JMP32 | BPF_JEQ | BPF_X: +++ case BPF_JMP32 | BPF_JGT | BPF_X: +++ case BPF_JMP32 | BPF_JGE | BPF_X: +++ case BPF_JMP32 | BPF_JNE | BPF_X: +++ case BPF_JMP32 | BPF_JSGT | BPF_X: +++ case BPF_JMP32 | BPF_JSGE | BPF_X: +++ case BPF_JMP32 | BPF_JSET | BPF_X: +++ case BPF_JMP32 | BPF_JLE | BPF_X: +++ case BPF_JMP32 | BPF_JLT | BPF_X: +++ case BPF_JMP32 | BPF_JSLT | BPF_X: +++ case BPF_JMP32 | BPF_JSLE | BPF_X: +++ /* Setup source registers */ +++ rm = arm_bpf_get_reg32(src_hi, tmp2[0], ctx); +++ rn = arm_bpf_get_reg32(src_lo, tmp2[1], ctx); +++ goto go_jmp; +++ /* PC += off if dst == imm */ +++ /* PC += off if dst > imm */ +++ /* PC += off if dst >= imm */ +++ /* PC += off if dst < imm */ +++ /* PC += off if dst <= imm */ +++ /* PC += off if dst != imm */ +++ /* PC += off if dst > imm (signed) */ +++ /* PC += off if dst >= imm (signed) */ +++ /* PC += off if dst < imm (signed) */ +++ /* PC += off if dst <= imm (signed) */ +++ /* PC += off if dst & imm */ +++ case BPF_JMP | BPF_JEQ | BPF_K: +++ case BPF_JMP | BPF_JGT | BPF_K: +++ case BPF_JMP | BPF_JGE | BPF_K: +++ case BPF_JMP | BPF_JNE | BPF_K: +++ case BPF_JMP | BPF_JSGT | BPF_K: +++ case BPF_JMP | BPF_JSGE | BPF_K: +++ case BPF_JMP | BPF_JSET | BPF_K: +++ case BPF_JMP | BPF_JLT | BPF_K: +++ case BPF_JMP | BPF_JLE | BPF_K: +++ case BPF_JMP | BPF_JSLT | BPF_K: +++ case BPF_JMP | BPF_JSLE | BPF_K: +++ case BPF_JMP32 | BPF_JEQ | BPF_K: +++ case BPF_JMP32 | BPF_JGT | BPF_K: +++ case BPF_JMP32 | BPF_JGE | BPF_K: +++ case BPF_JMP32 | BPF_JNE | BPF_K: +++ case BPF_JMP32 | BPF_JSGT | BPF_K: +++ case BPF_JMP32 | BPF_JSGE | BPF_K: +++ case BPF_JMP32 | BPF_JSET | BPF_K: +++ case BPF_JMP32 | BPF_JLT | BPF_K: +++ case BPF_JMP32 | BPF_JLE | BPF_K: +++ case BPF_JMP32 | BPF_JSLT | BPF_K: +++ case BPF_JMP32 | BPF_JSLE | BPF_K: +++ if (off == 0) +++ break; +++ rm = tmp2[0]; +++ rn = tmp2[1]; +++ /* Sign-extend immediate value */ +++ emit_a32_mov_se_i64(true, tmp2, imm, ctx); +++go_jmp: +++ /* Setup destination register */ +++ rd = arm_bpf_get_reg64(dst, tmp, ctx); +++ +++ /* Check for the condition */ +++ emit_ar_r(rd[0], rd[1], rm, rn, ctx, BPF_OP(code), +++ BPF_CLASS(code) == BPF_JMP); +++ +++ /* Setup JUMP instruction */ +++ jmp_offset = bpf2a32_offset(i+off, i, ctx); +++ switch (BPF_OP(code)) { +++ case BPF_JNE: +++ case BPF_JSET: +++ _emit(ARM_COND_NE, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JEQ: +++ _emit(ARM_COND_EQ, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JGT: +++ _emit(ARM_COND_HI, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JGE: +++ _emit(ARM_COND_CS, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JSGT: +++ _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JSGE: +++ _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JLE: +++ _emit(ARM_COND_LS, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JLT: +++ _emit(ARM_COND_CC, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JSLT: +++ _emit(ARM_COND_LT, ARM_B(jmp_offset), ctx); +++ break; +++ case BPF_JSLE: +++ _emit(ARM_COND_GE, ARM_B(jmp_offset), ctx); +++ break; +++ } +++ break; +++ /* JMP OFF */ +++ case BPF_JMP | BPF_JA: +++ { +++ if (off == 0) +++ break; +++ jmp_offset = bpf2a32_offset(i+off, i, ctx); +++ check_imm24(jmp_offset); +++ emit(ARM_B(jmp_offset), ctx); +++ break; +++ } +++ /* tail call */ +++ case BPF_JMP | BPF_TAIL_CALL: +++ if (emit_bpf_tail_call(ctx)) +++ return -EFAULT; +++ break; +++ /* function call */ +++ case BPF_JMP | BPF_CALL: +++ { +++ const s8 *r0 = bpf2a32[BPF_REG_0]; +++ const s8 *r1 = bpf2a32[BPF_REG_1]; +++ const s8 *r2 = bpf2a32[BPF_REG_2]; +++ const s8 *r3 = bpf2a32[BPF_REG_3]; +++ const s8 *r4 = bpf2a32[BPF_REG_4]; +++ const s8 *r5 = bpf2a32[BPF_REG_5]; +++ const u32 func = (u32)__bpf_call_base + (u32)imm; +++ +++ emit_a32_mov_r64(true, r0, r1, ctx); +++ emit_a32_mov_r64(true, r1, r2, ctx); +++ emit_push_r64(r5, ctx); +++ emit_push_r64(r4, ctx); +++ emit_push_r64(r3, ctx); ++ ++- /* compute offsets only during the first pass */ ++- if (ctx->target == NULL) ++- ctx->offsets[i] = ctx->idx * 4; +++ emit_a32_mov_i(tmp[1], func, ctx); +++ emit_blx_r(tmp[1], ctx); ++ +++ emit(ARM_ADD_I(ARM_SP, ARM_SP, imm8m(24)), ctx); // callee clean +++ break; +++ } +++ /* function return */ +++ case BPF_JMP | BPF_EXIT: +++ /* Optimization: when last instruction is EXIT +++ * simply fallthrough to epilogue. +++ */ +++ if (i == ctx->prog->len - 1) +++ break; +++ jmp_offset = epilogue_offset(ctx); +++ check_imm24(jmp_offset); +++ emit(ARM_B(jmp_offset), ctx); +++ break; +++notyet: +++ pr_info_once("*** NOT YET: opcode %02x ***\n", code); +++ return -EFAULT; +++ default: +++ pr_err_once("unknown opcode %02x\n", code); +++ return -EINVAL; +++ } +++ +++ if (ctx->flags & FLAG_IMM_OVERFLOW) +++ /* +++ * this instruction generated an overflow when +++ * trying to access the literal pool, so +++ * delegate this filter to the kernel interpreter. +++ */ +++ return -1; ++ return 0; ++ } ++ +++static int build_body(struct jit_ctx *ctx) +++{ +++ const struct bpf_prog *prog = ctx->prog; +++ unsigned int i; +++ +++ for (i = 0; i < prog->len; i++) { +++ const struct bpf_insn *insn = &(prog->insnsi[i]); +++ int ret; +++ +++ ret = build_insn(insn, ctx); +++ +++ /* It's used with loading the 64 bit immediate value. */ +++ if (ret > 0) { +++ i++; +++ if (ctx->target == NULL) +++ ctx->offsets[i] = ctx->idx; +++ continue; +++ } +++ +++ if (ctx->target == NULL) +++ ctx->offsets[i] = ctx->idx; +++ +++ /* If unsuccesfull, return with error code */ +++ if (ret) +++ return ret; +++ } +++ return 0; +++} +++ +++static int validate_code(struct jit_ctx *ctx) +++{ +++ int i; +++ +++ for (i = 0; i < ctx->idx; i++) { +++ if (ctx->target[i] == __opcode_to_mem_arm(ARM_INST_UDF)) +++ return -1; +++ } +++ +++ return 0; +++} ++ ++-void bpf_jit_compile(struct bpf_prog *fp) +++void bpf_jit_compile(struct bpf_prog *prog) ++ { +++ /* Nothing to do here. We support Internal BPF. */ +++} +++ +++bool bpf_jit_needs_zext(void) +++{ +++ return true; +++} +++ +++struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +++{ +++ struct bpf_prog *tmp, *orig_prog = prog; ++ struct bpf_binary_header *header; +++ bool tmp_blinded = false; ++ struct jit_ctx ctx; ++- unsigned tmp_idx; ++- unsigned alloc_size; ++- u8 *target_ptr; +++ unsigned int tmp_idx; +++ unsigned int image_size; +++ u8 *image_ptr; ++ ++- if (!bpf_jit_enable) ++- return; +++ /* If BPF JIT was not enabled then we must fall back to +++ * the interpreter. +++ */ +++ if (!prog->jit_requested) +++ return orig_prog; ++ ++- memset(&ctx, 0, sizeof(ctx)); ++- ctx.skf = fp; ++- ctx.ret0_fp_idx = -1; +++ /* If constant blinding was enabled and we failed during blinding +++ * then we must fall back to the interpreter. Otherwise, we save +++ * the new JITed code. +++ */ +++ tmp = bpf_jit_blind_constants(prog); ++ ++- ctx.offsets = kzalloc(4 * (ctx.skf->len + 1), GFP_KERNEL); ++- if (ctx.offsets == NULL) ++- return; +++ if (IS_ERR(tmp)) +++ return orig_prog; +++ if (tmp != prog) { +++ tmp_blinded = true; +++ prog = tmp; +++ } ++ ++- /* fake pass to fill in the ctx->seen */ ++- if (unlikely(build_body(&ctx))) +++ memset(&ctx, 0, sizeof(ctx)); +++ ctx.prog = prog; +++ ctx.cpu_architecture = cpu_architecture(); +++ +++ /* Not able to allocate memory for offsets[] , then +++ * we must fall back to the interpreter +++ */ +++ ctx.offsets = kcalloc(prog->len, sizeof(int), GFP_KERNEL); +++ if (ctx.offsets == NULL) { +++ prog = orig_prog; ++ goto out; +++ } +++ +++ /* 1) fake pass to find in the length of the JITed code, +++ * to compute ctx->offsets and other context variables +++ * needed to compute final JITed code. +++ * Also, calculate random starting pointer/start of JITed code +++ * which is prefixed by random number of fault instructions. +++ * +++ * If the first pass fails then there is no chance of it +++ * being successful in the second pass, so just fall back +++ * to the interpreter. +++ */ +++ if (build_body(&ctx)) { +++ prog = orig_prog; +++ goto out_off; +++ } ++ ++ tmp_idx = ctx.idx; ++ build_prologue(&ctx); ++ ctx.prologue_bytes = (ctx.idx - tmp_idx) * 4; ++ +++ ctx.epilogue_offset = ctx.idx; +++ ++ #if __LINUX_ARM_ARCH__ < 7 ++ tmp_idx = ctx.idx; ++ build_epilogue(&ctx); ++@@ -1020,64 +1941,83 @@ void bpf_jit_compile(struct bpf_prog *fp ++ ++ ctx.idx += ctx.imm_count; ++ if (ctx.imm_count) { ++- ctx.imms = kzalloc(4 * ctx.imm_count, GFP_KERNEL); ++- if (ctx.imms == NULL) ++- goto out; +++ ctx.imms = kcalloc(ctx.imm_count, sizeof(u32), GFP_KERNEL); +++ if (ctx.imms == NULL) { +++ prog = orig_prog; +++ goto out_off; +++ } ++ } ++ #else ++- /* there's nothing after the epilogue on ARMv7 */ +++ /* there's nothing about the epilogue on ARMv7 */ ++ build_epilogue(&ctx); ++ #endif ++- alloc_size = 4 * ctx.idx; ++- header = bpf_jit_binary_alloc(alloc_size, &target_ptr, ++- 4, jit_fill_hole); ++- if (header == NULL) ++- goto out; +++ /* Now we can get the actual image size of the JITed arm code. +++ * Currently, we are not considering the THUMB-2 instructions +++ * for jit, although it can decrease the size of the image. +++ * +++ * As each arm instruction is of length 32bit, we are translating +++ * number of JITed intructions into the size required to store these +++ * JITed code. +++ */ +++ image_size = sizeof(u32) * ctx.idx; ++ ++- ctx.target = (u32 *) target_ptr; +++ /* Now we know the size of the structure to make */ +++ header = bpf_jit_binary_alloc(image_size, &image_ptr, +++ sizeof(u32), jit_fill_hole); +++ /* Not able to allocate memory for the structure then +++ * we must fall back to the interpretation +++ */ +++ if (header == NULL) { +++ prog = orig_prog; +++ goto out_imms; +++ } +++ +++ /* 2.) Actual pass to generate final JIT code */ +++ ctx.target = (u32 *) image_ptr; ++ ctx.idx = 0; ++ ++ build_prologue(&ctx); +++ +++ /* If building the body of the JITed code fails somehow, +++ * we fall back to the interpretation. +++ */ ++ if (build_body(&ctx) < 0) { ++-#if __LINUX_ARM_ARCH__ < 7 ++- if (ctx.imm_count) ++- kfree(ctx.imms); ++-#endif +++ image_ptr = NULL; ++ bpf_jit_binary_free(header); ++- goto out; +++ prog = orig_prog; +++ goto out_imms; ++ } ++ build_epilogue(&ctx); ++ +++ /* 3.) Extra pass to validate JITed Code */ +++ if (validate_code(&ctx)) { +++ image_ptr = NULL; +++ bpf_jit_binary_free(header); +++ prog = orig_prog; +++ goto out_imms; +++ } ++ flush_icache_range((u32)header, (u32)(ctx.target + ctx.idx)); ++ +++ if (bpf_jit_enable > 1) +++ /* there are 2 passes here */ +++ bpf_jit_dump(prog->len, image_size, 2, ctx.target); +++ +++ bpf_jit_binary_lock_ro(header); +++ prog->bpf_func = (void *)ctx.target; +++ prog->jited = 1; +++ prog->jited_len = image_size; +++ +++out_imms: ++ #if __LINUX_ARM_ARCH__ < 7 ++ if (ctx.imm_count) ++ kfree(ctx.imms); ++ #endif ++- ++- if (bpf_jit_enable > 1) ++- /* there are 2 passes here */ ++- bpf_jit_dump(fp->len, alloc_size, 2, ctx.target); ++- ++- set_memory_ro((unsigned long)header, header->pages); ++- fp->bpf_func = (void *)ctx.target; ++- fp->jited = 1; ++-out: +++out_off: ++ kfree(ctx.offsets); ++- return; +++out: +++ if (tmp_blinded) +++ bpf_jit_prog_release_other(prog, prog == orig_prog ? +++ tmp : orig_prog); +++ return prog; ++ } ++ ++-void bpf_jit_free(struct bpf_prog *fp) ++-{ ++- unsigned long addr = (unsigned long)fp->bpf_func & PAGE_MASK; ++- struct bpf_binary_header *header = (void *)addr; ++- ++- if (!fp->jited) ++- goto free_filter; ++- ++- set_memory_rw(addr, header->pages); ++- bpf_jit_binary_free(header); ++- ++-free_filter: ++- bpf_prog_unlock_free(fp); ++-} ++--- a/arch/arm/net/bpf_jit_32.h +++++ b/arch/arm/net/bpf_jit_32.h ++@@ -1,16 +1,14 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ ++ /* ++ * Just-In-Time compiler for BPF filters on 32bit ARM ++ * ++ * Copyright (c) 2011 Mircea Gherzan ++- * ++- * This program is free software; you can redistribute it and/or modify it ++- * under the terms of the GNU General Public License as published by the ++- * Free Software Foundation; version 2 of the License. ++ */ ++ ++ #ifndef PFILTER_OPCODES_ARM_H ++ #define PFILTER_OPCODES_ARM_H ++ +++/* ARM 32bit Registers */ ++ #define ARM_R0 0 ++ #define ARM_R1 1 ++ #define ARM_R2 2 ++@@ -22,40 +20,46 @@ ++ #define ARM_R8 8 ++ #define ARM_R9 9 ++ #define ARM_R10 10 ++-#define ARM_FP 11 ++-#define ARM_IP 12 ++-#define ARM_SP 13 ++-#define ARM_LR 14 ++-#define ARM_PC 15 ++- ++-#define ARM_COND_EQ 0x0 ++-#define ARM_COND_NE 0x1 ++-#define ARM_COND_CS 0x2 +++#define ARM_FP 11 /* Frame Pointer */ +++#define ARM_IP 12 /* Intra-procedure scratch register */ +++#define ARM_SP 13 /* Stack pointer: as load/store base reg */ +++#define ARM_LR 14 /* Link Register */ +++#define ARM_PC 15 /* Program counter */ +++ +++#define ARM_COND_EQ 0x0 /* == */ +++#define ARM_COND_NE 0x1 /* != */ +++#define ARM_COND_CS 0x2 /* unsigned >= */ ++ #define ARM_COND_HS ARM_COND_CS ++-#define ARM_COND_CC 0x3 +++#define ARM_COND_CC 0x3 /* unsigned < */ ++ #define ARM_COND_LO ARM_COND_CC ++-#define ARM_COND_MI 0x4 ++-#define ARM_COND_PL 0x5 ++-#define ARM_COND_VS 0x6 ++-#define ARM_COND_VC 0x7 ++-#define ARM_COND_HI 0x8 ++-#define ARM_COND_LS 0x9 ++-#define ARM_COND_GE 0xa ++-#define ARM_COND_LT 0xb ++-#define ARM_COND_GT 0xc ++-#define ARM_COND_LE 0xd ++-#define ARM_COND_AL 0xe +++#define ARM_COND_MI 0x4 /* < 0 */ +++#define ARM_COND_PL 0x5 /* >= 0 */ +++#define ARM_COND_VS 0x6 /* Signed Overflow */ +++#define ARM_COND_VC 0x7 /* No Signed Overflow */ +++#define ARM_COND_HI 0x8 /* unsigned > */ +++#define ARM_COND_LS 0x9 /* unsigned <= */ +++#define ARM_COND_GE 0xa /* Signed >= */ +++#define ARM_COND_LT 0xb /* Signed < */ +++#define ARM_COND_GT 0xc /* Signed > */ +++#define ARM_COND_LE 0xd /* Signed <= */ +++#define ARM_COND_AL 0xe /* None */ ++ ++ /* register shift types */ ++ #define SRTYPE_LSL 0 ++ #define SRTYPE_LSR 1 ++ #define SRTYPE_ASR 2 ++ #define SRTYPE_ROR 3 +++#define SRTYPE_ASL (SRTYPE_LSL) ++ ++ #define ARM_INST_ADD_R 0x00800000 +++#define ARM_INST_ADDS_R 0x00900000 +++#define ARM_INST_ADC_R 0x00a00000 +++#define ARM_INST_ADC_I 0x02a00000 ++ #define ARM_INST_ADD_I 0x02800000 +++#define ARM_INST_ADDS_I 0x02900000 ++ ++ #define ARM_INST_AND_R 0x00000000 +++#define ARM_INST_ANDS_R 0x00100000 ++ #define ARM_INST_AND_I 0x02000000 ++ ++ #define ARM_INST_BIC_R 0x01c00000 ++@@ -71,13 +75,18 @@ ++ #define ARM_INST_EOR_R 0x00200000 ++ #define ARM_INST_EOR_I 0x02200000 ++ ++-#define ARM_INST_LDRB_I 0x05d00000 +++#define ARM_INST_LDST__U 0x00800000 +++#define ARM_INST_LDST__IMM12 0x00000fff +++#define ARM_INST_LDRB_I 0x05500000 ++ #define ARM_INST_LDRB_R 0x07d00000 ++-#define ARM_INST_LDRH_I 0x01d000b0 +++#define ARM_INST_LDRD_I 0x014000d0 +++#define ARM_INST_LDRH_I 0x015000b0 ++ #define ARM_INST_LDRH_R 0x019000b0 ++-#define ARM_INST_LDR_I 0x05900000 +++#define ARM_INST_LDR_I 0x05100000 +++#define ARM_INST_LDR_R 0x07900000 ++ ++ #define ARM_INST_LDM 0x08900000 +++#define ARM_INST_LDM_IA 0x08b00000 ++ ++ #define ARM_INST_LSL_I 0x01a00000 ++ #define ARM_INST_LSL_R 0x01a00010 ++@@ -86,6 +95,7 @@ ++ #define ARM_INST_LSR_R 0x01a00030 ++ ++ #define ARM_INST_MOV_R 0x01a00000 +++#define ARM_INST_MOVS_R 0x01b00000 ++ #define ARM_INST_MOV_I 0x03a00000 ++ #define ARM_INST_MOVW 0x03000000 ++ #define ARM_INST_MOVT 0x03400000 ++@@ -96,17 +106,29 @@ ++ #define ARM_INST_PUSH 0x092d0000 ++ ++ #define ARM_INST_ORR_R 0x01800000 +++#define ARM_INST_ORRS_R 0x01900000 ++ #define ARM_INST_ORR_I 0x03800000 ++ ++ #define ARM_INST_REV 0x06bf0f30 ++ #define ARM_INST_REV16 0x06bf0fb0 ++ ++ #define ARM_INST_RSB_I 0x02600000 +++#define ARM_INST_RSBS_I 0x02700000 +++#define ARM_INST_RSC_I 0x02e00000 ++ ++ #define ARM_INST_SUB_R 0x00400000 +++#define ARM_INST_SUBS_R 0x00500000 +++#define ARM_INST_RSB_R 0x00600000 ++ #define ARM_INST_SUB_I 0x02400000 ++- ++-#define ARM_INST_STR_I 0x05800000 +++#define ARM_INST_SUBS_I 0x02500000 +++#define ARM_INST_SBC_I 0x02c00000 +++#define ARM_INST_SBC_R 0x00c00000 +++#define ARM_INST_SBCS_R 0x00d00000 +++ +++#define ARM_INST_STR_I 0x05000000 +++#define ARM_INST_STRB_I 0x05400000 +++#define ARM_INST_STRD_I 0x014000f0 +++#define ARM_INST_STRH_I 0x014000b0 ++ ++ #define ARM_INST_TST_R 0x01100000 ++ #define ARM_INST_TST_I 0x03100000 ++@@ -117,6 +139,8 @@ ++ ++ #define ARM_INST_MLS 0x00600090 ++ +++#define ARM_INST_UXTH 0x06ff0070 +++ ++ /* ++ * Use a suitable undefined instruction to use for ARM/Thumb2 faulting. ++ * We need to be careful not to conflict with those used by other modules ++@@ -135,11 +159,18 @@ ++ #define _AL3_R(op, rd, rn, rm) ((op ## _R) | (rd) << 12 | (rn) << 16 | (rm)) ++ /* immediate */ ++ #define _AL3_I(op, rd, rn, imm) ((op ## _I) | (rd) << 12 | (rn) << 16 | (imm)) +++/* register with register-shift */ +++#define _AL3_SR(inst) (inst | (1 << 4)) ++ ++ #define ARM_ADD_R(rd, rn, rm) _AL3_R(ARM_INST_ADD, rd, rn, rm) +++#define ARM_ADDS_R(rd, rn, rm) _AL3_R(ARM_INST_ADDS, rd, rn, rm) ++ #define ARM_ADD_I(rd, rn, imm) _AL3_I(ARM_INST_ADD, rd, rn, imm) +++#define ARM_ADDS_I(rd, rn, imm) _AL3_I(ARM_INST_ADDS, rd, rn, imm) +++#define ARM_ADC_R(rd, rn, rm) _AL3_R(ARM_INST_ADC, rd, rn, rm) +++#define ARM_ADC_I(rd, rn, imm) _AL3_I(ARM_INST_ADC, rd, rn, imm) ++ ++ #define ARM_AND_R(rd, rn, rm) _AL3_R(ARM_INST_AND, rd, rn, rm) +++#define ARM_ANDS_R(rd, rn, rm) _AL3_R(ARM_INST_ANDS, rd, rn, rm) ++ #define ARM_AND_I(rd, rn, imm) _AL3_I(ARM_INST_AND, rd, rn, imm) ++ ++ #define ARM_BIC_R(rd, rn, rm) _AL3_R(ARM_INST_BIC, rd, rn, rm) ++@@ -155,27 +186,38 @@ ++ #define ARM_EOR_R(rd, rn, rm) _AL3_R(ARM_INST_EOR, rd, rn, rm) ++ #define ARM_EOR_I(rd, rn, imm) _AL3_I(ARM_INST_EOR, rd, rn, imm) ++ ++-#define ARM_LDR_I(rt, rn, off) (ARM_INST_LDR_I | (rt) << 12 | (rn) << 16 \ ++- | (off)) ++-#define ARM_LDRB_I(rt, rn, off) (ARM_INST_LDRB_I | (rt) << 12 | (rn) << 16 \ ++- | (off)) ++-#define ARM_LDRB_R(rt, rn, rm) (ARM_INST_LDRB_R | (rt) << 12 | (rn) << 16 \ +++#define ARM_LDR_R(rt, rn, rm) (ARM_INST_LDR_R | ARM_INST_LDST__U \ +++ | (rt) << 12 | (rn) << 16 \ ++ | (rm)) ++-#define ARM_LDRH_I(rt, rn, off) (ARM_INST_LDRH_I | (rt) << 12 | (rn) << 16 \ ++- | (((off) & 0xf0) << 4) | ((off) & 0xf)) ++-#define ARM_LDRH_R(rt, rn, rm) (ARM_INST_LDRH_R | (rt) << 12 | (rn) << 16 \ +++#define ARM_LDR_R_SI(rt, rn, rm, type, imm) \ +++ (ARM_INST_LDR_R | ARM_INST_LDST__U \ +++ | (rt) << 12 | (rn) << 16 \ +++ | (imm) << 7 | (type) << 5 | (rm)) +++#define ARM_LDRB_R(rt, rn, rm) (ARM_INST_LDRB_R | ARM_INST_LDST__U \ +++ | (rt) << 12 | (rn) << 16 \ +++ | (rm)) +++#define ARM_LDRH_R(rt, rn, rm) (ARM_INST_LDRH_R | ARM_INST_LDST__U \ +++ | (rt) << 12 | (rn) << 16 \ ++ | (rm)) ++ ++ #define ARM_LDM(rn, regs) (ARM_INST_LDM | (rn) << 16 | (regs)) +++#define ARM_LDM_IA(rn, regs) (ARM_INST_LDM_IA | (rn) << 16 | (regs)) ++ ++ #define ARM_LSL_R(rd, rn, rm) (_AL3_R(ARM_INST_LSL, rd, 0, rn) | (rm) << 8) ++ #define ARM_LSL_I(rd, rn, imm) (_AL3_I(ARM_INST_LSL, rd, 0, rn) | (imm) << 7) ++ ++ #define ARM_LSR_R(rd, rn, rm) (_AL3_R(ARM_INST_LSR, rd, 0, rn) | (rm) << 8) ++ #define ARM_LSR_I(rd, rn, imm) (_AL3_I(ARM_INST_LSR, rd, 0, rn) | (imm) << 7) +++#define ARM_ASR_R(rd, rn, rm) (_AL3_R(ARM_INST_ASR, rd, 0, rn) | (rm) << 8) +++#define ARM_ASR_I(rd, rn, imm) (_AL3_I(ARM_INST_ASR, rd, 0, rn) | (imm) << 7) ++ ++ #define ARM_MOV_R(rd, rm) _AL3_R(ARM_INST_MOV, rd, 0, rm) +++#define ARM_MOVS_R(rd, rm) _AL3_R(ARM_INST_MOVS, rd, 0, rm) ++ #define ARM_MOV_I(rd, imm) _AL3_I(ARM_INST_MOV, rd, 0, imm) +++#define ARM_MOV_SR(rd, rm, type, rs) \ +++ (_AL3_SR(ARM_MOV_R(rd, rm)) | (type) << 5 | (rs) << 8) +++#define ARM_MOV_SI(rd, rm, type, imm6) \ +++ (ARM_MOV_R(rd, rm) | (type) << 5 | (imm6) << 7) ++ ++ #define ARM_MOVW(rd, imm) \ ++ (ARM_INST_MOVW | ((imm) >> 12) << 16 | (rd) << 12 | ((imm) & 0x0fff)) ++@@ -190,19 +232,31 @@ ++ ++ #define ARM_ORR_R(rd, rn, rm) _AL3_R(ARM_INST_ORR, rd, rn, rm) ++ #define ARM_ORR_I(rd, rn, imm) _AL3_I(ARM_INST_ORR, rd, rn, imm) ++-#define ARM_ORR_S(rd, rn, rm, type, rs) \ ++- (ARM_ORR_R(rd, rn, rm) | (type) << 5 | (rs) << 7) +++#define ARM_ORR_SR(rd, rn, rm, type, rs) \ +++ (_AL3_SR(ARM_ORR_R(rd, rn, rm)) | (type) << 5 | (rs) << 8) +++#define ARM_ORRS_R(rd, rn, rm) _AL3_R(ARM_INST_ORRS, rd, rn, rm) +++#define ARM_ORRS_SR(rd, rn, rm, type, rs) \ +++ (_AL3_SR(ARM_ORRS_R(rd, rn, rm)) | (type) << 5 | (rs) << 8) +++#define ARM_ORR_SI(rd, rn, rm, type, imm6) \ +++ (ARM_ORR_R(rd, rn, rm) | (type) << 5 | (imm6) << 7) +++#define ARM_ORRS_SI(rd, rn, rm, type, imm6) \ +++ (ARM_ORRS_R(rd, rn, rm) | (type) << 5 | (imm6) << 7) ++ ++ #define ARM_REV(rd, rm) (ARM_INST_REV | (rd) << 12 | (rm)) ++ #define ARM_REV16(rd, rm) (ARM_INST_REV16 | (rd) << 12 | (rm)) ++ ++ #define ARM_RSB_I(rd, rn, imm) _AL3_I(ARM_INST_RSB, rd, rn, imm) +++#define ARM_RSBS_I(rd, rn, imm) _AL3_I(ARM_INST_RSBS, rd, rn, imm) +++#define ARM_RSC_I(rd, rn, imm) _AL3_I(ARM_INST_RSC, rd, rn, imm) ++ ++ #define ARM_SUB_R(rd, rn, rm) _AL3_R(ARM_INST_SUB, rd, rn, rm) +++#define ARM_SUBS_R(rd, rn, rm) _AL3_R(ARM_INST_SUBS, rd, rn, rm) +++#define ARM_RSB_R(rd, rn, rm) _AL3_R(ARM_INST_RSB, rd, rn, rm) +++#define ARM_SBC_R(rd, rn, rm) _AL3_R(ARM_INST_SBC, rd, rn, rm) +++#define ARM_SBCS_R(rd, rn, rm) _AL3_R(ARM_INST_SBCS, rd, rn, rm) ++ #define ARM_SUB_I(rd, rn, imm) _AL3_I(ARM_INST_SUB, rd, rn, imm) ++- ++-#define ARM_STR_I(rt, rn, off) (ARM_INST_STR_I | (rt) << 12 | (rn) << 16 \ ++- | (off)) +++#define ARM_SUBS_I(rd, rn, imm) _AL3_I(ARM_INST_SUBS, rd, rn, imm) +++#define ARM_SBC_I(rd, rn, imm) _AL3_I(ARM_INST_SBC, rd, rn, imm) ++ ++ #define ARM_TST_R(rn, rm) _AL3_R(ARM_INST_TST, 0, rn, rm) ++ #define ARM_TST_I(rn, imm) _AL3_I(ARM_INST_TST, 0, rn, imm) ++@@ -214,5 +268,6 @@ ++ ++ #define ARM_MLS(rd, rn, rm, ra) (ARM_INST_MLS | (rd) << 16 | (rn) | (rm) << 8 \ ++ | (ra) << 12) +++#define ARM_UXTH(rd, rm) (ARM_INST_UXTH | (rd) << 12 | (rm)) ++ ++ #endif /* PFILTER_OPCODES_ARM_H */ ++--- a/arch/arm/net/Makefile +++++ b/arch/arm/net/Makefile ++@@ -1,3 +1,4 @@ +++# SPDX-License-Identifier: GPL-2.0-only ++ # ARM-specific networking code ++ ++ obj-$(CONFIG_BPF_JIT) += bpf_jit_32.o ++--- /dev/null +++++ b/include/linux/bpf-cgroup.h ++@@ -0,0 +1,410 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ +++#ifndef _BPF_CGROUP_H +++#define _BPF_CGROUP_H +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++struct sock; +++struct sockaddr; +++struct cgroup; +++struct sk_buff; +++struct bpf_map; +++struct bpf_prog; +++struct bpf_sock_ops_kern; +++struct bpf_cgroup_storage; +++struct ctl_table; +++struct ctl_table_header; +++ +++#ifdef CONFIG_CGROUP_BPF +++ +++extern struct static_key_false cgroup_bpf_enabled_key; +++#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +++ +++DECLARE_PER_CPU(struct bpf_cgroup_storage*, +++ bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +++ +++#define for_each_cgroup_storage_type(stype) \ +++ for (stype = 0; stype < MAX_BPF_CGROUP_STORAGE_TYPE; stype++) +++ +++struct bpf_cgroup_storage_map; +++ +++struct bpf_storage_buffer { +++ struct rcu_head rcu; +++ char data[0]; +++}; +++ +++struct bpf_cgroup_storage { +++ union { +++ struct bpf_storage_buffer *buf; +++ void __percpu *percpu_buf; +++ }; +++ struct bpf_cgroup_storage_map *map; +++ struct bpf_cgroup_storage_key key; +++ struct list_head list; +++ struct rb_node node; +++ struct rcu_head rcu; +++}; +++ +++struct bpf_prog_list { +++ struct list_head node; +++ struct bpf_prog *prog; +++ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]; +++}; +++ +++struct bpf_prog_array; +++ +++struct cgroup_bpf { +++ /* array of effective progs in this cgroup */ +++ struct bpf_prog_array __rcu *effective[MAX_BPF_ATTACH_TYPE]; +++ +++ /* attached progs to this cgroup and attach flags +++ * when flags == 0 or BPF_F_ALLOW_OVERRIDE the progs list will +++ * have either zero or one element +++ * when BPF_F_ALLOW_MULTI the list can have up to BPF_CGROUP_MAX_PROGS +++ */ +++ struct list_head progs[MAX_BPF_ATTACH_TYPE]; +++ u32 flags[MAX_BPF_ATTACH_TYPE]; +++ +++ /* temp storage for effective prog array used by prog_attach/detach */ +++ struct bpf_prog_array *inactive; +++ +++ /* reference counter used to detach bpf programs after cgroup removal */ +++ struct percpu_ref refcnt; +++ +++ /* cgroup_bpf is released using a work queue */ +++ struct work_struct release_work; +++}; +++ +++int cgroup_bpf_inherit(struct cgroup *cgrp); +++void cgroup_bpf_offline(struct cgroup *cgrp); +++ +++int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, +++ enum bpf_attach_type type, u32 flags); +++int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, +++ enum bpf_attach_type type); +++int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, +++ union bpf_attr __user *uattr); +++ +++/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */ +++int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, +++ enum bpf_attach_type type, u32 flags); +++int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, +++ enum bpf_attach_type type, u32 flags); +++int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, +++ union bpf_attr __user *uattr); +++ +++int __cgroup_bpf_run_filter_skb(struct sock *sk, +++ struct sk_buff *skb, +++ enum bpf_attach_type type); +++ +++int __cgroup_bpf_run_filter_sk(struct sock *sk, +++ enum bpf_attach_type type); +++ +++int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, +++ struct sockaddr *uaddr, +++ enum bpf_attach_type type, +++ void *t_ctx); +++ +++int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, +++ struct bpf_sock_ops_kern *sock_ops, +++ enum bpf_attach_type type); +++ +++int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, +++ short access, enum bpf_attach_type type); +++ +++int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, +++ struct ctl_table *table, int write, +++ void __user *buf, size_t *pcount, +++ loff_t *ppos, void **new_buf, +++ enum bpf_attach_type type); +++ +++int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, +++ int *optname, char __user *optval, +++ int *optlen, char **kernel_optval); +++int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, +++ int optname, char __user *optval, +++ int __user *optlen, int max_optlen, +++ int retval); +++ +++static inline enum bpf_cgroup_storage_type cgroup_storage_type( +++ struct bpf_map *map) +++{ +++ if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) +++ return BPF_CGROUP_STORAGE_PERCPU; +++ +++ return BPF_CGROUP_STORAGE_SHARED; +++} +++ +++static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage +++ *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) +++{ +++ enum bpf_cgroup_storage_type stype; +++ +++ for_each_cgroup_storage_type(stype) +++ this_cpu_write(bpf_cgroup_storage[stype], storage[stype]); +++} +++ +++struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, +++ enum bpf_cgroup_storage_type stype); +++void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage); +++void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, +++ struct cgroup *cgroup, +++ enum bpf_attach_type type); +++void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); +++int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); +++void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); +++ +++int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +++int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, +++ void *value, u64 flags); +++ +++/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ +++#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) \ +++ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ +++ BPF_CGROUP_INET_INGRESS); \ +++ \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ +++ typeof(sk) __sk = sk_to_full_sk(sk); \ +++ if (sk_fullsock(__sk)) \ +++ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ +++ BPF_CGROUP_INET_EGRESS); \ +++ } \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) { \ +++ __ret = __cgroup_bpf_run_filter_sk(sk, type); \ +++ } \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +++ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) +++ +++#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ +++ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) +++ +++#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ +++ BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) +++ +++#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) \ +++ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ +++ NULL); \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) { \ +++ lock_sock(sk); \ +++ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ +++ t_ctx); \ +++ release_sock(sk); \ +++ } \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) +++ +++#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) +++ +++#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ +++ sk->sk_prot->pre_connect) +++ +++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) +++ +++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) +++ +++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT, NULL) +++ +++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT, NULL) +++ +++#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ +++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_SENDMSG, t_ctx) +++ +++#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ +++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_SENDMSG, t_ctx) +++ +++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP4_RECVMSG, NULL) +++ +++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ +++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_UDP6_RECVMSG, NULL) +++ +++#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled && (sock_ops)->sk) { \ +++ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ +++ if (__sk && sk_fullsock(__sk)) \ +++ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ +++ sock_ops, \ +++ BPF_CGROUP_SOCK_OPS); \ +++ } \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) \ +++ __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ +++ access, \ +++ BPF_CGROUP_DEVICE); \ +++ \ +++ __ret; \ +++}) +++ +++ +++#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) \ +++ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ +++ buf, count, pos, nbuf, \ +++ BPF_CGROUP_SYSCTL); \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ +++ kernel_optval) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) \ +++ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ +++ optname, optval, \ +++ optlen, \ +++ kernel_optval); \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ +++({ \ +++ int __ret = 0; \ +++ if (cgroup_bpf_enabled) \ +++ get_user(__ret, optlen); \ +++ __ret; \ +++}) +++ +++#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, optlen, \ +++ max_optlen, retval) \ +++({ \ +++ int __ret = retval; \ +++ if (cgroup_bpf_enabled) \ +++ __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ +++ optname, optval, \ +++ optlen, max_optlen, \ +++ retval); \ +++ __ret; \ +++}) +++ +++int cgroup_bpf_prog_attach(const union bpf_attr *attr, +++ enum bpf_prog_type ptype, struct bpf_prog *prog); +++int cgroup_bpf_prog_detach(const union bpf_attr *attr, +++ enum bpf_prog_type ptype); +++int cgroup_bpf_prog_query(const union bpf_attr *attr, +++ union bpf_attr __user *uattr); +++#else +++ +++struct bpf_prog; +++struct cgroup_bpf {}; +++static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +++static inline void cgroup_bpf_offline(struct cgroup *cgrp) {} +++ +++static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr, +++ enum bpf_prog_type ptype, +++ struct bpf_prog *prog) +++{ +++ return -EINVAL; +++} +++ +++static inline int cgroup_bpf_prog_detach(const union bpf_attr *attr, +++ enum bpf_prog_type ptype) +++{ +++ return -EINVAL; +++} +++ +++static inline int cgroup_bpf_prog_query(const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ return -EINVAL; +++} +++ +++static inline void bpf_cgroup_storage_set( +++ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {} +++static inline int bpf_cgroup_storage_assign(struct bpf_prog *prog, +++ struct bpf_map *map) { return 0; } +++static inline void bpf_cgroup_storage_release(struct bpf_prog *prog, +++ struct bpf_map *map) {} +++static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( +++ struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; } +++static inline void bpf_cgroup_storage_free( +++ struct bpf_cgroup_storage *storage) {} +++static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, +++ void *value) { +++ return 0; +++} +++static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, +++ void *key, void *value, u64 flags) { +++ return 0; +++} +++ +++#define cgroup_bpf_enabled (0) +++#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) +++#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +++#define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) +++#define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ +++ optlen, max_optlen, retval) ({ retval; }) +++#define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ +++ kernel_optval) ({ 0; }) +++ +++#define for_each_cgroup_storage_type(stype) for (; false; ) +++ +++#endif /* CONFIG_CGROUP_BPF */ +++ +++#endif /* _BPF_CGROUP_H */ ++--- a/include/linux/bpf.h +++++ b/include/linux/bpf.h ++@@ -1,55 +1,183 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ ++ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++- * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of version 2 of the GNU General Public ++- * License as published by the Free Software Foundation. ++ */ ++ #ifndef _LINUX_BPF_H ++ #define _LINUX_BPF_H 1 ++ ++ #include +++ ++ #include ++ #include +++#include +++#include +++#include +++#include +++#include +++#include ++ +++struct bpf_verifier_env; +++struct perf_event; +++struct bpf_prog; ++ struct bpf_map; +++struct sock; +++struct seq_file; +++struct btf; +++struct btf_type; +++ +++extern struct idr btf_idr; +++extern spinlock_t btf_idr_lock; ++ ++ /* map is generic key/value storage optionally accesible by eBPF programs */ ++ struct bpf_map_ops { ++ /* funcs callable from userspace (via syscall) */ +++ int (*map_alloc_check)(union bpf_attr *attr); ++ struct bpf_map *(*map_alloc)(union bpf_attr *attr); ++- void (*map_free)(struct bpf_map *); +++ void (*map_release)(struct bpf_map *map, struct file *map_file); +++ void (*map_free)(struct bpf_map *map); ++ int (*map_get_next_key)(struct bpf_map *map, void *key, void *next_key); +++ void (*map_release_uref)(struct bpf_map *map); +++ void *(*map_lookup_elem_sys_only)(struct bpf_map *map, void *key); ++ ++ /* funcs callable from userspace and from eBPF programs */ ++ void *(*map_lookup_elem)(struct bpf_map *map, void *key); ++ int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); ++ int (*map_delete_elem)(struct bpf_map *map, void *key); +++ int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); +++ int (*map_pop_elem)(struct bpf_map *map, void *value); +++ int (*map_peek_elem)(struct bpf_map *map, void *value); ++ ++ /* funcs called by prog_array and perf_event_array map */ ++- void *(*map_fd_get_ptr) (struct bpf_map *map, int fd); ++- void (*map_fd_put_ptr) (void *ptr); +++ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, +++ int fd); +++ void (*map_fd_put_ptr)(void *ptr); +++ u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); +++ u32 (*map_fd_sys_lookup_elem)(void *ptr); +++ void (*map_seq_show_elem)(struct bpf_map *map, void *key, +++ struct seq_file *m); +++ int (*map_check_btf)(const struct bpf_map *map, +++ const struct btf *btf, +++ const struct btf_type *key_type, +++ const struct btf_type *value_type); +++ +++ /* Direct value access helpers. */ +++ int (*map_direct_value_addr)(const struct bpf_map *map, +++ u64 *imm, u32 off); +++ int (*map_direct_value_meta)(const struct bpf_map *map, +++ u64 imm, u32 *off); +++}; +++ +++struct bpf_map_memory { +++ u32 pages; +++ struct user_struct *user; ++ }; ++ ++ struct bpf_map { ++- atomic_t refcnt; +++ /* The first two cachelines with read-mostly members of which some +++ * are also accessed in fast-path (e.g. ops, max_entries). +++ */ +++ const struct bpf_map_ops *ops ____cacheline_aligned; +++ struct bpf_map *inner_map_meta; +++#ifdef CONFIG_SECURITY +++ void *security; +++#endif ++ enum bpf_map_type map_type; ++ u32 key_size; ++ u32 value_size; ++ u32 max_entries; ++- u32 pages; +++ u32 map_flags; +++ int spin_lock_off; /* >=0 valid offset, <0 error */ +++ u32 id; +++ int numa_node; +++ u32 btf_key_type_id; +++ u32 btf_value_type_id; +++ struct btf *btf; +++ struct bpf_map_memory memory; ++ bool unpriv_array; ++- struct user_struct *user; ++- const struct bpf_map_ops *ops; ++- struct work_struct work; +++ bool frozen; /* write-once */ +++ /* 48 bytes hole */ +++ +++ /* The 3rd and 4th cacheline with misc members to avoid false sharing +++ * particularly with refcounting. +++ */ +++ atomic_t refcnt ____cacheline_aligned; ++ atomic_t usercnt; +++ struct work_struct work; +++ char name[BPF_OBJ_NAME_LEN]; ++ }; ++ ++-struct bpf_map_type_list { ++- struct list_head list_node; ++- const struct bpf_map_ops *ops; ++- enum bpf_map_type type; +++static inline bool map_value_has_spin_lock(const struct bpf_map *map) +++{ +++ return map->spin_lock_off >= 0; +++} +++ +++static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) +++{ +++ if (likely(!map_value_has_spin_lock(map))) +++ return; +++ *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = +++ (struct bpf_spin_lock){}; +++} +++ +++/* copy everything but bpf_spin_lock */ +++static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +++{ +++ if (unlikely(map_value_has_spin_lock(map))) { +++ u32 off = map->spin_lock_off; +++ +++ memcpy(dst, src, off); +++ memcpy(dst + off + sizeof(struct bpf_spin_lock), +++ src + off + sizeof(struct bpf_spin_lock), +++ map->value_size - off - sizeof(struct bpf_spin_lock)); +++ } else { +++ memcpy(dst, src, map->value_size); +++ } +++} +++void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, +++ bool lock_src); +++ +++struct bpf_offload_dev; +++struct bpf_offloaded_map; +++ +++struct bpf_map_dev_ops { +++ int (*map_get_next_key)(struct bpf_offloaded_map *map, +++ void *key, void *next_key); +++ int (*map_lookup_elem)(struct bpf_offloaded_map *map, +++ void *key, void *value); +++ int (*map_update_elem)(struct bpf_offloaded_map *map, +++ void *key, void *value, u64 flags); +++ int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); ++ }; ++ +++struct bpf_offloaded_map { +++ struct bpf_map map; +++ struct net_device *netdev; +++ const struct bpf_map_dev_ops *dev_ops; +++ void *dev_priv; +++ struct list_head offloads; +++}; +++ +++static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) +++{ +++ return container_of(map, struct bpf_offloaded_map, map); +++} +++ +++static inline bool bpf_map_offload_neutral(const struct bpf_map *map) +++{ +++ return map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; +++} +++ +++static inline bool bpf_map_support_seq_show(const struct bpf_map *map) +++{ +++ return map->btf && map->ops->map_seq_show_elem; +++} +++ +++int map_check_no_btf(const struct bpf_map *map, +++ const struct btf *btf, +++ const struct btf_type *key_type, +++ const struct btf_type *value_type); +++ +++extern const struct bpf_map_ops bpf_map_offload_ops; +++ ++ /* function argument constraints */ ++ enum bpf_arg_type { ++ ARG_DONTCARE = 0, /* unused argument in helper function */ ++@@ -60,22 +188,40 @@ enum bpf_arg_type { ++ ARG_CONST_MAP_PTR, /* const argument used as pointer to bpf_map */ ++ ARG_PTR_TO_MAP_KEY, /* pointer to stack used as map key */ ++ ARG_PTR_TO_MAP_VALUE, /* pointer to stack used as map value */ +++ ARG_PTR_TO_UNINIT_MAP_VALUE, /* pointer to valid memory used to store a map value */ +++ ARG_PTR_TO_MAP_VALUE_OR_NULL, /* pointer to stack used as map value or NULL */ ++ ++ /* the following constraints used to prototype bpf_memcmp() and other ++ * functions that access data on eBPF program stack ++ */ ++- ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ ++- ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ +++ ARG_PTR_TO_MEM, /* pointer to valid memory (stack, packet, map value) */ +++ ARG_PTR_TO_MEM_OR_NULL, /* pointer to valid memory or NULL */ +++ ARG_PTR_TO_UNINIT_MEM, /* pointer to memory does not need to be initialized, +++ * helper function must fill all bytes or clear +++ * them in error case. +++ */ +++ +++ ARG_CONST_SIZE, /* number of bytes accessed from memory */ +++ ARG_CONST_SIZE_OR_ZERO, /* number of bytes accessed from memory or 0 */ ++ ++ ARG_PTR_TO_CTX, /* pointer to context */ ++ ARG_ANYTHING, /* any (initialized) argument is ok */ +++ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ +++ ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ +++ ARG_PTR_TO_INT, /* pointer to int */ +++ ARG_PTR_TO_LONG, /* pointer to long */ +++ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ++ }; ++ ++ /* type of values returned from helper functions */ ++ enum bpf_return_type { ++ RET_INTEGER, /* function returns integer */ ++ RET_VOID, /* function doesn't return anything */ +++ RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ ++ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ +++ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ +++ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ +++ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ ++ }; ++ ++ /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs ++@@ -85,6 +231,7 @@ enum bpf_return_type { ++ struct bpf_func_proto { ++ u64 (*func)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); ++ bool gpl_only; +++ bool pkt_access; ++ enum bpf_return_type ret_type; ++ enum bpf_arg_type arg1_type; ++ enum bpf_arg_type arg2_type; ++@@ -104,35 +251,172 @@ enum bpf_access_type { ++ BPF_WRITE = 2 ++ }; ++ ++-struct bpf_prog; +++/* types of values stored in eBPF registers */ +++/* Pointer types represent: +++ * pointer +++ * pointer + imm +++ * pointer + (u16) var +++ * pointer + (u16) var + imm +++ * if (range > 0) then [ptr, ptr + range - off) is safe to access +++ * if (id > 0) means that some 'var' was added +++ * if (off > 0) means that 'imm' was added +++ */ +++enum bpf_reg_type { +++ NOT_INIT = 0, /* nothing was written into register */ +++ SCALAR_VALUE, /* reg doesn't contain a valid pointer */ +++ PTR_TO_CTX, /* reg points to bpf_context */ +++ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ +++ PTR_TO_MAP_VALUE, /* reg points to map element value */ +++ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ +++ PTR_TO_STACK, /* reg == frame_pointer + offset */ +++ PTR_TO_PACKET_META, /* skb->data - meta_len */ +++ PTR_TO_PACKET, /* reg points to skb->data */ +++ PTR_TO_PACKET_END, /* skb->data + headlen */ +++ PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ +++ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ +++ PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ +++ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ +++ PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ +++ PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ +++ PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ +++ PTR_TO_TP_BUFFER, /* reg points to a writable raw tp's buffer */ +++ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ +++}; +++ +++/* The information passed from prog-specific *_is_valid_access +++ * back to the verifier. +++ */ +++struct bpf_insn_access_aux { +++ enum bpf_reg_type reg_type; +++ int ctx_field_size; +++}; +++ +++static inline void +++bpf_ctx_record_field_size(struct bpf_insn_access_aux *aux, u32 size) +++{ +++ aux->ctx_field_size = size; +++} +++ +++struct bpf_prog_ops { +++ int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, +++ union bpf_attr __user *uattr); +++}; ++ ++ struct bpf_verifier_ops { ++ /* return eBPF function prototype for verification */ ++- const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); +++ const struct bpf_func_proto * +++ (*get_func_proto)(enum bpf_func_id func_id, +++ const struct bpf_prog *prog); ++ ++ /* return true if 'size' wide access at offset 'off' within bpf_context ++ * with 'type' (read or write) is allowed ++ */ ++- bool (*is_valid_access)(int off, int size, enum bpf_access_type type); +++ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info); +++ int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, +++ const struct bpf_prog *prog); +++ int (*gen_ld_abs)(const struct bpf_insn *orig, +++ struct bpf_insn *insn_buf); +++ u32 (*convert_ctx_access)(enum bpf_access_type type, +++ const struct bpf_insn *src, +++ struct bpf_insn *dst, +++ struct bpf_prog *prog, u32 *target_size); +++}; ++ ++- u32 (*convert_ctx_access)(enum bpf_access_type type, int dst_reg, ++- int src_reg, int ctx_off, ++- struct bpf_insn *insn, struct bpf_prog *prog); +++struct bpf_prog_offload_ops { +++ /* verifier basic callbacks */ +++ int (*insn_hook)(struct bpf_verifier_env *env, +++ int insn_idx, int prev_insn_idx); +++ int (*finalize)(struct bpf_verifier_env *env); +++ /* verifier optimization callbacks (called after .finalize) */ +++ int (*replace_insn)(struct bpf_verifier_env *env, u32 off, +++ struct bpf_insn *insn); +++ int (*remove_insns)(struct bpf_verifier_env *env, u32 off, u32 cnt); +++ /* program management callbacks */ +++ int (*prepare)(struct bpf_prog *prog); +++ int (*translate)(struct bpf_prog *prog); +++ void (*destroy)(struct bpf_prog *prog); ++ }; ++ ++-struct bpf_prog_type_list { ++- struct list_head list_node; ++- const struct bpf_verifier_ops *ops; ++- enum bpf_prog_type type; +++struct bpf_prog_offload { +++ struct bpf_prog *prog; +++ struct net_device *netdev; +++ struct bpf_offload_dev *offdev; +++ void *dev_priv; +++ struct list_head offloads; +++ bool dev_state; +++ bool opt_failed; +++ void *jited_image; +++ u32 jited_len; +++}; +++ +++enum bpf_cgroup_storage_type { +++ BPF_CGROUP_STORAGE_SHARED, +++ BPF_CGROUP_STORAGE_PERCPU, +++ __BPF_CGROUP_STORAGE_MAX +++}; +++ +++#define MAX_BPF_CGROUP_STORAGE_TYPE __BPF_CGROUP_STORAGE_MAX +++ +++struct bpf_prog_stats { +++ u64 cnt; +++ u64 nsecs; +++ struct u64_stats_sync syncp; ++ }; ++ ++ struct bpf_prog_aux { ++ atomic_t refcnt; ++ u32 used_map_cnt; ++- const struct bpf_verifier_ops *ops; +++ u32 max_ctx_offset; +++ u32 max_pkt_offset; +++ u32 max_tp_access; +++ u32 stack_depth; +++ u32 id; +++ u32 func_cnt; /* used by non-func prog as the number of func progs */ +++ u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */ +++ bool verifier_zext; /* Zero extensions has been inserted by verifier. */ +++ bool offload_requested; +++ struct bpf_prog **func; +++ void *jit_data; /* JIT specific data. arch dependent */ +++ struct latch_tree_node ksym_tnode; +++ struct list_head ksym_lnode; +++ const struct bpf_prog_ops *ops; ++ struct bpf_map **used_maps; ++ struct bpf_prog *prog; ++ struct user_struct *user; +++ u64 load_time; /* ns since boottime */ +++ struct bpf_map *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; +++ char name[BPF_OBJ_NAME_LEN]; +++#ifdef CONFIG_SECURITY +++ void *security; +++#endif +++ struct bpf_prog_offload *offload; +++ struct btf *btf; +++ struct bpf_func_info *func_info; +++ /* bpf_line_info loaded from userspace. linfo->insn_off +++ * has the xlated insn offset. +++ * Both the main and sub prog share the same linfo. +++ * The subprog can access its first linfo by +++ * using the linfo_idx. +++ */ +++ struct bpf_line_info *linfo; +++ /* jited_linfo is the jited addr of the linfo. It has a +++ * one to one mapping to linfo: +++ * jited_linfo[i] is the jited addr for the linfo[i]->insn_off. +++ * Both the main and sub prog share the same jited_linfo. +++ * The subprog can access its first jited_linfo by +++ * using the linfo_idx. +++ */ +++ void **jited_linfo; +++ u32 func_info_cnt; +++ u32 nr_linfo; +++ /* subprog can use linfo_idx to access its first linfo and +++ * jited_linfo. +++ * main prog always has linfo_idx == 0 +++ */ +++ u32 linfo_idx; +++ struct bpf_prog_stats __percpu *stats; ++ union { ++ struct work_struct work; ++ struct rcu_head rcu; ++@@ -153,76 +437,688 @@ struct bpf_array { ++ union { ++ char value[0] __aligned(8); ++ void *ptrs[0] __aligned(8); +++ void __percpu *pptrs[0] __aligned(8); ++ }; ++ }; +++ +++#define BPF_COMPLEXITY_LIMIT_INSNS 1000000 /* yes. 1M insns */ ++ #define MAX_TAIL_CALL_CNT 32 ++ ++-u64 bpf_tail_call(u64 ctx, u64 r2, u64 index, u64 r4, u64 r5); ++-void bpf_fd_array_map_clear(struct bpf_map *map); +++#define BPF_F_ACCESS_MASK (BPF_F_RDONLY | \ +++ BPF_F_RDONLY_PROG | \ +++ BPF_F_WRONLY | \ +++ BPF_F_WRONLY_PROG) +++ +++#define BPF_MAP_CAN_READ BIT(0) +++#define BPF_MAP_CAN_WRITE BIT(1) +++ +++static inline u32 bpf_map_flags_to_cap(struct bpf_map *map) +++{ +++ u32 access_flags = map->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); +++ +++ /* Combination of BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG is +++ * not possible. +++ */ +++ if (access_flags & BPF_F_RDONLY_PROG) +++ return BPF_MAP_CAN_READ; +++ else if (access_flags & BPF_F_WRONLY_PROG) +++ return BPF_MAP_CAN_WRITE; +++ else +++ return BPF_MAP_CAN_READ | BPF_MAP_CAN_WRITE; +++} +++ +++static inline bool bpf_map_flags_access_ok(u32 access_flags) +++{ +++ return (access_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) != +++ (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG); +++} +++ +++struct bpf_event_entry { +++ struct perf_event *event; +++ struct file *perf_file; +++ struct file *map_file; +++ struct rcu_head rcu; +++}; +++ ++ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); +++int bpf_prog_calc_tag(struct bpf_prog *fp); +++ ++ const struct bpf_func_proto *bpf_get_trace_printk_proto(void); ++ +++typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, +++ unsigned long off, unsigned long len); +++typedef u32 (*bpf_convert_ctx_access_t)(enum bpf_access_type type, +++ const struct bpf_insn *src, +++ struct bpf_insn *dst, +++ struct bpf_prog *prog, +++ u32 *target_size); +++ +++u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, +++ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); +++ +++/* an array of programs to be executed under rcu_lock. +++ * +++ * Typical usage: +++ * ret = BPF_PROG_RUN_ARRAY(&bpf_prog_array, ctx, BPF_PROG_RUN); +++ * +++ * the structure returned by bpf_prog_array_alloc() should be populated +++ * with program pointers and the last pointer must be NULL. +++ * The user has to keep refcnt on the program and make sure the program +++ * is removed from the array before bpf_prog_put(). +++ * The 'struct bpf_prog_array *' should only be replaced with xchg() +++ * since other cpus are walking the array of pointers in parallel. +++ */ +++struct bpf_prog_array_item { +++ struct bpf_prog *prog; +++ struct bpf_cgroup_storage *cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]; +++}; +++ +++struct bpf_prog_array { +++ struct rcu_head rcu; +++ struct bpf_prog_array_item items[0]; +++}; +++ +++struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags); +++void bpf_prog_array_free(struct bpf_prog_array *progs); +++int bpf_prog_array_length(struct bpf_prog_array *progs); +++bool bpf_prog_array_is_empty(struct bpf_prog_array *array); +++int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs, +++ __u32 __user *prog_ids, u32 cnt); +++ +++void bpf_prog_array_delete_safe(struct bpf_prog_array *progs, +++ struct bpf_prog *old_prog); +++int bpf_prog_array_copy_info(struct bpf_prog_array *array, +++ u32 *prog_ids, u32 request_cnt, +++ u32 *prog_cnt); +++int bpf_prog_array_copy(struct bpf_prog_array *old_array, +++ struct bpf_prog *exclude_prog, +++ struct bpf_prog *include_prog, +++ struct bpf_prog_array **new_array); +++ +++#define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage) \ +++ ({ \ +++ struct bpf_prog_array_item *_item; \ +++ struct bpf_prog *_prog; \ +++ struct bpf_prog_array *_array; \ +++ u32 _ret = 1; \ +++ preempt_disable(); \ +++ rcu_read_lock(); \ +++ _array = rcu_dereference(array); \ +++ if (unlikely(check_non_null && !_array))\ +++ goto _out; \ +++ _item = &_array->items[0]; \ +++ while ((_prog = READ_ONCE(_item->prog))) { \ +++ if (set_cg_storage) \ +++ bpf_cgroup_storage_set(_item->cgroup_storage); \ +++ _ret &= func(_prog, ctx); \ +++ _item++; \ +++ } \ +++_out: \ +++ rcu_read_unlock(); \ +++ preempt_enable(); \ +++ _ret; \ +++ }) +++ +++/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs +++ * so BPF programs can request cwr for TCP packets. +++ * +++ * Current cgroup skb programs can only return 0 or 1 (0 to drop the +++ * packet. This macro changes the behavior so the low order bit +++ * indicates whether the packet should be dropped (0) or not (1) +++ * and the next bit is a congestion notification bit. This could be +++ * used by TCP to call tcp_enter_cwr() +++ * +++ * Hence, new allowed return values of CGROUP EGRESS BPF programs are: +++ * 0: drop packet +++ * 1: keep packet +++ * 2: drop packet and cn +++ * 3: keep packet and cn +++ * +++ * This macro then converts it to one of the NET_XMIT or an error +++ * code that is then interpreted as drop packet (and no cn): +++ * 0: NET_XMIT_SUCCESS skb should be transmitted +++ * 1: NET_XMIT_DROP skb should be dropped and cn +++ * 2: NET_XMIT_CN skb should be transmitted and cn +++ * 3: -EPERM skb should be dropped +++ */ +++#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ +++ ({ \ +++ struct bpf_prog_array_item *_item; \ +++ struct bpf_prog *_prog; \ +++ struct bpf_prog_array *_array; \ +++ u32 ret; \ +++ u32 _ret = 1; \ +++ u32 _cn = 0; \ +++ preempt_disable(); \ +++ rcu_read_lock(); \ +++ _array = rcu_dereference(array); \ +++ _item = &_array->items[0]; \ +++ while ((_prog = READ_ONCE(_item->prog))) { \ +++ bpf_cgroup_storage_set(_item->cgroup_storage); \ +++ ret = func(_prog, ctx); \ +++ _ret &= (ret & 1); \ +++ _cn |= (ret & 2); \ +++ _item++; \ +++ } \ +++ rcu_read_unlock(); \ +++ preempt_enable(); \ +++ if (_ret) \ +++ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ +++ else \ +++ _ret = (_cn ? NET_XMIT_DROP : -EPERM); \ +++ _ret; \ +++ }) +++ +++#define BPF_PROG_RUN_ARRAY(array, ctx, func) \ +++ __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true) +++ +++#define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func) \ +++ __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false) +++ ++ #ifdef CONFIG_BPF_SYSCALL ++-void bpf_register_prog_type(struct bpf_prog_type_list *tl); ++-void bpf_register_map_type(struct bpf_map_type_list *tl); +++DECLARE_PER_CPU(int, bpf_prog_active); +++ +++extern const struct file_operations bpf_map_fops; +++extern const struct file_operations bpf_prog_fops; +++ +++#define BPF_PROG_TYPE(_id, _name) \ +++ extern const struct bpf_prog_ops _name ## _prog_ops; \ +++ extern const struct bpf_verifier_ops _name ## _verifier_ops; +++#define BPF_MAP_TYPE(_id, _ops) \ +++ extern const struct bpf_map_ops _ops; +++#include +++#undef BPF_PROG_TYPE +++#undef BPF_MAP_TYPE +++ +++extern const struct bpf_prog_ops bpf_offload_prog_ops; +++extern const struct bpf_verifier_ops tc_cls_act_analyzer_ops; +++extern const struct bpf_verifier_ops xdp_analyzer_ops; ++ ++ struct bpf_prog *bpf_prog_get(u32 ufd); ++-struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog); +++struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, +++ bool attach_drv); +++struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, int i); +++void bpf_prog_sub(struct bpf_prog *prog, int i); +++struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog); +++struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); ++ void bpf_prog_put(struct bpf_prog *prog); ++-void bpf_prog_put_rcu(struct bpf_prog *prog); +++int __bpf_prog_charge(struct user_struct *user, u32 pages); +++void __bpf_prog_uncharge(struct user_struct *user, u32 pages); +++ +++void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); +++void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); ++ ++ struct bpf_map *bpf_map_get_with_uref(u32 ufd); ++ struct bpf_map *__bpf_map_get(struct fd f); ++-struct bpf_map *bpf_map_inc(struct bpf_map *map, bool uref); +++struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); +++struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map, +++ bool uref); ++ void bpf_map_put_with_uref(struct bpf_map *map); ++ void bpf_map_put(struct bpf_map *map); +++int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); +++void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); +++int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); +++void bpf_map_charge_finish(struct bpf_map_memory *mem); +++void bpf_map_charge_move(struct bpf_map_memory *dst, +++ struct bpf_map_memory *src); +++void *bpf_map_area_alloc(u64 size, int numa_node); +++void bpf_map_area_free(void *base); +++void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); ++ ++ extern int sysctl_unprivileged_bpf_disabled; ++ ++-int bpf_map_new_fd(struct bpf_map *map); +++int bpf_map_new_fd(struct bpf_map *map, int flags); ++ int bpf_prog_new_fd(struct bpf_prog *prog); ++ ++ int bpf_obj_pin_user(u32 ufd, const char __user *pathname); ++-int bpf_obj_get_user(const char __user *pathname); +++int bpf_obj_get_user(const char __user *pathname, int flags); +++ +++int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value); +++int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value); +++int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, +++ u64 flags); +++int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, +++ u64 flags); +++ +++int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); +++ +++int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, +++ void *key, void *value, u64 map_flags); +++int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); +++int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, +++ void *key, void *value, u64 map_flags); +++int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value); +++ +++int bpf_get_file_flag(int flags); +++int bpf_check_uarg_tail_zero(void __user *uaddr, size_t expected_size, +++ size_t actual_size); +++ +++/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and +++ * forced to use 'long' read/writes to try to atomically copy long counters. +++ * Best-effort only. No barriers here, since it _will_ race with concurrent +++ * updates from BPF programs. Called from bpf syscall and mostly used with +++ * size 8 or 16 bytes, so ask compiler to inline it. +++ */ +++static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) +++{ +++ const long *lsrc = src; +++ long *ldst = dst; +++ +++ size /= sizeof(long); +++ while (size--) +++ *ldst++ = *lsrc++; +++} ++ ++ /* verify correctness of eBPF program */ ++-int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); ++-#else ++-static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl) +++int bpf_check(struct bpf_prog **fp, union bpf_attr *attr, +++ union bpf_attr __user *uattr); +++ +++#ifndef CONFIG_BPF_JIT_ALWAYS_ON +++void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); +++#endif +++ +++/* Map specifics */ +++struct xdp_buff; +++struct sk_buff; +++ +++struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +++struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); +++void __dev_map_flush(struct bpf_map *map); +++int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +++ struct net_device *dev_rx); +++int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, +++ struct bpf_prog *xdp_prog); +++ +++struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); +++void __cpu_map_flush(struct bpf_map *map); +++int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, +++ struct net_device *dev_rx); +++ +++/* Return map's numa specified by userspace */ +++static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) ++ { +++ return (attr->map_flags & BPF_F_NUMA_NODE) ? +++ attr->numa_node : NUMA_NO_NODE; ++ } ++ +++struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type); +++int array_map_alloc_check(union bpf_attr *attr); +++ +++int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, +++ union bpf_attr __user *uattr); +++int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, +++ union bpf_attr __user *uattr); +++int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, +++ const union bpf_attr *kattr, +++ union bpf_attr __user *uattr); +++#else /* !CONFIG_BPF_SYSCALL */ ++ static inline struct bpf_prog *bpf_prog_get(u32 ufd) ++ { ++ return ERR_PTR(-EOPNOTSUPP); ++ } ++ +++static inline struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, +++ enum bpf_prog_type type, +++ bool attach_drv) +++{ +++ return ERR_PTR(-EOPNOTSUPP); +++} +++ +++static inline struct bpf_prog * __must_check bpf_prog_add(struct bpf_prog *prog, +++ int i) +++{ +++ return ERR_PTR(-EOPNOTSUPP); +++} +++ +++static inline void bpf_prog_sub(struct bpf_prog *prog, int i) +++{ +++} +++ ++ static inline void bpf_prog_put(struct bpf_prog *prog) ++ { ++ } ++ ++-static inline void bpf_prog_put_rcu(struct bpf_prog *prog) +++static inline struct bpf_prog * __must_check bpf_prog_inc(struct bpf_prog *prog) +++{ +++ return ERR_PTR(-EOPNOTSUPP); +++} +++ +++static inline struct bpf_prog *__must_check +++bpf_prog_inc_not_zero(struct bpf_prog *prog) +++{ +++ return ERR_PTR(-EOPNOTSUPP); +++} +++ +++static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) +++{ +++ return 0; +++} +++ +++static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +++{ +++} +++ +++static inline int bpf_obj_get_user(const char __user *pathname, int flags) +++{ +++ return -EOPNOTSUPP; +++} +++ +++static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, +++ u32 key) +++{ +++ return NULL; +++} +++ +++static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, +++ u32 key) +++{ +++ return NULL; +++} +++ +++static inline void __dev_map_flush(struct bpf_map *map) +++{ +++} +++ +++struct xdp_buff; +++struct bpf_dtab_netdev; +++ +++static inline +++int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +++ struct net_device *dev_rx) +++{ +++ return 0; +++} +++ +++struct sk_buff; +++ +++static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, +++ struct sk_buff *skb, +++ struct bpf_prog *xdp_prog) +++{ +++ return 0; +++} +++ +++static inline +++struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key) +++{ +++ return NULL; +++} +++ +++static inline void __cpu_map_flush(struct bpf_map *map) +++{ +++} +++ +++static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, +++ struct xdp_buff *xdp, +++ struct net_device *dev_rx) +++{ +++ return 0; +++} +++ +++static inline struct bpf_prog *bpf_prog_get_type_path(const char *name, +++ enum bpf_prog_type type) +++{ +++ return ERR_PTR(-EOPNOTSUPP); +++} +++ +++static inline int bpf_prog_test_run_xdp(struct bpf_prog *prog, +++ const union bpf_attr *kattr, +++ union bpf_attr __user *uattr) +++{ +++ return -ENOTSUPP; +++} +++ +++static inline int bpf_prog_test_run_skb(struct bpf_prog *prog, +++ const union bpf_attr *kattr, +++ union bpf_attr __user *uattr) +++{ +++ return -ENOTSUPP; +++} +++ +++static inline int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, +++ const union bpf_attr *kattr, +++ union bpf_attr __user *uattr) +++{ +++ return -ENOTSUPP; +++} +++#endif /* CONFIG_BPF_SYSCALL */ +++ +++static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, +++ enum bpf_prog_type type) +++{ +++ return bpf_prog_get_type_dev(ufd, type, false); +++} +++ +++bool bpf_prog_get_ok(struct bpf_prog *, enum bpf_prog_type *, bool); +++ +++#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) +++ +++static inline bool bpf_prog_is_dev_bound(const struct bpf_prog_aux *aux) +++{ +++ return aux->offload_requested; +++} +++ +++static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +++{ +++ return false; +++} +++ +++#else +++static inline int bpf_prog_offload_init(struct bpf_prog *prog, +++ union bpf_attr *attr) +++{ +++ return -EOPNOTSUPP; +++} +++ +++static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) +++{ +++ return false; +++} +++ +++static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +++{ +++ return false; +++} +++ +++#endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ +++ +++#if defined(CONFIG_BPF_STREAM_PARSER) +++int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, +++ struct bpf_prog *old, u32 which); +++int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog); +++int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); +++#else +++static inline int sock_map_prog_update(struct bpf_map *map, +++ struct bpf_prog *prog, +++ struct bpf_prog *old, u32 which) ++ { +++ return -EOPNOTSUPP; +++} +++ +++static inline int sock_map_get_from_fd(const union bpf_attr *attr, +++ struct bpf_prog *prog) +++{ +++ return -EINVAL; +++} +++ +++static inline int sock_map_prog_detach(const union bpf_attr *attr, +++ enum bpf_prog_type ptype) +++{ +++ return -EOPNOTSUPP; +++} +++#endif +++ +++#if defined(CONFIG_XDP_SOCKETS) +++struct xdp_sock; +++struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, u32 key); +++int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, +++ struct xdp_sock *xs); +++void __xsk_map_flush(struct bpf_map *map); +++#else +++struct xdp_sock; +++static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, +++ u32 key) +++{ +++ return NULL; +++} +++ +++static inline int __xsk_map_redirect(struct bpf_map *map, struct xdp_buff *xdp, +++ struct xdp_sock *xs) +++{ +++ return -EOPNOTSUPP; +++} +++ +++static inline void __xsk_map_flush(struct bpf_map *map) +++{ +++} +++#endif +++ +++#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) +++void bpf_sk_reuseport_detach(struct sock *sk); +++int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key, +++ void *value); +++int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key, +++ void *value, u64 map_flags); +++#else +++static inline void bpf_sk_reuseport_detach(struct sock *sk) +++{ +++} +++ +++#ifdef CONFIG_BPF_SYSCALL +++static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, +++ void *key, void *value) +++{ +++ return -EOPNOTSUPP; +++} +++ +++static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, +++ void *key, void *value, +++ u64 map_flags) +++{ +++ return -EOPNOTSUPP; ++ } ++ #endif /* CONFIG_BPF_SYSCALL */ +++#endif /* defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL) */ ++ ++ /* verifier prototypes for helper functions called from eBPF programs */ ++ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; ++ extern const struct bpf_func_proto bpf_map_update_elem_proto; ++ extern const struct bpf_func_proto bpf_map_delete_elem_proto; +++extern const struct bpf_func_proto bpf_map_push_elem_proto; +++extern const struct bpf_func_proto bpf_map_pop_elem_proto; +++extern const struct bpf_func_proto bpf_map_peek_elem_proto; ++ ++ extern const struct bpf_func_proto bpf_get_prandom_u32_proto; ++ extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; +++extern const struct bpf_func_proto bpf_get_numa_node_id_proto; ++ extern const struct bpf_func_proto bpf_tail_call_proto; ++ extern const struct bpf_func_proto bpf_ktime_get_ns_proto; ++ extern const struct bpf_func_proto bpf_get_current_pid_tgid_proto; ++ extern const struct bpf_func_proto bpf_get_current_uid_gid_proto; ++ extern const struct bpf_func_proto bpf_get_current_comm_proto; ++-extern const struct bpf_func_proto bpf_skb_vlan_push_proto; ++-extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; +++extern const struct bpf_func_proto bpf_get_stackid_proto; +++extern const struct bpf_func_proto bpf_get_stack_proto; +++extern const struct bpf_func_proto bpf_sock_map_update_proto; +++extern const struct bpf_func_proto bpf_sock_hash_update_proto; +++extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto; +++extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; +++extern const struct bpf_func_proto bpf_msg_redirect_map_proto; +++extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; +++extern const struct bpf_func_proto bpf_sk_redirect_map_proto; +++extern const struct bpf_func_proto bpf_spin_lock_proto; +++extern const struct bpf_func_proto bpf_spin_unlock_proto; +++extern const struct bpf_func_proto bpf_get_local_storage_proto; +++extern const struct bpf_func_proto bpf_strtol_proto; +++extern const struct bpf_func_proto bpf_strtoul_proto; +++extern const struct bpf_func_proto bpf_tcp_sock_proto; ++ ++ /* Shared helpers among cBPF and eBPF. */ ++ void bpf_user_rnd_init_once(void); ++ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); ++ +++#if defined(CONFIG_NET) +++bool bpf_sock_common_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ struct bpf_insn_access_aux *info); +++bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, +++ struct bpf_insn_access_aux *info); +++u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size); +++#else +++static inline bool bpf_sock_common_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++static inline bool bpf_sock_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size) +++{ +++ return 0; +++} +++#endif +++ +++#ifdef CONFIG_INET +++bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, +++ struct bpf_insn_access_aux *info); +++ +++u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size); +++ +++bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type, +++ struct bpf_insn_access_aux *info); +++ +++u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size); +++#else +++static inline bool bpf_tcp_sock_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size) +++{ +++ return 0; +++} +++static inline bool bpf_xdp_sock_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static inline u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size) +++{ +++ return 0; +++} +++#endif /* CONFIG_INET */ +++ ++ #endif /* _LINUX_BPF_H */ ++--- /dev/null +++++ b/include/linux/bpf_trace.h ++@@ -0,0 +1,7 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ +++#ifndef __LINUX_BPF_TRACE_H__ +++#define __LINUX_BPF_TRACE_H__ +++ +++#include +++ +++#endif /* __LINUX_BPF_TRACE_H__ */ ++--- /dev/null +++++ b/include/linux/bpf_types.h ++@@ -0,0 +1,44 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ +++/* internal file - do not include directly */ +++ +++#ifdef CONFIG_NET +++BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter) +++BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act) +++BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) +++BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_in) +++BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_out) +++BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) +++BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_SEG6LOCAL, lwt_seg6local) +++BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops) +++BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb) +++BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) +++BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) +++#endif +++#ifdef CONFIG_BPF_EVENTS +++BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) +++BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) +++BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) +++BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) +++BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, raw_tracepoint_writable) +++#endif +++ +++BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops) +++#ifdef CONFIG_PERF_EVENTS +++BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_trace_map_ops) +++#endif +++BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) +++#ifdef CONFIG_NET +++BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) +++#endif +++BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) +++BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) ++--- /dev/null +++++ b/include/linux/bpf_verifier.h ++@@ -0,0 +1,425 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ +++/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com +++ */ +++#ifndef _LINUX_BPF_VERIFIER_H +++#define _LINUX_BPF_VERIFIER_H 1 +++ +++#include /* for enum bpf_reg_type */ +++#include /* for MAX_BPF_STACK */ +++#include +++ +++/* Maximum variable offset umax_value permitted when resolving memory accesses. +++ * In practice this is far bigger than any realistic pointer offset; this limit +++ * ensures that umax_value + (int)off + (int)size cannot overflow a u64. +++ */ +++#define BPF_MAX_VAR_OFF (1 << 29) +++/* Maximum variable size permitted for ARG_CONST_SIZE[_OR_ZERO]. This ensures +++ * that converting umax_value to int cannot overflow. +++ */ +++#define BPF_MAX_VAR_SIZ (1 << 29) +++ +++/* Liveness marks, used for registers and spilled-regs (in stack slots). +++ * Read marks propagate upwards until they find a write mark; they record that +++ * "one of this state's descendants read this reg" (and therefore the reg is +++ * relevant for states_equal() checks). +++ * Write marks collect downwards and do not propagate; they record that "the +++ * straight-line code that reached this state (from its parent) wrote this reg" +++ * (and therefore that reads propagated from this state or its descendants +++ * should not propagate to its parent). +++ * A state with a write mark can receive read marks; it just won't propagate +++ * them to its parent, since the write mark is a property, not of the state, +++ * but of the link between it and its parent. See mark_reg_read() and +++ * mark_stack_slot_read() in kernel/bpf/verifier.c. +++ */ +++enum bpf_reg_liveness { +++ REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */ +++ REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */ +++ REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */ +++ REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64, +++ REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */ +++ REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */ +++}; +++ +++struct bpf_reg_state { +++ /* Ordering of fields matters. See states_equal() */ +++ enum bpf_reg_type type; +++ union { +++ /* valid when type == PTR_TO_PACKET */ +++ u16 range; +++ +++ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | +++ * PTR_TO_MAP_VALUE_OR_NULL +++ */ +++ struct bpf_map *map_ptr; +++ +++ /* Max size from any of the above. */ +++ unsigned long raw; +++ }; +++ /* Fixed part of pointer offset, pointer types only */ +++ s32 off; +++ /* For PTR_TO_PACKET, used to find other pointers with the same variable +++ * offset, so they can share range knowledge. +++ * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we +++ * came from, when one is tested for != NULL. +++ * For PTR_TO_SOCKET this is used to share which pointers retain the +++ * same reference to the socket, to determine proper reference freeing. +++ */ +++ u32 id; +++ /* PTR_TO_SOCKET and PTR_TO_TCP_SOCK could be a ptr returned +++ * from a pointer-cast helper, bpf_sk_fullsock() and +++ * bpf_tcp_sock(). +++ * +++ * Consider the following where "sk" is a reference counted +++ * pointer returned from "sk = bpf_sk_lookup_tcp();": +++ * +++ * 1: sk = bpf_sk_lookup_tcp(); +++ * 2: if (!sk) { return 0; } +++ * 3: fullsock = bpf_sk_fullsock(sk); +++ * 4: if (!fullsock) { bpf_sk_release(sk); return 0; } +++ * 5: tp = bpf_tcp_sock(fullsock); +++ * 6: if (!tp) { bpf_sk_release(sk); return 0; } +++ * 7: bpf_sk_release(sk); +++ * 8: snd_cwnd = tp->snd_cwnd; // verifier will complain +++ * +++ * After bpf_sk_release(sk) at line 7, both "fullsock" ptr and +++ * "tp" ptr should be invalidated also. In order to do that, +++ * the reg holding "fullsock" and "sk" need to remember +++ * the original refcounted ptr id (i.e. sk_reg->id) in ref_obj_id +++ * such that the verifier can reset all regs which have +++ * ref_obj_id matching the sk_reg->id. +++ * +++ * sk_reg->ref_obj_id is set to sk_reg->id at line 1. +++ * sk_reg->id will stay as NULL-marking purpose only. +++ * After NULL-marking is done, sk_reg->id can be reset to 0. +++ * +++ * After "fullsock = bpf_sk_fullsock(sk);" at line 3, +++ * fullsock_reg->ref_obj_id is set to sk_reg->ref_obj_id. +++ * +++ * After "tp = bpf_tcp_sock(fullsock);" at line 5, +++ * tp_reg->ref_obj_id is set to fullsock_reg->ref_obj_id +++ * which is the same as sk_reg->ref_obj_id. +++ * +++ * From the verifier perspective, if sk, fullsock and tp +++ * are not NULL, they are the same ptr with different +++ * reg->type. In particular, bpf_sk_release(tp) is also +++ * allowed and has the same effect as bpf_sk_release(sk). +++ */ +++ u32 ref_obj_id; +++ /* For scalar types (SCALAR_VALUE), this represents our knowledge of +++ * the actual value. +++ * For pointer types, this represents the variable part of the offset +++ * from the pointed-to object, and is shared with all bpf_reg_states +++ * with the same id as us. +++ */ +++ struct tnum var_off; +++ /* Used to determine if any memory access using this register will +++ * result in a bad access. +++ * These refer to the same value as var_off, not necessarily the actual +++ * contents of the register. +++ */ +++ s64 smin_value; /* minimum possible (s64)value */ +++ s64 smax_value; /* maximum possible (s64)value */ +++ u64 umin_value; /* minimum possible (u64)value */ +++ u64 umax_value; /* maximum possible (u64)value */ +++ /* parentage chain for liveness checking */ +++ struct bpf_reg_state *parent; +++ /* Inside the callee two registers can be both PTR_TO_STACK like +++ * R1=fp-8 and R2=fp-8, but one of them points to this function stack +++ * while another to the caller's stack. To differentiate them 'frameno' +++ * is used which is an index in bpf_verifier_state->frame[] array +++ * pointing to bpf_func_state. +++ */ +++ u32 frameno; +++ /* Tracks subreg definition. The stored value is the insn_idx of the +++ * writing insn. This is safe because subreg_def is used before any insn +++ * patching which only happens after main verification finished. +++ */ +++ s32 subreg_def; +++ enum bpf_reg_liveness live; +++ /* if (!precise && SCALAR_VALUE) min/max/tnum don't affect safety */ +++ bool precise; +++}; +++ +++enum bpf_stack_slot_type { +++ STACK_INVALID, /* nothing was stored in this stack slot */ +++ STACK_SPILL, /* register spilled into stack */ +++ STACK_MISC, /* BPF program wrote some data into this slot */ +++ STACK_ZERO, /* BPF program wrote constant zero */ +++}; +++ +++#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ +++ +++struct bpf_stack_state { +++ struct bpf_reg_state spilled_ptr; +++ u8 slot_type[BPF_REG_SIZE]; +++}; +++ +++struct bpf_reference_state { +++ /* Track each reference created with a unique id, even if the same +++ * instruction creates the reference multiple times (eg, via CALL). +++ */ +++ int id; +++ /* Instruction where the allocation of this reference occurred. This +++ * is used purely to inform the user of a reference leak. +++ */ +++ int insn_idx; +++}; +++ +++/* state of the program: +++ * type of all registers and stack info +++ */ +++struct bpf_func_state { +++ struct bpf_reg_state regs[MAX_BPF_REG]; +++ /* index of call instruction that called into this func */ +++ int callsite; +++ /* stack frame number of this function state from pov of +++ * enclosing bpf_verifier_state. +++ * 0 = main function, 1 = first callee. +++ */ +++ u32 frameno; +++ /* subprog number == index within subprog_stack_depth +++ * zero == main subprog +++ */ +++ u32 subprogno; +++ +++ /* The following fields should be last. See copy_func_state() */ +++ int acquired_refs; +++ struct bpf_reference_state *refs; +++ int allocated_stack; +++ struct bpf_stack_state *stack; +++}; +++ +++struct bpf_idx_pair { +++ u32 prev_idx; +++ u32 idx; +++}; +++ +++#define MAX_CALL_FRAMES 8 +++struct bpf_verifier_state { +++ /* call stack tracking */ +++ struct bpf_func_state *frame[MAX_CALL_FRAMES]; +++ struct bpf_verifier_state *parent; +++ /* +++ * 'branches' field is the number of branches left to explore: +++ * 0 - all possible paths from this state reached bpf_exit or +++ * were safely pruned +++ * 1 - at least one path is being explored. +++ * This state hasn't reached bpf_exit +++ * 2 - at least two paths are being explored. +++ * This state is an immediate parent of two children. +++ * One is fallthrough branch with branches==1 and another +++ * state is pushed into stack (to be explored later) also with +++ * branches==1. The parent of this state has branches==1. +++ * The verifier state tree connected via 'parent' pointer looks like: +++ * 1 +++ * 1 +++ * 2 -> 1 (first 'if' pushed into stack) +++ * 1 +++ * 2 -> 1 (second 'if' pushed into stack) +++ * 1 +++ * 1 +++ * 1 bpf_exit. +++ * +++ * Once do_check() reaches bpf_exit, it calls update_branch_counts() +++ * and the verifier state tree will look: +++ * 1 +++ * 1 +++ * 2 -> 1 (first 'if' pushed into stack) +++ * 1 +++ * 1 -> 1 (second 'if' pushed into stack) +++ * 0 +++ * 0 +++ * 0 bpf_exit. +++ * After pop_stack() the do_check() will resume at second 'if'. +++ * +++ * If is_state_visited() sees a state with branches > 0 it means +++ * there is a loop. If such state is exactly equal to the current state +++ * it's an infinite loop. Note states_equal() checks for states +++ * equvalency, so two states being 'states_equal' does not mean +++ * infinite loop. The exact comparison is provided by +++ * states_maybe_looping() function. It's a stronger pre-check and +++ * much faster than states_equal(). +++ * +++ * This algorithm may not find all possible infinite loops or +++ * loop iteration count may be too high. +++ * In such cases BPF_COMPLEXITY_LIMIT_INSNS limit kicks in. +++ */ +++ u32 branches; +++ u32 insn_idx; +++ u32 curframe; +++ u32 active_spin_lock; +++ bool speculative; +++ +++ /* first and last insn idx of this verifier state */ +++ u32 first_insn_idx; +++ u32 last_insn_idx; +++ /* jmp history recorded from first to last. +++ * backtracking is using it to go from last to first. +++ * For most states jmp_history_cnt is [0-3]. +++ * For loops can go up to ~40. +++ */ +++ struct bpf_idx_pair *jmp_history; +++ u32 jmp_history_cnt; +++}; +++ +++#define bpf_get_spilled_reg(slot, frame) \ +++ (((slot < frame->allocated_stack / BPF_REG_SIZE) && \ +++ (frame->stack[slot].slot_type[0] == STACK_SPILL)) \ +++ ? &frame->stack[slot].spilled_ptr : NULL) +++ +++/* Iterate over 'frame', setting 'reg' to either NULL or a spilled register. */ +++#define bpf_for_each_spilled_reg(iter, frame, reg) \ +++ for (iter = 0, reg = bpf_get_spilled_reg(iter, frame); \ +++ iter < frame->allocated_stack / BPF_REG_SIZE; \ +++ iter++, reg = bpf_get_spilled_reg(iter, frame)) +++ +++/* linked list of verifier states used to prune search */ +++struct bpf_verifier_state_list { +++ struct bpf_verifier_state state; +++ struct bpf_verifier_state_list *next; +++ int miss_cnt, hit_cnt; +++}; +++ +++/* Possible states for alu_state member. */ +++#define BPF_ALU_SANITIZE_SRC (1U << 0) +++#define BPF_ALU_SANITIZE_DST (1U << 1) +++#define BPF_ALU_NEG_VALUE (1U << 2) +++#define BPF_ALU_NON_POINTER (1U << 3) +++#define BPF_ALU_IMMEDIATE (1U << 4) +++#define BPF_ALU_SANITIZE (BPF_ALU_SANITIZE_SRC | \ +++ BPF_ALU_SANITIZE_DST) +++ +++struct bpf_insn_aux_data { +++ union { +++ enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ +++ unsigned long map_state; /* pointer/poison value for maps */ +++ s32 call_imm; /* saved imm field of call insn */ +++ u32 alu_limit; /* limit for add/sub register with pointer */ +++ struct { +++ u32 map_index; /* index into used_maps[] */ +++ u32 map_off; /* offset from value base address */ +++ }; +++ }; +++ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ +++ int sanitize_stack_off; /* stack slot to be cleared */ +++ bool seen; /* this insn was processed by the verifier */ +++ bool zext_dst; /* this insn zero extends dst reg */ +++ u8 alu_state; /* used in combination with alu_limit */ +++ bool prune_point; +++ unsigned int orig_idx; /* original instruction index */ +++}; +++ +++#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +++ +++#define BPF_VERIFIER_TMP_LOG_SIZE 1024 +++ +++struct bpf_verifier_log { +++ u32 level; +++ char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; +++ char __user *ubuf; +++ u32 len_used; +++ u32 len_total; +++}; +++ +++static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) +++{ +++ return log->len_used >= log->len_total - 1; +++} +++ +++#define BPF_LOG_LEVEL1 1 +++#define BPF_LOG_LEVEL2 2 +++#define BPF_LOG_STATS 4 +++#define BPF_LOG_LEVEL (BPF_LOG_LEVEL1 | BPF_LOG_LEVEL2) +++#define BPF_LOG_MASK (BPF_LOG_LEVEL | BPF_LOG_STATS) +++ +++static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) +++{ +++ return log->level && log->ubuf && !bpf_verifier_log_full(log); +++} +++ +++#define BPF_MAX_SUBPROGS 256 +++ +++struct bpf_subprog_info { +++ u32 start; /* insn idx of function entry point */ +++ u32 linfo_idx; /* The idx to the main_prog->aux->linfo */ +++ u16 stack_depth; /* max. stack depth used by this function */ +++ bool has_tail_call; +++}; +++ +++/* single container for all structs +++ * one verifier_env per bpf_check() call +++ */ +++struct bpf_verifier_env { +++ u32 insn_idx; +++ u32 prev_insn_idx; +++ struct bpf_prog *prog; /* eBPF program being verified */ +++ const struct bpf_verifier_ops *ops; +++ struct bpf_verifier_stack_elem *head; /* stack of verifier states to be processed */ +++ int stack_size; /* number of states to be processed */ +++ bool strict_alignment; /* perform strict pointer alignment checks */ +++ bool test_state_freq; /* test verifier with different pruning frequency */ +++ struct bpf_verifier_state *cur_state; /* current verifier state */ +++ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ +++ struct bpf_verifier_state_list *free_list; +++ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ +++ u32 used_map_cnt; /* number of used maps */ +++ u32 id_gen; /* used to generate unique reg IDs */ +++ bool allow_ptr_leaks; +++ bool seen_direct_write; +++ struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ +++ const struct bpf_line_info *prev_linfo; +++ struct bpf_verifier_log log; +++ struct bpf_subprog_info subprog_info[BPF_MAX_SUBPROGS + 1]; +++ struct { +++ int *insn_state; +++ int *insn_stack; +++ int cur_stack; +++ } cfg; +++ u32 subprog_cnt; +++ /* number of instructions analyzed by the verifier */ +++ u32 prev_insn_processed, insn_processed; +++ /* number of jmps, calls, exits analyzed so far */ +++ u32 prev_jmps_processed, jmps_processed; +++ /* total verification time */ +++ u64 verification_time; +++ /* maximum number of verifier states kept in 'branching' instructions */ +++ u32 max_states_per_insn; +++ /* total number of allocated verifier states */ +++ u32 total_states; +++ /* some states are freed during program analysis. +++ * this is peak number of states. this number dominates kernel +++ * memory consumption during verification +++ */ +++ u32 peak_states; +++ /* longest register parentage chain walked for liveness marking */ +++ u32 longest_mark_read_walk; +++}; +++ +++__printf(2, 0) void bpf_verifier_vlog(struct bpf_verifier_log *log, +++ const char *fmt, va_list args); +++__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, +++ const char *fmt, ...); +++ +++static inline struct bpf_func_state *cur_func(struct bpf_verifier_env *env) +++{ +++ struct bpf_verifier_state *cur = env->cur_state; +++ +++ return cur->frame[cur->curframe]; +++} +++ +++static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) +++{ +++ return cur_func(env)->regs; +++} +++ +++int bpf_prog_offload_verifier_prep(struct bpf_prog *prog); +++int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, +++ int insn_idx, int prev_insn_idx); +++int bpf_prog_offload_finalize(struct bpf_verifier_env *env); +++void +++bpf_prog_offload_replace_insn(struct bpf_verifier_env *env, u32 off, +++ struct bpf_insn *insn); +++void +++bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); +++ +++#endif /* _LINUX_BPF_VERIFIER_H */ ++--- /dev/null +++++ b/include/linux/btf.h ++@@ -0,0 +1,72 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ +++/* Copyright (c) 2018 Facebook */ +++ +++#ifndef _LINUX_BTF_H +++#define _LINUX_BTF_H 1 +++ +++#include +++ +++struct btf; +++struct btf_member; +++struct btf_type; +++union bpf_attr; +++ +++extern const struct file_operations btf_fops; +++ +++void btf_put(struct btf *btf); +++int btf_new_fd(const union bpf_attr *attr); +++struct btf *btf_get_by_fd(int fd); +++int btf_get_info_by_fd(const struct btf *btf, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr); +++/* Figure out the size of a type_id. If type_id is a modifier +++ * (e.g. const), it will be resolved to find out the type with size. +++ * +++ * For example: +++ * In describing "const void *", type_id is "const" and "const" +++ * refers to "void *". The return type will be "void *". +++ * +++ * If type_id is a simple "int", then return type will be "int". +++ * +++ * @btf: struct btf object +++ * @type_id: Find out the size of type_id. The type_id of the return +++ * type is set to *type_id. +++ * @ret_size: It can be NULL. If not NULL, the size of the return +++ * type is set to *ret_size. +++ * Return: The btf_type (resolved to another type with size info if needed). +++ * NULL is returned if type_id itself does not have size info +++ * (e.g. void) or it cannot be resolved to another type that +++ * has size info. +++ * *type_id and *ret_size will not be changed in the +++ * NULL return case. +++ */ +++const struct btf_type *btf_type_id_size(const struct btf *btf, +++ u32 *type_id, +++ u32 *ret_size); +++void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, +++ struct seq_file *m); +++int btf_get_fd_by_id(u32 id); +++u32 btf_id(const struct btf *btf); +++bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, +++ const struct btf_member *m, +++ u32 expected_offset, u32 expected_size); +++int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); +++bool btf_type_is_void(const struct btf_type *t); +++ +++#ifdef CONFIG_BPF_SYSCALL +++const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); +++const char *btf_name_by_offset(const struct btf *btf, u32 offset); +++#else +++static inline const struct btf_type *btf_type_by_id(const struct btf *btf, +++ u32 type_id) +++{ +++ return NULL; +++} +++static inline const char *btf_name_by_offset(const struct btf *btf, +++ u32 offset) +++{ +++ return NULL; +++} +++#endif +++ +++#endif ++--- a/include/uapi/linux/bpf_common.h +++++ b/include/uapi/linux/bpf_common.h ++@@ -1,3 +1,4 @@ +++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++ #ifndef _UAPI__LINUX_BPF_COMMON_H__ ++ #define _UAPI__LINUX_BPF_COMMON_H__ ++ ++@@ -14,9 +15,10 @@ ++ ++ /* ld/ldx fields */ ++ #define BPF_SIZE(code) ((code) & 0x18) ++-#define BPF_W 0x00 ++-#define BPF_H 0x08 ++-#define BPF_B 0x10 +++#define BPF_W 0x00 /* 32-bit */ +++#define BPF_H 0x08 /* 16-bit */ +++#define BPF_B 0x10 /* 8-bit */ +++/* eBPF BPF_DW 0x18 64-bit */ ++ #define BPF_MODE(code) ((code) & 0xe0) ++ #define BPF_IMM 0x00 ++ #define BPF_ABS 0x20 ++--- a/include/uapi/linux/bpf.h +++++ b/include/uapi/linux/bpf.h ++@@ -1,3 +1,4 @@ +++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ ++ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++ * ++ * This program is free software; you can redistribute it and/or ++@@ -13,10 +14,11 @@ ++ /* Extended instruction set based on top of classic BPF */ ++ ++ /* instruction classes */ +++#define BPF_JMP32 0x06 /* jmp mode in word width */ ++ #define BPF_ALU64 0x07 /* alu mode in double word width */ ++ ++ /* ld/ldx fields */ ++-#define BPF_DW 0x18 /* double word */ +++#define BPF_DW 0x18 /* double word (64-bit) */ ++ #define BPF_XADD 0xc0 /* exclusive add */ ++ ++ /* alu/jmp fields */ ++@@ -30,9 +32,14 @@ ++ #define BPF_FROM_LE BPF_TO_LE ++ #define BPF_FROM_BE BPF_TO_BE ++ +++/* jmp encodings */ ++ #define BPF_JNE 0x50 /* jump != */ +++#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +++#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ ++ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ ++ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +++#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +++#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ ++ #define BPF_CALL 0x80 /* function call */ ++ #define BPF_EXIT 0x90 /* function return */ ++ ++@@ -63,6 +70,17 @@ struct bpf_insn { ++ __s32 imm; /* signed immediate constant */ ++ }; ++ +++/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry */ +++struct bpf_lpm_trie_key { +++ __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ +++ __u8 data[0]; /* Arbitrary size */ +++}; +++ +++struct bpf_cgroup_storage_key { +++ __u64 cgroup_inode_id; /* cgroup inode id */ +++ __u32 attach_type; /* program attach type */ +++}; +++ ++ /* BPF syscall commands, see bpf(2) man-page for details. */ ++ enum bpf_cmd { ++ BPF_MAP_CREATE, ++@@ -73,6 +91,22 @@ enum bpf_cmd { ++ BPF_PROG_LOAD, ++ BPF_OBJ_PIN, ++ BPF_OBJ_GET, +++ BPF_PROG_ATTACH, +++ BPF_PROG_DETACH, +++ BPF_PROG_TEST_RUN, +++ BPF_PROG_GET_NEXT_ID, +++ BPF_MAP_GET_NEXT_ID, +++ BPF_PROG_GET_FD_BY_ID, +++ BPF_MAP_GET_FD_BY_ID, +++ BPF_OBJ_GET_INFO_BY_FD, +++ BPF_PROG_QUERY, +++ BPF_RAW_TRACEPOINT_OPEN, +++ BPF_BTF_LOAD, +++ BPF_BTF_GET_FD_BY_ID, +++ BPF_TASK_FD_QUERY, +++ BPF_MAP_LOOKUP_AND_DELETE_ELEM, +++ BPF_MAP_FREEZE, +++ BPF_BTF_GET_NEXT_ID, ++ }; ++ ++ enum bpf_map_type { ++@@ -81,22 +115,256 @@ enum bpf_map_type { ++ BPF_MAP_TYPE_ARRAY, ++ BPF_MAP_TYPE_PROG_ARRAY, ++ BPF_MAP_TYPE_PERF_EVENT_ARRAY, +++ BPF_MAP_TYPE_PERCPU_HASH, +++ BPF_MAP_TYPE_PERCPU_ARRAY, +++ BPF_MAP_TYPE_STACK_TRACE, +++ BPF_MAP_TYPE_CGROUP_ARRAY, +++ BPF_MAP_TYPE_LRU_HASH, +++ BPF_MAP_TYPE_LRU_PERCPU_HASH, +++ BPF_MAP_TYPE_LPM_TRIE, +++ BPF_MAP_TYPE_ARRAY_OF_MAPS, +++ BPF_MAP_TYPE_HASH_OF_MAPS, +++ BPF_MAP_TYPE_DEVMAP, +++ BPF_MAP_TYPE_SOCKMAP, +++ BPF_MAP_TYPE_CPUMAP, +++ BPF_MAP_TYPE_XSKMAP, +++ BPF_MAP_TYPE_SOCKHASH, +++ BPF_MAP_TYPE_CGROUP_STORAGE, +++ BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, +++ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, +++ BPF_MAP_TYPE_QUEUE, +++ BPF_MAP_TYPE_STACK, +++ BPF_MAP_TYPE_SK_STORAGE, +++ BPF_MAP_TYPE_DEVMAP_HASH, ++ }; ++ +++/* Note that tracing related programs such as +++ * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} +++ * are not subject to a stable API since kernel internal data +++ * structures can change from release to release and may +++ * therefore break existing tracing BPF programs. Tracing BPF +++ * programs correspond to /a/ specific kernel which is to be +++ * analyzed, and not /a/ specific kernel /and/ all future ones. +++ */ ++ enum bpf_prog_type { ++ BPF_PROG_TYPE_UNSPEC, ++ BPF_PROG_TYPE_SOCKET_FILTER, ++ BPF_PROG_TYPE_KPROBE, ++ BPF_PROG_TYPE_SCHED_CLS, ++ BPF_PROG_TYPE_SCHED_ACT, +++ BPF_PROG_TYPE_TRACEPOINT, +++ BPF_PROG_TYPE_XDP, +++ BPF_PROG_TYPE_PERF_EVENT, +++ BPF_PROG_TYPE_CGROUP_SKB, +++ BPF_PROG_TYPE_CGROUP_SOCK, +++ BPF_PROG_TYPE_LWT_IN, +++ BPF_PROG_TYPE_LWT_OUT, +++ BPF_PROG_TYPE_LWT_XMIT, +++ BPF_PROG_TYPE_SOCK_OPS, +++ BPF_PROG_TYPE_SK_SKB, +++ BPF_PROG_TYPE_CGROUP_DEVICE, +++ BPF_PROG_TYPE_SK_MSG, +++ BPF_PROG_TYPE_RAW_TRACEPOINT, +++ BPF_PROG_TYPE_CGROUP_SOCK_ADDR, +++ BPF_PROG_TYPE_LWT_SEG6LOCAL, +++ BPF_PROG_TYPE_LIRC_MODE2, +++ BPF_PROG_TYPE_SK_REUSEPORT, +++ BPF_PROG_TYPE_FLOW_DISSECTOR, +++ BPF_PROG_TYPE_CGROUP_SYSCTL, +++ BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, +++ BPF_PROG_TYPE_CGROUP_SOCKOPT, ++ }; ++ +++enum bpf_attach_type { +++ BPF_CGROUP_INET_INGRESS, +++ BPF_CGROUP_INET_EGRESS, +++ BPF_CGROUP_INET_SOCK_CREATE, +++ BPF_CGROUP_SOCK_OPS, +++ BPF_SK_SKB_STREAM_PARSER, +++ BPF_SK_SKB_STREAM_VERDICT, +++ BPF_CGROUP_DEVICE, +++ BPF_SK_MSG_VERDICT, +++ BPF_CGROUP_INET4_BIND, +++ BPF_CGROUP_INET6_BIND, +++ BPF_CGROUP_INET4_CONNECT, +++ BPF_CGROUP_INET6_CONNECT, +++ BPF_CGROUP_INET4_POST_BIND, +++ BPF_CGROUP_INET6_POST_BIND, +++ BPF_CGROUP_UDP4_SENDMSG, +++ BPF_CGROUP_UDP6_SENDMSG, +++ BPF_LIRC_MODE2, +++ BPF_FLOW_DISSECTOR, +++ BPF_CGROUP_SYSCTL, +++ BPF_CGROUP_UDP4_RECVMSG, +++ BPF_CGROUP_UDP6_RECVMSG, +++ BPF_CGROUP_GETSOCKOPT, +++ BPF_CGROUP_SETSOCKOPT, +++ __MAX_BPF_ATTACH_TYPE +++}; +++ +++#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +++ +++/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command +++ * +++ * NONE(default): No further bpf programs allowed in the subtree. +++ * +++ * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, +++ * the program in this cgroup yields to sub-cgroup program. +++ * +++ * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, +++ * that cgroup program gets run in addition to the program in this cgroup. +++ * +++ * Only one program is allowed to be attached to a cgroup with +++ * NONE or BPF_F_ALLOW_OVERRIDE flag. +++ * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will +++ * release old program and attach the new one. Attach flags has to match. +++ * +++ * Multiple programs are allowed to be attached to a cgroup with +++ * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order +++ * (those that were attached first, run first) +++ * The programs of sub-cgroup are executed first, then programs of +++ * this cgroup and then programs of parent cgroup. +++ * When children program makes decision (like picking TCP CA or sock bind) +++ * parent program has a chance to override it. +++ * +++ * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. +++ * A cgroup with NONE doesn't allow any programs in sub-cgroups. +++ * Ex1: +++ * cgrp1 (MULTI progs A, B) -> +++ * cgrp2 (OVERRIDE prog C) -> +++ * cgrp3 (MULTI prog D) -> +++ * cgrp4 (OVERRIDE prog E) -> +++ * cgrp5 (NONE prog F) +++ * the event in cgrp5 triggers execution of F,D,A,B in that order. +++ * if prog F is detached, the execution is E,D,A,B +++ * if prog F and D are detached, the execution is E,A,B +++ * if prog F, E and D are detached, the execution is C,A,B +++ * +++ * All eligible programs are executed regardless of return code from +++ * earlier programs. +++ */ +++#define BPF_F_ALLOW_OVERRIDE (1U << 0) +++#define BPF_F_ALLOW_MULTI (1U << 1) +++ +++/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the +++ * verifier will perform strict alignment checking as if the kernel +++ * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, +++ * and NET_IP_ALIGN defined to 2. +++ */ +++#define BPF_F_STRICT_ALIGNMENT (1U << 0) +++ +++/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROF_LOAD command, the +++ * verifier will allow any alignment whatsoever. On platforms +++ * with strict alignment requirements for loads ands stores (such +++ * as sparc and mips) the verifier validates that all loads and +++ * stores provably follow this requirement. This flag turns that +++ * checking and enforcement off. +++ * +++ * It is mostly used for testing when we want to validate the +++ * context and memory access aspects of the verifier, but because +++ * of an unaligned access the alignment check would trigger before +++ * the one we are interested in. +++ */ +++#define BPF_F_ANY_ALIGNMENT (1U << 1) +++ +++/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. +++ * Verifier does sub-register def/use analysis and identifies instructions whose +++ * def only matters for low 32-bit, high 32-bit is never referenced later +++ * through implicit zero extension. Therefore verifier notifies JIT back-ends +++ * that it is safe to ignore clearing high 32-bit for these instructions. This +++ * saves some back-ends a lot of code-gen. However such optimization is not +++ * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends +++ * hence hasn't used verifier's analysis result. But, we really want to have a +++ * way to be able to verify the correctness of the described optimization on +++ * x86_64 on which testsuites are frequently exercised. +++ * +++ * So, this flag is introduced. Once it is set, verifier will randomize high +++ * 32-bit for those instructions who has been identified as safe to ignore them. +++ * Then, if verifier is not doing correct analysis, such randomization will +++ * regress tests to expose bugs. +++ */ +++#define BPF_F_TEST_RND_HI32 (1U << 2) +++ +++/* The verifier internal test flag. Behavior is undefined */ +++#define BPF_F_TEST_STATE_FREQ (1U << 3) +++ +++/* When BPF ldimm64's insn[0].src_reg != 0 then this can have +++ * two extensions: +++ * +++ * insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE +++ * insn[0].imm: map fd map fd +++ * insn[1].imm: 0 offset into value +++ * insn[0].off: 0 0 +++ * insn[1].off: 0 0 +++ * ldimm64 rewrite: address of map address of map[0]+offset +++ * verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE +++ */ ++ #define BPF_PSEUDO_MAP_FD 1 +++#define BPF_PSEUDO_MAP_VALUE 2 +++ +++/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative +++ * offset to another bpf function +++ */ +++#define BPF_PSEUDO_CALL 1 ++ ++ /* flags for BPF_MAP_UPDATE_ELEM command */ ++ #define BPF_ANY 0 /* create new element or update existing */ ++ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ ++ #define BPF_EXIST 2 /* update existing element */ +++#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ +++ +++/* flags for BPF_MAP_CREATE command */ +++#define BPF_F_NO_PREALLOC (1U << 0) +++/* Instead of having one common LRU list in the +++ * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list +++ * which can scale and perform better. +++ * Note, the LRU nodes (including free nodes) cannot be moved +++ * across different LRU lists. +++ */ +++#define BPF_F_NO_COMMON_LRU (1U << 1) +++/* Specify numa node during map creation */ +++#define BPF_F_NUMA_NODE (1U << 2) +++ +++#define BPF_OBJ_NAME_LEN 16U +++ +++/* Flags for accessing BPF object from syscall side. */ +++#define BPF_F_RDONLY (1U << 3) +++#define BPF_F_WRONLY (1U << 4) +++ +++/* Flag for stack_map, store build_id+offset instead of pointer */ +++#define BPF_F_STACK_BUILD_ID (1U << 5) +++ +++/* Zero-initialize hash function seed. This should only be used for testing. */ +++#define BPF_F_ZERO_SEED (1U << 6) +++ +++/* Flags for accessing BPF object from program side. */ +++#define BPF_F_RDONLY_PROG (1U << 7) +++#define BPF_F_WRONLY_PROG (1U << 8) +++ +++/* Clone map from listener for newly accepted socket */ +++#define BPF_F_CLONE (1U << 9) +++ +++/* flags for BPF_PROG_QUERY */ +++#define BPF_F_QUERY_EFFECTIVE (1U << 0) +++ +++enum bpf_stack_build_id_status { +++ /* user space need an empty entry to identify end of a trace */ +++ BPF_STACK_BUILD_ID_EMPTY = 0, +++ /* with valid build_id and offset */ +++ BPF_STACK_BUILD_ID_VALID = 1, +++ /* couldn't get build_id, fallback to ip */ +++ BPF_STACK_BUILD_ID_IP = 2, +++}; +++ +++#define BPF_BUILD_ID_SIZE 20 +++struct bpf_stack_build_id { +++ __s32 status; +++ unsigned char build_id[BPF_BUILD_ID_SIZE]; +++ union { +++ __u64 offset; +++ __u64 ip; +++ }; +++}; ++ ++ union bpf_attr { ++ struct { /* anonymous struct used by BPF_MAP_CREATE command */ ++@@ -104,6 +372,18 @@ union bpf_attr { ++ __u32 key_size; /* size of key in bytes */ ++ __u32 value_size; /* size of value in bytes */ ++ __u32 max_entries; /* max number of entries in a map */ +++ __u32 map_flags; /* BPF_MAP_CREATE related +++ * flags defined above. +++ */ +++ __u32 inner_map_fd; /* fd pointing to the inner map */ +++ __u32 numa_node; /* numa node (effective only if +++ * BPF_F_NUMA_NODE is set). +++ */ +++ char map_name[BPF_OBJ_NAME_LEN]; +++ __u32 map_ifindex; /* ifindex of netdev to create on */ +++ __u32 btf_fd; /* fd pointing to a BTF type data */ +++ __u32 btf_key_type_id; /* BTF type_id of the key */ +++ __u32 btf_value_type_id; /* BTF type_id of the value */ ++ }; ++ ++ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ ++@@ -124,154 +404,2568 @@ union bpf_attr { ++ __u32 log_level; /* verbosity level of verifier */ ++ __u32 log_size; /* size of user buffer */ ++ __aligned_u64 log_buf; /* user supplied buffer */ ++- __u32 kern_version; /* checked when prog_type=kprobe */ +++ __u32 kern_version; /* not used */ +++ __u32 prog_flags; +++ char prog_name[BPF_OBJ_NAME_LEN]; +++ __u32 prog_ifindex; /* ifindex of netdev to prep for */ +++ /* For some prog types expected attach type must be known at +++ * load time to verify attach type specific parts of prog +++ * (context accesses, allowed helpers, etc). +++ */ +++ __u32 expected_attach_type; +++ __u32 prog_btf_fd; /* fd pointing to BTF type data */ +++ __u32 func_info_rec_size; /* userspace bpf_func_info size */ +++ __aligned_u64 func_info; /* func info */ +++ __u32 func_info_cnt; /* number of bpf_func_info records */ +++ __u32 line_info_rec_size; /* userspace bpf_line_info size */ +++ __aligned_u64 line_info; /* line info */ +++ __u32 line_info_cnt; /* number of bpf_line_info records */ ++ }; ++ ++ struct { /* anonymous struct used by BPF_OBJ_* commands */ ++ __aligned_u64 pathname; ++ __u32 bpf_fd; +++ __u32 file_flags; +++ }; +++ +++ struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ +++ __u32 target_fd; /* container object to attach to */ +++ __u32 attach_bpf_fd; /* eBPF program to attach */ +++ __u32 attach_type; +++ __u32 attach_flags; +++ }; +++ +++ struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ +++ __u32 prog_fd; +++ __u32 retval; +++ __u32 data_size_in; /* input: len of data_in */ +++ __u32 data_size_out; /* input/output: len of data_out +++ * returns ENOSPC if data_out +++ * is too small. +++ */ +++ __aligned_u64 data_in; +++ __aligned_u64 data_out; +++ __u32 repeat; +++ __u32 duration; +++ __u32 ctx_size_in; /* input: len of ctx_in */ +++ __u32 ctx_size_out; /* input/output: len of ctx_out +++ * returns ENOSPC if ctx_out +++ * is too small. +++ */ +++ __aligned_u64 ctx_in; +++ __aligned_u64 ctx_out; +++ } test; +++ +++ struct { /* anonymous struct used by BPF_*_GET_*_ID */ +++ union { +++ __u32 start_id; +++ __u32 prog_id; +++ __u32 map_id; +++ __u32 btf_id; +++ }; +++ __u32 next_id; +++ __u32 open_flags; ++ }; +++ +++ struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ +++ __u32 bpf_fd; +++ __u32 info_len; +++ __aligned_u64 info; +++ } info; +++ +++ struct { /* anonymous struct used by BPF_PROG_QUERY command */ +++ __u32 target_fd; /* container object to query */ +++ __u32 attach_type; +++ __u32 query_flags; +++ __u32 attach_flags; +++ __aligned_u64 prog_ids; +++ __u32 prog_cnt; +++ } query; +++ +++ struct { +++ __u64 name; +++ __u32 prog_fd; +++ } raw_tracepoint; +++ +++ struct { /* anonymous struct for BPF_BTF_LOAD */ +++ __aligned_u64 btf; +++ __aligned_u64 btf_log_buf; +++ __u32 btf_size; +++ __u32 btf_log_size; +++ __u32 btf_log_level; +++ }; +++ +++ struct { +++ __u32 pid; /* input: pid */ +++ __u32 fd; /* input: fd */ +++ __u32 flags; /* input: flags */ +++ __u32 buf_len; /* input/output: buf len */ +++ __aligned_u64 buf; /* input/output: +++ * tp_name for tracepoint +++ * symbol for kprobe +++ * filename for uprobe +++ */ +++ __u32 prog_id; /* output: prod_id */ +++ __u32 fd_type; /* output: BPF_FD_TYPE_* */ +++ __u64 probe_offset; /* output: probe_offset */ +++ __u64 probe_addr; /* output: probe_addr */ +++ } task_fd_query; ++ } __attribute__((aligned(8))); ++ +++/* The description below is an attempt at providing documentation to eBPF +++ * developers about the multiple available eBPF helper functions. It can be +++ * parsed and used to produce a manual page. The workflow is the following, +++ * and requires the rst2man utility: +++ * +++ * $ ./scripts/bpf_helpers_doc.py \ +++ * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst +++ * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 +++ * $ man /tmp/bpf-helpers.7 +++ * +++ * Note that in order to produce this external documentation, some RST +++ * formatting is used in the descriptions to get "bold" and "italics" in +++ * manual pages. Also note that the few trailing white spaces are +++ * intentional, removing them would break paragraphs for rst2man. +++ * +++ * Start of BPF helper function descriptions: +++ * +++ * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) +++ * Description +++ * Perform a lookup in *map* for an entry associated to *key*. +++ * Return +++ * Map value associated to *key*, or **NULL** if no entry was +++ * found. +++ * +++ * int bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) +++ * Description +++ * Add or update the value of the entry associated to *key* in +++ * *map* with *value*. *flags* is one of: +++ * +++ * **BPF_NOEXIST** +++ * The entry for *key* must not exist in the map. +++ * **BPF_EXIST** +++ * The entry for *key* must already exist in the map. +++ * **BPF_ANY** +++ * No condition on the existence of the entry for *key*. +++ * +++ * Flag value **BPF_NOEXIST** cannot be used for maps of types +++ * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all +++ * elements always exist), the helper would return an error. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_map_delete_elem(struct bpf_map *map, const void *key) +++ * Description +++ * Delete entry with *key* from *map*. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_probe_read(void *dst, u32 size, const void *src) +++ * Description +++ * For tracing programs, safely attempt to read *size* bytes from +++ * address *src* and store the data in *dst*. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * u64 bpf_ktime_get_ns(void) +++ * Description +++ * Return the time elapsed since system boot, in nanoseconds. +++ * Return +++ * Current *ktime*. +++ * +++ * int bpf_trace_printk(const char *fmt, u32 fmt_size, ...) +++ * Description +++ * This helper is a "printk()-like" facility for debugging. It +++ * prints a message defined by format *fmt* (of size *fmt_size*) +++ * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if +++ * available. It can take up to three additional **u64** +++ * arguments (as an eBPF helpers, the total number of arguments is +++ * limited to five). +++ * +++ * Each time the helper is called, it appends a line to the trace. +++ * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is +++ * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. +++ * The format of the trace is customizable, and the exact output +++ * one will get depends on the options set in +++ * *\/sys/kernel/debug/tracing/trace_options* (see also the +++ * *README* file under the same directory). However, it usually +++ * defaults to something like: +++ * +++ * :: +++ * +++ * telnet-470 [001] .N.. 419421.045894: 0x00000001: +++ * +++ * In the above: +++ * +++ * * ``telnet`` is the name of the current task. +++ * * ``470`` is the PID of the current task. +++ * * ``001`` is the CPU number on which the task is +++ * running. +++ * * In ``.N..``, each character refers to a set of +++ * options (whether irqs are enabled, scheduling +++ * options, whether hard/softirqs are running, level of +++ * preempt_disabled respectively). **N** means that +++ * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** +++ * are set. +++ * * ``419421.045894`` is a timestamp. +++ * * ``0x00000001`` is a fake value used by BPF for the +++ * instruction pointer register. +++ * * ```` is the message formatted with +++ * *fmt*. +++ * +++ * The conversion specifiers supported by *fmt* are similar, but +++ * more limited than for printk(). They are **%d**, **%i**, +++ * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, +++ * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size +++ * of field, padding with zeroes, etc.) is available, and the +++ * helper will return **-EINVAL** (but print nothing) if it +++ * encounters an unknown specifier. +++ * +++ * Also, note that **bpf_trace_printk**\ () is slow, and should +++ * only be used for debugging purposes. For this reason, a notice +++ * bloc (spanning several lines) is printed to kernel logs and +++ * states that the helper should not be used "for production use" +++ * the first time this helper is used (or more precisely, when +++ * **trace_printk**\ () buffers are allocated). For passing values +++ * to user space, perf events should be preferred. +++ * Return +++ * The number of bytes written to the buffer, or a negative error +++ * in case of failure. +++ * +++ * u32 bpf_get_prandom_u32(void) +++ * Description +++ * Get a pseudo-random number. +++ * +++ * From a security point of view, this helper uses its own +++ * pseudo-random internal state, and cannot be used to infer the +++ * seed of other random functions in the kernel. However, it is +++ * essential to note that the generator used by the helper is not +++ * cryptographically secure. +++ * Return +++ * A random 32-bit unsigned value. +++ * +++ * u32 bpf_get_smp_processor_id(void) +++ * Description +++ * Get the SMP (symmetric multiprocessing) processor id. Note that +++ * all programs run with preemption disabled, which means that the +++ * SMP processor id is stable during all the execution of the +++ * program. +++ * Return +++ * The SMP id of the processor running the program. +++ * +++ * int bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) +++ * Description +++ * Store *len* bytes from address *from* into the packet +++ * associated to *skb*, at *offset*. *flags* are a combination of +++ * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the +++ * checksum for the packet after storing the bytes) and +++ * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ +++ * **->swhash** and *skb*\ **->l4hash** to 0). +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) +++ * Description +++ * Recompute the layer 3 (e.g. IP) checksum for the packet +++ * associated to *skb*. Computation is incremental, so the helper +++ * must know the former value of the header field that was +++ * modified (*from*), the new value of this field (*to*), and the +++ * number of bytes (2 or 4) for this field, stored in *size*. +++ * Alternatively, it is possible to store the difference between +++ * the previous and the new values of the header field in *to*, by +++ * setting *from* and *size* to 0. For both methods, *offset* +++ * indicates the location of the IP checksum within the packet. +++ * +++ * This helper works in combination with **bpf_csum_diff**\ (), +++ * which does not update the checksum in-place, but offers more +++ * flexibility and can handle sizes larger than 2 or 4 for the +++ * checksum to update. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) +++ * Description +++ * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the +++ * packet associated to *skb*. Computation is incremental, so the +++ * helper must know the former value of the header field that was +++ * modified (*from*), the new value of this field (*to*), and the +++ * number of bytes (2 or 4) for this field, stored on the lowest +++ * four bits of *flags*. Alternatively, it is possible to store +++ * the difference between the previous and the new values of the +++ * header field in *to*, by setting *from* and the four lowest +++ * bits of *flags* to 0. For both methods, *offset* indicates the +++ * location of the IP checksum within the packet. In addition to +++ * the size of the field, *flags* can be added (bitwise OR) actual +++ * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left +++ * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and +++ * for updates resulting in a null checksum the value is set to +++ * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates +++ * the checksum is to be computed against a pseudo-header. +++ * +++ * This helper works in combination with **bpf_csum_diff**\ (), +++ * which does not update the checksum in-place, but offers more +++ * flexibility and can handle sizes larger than 2 or 4 for the +++ * checksum to update. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) +++ * Description +++ * This special helper is used to trigger a "tail call", or in +++ * other words, to jump into another eBPF program. The same stack +++ * frame is used (but values on stack and in registers for the +++ * caller are not accessible to the callee). This mechanism allows +++ * for program chaining, either for raising the maximum number of +++ * available eBPF instructions, or to execute given programs in +++ * conditional blocks. For security reasons, there is an upper +++ * limit to the number of successive tail calls that can be +++ * performed. +++ * +++ * Upon call of this helper, the program attempts to jump into a +++ * program referenced at index *index* in *prog_array_map*, a +++ * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes +++ * *ctx*, a pointer to the context. +++ * +++ * If the call succeeds, the kernel immediately runs the first +++ * instruction of the new program. This is not a function call, +++ * and it never returns to the previous program. If the call +++ * fails, then the helper has no effect, and the caller continues +++ * to run its subsequent instructions. A call can fail if the +++ * destination program for the jump does not exist (i.e. *index* +++ * is superior to the number of entries in *prog_array_map*), or +++ * if the maximum number of tail calls has been reached for this +++ * chain of programs. This limit is defined in the kernel by the +++ * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), +++ * which is currently set to 32. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) +++ * Description +++ * Clone and redirect the packet associated to *skb* to another +++ * net device of index *ifindex*. Both ingress and egress +++ * interfaces can be used for redirection. The **BPF_F_INGRESS** +++ * value in *flags* is used to make the distinction (ingress path +++ * is selected if the flag is present, egress path otherwise). +++ * This is the only flag supported for now. +++ * +++ * In comparison with **bpf_redirect**\ () helper, +++ * **bpf_clone_redirect**\ () has the associated cost of +++ * duplicating the packet buffer, but this can be executed out of +++ * the eBPF program. Conversely, **bpf_redirect**\ () is more +++ * efficient, but it is handled through an action code where the +++ * redirection happens only after the eBPF program has returned. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * u64 bpf_get_current_pid_tgid(void) +++ * Return +++ * A 64-bit integer containing the current tgid and pid, and +++ * created as such: +++ * *current_task*\ **->tgid << 32 \|** +++ * *current_task*\ **->pid**. +++ * +++ * u64 bpf_get_current_uid_gid(void) +++ * Return +++ * A 64-bit integer containing the current GID and UID, and +++ * created as such: *current_gid* **<< 32 \|** *current_uid*. +++ * +++ * int bpf_get_current_comm(char *buf, u32 size_of_buf) +++ * Description +++ * Copy the **comm** attribute of the current task into *buf* of +++ * *size_of_buf*. The **comm** attribute contains the name of +++ * the executable (excluding the path) for the current task. The +++ * *size_of_buf* must be strictly positive. On success, the +++ * helper makes sure that the *buf* is NUL-terminated. On failure, +++ * it is filled with zeroes. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * u32 bpf_get_cgroup_classid(struct sk_buff *skb) +++ * Description +++ * Retrieve the classid for the current task, i.e. for the net_cls +++ * cgroup to which *skb* belongs. +++ * +++ * This helper can be used on TC egress path, but not on ingress. +++ * +++ * The net_cls cgroup provides an interface to tag network packets +++ * based on a user-provided identifier for all traffic coming from +++ * the tasks belonging to the related cgroup. See also the related +++ * kernel documentation, available from the Linux sources in file +++ * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. +++ * +++ * The Linux kernel has two versions for cgroups: there are +++ * cgroups v1 and cgroups v2. Both are available to users, who can +++ * use a mixture of them, but note that the net_cls cgroup is for +++ * cgroup v1 only. This makes it incompatible with BPF programs +++ * run on cgroups, which is a cgroup-v2-only feature (a socket can +++ * only hold data for one version of cgroups at a time). +++ * +++ * This helper is only available is the kernel was compiled with +++ * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to +++ * "**y**" or to "**m**". +++ * Return +++ * The classid, or 0 for the default unconfigured classid. +++ * +++ * int bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) +++ * Description +++ * Push a *vlan_tci* (VLAN tag control information) of protocol +++ * *vlan_proto* to the packet associated to *skb*, then update +++ * the checksum. Note that if *vlan_proto* is different from +++ * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to +++ * be **ETH_P_8021Q**. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_vlan_pop(struct sk_buff *skb) +++ * Description +++ * Pop a VLAN header from the packet associated to *skb*. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) +++ * Description +++ * Get tunnel metadata. This helper takes a pointer *key* to an +++ * empty **struct bpf_tunnel_key** of **size**, that will be +++ * filled with tunnel metadata for the packet associated to *skb*. +++ * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which +++ * indicates that the tunnel is based on IPv6 protocol instead of +++ * IPv4. +++ * +++ * The **struct bpf_tunnel_key** is an object that generalizes the +++ * principal parameters used by various tunneling protocols into a +++ * single struct. This way, it can be used to easily make a +++ * decision based on the contents of the encapsulation header, +++ * "summarized" in this struct. In particular, it holds the IP +++ * address of the remote end (IPv4 or IPv6, depending on the case) +++ * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, +++ * this struct exposes the *key*\ **->tunnel_id**, which is +++ * generally mapped to a VNI (Virtual Network Identifier), making +++ * it programmable together with the **bpf_skb_set_tunnel_key**\ +++ * () helper. +++ * +++ * Let's imagine that the following code is part of a program +++ * attached to the TC ingress interface, on one end of a GRE +++ * tunnel, and is supposed to filter out all messages coming from +++ * remote ends with IPv4 address other than 10.0.0.1: +++ * +++ * :: +++ * +++ * int ret; +++ * struct bpf_tunnel_key key = {}; +++ * +++ * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); +++ * if (ret < 0) +++ * return TC_ACT_SHOT; // drop packet +++ * +++ * if (key.remote_ipv4 != 0x0a000001) +++ * return TC_ACT_SHOT; // drop packet +++ * +++ * return TC_ACT_OK; // accept packet +++ * +++ * This interface can also be used with all encapsulation devices +++ * that can operate in "collect metadata" mode: instead of having +++ * one network device per specific configuration, the "collect +++ * metadata" mode only requires a single device where the +++ * configuration can be extracted from this helper. +++ * +++ * This can be used together with various tunnels such as VXLan, +++ * Geneve, GRE or IP in IP (IPIP). +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) +++ * Description +++ * Populate tunnel metadata for packet associated to *skb.* The +++ * tunnel metadata is set to the contents of *key*, of *size*. The +++ * *flags* can be set to a combination of the following values: +++ * +++ * **BPF_F_TUNINFO_IPV6** +++ * Indicate that the tunnel is based on IPv6 protocol +++ * instead of IPv4. +++ * **BPF_F_ZERO_CSUM_TX** +++ * For IPv4 packets, add a flag to tunnel metadata +++ * indicating that checksum computation should be skipped +++ * and checksum set to zeroes. +++ * **BPF_F_DONT_FRAGMENT** +++ * Add a flag to tunnel metadata indicating that the +++ * packet should not be fragmented. +++ * **BPF_F_SEQ_NUMBER** +++ * Add a flag to tunnel metadata indicating that a +++ * sequence number should be added to tunnel header before +++ * sending the packet. This flag was added for GRE +++ * encapsulation, but might be used with other protocols +++ * as well in the future. +++ * +++ * Here is a typical usage on the transmit path: +++ * +++ * :: +++ * +++ * struct bpf_tunnel_key key; +++ * populate key ... +++ * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); +++ * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); +++ * +++ * See also the description of the **bpf_skb_get_tunnel_key**\ () +++ * helper for additional information. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) +++ * Description +++ * Read the value of a perf event counter. This helper relies on a +++ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of +++ * the perf event counter is selected when *map* is updated with +++ * perf event file descriptors. The *map* is an array whose size +++ * is the number of available CPUs, and each cell contains a value +++ * relative to one CPU. The value to retrieve is indicated by +++ * *flags*, that contains the index of the CPU to look up, masked +++ * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to +++ * **BPF_F_CURRENT_CPU** to indicate that the value for the +++ * current CPU should be retrieved. +++ * +++ * Note that before Linux 4.13, only hardware perf event can be +++ * retrieved. +++ * +++ * Also, be aware that the newer helper +++ * **bpf_perf_event_read_value**\ () is recommended over +++ * **bpf_perf_event_read**\ () in general. The latter has some ABI +++ * quirks where error and counter value are used as a return code +++ * (which is wrong to do since ranges may overlap). This issue is +++ * fixed with **bpf_perf_event_read_value**\ (), which at the same +++ * time provides more features over the **bpf_perf_event_read**\ +++ * () interface. Please refer to the description of +++ * **bpf_perf_event_read_value**\ () for details. +++ * Return +++ * The value of the perf event counter read from the map, or a +++ * negative error code in case of failure. +++ * +++ * int bpf_redirect(u32 ifindex, u64 flags) +++ * Description +++ * Redirect the packet to another net device of index *ifindex*. +++ * This helper is somewhat similar to **bpf_clone_redirect**\ +++ * (), except that the packet is not cloned, which provides +++ * increased performance. +++ * +++ * Except for XDP, both ingress and egress interfaces can be used +++ * for redirection. The **BPF_F_INGRESS** value in *flags* is used +++ * to make the distinction (ingress path is selected if the flag +++ * is present, egress path otherwise). Currently, XDP only +++ * supports redirection to the egress interface, and accepts no +++ * flag at all. +++ * +++ * The same effect can be attained with the more generic +++ * **bpf_redirect_map**\ (), which requires specific maps to be +++ * used but offers better performance. +++ * Return +++ * For XDP, the helper returns **XDP_REDIRECT** on success or +++ * **XDP_ABORTED** on error. For other program types, the values +++ * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on +++ * error. +++ * +++ * u32 bpf_get_route_realm(struct sk_buff *skb) +++ * Description +++ * Retrieve the realm or the route, that is to say the +++ * **tclassid** field of the destination for the *skb*. The +++ * indentifier retrieved is a user-provided tag, similar to the +++ * one used with the net_cls cgroup (see description for +++ * **bpf_get_cgroup_classid**\ () helper), but here this tag is +++ * held by a route (a destination entry), not by a task. +++ * +++ * Retrieving this identifier works with the clsact TC egress hook +++ * (see also **tc-bpf(8)**), or alternatively on conventional +++ * classful egress qdiscs, but not on TC ingress path. In case of +++ * clsact TC egress hook, this has the advantage that, internally, +++ * the destination entry has not been dropped yet in the transmit +++ * path. Therefore, the destination entry does not need to be +++ * artificially held via **netif_keep_dst**\ () for a classful +++ * qdisc until the *skb* is freed. +++ * +++ * This helper is available only if the kernel was compiled with +++ * **CONFIG_IP_ROUTE_CLASSID** configuration option. +++ * Return +++ * The realm of the route for the packet associated to *skb*, or 0 +++ * if none was found. +++ * +++ * int bpf_perf_event_output(struct pt_regs *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) +++ * Description +++ * Write raw *data* blob into a special BPF perf event held by +++ * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf +++ * event must have the following attributes: **PERF_SAMPLE_RAW** +++ * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and +++ * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. +++ * +++ * The *flags* are used to indicate the index in *map* for which +++ * the value must be put, masked with **BPF_F_INDEX_MASK**. +++ * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** +++ * to indicate that the index of the current CPU core should be +++ * used. +++ * +++ * The value to write, of *size*, is passed through eBPF stack and +++ * pointed by *data*. +++ * +++ * The context of the program *ctx* needs also be passed to the +++ * helper. +++ * +++ * On user space, a program willing to read the values needs to +++ * call **perf_event_open**\ () on the perf event (either for +++ * one or for all CPUs) and to store the file descriptor into the +++ * *map*. This must be done before the eBPF program can send data +++ * into it. An example is available in file +++ * *samples/bpf/trace_output_user.c* in the Linux kernel source +++ * tree (the eBPF program counterpart is in +++ * *samples/bpf/trace_output_kern.c*). +++ * +++ * **bpf_perf_event_output**\ () achieves better performance +++ * than **bpf_trace_printk**\ () for sharing data with user +++ * space, and is much better suitable for streaming data from eBPF +++ * programs. +++ * +++ * Note that this helper is not restricted to tracing use cases +++ * and can be used with programs attached to TC or XDP as well, +++ * where it allows for passing data to user space listeners. Data +++ * can be: +++ * +++ * * Only custom structs, +++ * * Only the packet payload, or +++ * * A combination of both. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_load_bytes(const struct sk_buff *skb, u32 offset, void *to, u32 len) +++ * Description +++ * This helper was provided as an easy way to load data from a +++ * packet. It can be used to load *len* bytes from *offset* from +++ * the packet associated to *skb*, into the buffer pointed by +++ * *to*. +++ * +++ * Since Linux 4.7, usage of this helper has mostly been replaced +++ * by "direct packet access", enabling packet data to be +++ * manipulated with *skb*\ **->data** and *skb*\ **->data_end** +++ * pointing respectively to the first byte of packet data and to +++ * the byte after the last byte of packet data. However, it +++ * remains useful if one wishes to read large quantities of data +++ * at once from a packet into the eBPF stack. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_get_stackid(struct pt_regs *ctx, struct bpf_map *map, u64 flags) +++ * Description +++ * Walk a user or a kernel stack and return its id. To achieve +++ * this, the helper needs *ctx*, which is a pointer to the context +++ * on which the tracing program is executed, and a pointer to a +++ * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. +++ * +++ * The last argument, *flags*, holds the number of stack frames to +++ * skip (from 0 to 255), masked with +++ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set +++ * a combination of the following flags: +++ * +++ * **BPF_F_USER_STACK** +++ * Collect a user space stack instead of a kernel stack. +++ * **BPF_F_FAST_STACK_CMP** +++ * Compare stacks by hash only. +++ * **BPF_F_REUSE_STACKID** +++ * If two different stacks hash into the same *stackid*, +++ * discard the old one. +++ * +++ * The stack id retrieved is a 32 bit long integer handle which +++ * can be further combined with other data (including other stack +++ * ids) and used as a key into maps. This can be useful for +++ * generating a variety of graphs (such as flame graphs or off-cpu +++ * graphs). +++ * +++ * For walking a stack, this helper is an improvement over +++ * **bpf_probe_read**\ (), which can be used with unrolled loops +++ * but is not efficient and consumes a lot of eBPF instructions. +++ * Instead, **bpf_get_stackid**\ () can collect up to +++ * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that +++ * this limit can be controlled with the **sysctl** program, and +++ * that it should be manually increased in order to profile long +++ * user stacks (such as stacks for Java programs). To do so, use: +++ * +++ * :: +++ * +++ * # sysctl kernel.perf_event_max_stack= +++ * Return +++ * The positive or null stack id on success, or a negative error +++ * in case of failure. +++ * +++ * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) +++ * Description +++ * Compute a checksum difference, from the raw buffer pointed by +++ * *from*, of length *from_size* (that must be a multiple of 4), +++ * towards the raw buffer pointed by *to*, of size *to_size* +++ * (same remark). An optional *seed* can be added to the value +++ * (this can be cascaded, the seed may come from a previous call +++ * to the helper). +++ * +++ * This is flexible enough to be used in several ways: +++ * +++ * * With *from_size* == 0, *to_size* > 0 and *seed* set to +++ * checksum, it can be used when pushing new data. +++ * * With *from_size* > 0, *to_size* == 0 and *seed* set to +++ * checksum, it can be used when removing data from a packet. +++ * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it +++ * can be used to compute a diff. Note that *from_size* and +++ * *to_size* do not need to be equal. +++ * +++ * This helper can be used in combination with +++ * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to +++ * which one can feed in the difference computed with +++ * **bpf_csum_diff**\ (). +++ * Return +++ * The checksum result, or a negative error code in case of +++ * failure. +++ * +++ * int bpf_skb_get_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) +++ * Description +++ * Retrieve tunnel options metadata for the packet associated to +++ * *skb*, and store the raw tunnel option data to the buffer *opt* +++ * of *size*. +++ * +++ * This helper can be used with encapsulation devices that can +++ * operate in "collect metadata" mode (please refer to the related +++ * note in the description of **bpf_skb_get_tunnel_key**\ () for +++ * more details). A particular example where this can be used is +++ * in combination with the Geneve encapsulation protocol, where it +++ * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) +++ * and retrieving arbitrary TLVs (Type-Length-Value headers) from +++ * the eBPF program. This allows for full customization of these +++ * headers. +++ * Return +++ * The size of the option data retrieved. +++ * +++ * int bpf_skb_set_tunnel_opt(struct sk_buff *skb, u8 *opt, u32 size) +++ * Description +++ * Set tunnel options metadata for the packet associated to *skb* +++ * to the option data contained in the raw buffer *opt* of *size*. +++ * +++ * See also the description of the **bpf_skb_get_tunnel_opt**\ () +++ * helper for additional information. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) +++ * Description +++ * Change the protocol of the *skb* to *proto*. Currently +++ * supported are transition from IPv4 to IPv6, and from IPv6 to +++ * IPv4. The helper takes care of the groundwork for the +++ * transition, including resizing the socket buffer. The eBPF +++ * program is expected to fill the new headers, if any, via +++ * **skb_store_bytes**\ () and to recompute the checksums with +++ * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ +++ * (). The main case for this helper is to perform NAT64 +++ * operations out of an eBPF program. +++ * +++ * Internally, the GSO type is marked as dodgy so that headers are +++ * checked and segments are recalculated by the GSO/GRO engine. +++ * The size for GSO target is adapted as well. +++ * +++ * All values for *flags* are reserved for future usage, and must +++ * be left at zero. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_change_type(struct sk_buff *skb, u32 type) +++ * Description +++ * Change the packet type for the packet associated to *skb*. This +++ * comes down to setting *skb*\ **->pkt_type** to *type*, except +++ * the eBPF program does not have a write access to *skb*\ +++ * **->pkt_type** beside this helper. Using a helper here allows +++ * for graceful handling of errors. +++ * +++ * The major use case is to change incoming *skb*s to +++ * **PACKET_HOST** in a programmatic way instead of having to +++ * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for +++ * example. +++ * +++ * Note that *type* only allows certain values. At this time, they +++ * are: +++ * +++ * **PACKET_HOST** +++ * Packet is for us. +++ * **PACKET_BROADCAST** +++ * Send packet to all. +++ * **PACKET_MULTICAST** +++ * Send packet to group. +++ * **PACKET_OTHERHOST** +++ * Send packet to someone else. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) +++ * Description +++ * Check whether *skb* is a descendant of the cgroup2 held by +++ * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. +++ * Return +++ * The return value depends on the result of the test, and can be: +++ * +++ * * 0, if the *skb* failed the cgroup2 descendant test. +++ * * 1, if the *skb* succeeded the cgroup2 descendant test. +++ * * A negative error code, if an error occurred. +++ * +++ * u32 bpf_get_hash_recalc(struct sk_buff *skb) +++ * Description +++ * Retrieve the hash of the packet, *skb*\ **->hash**. If it is +++ * not set, in particular if the hash was cleared due to mangling, +++ * recompute this hash. Later accesses to the hash can be done +++ * directly with *skb*\ **->hash**. +++ * +++ * Calling **bpf_set_hash_invalid**\ (), changing a packet +++ * prototype with **bpf_skb_change_proto**\ (), or calling +++ * **bpf_skb_store_bytes**\ () with the +++ * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear +++ * the hash and to trigger a new computation for the next call to +++ * **bpf_get_hash_recalc**\ (). +++ * Return +++ * The 32-bit hash. +++ * +++ * u64 bpf_get_current_task(void) +++ * Return +++ * A pointer to the current task struct. +++ * +++ * int bpf_probe_write_user(void *dst, const void *src, u32 len) +++ * Description +++ * Attempt in a safe way to write *len* bytes from the buffer +++ * *src* to *dst* in memory. It only works for threads that are in +++ * user context, and *dst* must be a valid user space address. +++ * +++ * This helper should not be used to implement any kind of +++ * security mechanism because of TOC-TOU attacks, but rather to +++ * debug, divert, and manipulate execution of semi-cooperative +++ * processes. +++ * +++ * Keep in mind that this feature is meant for experiments, and it +++ * has a risk of crashing the system and running programs. +++ * Therefore, when an eBPF program using this helper is attached, +++ * a warning including PID and process name is printed to kernel +++ * logs. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) +++ * Description +++ * Check whether the probe is being run is the context of a given +++ * subset of the cgroup2 hierarchy. The cgroup2 to test is held by +++ * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. +++ * Return +++ * The return value depends on the result of the test, and can be: +++ * +++ * * 0, if current task belongs to the cgroup2. +++ * * 1, if current task does not belong to the cgroup2. +++ * * A negative error code, if an error occurred. +++ * +++ * int bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) +++ * Description +++ * Resize (trim or grow) the packet associated to *skb* to the +++ * new *len*. The *flags* are reserved for future usage, and must +++ * be left at zero. +++ * +++ * The basic idea is that the helper performs the needed work to +++ * change the size of the packet, then the eBPF program rewrites +++ * the rest via helpers like **bpf_skb_store_bytes**\ (), +++ * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () +++ * and others. This helper is a slow path utility intended for +++ * replies with control messages. And because it is targeted for +++ * slow path, the helper itself can afford to be slow: it +++ * implicitly linearizes, unclones and drops offloads from the +++ * *skb*. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_pull_data(struct sk_buff *skb, u32 len) +++ * Description +++ * Pull in non-linear data in case the *skb* is non-linear and not +++ * all of *len* are part of the linear section. Make *len* bytes +++ * from *skb* readable and writable. If a zero value is passed for +++ * *len*, then the whole length of the *skb* is pulled. +++ * +++ * This helper is only needed for reading and writing with direct +++ * packet access. +++ * +++ * For direct packet access, testing that offsets to access +++ * are within packet boundaries (test on *skb*\ **->data_end**) is +++ * susceptible to fail if offsets are invalid, or if the requested +++ * data is in non-linear parts of the *skb*. On failure the +++ * program can just bail out, or in the case of a non-linear +++ * buffer, use a helper to make the data available. The +++ * **bpf_skb_load_bytes**\ () helper is a first solution to access +++ * the data. Another one consists in using **bpf_skb_pull_data** +++ * to pull in once the non-linear parts, then retesting and +++ * eventually access the data. +++ * +++ * At the same time, this also makes sure the *skb* is uncloned, +++ * which is a necessary condition for direct write. As this needs +++ * to be an invariant for the write part only, the verifier +++ * detects writes and adds a prologue that is calling +++ * **bpf_skb_pull_data()** to effectively unclone the *skb* from +++ * the very beginning in case it is indeed cloned. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) +++ * Description +++ * Add the checksum *csum* into *skb*\ **->csum** in case the +++ * driver has supplied a checksum for the entire packet into that +++ * field. Return an error otherwise. This helper is intended to be +++ * used in combination with **bpf_csum_diff**\ (), in particular +++ * when the checksum needs to be updated after data has been +++ * written into the packet through direct packet access. +++ * Return +++ * The checksum on success, or a negative error code in case of +++ * failure. +++ * +++ * void bpf_set_hash_invalid(struct sk_buff *skb) +++ * Description +++ * Invalidate the current *skb*\ **->hash**. It can be used after +++ * mangling on headers through direct packet access, in order to +++ * indicate that the hash is outdated and to trigger a +++ * recalculation the next time the kernel tries to access this +++ * hash or when the **bpf_get_hash_recalc**\ () helper is called. +++ * +++ * int bpf_get_numa_node_id(void) +++ * Description +++ * Return the id of the current NUMA node. The primary use case +++ * for this helper is the selection of sockets for the local NUMA +++ * node, when the program is attached to sockets using the +++ * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), +++ * but the helper is also available to other eBPF program types, +++ * similarly to **bpf_get_smp_processor_id**\ (). +++ * Return +++ * The id of current NUMA node. +++ * +++ * int bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) +++ * Description +++ * Grows headroom of packet associated to *skb* and adjusts the +++ * offset of the MAC header accordingly, adding *len* bytes of +++ * space. It automatically extends and reallocates memory as +++ * required. +++ * +++ * This helper can be used on a layer 3 *skb* to push a MAC header +++ * for redirection into a layer 2 device. +++ * +++ * All values for *flags* are reserved for future usage, and must +++ * be left at zero. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) +++ * Description +++ * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that +++ * it is possible to use a negative value for *delta*. This helper +++ * can be used to prepare the packet for pushing or popping +++ * headers. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_probe_read_str(void *dst, int size, const void *unsafe_ptr) +++ * Description +++ * Copy a NUL terminated string from an unsafe address +++ * *unsafe_ptr* to *dst*. The *size* should include the +++ * terminating NUL byte. In case the string length is smaller than +++ * *size*, the target is not padded with further NUL bytes. If the +++ * string length is larger than *size*, just *size*-1 bytes are +++ * copied and the last byte is set to NUL. +++ * +++ * On success, the length of the copied string is returned. This +++ * makes this helper useful in tracing programs for reading +++ * strings, and more importantly to get its length at runtime. See +++ * the following snippet: +++ * +++ * :: +++ * +++ * SEC("kprobe/sys_open") +++ * void bpf_sys_open(struct pt_regs *ctx) +++ * { +++ * char buf[PATHLEN]; // PATHLEN is defined to 256 +++ * int res = bpf_probe_read_str(buf, sizeof(buf), +++ * ctx->di); +++ * +++ * // Consume buf, for example push it to +++ * // userspace via bpf_perf_event_output(); we +++ * // can use res (the string length) as event +++ * // size, after checking its boundaries. +++ * } +++ * +++ * In comparison, using **bpf_probe_read()** helper here instead +++ * to read the string would require to estimate the length at +++ * compile time, and would often result in copying more memory +++ * than necessary. +++ * +++ * Another useful use case is when parsing individual process +++ * arguments or individual environment variables navigating +++ * *current*\ **->mm->arg_start** and *current*\ +++ * **->mm->env_start**: using this helper and the return value, +++ * one can quickly iterate at the right offset of the memory area. +++ * Return +++ * On success, the strictly positive length of the string, +++ * including the trailing NUL character. On error, a negative +++ * value. +++ * +++ * u64 bpf_get_socket_cookie(struct sk_buff *skb) +++ * Description +++ * If the **struct sk_buff** pointed by *skb* has a known socket, +++ * retrieve the cookie (generated by the kernel) of this socket. +++ * If no cookie has been set yet, generate a new cookie. Once +++ * generated, the socket cookie remains stable for the life of the +++ * socket. This helper can be useful for monitoring per socket +++ * networking traffic statistics as it provides a global socket +++ * identifier that can be assumed unique. +++ * Return +++ * A 8-byte long non-decreasing number on success, or 0 if the +++ * socket field is missing inside *skb*. +++ * +++ * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) +++ * Description +++ * Equivalent to bpf_get_socket_cookie() helper that accepts +++ * *skb*, but gets socket from **struct bpf_sock_addr** context. +++ * Return +++ * A 8-byte long non-decreasing number. +++ * +++ * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) +++ * Description +++ * Equivalent to bpf_get_socket_cookie() helper that accepts +++ * *skb*, but gets socket from **struct bpf_sock_ops** context. +++ * Return +++ * A 8-byte long non-decreasing number. +++ * +++ * u32 bpf_get_socket_uid(struct sk_buff *skb) +++ * Return +++ * The owner UID of the socket associated to *skb*. If the socket +++ * is **NULL**, or if it is not a full socket (i.e. if it is a +++ * time-wait or a request socket instead), **overflowuid** value +++ * is returned (note that **overflowuid** might also be the actual +++ * UID value for the socket). +++ * +++ * u32 bpf_set_hash(struct sk_buff *skb, u32 hash) +++ * Description +++ * Set the full hash for *skb* (set the field *skb*\ **->hash**) +++ * to value *hash*. +++ * Return +++ * 0 +++ * +++ * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) +++ * Description +++ * Emulate a call to **setsockopt()** on the socket associated to +++ * *bpf_socket*, which must be a full socket. The *level* at +++ * which the option resides and the name *optname* of the option +++ * must be specified, see **setsockopt(2)** for more information. +++ * The option value of length *optlen* is pointed by *optval*. +++ * +++ * This helper actually implements a subset of **setsockopt()**. +++ * It supports the following *level*\ s: +++ * +++ * * **SOL_SOCKET**, which supports the following *optname*\ s: +++ * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, +++ * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**. +++ * * **IPPROTO_TCP**, which supports the following *optname*\ s: +++ * **TCP_CONGESTION**, **TCP_BPF_IW**, +++ * **TCP_BPF_SNDCWND_CLAMP**. +++ * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. +++ * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) +++ * Description +++ * Grow or shrink the room for data in the packet associated to +++ * *skb* by *len_diff*, and according to the selected *mode*. +++ * +++ * There are two supported modes at this time: +++ * +++ * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer +++ * (room space is added or removed below the layer 2 header). +++ * +++ * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer +++ * (room space is added or removed below the layer 3 header). +++ * +++ * The following flags are supported at this time: +++ * +++ * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. +++ * Adjusting mss in this way is not allowed for datagrams. +++ * +++ * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, +++ * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: +++ * Any new space is reserved to hold a tunnel header. +++ * Configure skb offsets and other fields accordingly. +++ * +++ * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, +++ * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: +++ * Use with ENCAP_L3 flags to further specify the tunnel type. +++ * +++ * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): +++ * Use with ENCAP_L3/L4 flags to further specify the tunnel +++ * type; *len* is the length of the inner MAC header. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_redirect_map(struct bpf_map *map, u32 key, u64 flags) +++ * Description +++ * Redirect the packet to the endpoint referenced by *map* at +++ * index *key*. Depending on its type, this *map* can contain +++ * references to net devices (for forwarding packets through other +++ * ports), or to CPUs (for redirecting XDP frames to another CPU; +++ * but this is only implemented for native XDP (with driver +++ * support) as of this writing). +++ * +++ * The lower two bits of *flags* are used as the return code if +++ * the map lookup fails. This is so that the return value can be +++ * one of the XDP program return codes up to XDP_TX, as chosen by +++ * the caller. Any higher bits in the *flags* argument must be +++ * unset. +++ * +++ * When used to redirect packets to net devices, this helper +++ * provides a high performance increase over **bpf_redirect**\ (). +++ * This is due to various implementation details of the underlying +++ * mechanisms, one of which is the fact that **bpf_redirect_map**\ +++ * () tries to send packet as a "bulk" to the device. +++ * Return +++ * **XDP_REDIRECT** on success, or **XDP_ABORTED** on error. +++ * +++ * int bpf_sk_redirect_map(struct bpf_map *map, u32 key, u64 flags) +++ * Description +++ * Redirect the packet to the socket referenced by *map* (of type +++ * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and +++ * egress interfaces can be used for redirection. The +++ * **BPF_F_INGRESS** value in *flags* is used to make the +++ * distinction (ingress path is selected if the flag is present, +++ * egress path otherwise). This is the only flag supported for now. +++ * Return +++ * **SK_PASS** on success, or **SK_DROP** on error. +++ * +++ * int bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) +++ * Description +++ * Add an entry to, or update a *map* referencing sockets. The +++ * *skops* is used as a new value for the entry associated to +++ * *key*. *flags* is one of: +++ * +++ * **BPF_NOEXIST** +++ * The entry for *key* must not exist in the map. +++ * **BPF_EXIST** +++ * The entry for *key* must already exist in the map. +++ * **BPF_ANY** +++ * No condition on the existence of the entry for *key*. +++ * +++ * If the *map* has eBPF programs (parser and verdict), those will +++ * be inherited by the socket being added. If the socket is +++ * already attached to eBPF programs, this results in an error. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) +++ * Description +++ * Adjust the address pointed by *xdp_md*\ **->data_meta** by +++ * *delta* (which can be positive or negative). Note that this +++ * operation modifies the address stored in *xdp_md*\ **->data**, +++ * so the latter must be loaded only after the helper has been +++ * called. +++ * +++ * The use of *xdp_md*\ **->data_meta** is optional and programs +++ * are not required to use it. The rationale is that when the +++ * packet is processed with XDP (e.g. as DoS filter), it is +++ * possible to push further meta data along with it before passing +++ * to the stack, and to give the guarantee that an ingress eBPF +++ * program attached as a TC classifier on the same device can pick +++ * this up for further post-processing. Since TC works with socket +++ * buffers, it remains possible to set from XDP the **mark** or +++ * **priority** pointers, or other pointers for the socket buffer. +++ * Having this scratch space generic and programmable allows for +++ * more flexibility as the user is free to store whatever meta +++ * data they need. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) +++ * Description +++ * Read the value of a perf event counter, and store it into *buf* +++ * of size *buf_size*. This helper relies on a *map* of type +++ * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event +++ * counter is selected when *map* is updated with perf event file +++ * descriptors. The *map* is an array whose size is the number of +++ * available CPUs, and each cell contains a value relative to one +++ * CPU. The value to retrieve is indicated by *flags*, that +++ * contains the index of the CPU to look up, masked with +++ * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to +++ * **BPF_F_CURRENT_CPU** to indicate that the value for the +++ * current CPU should be retrieved. +++ * +++ * This helper behaves in a way close to +++ * **bpf_perf_event_read**\ () helper, save that instead of +++ * just returning the value observed, it fills the *buf* +++ * structure. This allows for additional data to be retrieved: in +++ * particular, the enabled and running times (in *buf*\ +++ * **->enabled** and *buf*\ **->running**, respectively) are +++ * copied. In general, **bpf_perf_event_read_value**\ () is +++ * recommended over **bpf_perf_event_read**\ (), which has some +++ * ABI issues and provides fewer functionalities. +++ * +++ * These values are interesting, because hardware PMU (Performance +++ * Monitoring Unit) counters are limited resources. When there are +++ * more PMU based perf events opened than available counters, +++ * kernel will multiplex these events so each event gets certain +++ * percentage (but not all) of the PMU time. In case that +++ * multiplexing happens, the number of samples or counter value +++ * will not reflect the case compared to when no multiplexing +++ * occurs. This makes comparison between different runs difficult. +++ * Typically, the counter value should be normalized before +++ * comparing to other experiments. The usual normalization is done +++ * as follows. +++ * +++ * :: +++ * +++ * normalized_counter = counter * t_enabled / t_running +++ * +++ * Where t_enabled is the time enabled for event and t_running is +++ * the time running for event since last normalization. The +++ * enabled and running times are accumulated since the perf event +++ * open. To achieve scaling factor between two invocations of an +++ * eBPF program, users can can use CPU id as the key (which is +++ * typical for perf array usage model) to remember the previous +++ * value and do the calculation inside the eBPF program. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) +++ * Description +++ * For en eBPF program attached to a perf event, retrieve the +++ * value of the event counter associated to *ctx* and store it in +++ * the structure pointed by *buf* and of size *buf_size*. Enabled +++ * and running times are also stored in the structure (see +++ * description of helper **bpf_perf_event_read_value**\ () for +++ * more details). +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, char *optval, int optlen) +++ * Description +++ * Emulate a call to **getsockopt()** on the socket associated to +++ * *bpf_socket*, which must be a full socket. The *level* at +++ * which the option resides and the name *optname* of the option +++ * must be specified, see **getsockopt(2)** for more information. +++ * The retrieved value is stored in the structure pointed by +++ * *opval* and of length *optlen*. +++ * +++ * This helper actually implements a subset of **getsockopt()**. +++ * It supports the following *level*\ s: +++ * +++ * * **IPPROTO_TCP**, which supports *optname* +++ * **TCP_CONGESTION**. +++ * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. +++ * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_override_return(struct pt_regs *regs, u64 rc) +++ * Description +++ * Used for error injection, this helper uses kprobes to override +++ * the return value of the probed function, and to set it to *rc*. +++ * The first argument is the context *regs* on which the kprobe +++ * works. +++ * +++ * This helper works by setting setting the PC (program counter) +++ * to an override function which is run in place of the original +++ * probed function. This means the probed function is not run at +++ * all. The replacement function just returns with the required +++ * value. +++ * +++ * This helper has security implications, and thus is subject to +++ * restrictions. It is only available if the kernel was compiled +++ * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration +++ * option, and in this case it only works on functions tagged with +++ * **ALLOW_ERROR_INJECTION** in the kernel code. +++ * +++ * Also, the helper is only available for the architectures having +++ * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, +++ * x86 architecture is the only one to support this feature. +++ * Return +++ * 0 +++ * +++ * int bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) +++ * Description +++ * Attempt to set the value of the **bpf_sock_ops_cb_flags** field +++ * for the full TCP socket associated to *bpf_sock_ops* to +++ * *argval*. +++ * +++ * The primary use of this field is to determine if there should +++ * be calls to eBPF programs of type +++ * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP +++ * code. A program of the same type can change its value, per +++ * connection and as necessary, when the connection is +++ * established. This field is directly accessible for reading, but +++ * this helper must be used for updates in order to return an +++ * error if an eBPF program tries to set a callback that is not +++ * supported in the current kernel. +++ * +++ * *argval* is a flag array which can combine these flags: +++ * +++ * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) +++ * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) +++ * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) +++ * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) +++ * +++ * Therefore, this function can be used to clear a callback flag by +++ * setting the appropriate bit to zero. e.g. to disable the RTO +++ * callback: +++ * +++ * **bpf_sock_ops_cb_flags_set(bpf_sock,** +++ * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** +++ * +++ * Here are some examples of where one could call such eBPF +++ * program: +++ * +++ * * When RTO fires. +++ * * When a packet is retransmitted. +++ * * When the connection terminates. +++ * * When a packet is sent. +++ * * When a packet is received. +++ * Return +++ * Code **-EINVAL** if the socket is not a full TCP socket; +++ * otherwise, a positive number containing the bits that could not +++ * be set is returned (which comes down to 0 if all bits were set +++ * as required). +++ * +++ * int bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) +++ * Description +++ * This helper is used in programs implementing policies at the +++ * socket level. If the message *msg* is allowed to pass (i.e. if +++ * the verdict eBPF program returns **SK_PASS**), redirect it to +++ * the socket referenced by *map* (of type +++ * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and +++ * egress interfaces can be used for redirection. The +++ * **BPF_F_INGRESS** value in *flags* is used to make the +++ * distinction (ingress path is selected if the flag is present, +++ * egress path otherwise). This is the only flag supported for now. +++ * Return +++ * **SK_PASS** on success, or **SK_DROP** on error. +++ * +++ * int bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) +++ * Description +++ * For socket policies, apply the verdict of the eBPF program to +++ * the next *bytes* (number of bytes) of message *msg*. +++ * +++ * For example, this helper can be used in the following cases: +++ * +++ * * A single **sendmsg**\ () or **sendfile**\ () system call +++ * contains multiple logical messages that the eBPF program is +++ * supposed to read and for which it should apply a verdict. +++ * * An eBPF program only cares to read the first *bytes* of a +++ * *msg*. If the message has a large payload, then setting up +++ * and calling the eBPF program repeatedly for all bytes, even +++ * though the verdict is already known, would create unnecessary +++ * overhead. +++ * +++ * When called from within an eBPF program, the helper sets a +++ * counter internal to the BPF infrastructure, that is used to +++ * apply the last verdict to the next *bytes*. If *bytes* is +++ * smaller than the current data being processed from a +++ * **sendmsg**\ () or **sendfile**\ () system call, the first +++ * *bytes* will be sent and the eBPF program will be re-run with +++ * the pointer for start of data pointing to byte number *bytes* +++ * **+ 1**. If *bytes* is larger than the current data being +++ * processed, then the eBPF verdict will be applied to multiple +++ * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are +++ * consumed. +++ * +++ * Note that if a socket closes with the internal counter holding +++ * a non-zero value, this is not a problem because data is not +++ * being buffered for *bytes* and is sent as it is received. +++ * Return +++ * 0 +++ * +++ * int bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) +++ * Description +++ * For socket policies, prevent the execution of the verdict eBPF +++ * program for message *msg* until *bytes* (byte number) have been +++ * accumulated. +++ * +++ * This can be used when one needs a specific number of bytes +++ * before a verdict can be assigned, even if the data spans +++ * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme +++ * case would be a user calling **sendmsg**\ () repeatedly with +++ * 1-byte long message segments. Obviously, this is bad for +++ * performance, but it is still valid. If the eBPF program needs +++ * *bytes* bytes to validate a header, this helper can be used to +++ * prevent the eBPF program to be called again until *bytes* have +++ * been accumulated. +++ * Return +++ * 0 +++ * +++ * int bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) +++ * Description +++ * For socket policies, pull in non-linear data from user space +++ * for *msg* and set pointers *msg*\ **->data** and *msg*\ +++ * **->data_end** to *start* and *end* bytes offsets into *msg*, +++ * respectively. +++ * +++ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a +++ * *msg* it can only parse data that the (**data**, **data_end**) +++ * pointers have already consumed. For **sendmsg**\ () hooks this +++ * is likely the first scatterlist element. But for calls relying +++ * on the **sendpage** handler (e.g. **sendfile**\ ()) this will +++ * be the range (**0**, **0**) because the data is shared with +++ * user space and by default the objective is to avoid allowing +++ * user space to modify data while (or after) eBPF verdict is +++ * being decided. This helper can be used to pull in data and to +++ * set the start and end pointer to given values. Data will be +++ * copied if necessary (i.e. if data was not linear and if start +++ * and end pointers do not point to the same chunk). +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * +++ * All values for *flags* are reserved for future usage, and must +++ * be left at zero. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) +++ * Description +++ * Bind the socket associated to *ctx* to the address pointed by +++ * *addr*, of length *addr_len*. This allows for making outgoing +++ * connection from the desired IP address, which can be useful for +++ * example when all processes inside a cgroup should use one +++ * single IP address on a host that has multiple IP configured. +++ * +++ * This helper works for IPv4 and IPv6, TCP and UDP sockets. The +++ * domain (*addr*\ **->sa_family**) must be **AF_INET** (or +++ * **AF_INET6**). Looking for a free port to bind to can be +++ * expensive, therefore binding to port is not permitted by the +++ * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) +++ * must be set to zero. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) +++ * Description +++ * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is +++ * only possible to shrink the packet as of this writing, +++ * therefore *delta* must be a negative integer. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) +++ * Description +++ * Retrieve the XFRM state (IP transform framework, see also +++ * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. +++ * +++ * The retrieved value is stored in the **struct bpf_xfrm_state** +++ * pointed by *xfrm_state* and of length *size*. +++ * +++ * All values for *flags* are reserved for future usage, and must +++ * be left at zero. +++ * +++ * This helper is available only if the kernel was compiled with +++ * **CONFIG_XFRM** configuration option. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_get_stack(struct pt_regs *regs, void *buf, u32 size, u64 flags) +++ * Description +++ * Return a user or a kernel stack in bpf program provided buffer. +++ * To achieve this, the helper needs *ctx*, which is a pointer +++ * to the context on which the tracing program is executed. +++ * To store the stacktrace, the bpf program provides *buf* with +++ * a nonnegative *size*. +++ * +++ * The last argument, *flags*, holds the number of stack frames to +++ * skip (from 0 to 255), masked with +++ * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set +++ * the following flags: +++ * +++ * **BPF_F_USER_STACK** +++ * Collect a user space stack instead of a kernel stack. +++ * **BPF_F_USER_BUILD_ID** +++ * Collect buildid+offset instead of ips for user stack, +++ * only valid if **BPF_F_USER_STACK** is also specified. +++ * +++ * **bpf_get_stack**\ () can collect up to +++ * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject +++ * to sufficient large buffer size. Note that +++ * this limit can be controlled with the **sysctl** program, and +++ * that it should be manually increased in order to profile long +++ * user stacks (such as stacks for Java programs). To do so, use: +++ * +++ * :: +++ * +++ * # sysctl kernel.perf_event_max_stack= +++ * Return +++ * A non-negative value equal to or less than *size* on success, +++ * or a negative error in case of failure. +++ * +++ * int bpf_skb_load_bytes_relative(const struct sk_buff *skb, u32 offset, void *to, u32 len, u32 start_header) +++ * Description +++ * This helper is similar to **bpf_skb_load_bytes**\ () in that +++ * it provides an easy way to load *len* bytes from *offset* +++ * from the packet associated to *skb*, into the buffer pointed +++ * by *to*. The difference to **bpf_skb_load_bytes**\ () is that +++ * a fifth argument *start_header* exists in order to select a +++ * base offset to start from. *start_header* can be one of: +++ * +++ * **BPF_HDR_START_MAC** +++ * Base offset to load data from is *skb*'s mac header. +++ * **BPF_HDR_START_NET** +++ * Base offset to load data from is *skb*'s network header. +++ * +++ * In general, "direct packet access" is the preferred method to +++ * access packet data, however, this helper is in particular useful +++ * in socket filters where *skb*\ **->data** does not always point +++ * to the start of the mac header and where "direct packet access" +++ * is not available. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) +++ * Description +++ * Do FIB lookup in kernel tables using parameters in *params*. +++ * If lookup is successful and result shows packet is to be +++ * forwarded, the neighbor tables are searched for the nexthop. +++ * If successful (ie., FIB lookup shows forwarding and nexthop +++ * is resolved), the nexthop address is returned in ipv4_dst +++ * or ipv6_dst based on family, smac is set to mac address of +++ * egress device, dmac is set to nexthop mac address, rt_metric +++ * is set to metric from route (IPv4/IPv6 only), and ifindex +++ * is set to the device index of the nexthop from the FIB lookup. +++ * +++ * *plen* argument is the size of the passed in struct. +++ * *flags* argument can be a combination of one or more of the +++ * following values: +++ * +++ * **BPF_FIB_LOOKUP_DIRECT** +++ * Do a direct table lookup vs full lookup using FIB +++ * rules. +++ * **BPF_FIB_LOOKUP_OUTPUT** +++ * Perform lookup from an egress perspective (default is +++ * ingress). +++ * +++ * *ctx* is either **struct xdp_md** for XDP programs or +++ * **struct sk_buff** tc cls_act programs. +++ * Return +++ * * < 0 if any input argument is invalid +++ * * 0 on success (packet is forwarded, nexthop neighbor exists) +++ * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the +++ * packet is not forwarded or needs assist from full stack +++ * +++ * int bpf_sock_hash_update(struct bpf_sock_ops_kern *skops, struct bpf_map *map, void *key, u64 flags) +++ * Description +++ * Add an entry to, or update a sockhash *map* referencing sockets. +++ * The *skops* is used as a new value for the entry associated to +++ * *key*. *flags* is one of: +++ * +++ * **BPF_NOEXIST** +++ * The entry for *key* must not exist in the map. +++ * **BPF_EXIST** +++ * The entry for *key* must already exist in the map. +++ * **BPF_ANY** +++ * No condition on the existence of the entry for *key*. +++ * +++ * If the *map* has eBPF programs (parser and verdict), those will +++ * be inherited by the socket being added. If the socket is +++ * already attached to eBPF programs, this results in an error. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) +++ * Description +++ * This helper is used in programs implementing policies at the +++ * socket level. If the message *msg* is allowed to pass (i.e. if +++ * the verdict eBPF program returns **SK_PASS**), redirect it to +++ * the socket referenced by *map* (of type +++ * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and +++ * egress interfaces can be used for redirection. The +++ * **BPF_F_INGRESS** value in *flags* is used to make the +++ * distinction (ingress path is selected if the flag is present, +++ * egress path otherwise). This is the only flag supported for now. +++ * Return +++ * **SK_PASS** on success, or **SK_DROP** on error. +++ * +++ * int bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) +++ * Description +++ * This helper is used in programs implementing policies at the +++ * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. +++ * if the verdeict eBPF program returns **SK_PASS**), redirect it +++ * to the socket referenced by *map* (of type +++ * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and +++ * egress interfaces can be used for redirection. The +++ * **BPF_F_INGRESS** value in *flags* is used to make the +++ * distinction (ingress path is selected if the flag is present, +++ * egress otherwise). This is the only flag supported for now. +++ * Return +++ * **SK_PASS** on success, or **SK_DROP** on error. +++ * +++ * int bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) +++ * Description +++ * Encapsulate the packet associated to *skb* within a Layer 3 +++ * protocol header. This header is provided in the buffer at +++ * address *hdr*, with *len* its size in bytes. *type* indicates +++ * the protocol of the header and can be one of: +++ * +++ * **BPF_LWT_ENCAP_SEG6** +++ * IPv6 encapsulation with Segment Routing Header +++ * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, +++ * the IPv6 header is computed by the kernel. +++ * **BPF_LWT_ENCAP_SEG6_INLINE** +++ * Only works if *skb* contains an IPv6 packet. Insert a +++ * Segment Routing Header (**struct ipv6_sr_hdr**) inside +++ * the IPv6 header. +++ * **BPF_LWT_ENCAP_IP** +++ * IP encapsulation (GRE/GUE/IPIP/etc). The outer header +++ * must be IPv4 or IPv6, followed by zero or more +++ * additional headers, up to **LWT_BPF_MAX_HEADROOM** +++ * total bytes in all prepended headers. Please note that +++ * if **skb_is_gso**\ (*skb*) is true, no more than two +++ * headers can be prepended, and the inner header, if +++ * present, should be either GRE or UDP/GUE. +++ * +++ * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs +++ * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can +++ * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and +++ * **BPF_PROG_TYPE_LWT_XMIT**. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) +++ * Description +++ * Store *len* bytes from address *from* into the packet +++ * associated to *skb*, at *offset*. Only the flags, tag and TLVs +++ * inside the outermost IPv6 Segment Routing Header can be +++ * modified through this helper. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) +++ * Description +++ * Adjust the size allocated to TLVs in the outermost IPv6 +++ * Segment Routing Header contained in the packet associated to +++ * *skb*, at position *offset* by *delta* bytes. Only offsets +++ * after the segments are accepted. *delta* can be as well +++ * positive (growing) as negative (shrinking). +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) +++ * Description +++ * Apply an IPv6 Segment Routing action of type *action* to the +++ * packet associated to *skb*. Each action takes a parameter +++ * contained at address *param*, and of length *param_len* bytes. +++ * *action* can be one of: +++ * +++ * **SEG6_LOCAL_ACTION_END_X** +++ * End.X action: Endpoint with Layer-3 cross-connect. +++ * Type of *param*: **struct in6_addr**. +++ * **SEG6_LOCAL_ACTION_END_T** +++ * End.T action: Endpoint with specific IPv6 table lookup. +++ * Type of *param*: **int**. +++ * **SEG6_LOCAL_ACTION_END_B6** +++ * End.B6 action: Endpoint bound to an SRv6 policy. +++ * Type of *param*: **struct ipv6_sr_hdr**. +++ * **SEG6_LOCAL_ACTION_END_B6_ENCAP** +++ * End.B6.Encap action: Endpoint bound to an SRv6 +++ * encapsulation policy. +++ * Type of *param*: **struct ipv6_sr_hdr**. +++ * +++ * A call to this helper is susceptible to change the underlying +++ * packet buffer. Therefore, at load time, all checks on pointers +++ * previously done by the verifier are invalidated and must be +++ * performed again, if the helper is used in combination with +++ * direct packet access. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_rc_repeat(void *ctx) +++ * Description +++ * This helper is used in programs implementing IR decoding, to +++ * report a successfully decoded repeat key message. This delays +++ * the generation of a key up event for previously generated +++ * key down event. +++ * +++ * Some IR protocols like NEC have a special IR message for +++ * repeating last button, for when a button is held down. +++ * +++ * The *ctx* should point to the lirc sample as passed into +++ * the program. +++ * +++ * This helper is only available is the kernel was compiled with +++ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to +++ * "**y**". +++ * Return +++ * 0 +++ * +++ * int bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) +++ * Description +++ * This helper is used in programs implementing IR decoding, to +++ * report a successfully decoded key press with *scancode*, +++ * *toggle* value in the given *protocol*. The scancode will be +++ * translated to a keycode using the rc keymap, and reported as +++ * an input key down event. After a period a key up event is +++ * generated. This period can be extended by calling either +++ * **bpf_rc_keydown**\ () again with the same values, or calling +++ * **bpf_rc_repeat**\ (). +++ * +++ * Some protocols include a toggle bit, in case the button was +++ * released and pressed again between consecutive scancodes. +++ * +++ * The *ctx* should point to the lirc sample as passed into +++ * the program. +++ * +++ * The *protocol* is the decoded protocol number (see +++ * **enum rc_proto** for some predefined values). +++ * +++ * This helper is only available is the kernel was compiled with +++ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to +++ * "**y**". +++ * Return +++ * 0 +++ * +++ * u64 bpf_skb_cgroup_id(struct sk_buff *skb) +++ * Description +++ * Return the cgroup v2 id of the socket associated with the *skb*. +++ * This is roughly similar to the **bpf_get_cgroup_classid**\ () +++ * helper for cgroup v1 by providing a tag resp. identifier that +++ * can be matched on or used for map lookups e.g. to implement +++ * policy. The cgroup v2 id of a given path in the hierarchy is +++ * exposed in user space through the f_handle API in order to get +++ * to the same 64-bit id. +++ * +++ * This helper can be used on TC egress path, but not on ingress, +++ * and is available only if the kernel was compiled with the +++ * **CONFIG_SOCK_CGROUP_DATA** configuration option. +++ * Return +++ * The id is returned or 0 in case the id could not be retrieved. +++ * +++ * u64 bpf_get_current_cgroup_id(void) +++ * Return +++ * A 64-bit integer containing the current cgroup id based +++ * on the cgroup within which the current task is running. +++ * +++ * void *bpf_get_local_storage(void *map, u64 flags) +++ * Description +++ * Get the pointer to the local storage area. +++ * The type and the size of the local storage is defined +++ * by the *map* argument. +++ * The *flags* meaning is specific for each map type, +++ * and has to be 0 for cgroup local storage. +++ * +++ * Depending on the BPF program type, a local storage area +++ * can be shared between multiple instances of the BPF program, +++ * running simultaneously. +++ * +++ * A user should care about the synchronization by himself. +++ * For example, by using the **BPF_STX_XADD** instruction to alter +++ * the shared data. +++ * Return +++ * A pointer to the local storage area. +++ * +++ * int bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) +++ * Description +++ * Select a **SO_REUSEPORT** socket from a +++ * **BPF_MAP_TYPE_REUSEPORT_ARRAY** *map*. +++ * It checks the selected socket is matching the incoming +++ * request in the socket buffer. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) +++ * Description +++ * Return id of cgroup v2 that is ancestor of cgroup associated +++ * with the *skb* at the *ancestor_level*. The root cgroup is at +++ * *ancestor_level* zero and each step down the hierarchy +++ * increments the level. If *ancestor_level* == level of cgroup +++ * associated with *skb*, then return value will be same as that +++ * of **bpf_skb_cgroup_id**\ (). +++ * +++ * The helper is useful to implement policies based on cgroups +++ * that are upper in hierarchy than immediate cgroup associated +++ * with *skb*. +++ * +++ * The format of returned id and helper limitations are same as in +++ * **bpf_skb_cgroup_id**\ (). +++ * Return +++ * The id is returned or 0 in case the id could not be retrieved. +++ * +++ * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) +++ * Description +++ * Look for TCP socket matching *tuple*, optionally in a child +++ * network namespace *netns*. The return value must be checked, +++ * and if non-**NULL**, released via **bpf_sk_release**\ (). +++ * +++ * The *ctx* should point to the context of the program, such as +++ * the skb or socket (depending on the hook in use). This is used +++ * to determine the base network namespace for the lookup. +++ * +++ * *tuple_size* must be one of: +++ * +++ * **sizeof**\ (*tuple*\ **->ipv4**) +++ * Look for an IPv4 socket. +++ * **sizeof**\ (*tuple*\ **->ipv6**) +++ * Look for an IPv6 socket. +++ * +++ * If the *netns* is a negative signed 32-bit integer, then the +++ * socket lookup table in the netns associated with the *ctx* will +++ * will be used. For the TC hooks, this is the netns of the device +++ * in the skb. For socket hooks, this is the netns of the socket. +++ * If *netns* is any other signed 32-bit value greater than or +++ * equal to zero then it specifies the ID of the netns relative to +++ * the netns associated with the *ctx*. *netns* values beyond the +++ * range of 32-bit integers are reserved for future use. +++ * +++ * All values for *flags* are reserved for future usage, and must +++ * be left at zero. +++ * +++ * This helper is available only if the kernel was compiled with +++ * **CONFIG_NET** configuration option. +++ * Return +++ * Pointer to **struct bpf_sock**, or **NULL** in case of failure. +++ * For sockets with reuseport option, the **struct bpf_sock** +++ * result is from *reuse*\ **->socks**\ [] using the hash of the +++ * tuple. +++ * +++ * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) +++ * Description +++ * Look for UDP socket matching *tuple*, optionally in a child +++ * network namespace *netns*. The return value must be checked, +++ * and if non-**NULL**, released via **bpf_sk_release**\ (). +++ * +++ * The *ctx* should point to the context of the program, such as +++ * the skb or socket (depending on the hook in use). This is used +++ * to determine the base network namespace for the lookup. +++ * +++ * *tuple_size* must be one of: +++ * +++ * **sizeof**\ (*tuple*\ **->ipv4**) +++ * Look for an IPv4 socket. +++ * **sizeof**\ (*tuple*\ **->ipv6**) +++ * Look for an IPv6 socket. +++ * +++ * If the *netns* is a negative signed 32-bit integer, then the +++ * socket lookup table in the netns associated with the *ctx* will +++ * will be used. For the TC hooks, this is the netns of the device +++ * in the skb. For socket hooks, this is the netns of the socket. +++ * If *netns* is any other signed 32-bit value greater than or +++ * equal to zero then it specifies the ID of the netns relative to +++ * the netns associated with the *ctx*. *netns* values beyond the +++ * range of 32-bit integers are reserved for future use. +++ * +++ * All values for *flags* are reserved for future usage, and must +++ * be left at zero. +++ * +++ * This helper is available only if the kernel was compiled with +++ * **CONFIG_NET** configuration option. +++ * Return +++ * Pointer to **struct bpf_sock**, or **NULL** in case of failure. +++ * For sockets with reuseport option, the **struct bpf_sock** +++ * result is from *reuse*\ **->socks**\ [] using the hash of the +++ * tuple. +++ * +++ * int bpf_sk_release(struct bpf_sock *sock) +++ * Description +++ * Release the reference held by *sock*. *sock* must be a +++ * non-**NULL** pointer that was returned from +++ * **bpf_sk_lookup_xxx**\ (). +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) +++ * Description +++ * Push an element *value* in *map*. *flags* is one of: +++ * +++ * **BPF_EXIST** +++ * If the queue/stack is full, the oldest element is +++ * removed to make room for this. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_map_pop_elem(struct bpf_map *map, void *value) +++ * Description +++ * Pop an element from *map*. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_map_peek_elem(struct bpf_map *map, void *value) +++ * Description +++ * Get an element from *map* without removing it. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) +++ * Description +++ * For socket policies, insert *len* bytes into *msg* at offset +++ * *start*. +++ * +++ * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a +++ * *msg* it may want to insert metadata or options into the *msg*. +++ * This can later be read and used by any of the lower layer BPF +++ * hooks. +++ * +++ * This helper may fail if under memory pressure (a malloc +++ * fails) in these cases BPF programs will get an appropriate +++ * error and BPF programs will need to handle them. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 pop, u64 flags) +++ * Description +++ * Will remove *pop* bytes from a *msg* starting at byte *start*. +++ * This may result in **ENOMEM** errors under certain situations if +++ * an allocation and copy are required due to a full ring buffer. +++ * However, the helper will try to avoid doing the allocation +++ * if possible. Other errors can occur if input parameters are +++ * invalid either due to *start* byte not being valid part of *msg* +++ * payload and/or *pop* value being to large. +++ * Return +++ * 0 on success, or a negative error in case of failure. +++ * +++ * int bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) +++ * Description +++ * This helper is used in programs implementing IR decoding, to +++ * report a successfully decoded pointer movement. +++ * +++ * The *ctx* should point to the lirc sample as passed into +++ * the program. +++ * +++ * This helper is only available is the kernel was compiled with +++ * the **CONFIG_BPF_LIRC_MODE2** configuration option set to +++ * "**y**". +++ * Return +++ * 0 +++ * +++ * int bpf_spin_lock(struct bpf_spin_lock *lock) +++ * Description +++ * Acquire a spinlock represented by the pointer *lock*, which is +++ * stored as part of a value of a map. Taking the lock allows to +++ * safely update the rest of the fields in that value. The +++ * spinlock can (and must) later be released with a call to +++ * **bpf_spin_unlock**\ (\ *lock*\ ). +++ * +++ * Spinlocks in BPF programs come with a number of restrictions +++ * and constraints: +++ * +++ * * **bpf_spin_lock** objects are only allowed inside maps of +++ * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this +++ * list could be extended in the future). +++ * * BTF description of the map is mandatory. +++ * * The BPF program can take ONE lock at a time, since taking two +++ * or more could cause dead locks. +++ * * Only one **struct bpf_spin_lock** is allowed per map element. +++ * * When the lock is taken, calls (either BPF to BPF or helpers) +++ * are not allowed. +++ * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not +++ * allowed inside a spinlock-ed region. +++ * * The BPF program MUST call **bpf_spin_unlock**\ () to release +++ * the lock, on all execution paths, before it returns. +++ * * The BPF program can access **struct bpf_spin_lock** only via +++ * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () +++ * helpers. Loading or storing data into the **struct +++ * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. +++ * * To use the **bpf_spin_lock**\ () helper, the BTF description +++ * of the map value must be a struct and have **struct +++ * bpf_spin_lock** *anyname*\ **;** field at the top level. +++ * Nested lock inside another struct is not allowed. +++ * * The **struct bpf_spin_lock** *lock* field in a map value must +++ * be aligned on a multiple of 4 bytes in that value. +++ * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy +++ * the **bpf_spin_lock** field to user space. +++ * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from +++ * a BPF program, do not update the **bpf_spin_lock** field. +++ * * **bpf_spin_lock** cannot be on the stack or inside a +++ * networking packet (it can only be inside of a map values). +++ * * **bpf_spin_lock** is available to root only. +++ * * Tracing programs and socket filter programs cannot use +++ * **bpf_spin_lock**\ () due to insufficient preemption checks +++ * (but this may change in the future). +++ * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. +++ * Return +++ * 0 +++ * +++ * int bpf_spin_unlock(struct bpf_spin_lock *lock) +++ * Description +++ * Release the *lock* previously locked by a call to +++ * **bpf_spin_lock**\ (\ *lock*\ ). +++ * Return +++ * 0 +++ * +++ * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) +++ * Description +++ * This helper gets a **struct bpf_sock** pointer such +++ * that all the fields in this **bpf_sock** can be accessed. +++ * Return +++ * A **struct bpf_sock** pointer on success, or **NULL** in +++ * case of failure. +++ * +++ * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) +++ * Description +++ * This helper gets a **struct bpf_tcp_sock** pointer from a +++ * **struct bpf_sock** pointer. +++ * Return +++ * A **struct bpf_tcp_sock** pointer on success, or **NULL** in +++ * case of failure. +++ * +++ * int bpf_skb_ecn_set_ce(struct sk_buf *skb) +++ * Description +++ * Set ECN (Explicit Congestion Notification) field of IP header +++ * to **CE** (Congestion Encountered) if current value is **ECT** +++ * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 +++ * and IPv4. +++ * Return +++ * 1 if the **CE** flag is set (either by the current helper call +++ * or because it was already present), 0 if it is not set. +++ * +++ * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) +++ * Description +++ * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. +++ * **bpf_sk_release**\ () is unnecessary and not allowed. +++ * Return +++ * A **struct bpf_sock** pointer on success, or **NULL** in +++ * case of failure. +++ * +++ * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) +++ * Description +++ * Look for TCP socket matching *tuple*, optionally in a child +++ * network namespace *netns*. The return value must be checked, +++ * and if non-**NULL**, released via **bpf_sk_release**\ (). +++ * +++ * This function is identical to **bpf_sk_lookup_tcp**\ (), except +++ * that it also returns timewait or request sockets. Use +++ * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the +++ * full structure. +++ * +++ * This helper is available only if the kernel was compiled with +++ * **CONFIG_NET** configuration option. +++ * Return +++ * Pointer to **struct bpf_sock**, or **NULL** in case of failure. +++ * For sockets with reuseport option, the **struct bpf_sock** +++ * result is from *reuse*\ **->socks**\ [] using the hash of the +++ * tuple. +++ * +++ * int bpf_tcp_check_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) +++ * Description +++ * Check whether *iph* and *th* contain a valid SYN cookie ACK for +++ * the listening socket in *sk*. +++ * +++ * *iph* points to the start of the IPv4 or IPv6 header, while +++ * *iph_len* contains **sizeof**\ (**struct iphdr**) or +++ * **sizeof**\ (**struct ip6hdr**). +++ * +++ * *th* points to the start of the TCP header, while *th_len* +++ * contains **sizeof**\ (**struct tcphdr**). +++ * +++ * Return +++ * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative +++ * error otherwise. +++ * +++ * int bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) +++ * Description +++ * Get name of sysctl in /proc/sys/ and copy it into provided by +++ * program buffer *buf* of size *buf_len*. +++ * +++ * The buffer is always NUL terminated, unless it's zero-sized. +++ * +++ * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is +++ * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name +++ * only (e.g. "tcp_mem"). +++ * Return +++ * Number of character copied (not including the trailing NUL). +++ * +++ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain +++ * truncated name in this case). +++ * +++ * int bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) +++ * Description +++ * Get current value of sysctl as it is presented in /proc/sys +++ * (incl. newline, etc), and copy it as a string into provided +++ * by program buffer *buf* of size *buf_len*. +++ * +++ * The whole value is copied, no matter what file position user +++ * space issued e.g. sys_read at. +++ * +++ * The buffer is always NUL terminated, unless it's zero-sized. +++ * Return +++ * Number of character copied (not including the trailing NUL). +++ * +++ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain +++ * truncated name in this case). +++ * +++ * **-EINVAL** if current value was unavailable, e.g. because +++ * sysctl is uninitialized and read returns -EIO for it. +++ * +++ * int bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) +++ * Description +++ * Get new value being written by user space to sysctl (before +++ * the actual write happens) and copy it as a string into +++ * provided by program buffer *buf* of size *buf_len*. +++ * +++ * User space may write new value at file position > 0. +++ * +++ * The buffer is always NUL terminated, unless it's zero-sized. +++ * Return +++ * Number of character copied (not including the trailing NUL). +++ * +++ * **-E2BIG** if the buffer wasn't big enough (*buf* will contain +++ * truncated name in this case). +++ * +++ * **-EINVAL** if sysctl is being read. +++ * +++ * int bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) +++ * Description +++ * Override new value being written by user space to sysctl with +++ * value provided by program in buffer *buf* of size *buf_len*. +++ * +++ * *buf* should contain a string in same form as provided by user +++ * space on sysctl write. +++ * +++ * User space may write new value at file position > 0. To override +++ * the whole sysctl value file position should be set to zero. +++ * Return +++ * 0 on success. +++ * +++ * **-E2BIG** if the *buf_len* is too big. +++ * +++ * **-EINVAL** if sysctl is being read. +++ * +++ * int bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) +++ * Description +++ * Convert the initial part of the string from buffer *buf* of +++ * size *buf_len* to a long integer according to the given base +++ * and save the result in *res*. +++ * +++ * The string may begin with an arbitrary amount of white space +++ * (as determined by **isspace**\ (3)) followed by a single +++ * optional '**-**' sign. +++ * +++ * Five least significant bits of *flags* encode base, other bits +++ * are currently unused. +++ * +++ * Base must be either 8, 10, 16 or 0 to detect it automatically +++ * similar to user space **strtol**\ (3). +++ * Return +++ * Number of characters consumed on success. Must be positive but +++ * no more than *buf_len*. +++ * +++ * **-EINVAL** if no valid digits were found or unsupported base +++ * was provided. +++ * +++ * **-ERANGE** if resulting value was out of range. +++ * +++ * int bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) +++ * Description +++ * Convert the initial part of the string from buffer *buf* of +++ * size *buf_len* to an unsigned long integer according to the +++ * given base and save the result in *res*. +++ * +++ * The string may begin with an arbitrary amount of white space +++ * (as determined by **isspace**\ (3)). +++ * +++ * Five least significant bits of *flags* encode base, other bits +++ * are currently unused. +++ * +++ * Base must be either 8, 10, 16 or 0 to detect it automatically +++ * similar to user space **strtoul**\ (3). +++ * Return +++ * Number of characters consumed on success. Must be positive but +++ * no more than *buf_len*. +++ * +++ * **-EINVAL** if no valid digits were found or unsupported base +++ * was provided. +++ * +++ * **-ERANGE** if resulting value was out of range. +++ * +++ * void *bpf_sk_storage_get(struct bpf_map *map, struct bpf_sock *sk, void *value, u64 flags) +++ * Description +++ * Get a bpf-local-storage from a *sk*. +++ * +++ * Logically, it could be thought of getting the value from +++ * a *map* with *sk* as the **key**. From this +++ * perspective, the usage is not much different from +++ * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this +++ * helper enforces the key must be a full socket and the map must +++ * be a **BPF_MAP_TYPE_SK_STORAGE** also. +++ * +++ * Underneath, the value is stored locally at *sk* instead of +++ * the *map*. The *map* is used as the bpf-local-storage +++ * "type". The bpf-local-storage "type" (i.e. the *map*) is +++ * searched against all bpf-local-storages residing at *sk*. +++ * +++ * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be +++ * used such that a new bpf-local-storage will be +++ * created if one does not exist. *value* can be used +++ * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify +++ * the initial value of a bpf-local-storage. If *value* is +++ * **NULL**, the new bpf-local-storage will be zero initialized. +++ * Return +++ * A bpf-local-storage pointer is returned on success. +++ * +++ * **NULL** if not found or there was an error in adding +++ * a new bpf-local-storage. +++ * +++ * int bpf_sk_storage_delete(struct bpf_map *map, struct bpf_sock *sk) +++ * Description +++ * Delete a bpf-local-storage from a *sk*. +++ * Return +++ * 0 on success. +++ * +++ * **-ENOENT** if the bpf-local-storage cannot be found. +++ * +++ * int bpf_send_signal(u32 sig) +++ * Description +++ * Send signal *sig* to the current task. +++ * Return +++ * 0 on success or successfully queued. +++ * +++ * **-EBUSY** if work queue under nmi is full. +++ * +++ * **-EINVAL** if *sig* is invalid. +++ * +++ * **-EPERM** if no permission to send the *sig*. +++ * +++ * **-EAGAIN** if bpf program can try again. +++ * +++ * s64 bpf_tcp_gen_syncookie(struct bpf_sock *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) +++ * Description +++ * Try to issue a SYN cookie for the packet with corresponding +++ * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. +++ * +++ * *iph* points to the start of the IPv4 or IPv6 header, while +++ * *iph_len* contains **sizeof**\ (**struct iphdr**) or +++ * **sizeof**\ (**struct ip6hdr**). +++ * +++ * *th* points to the start of the TCP header, while *th_len* +++ * contains the length of the TCP header. +++ * +++ * Return +++ * On success, lower 32 bits hold the generated SYN cookie in +++ * followed by 16 bits which hold the MSS value for that cookie, +++ * and the top 16 bits are unused. +++ * +++ * On failure, the returned value is one of the following: +++ * +++ * **-EINVAL** SYN cookie cannot be issued due to error +++ * +++ * **-ENOENT** SYN cookie should not be issued (no SYN flood) +++ * +++ * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies +++ * +++ * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 +++ */ +++#define __BPF_FUNC_MAPPER(FN) \ +++ FN(unspec), \ +++ FN(map_lookup_elem), \ +++ FN(map_update_elem), \ +++ FN(map_delete_elem), \ +++ FN(probe_read), \ +++ FN(ktime_get_ns), \ +++ FN(trace_printk), \ +++ FN(get_prandom_u32), \ +++ FN(get_smp_processor_id), \ +++ FN(skb_store_bytes), \ +++ FN(l3_csum_replace), \ +++ FN(l4_csum_replace), \ +++ FN(tail_call), \ +++ FN(clone_redirect), \ +++ FN(get_current_pid_tgid), \ +++ FN(get_current_uid_gid), \ +++ FN(get_current_comm), \ +++ FN(get_cgroup_classid), \ +++ FN(skb_vlan_push), \ +++ FN(skb_vlan_pop), \ +++ FN(skb_get_tunnel_key), \ +++ FN(skb_set_tunnel_key), \ +++ FN(perf_event_read), \ +++ FN(redirect), \ +++ FN(get_route_realm), \ +++ FN(perf_event_output), \ +++ FN(skb_load_bytes), \ +++ FN(get_stackid), \ +++ FN(csum_diff), \ +++ FN(skb_get_tunnel_opt), \ +++ FN(skb_set_tunnel_opt), \ +++ FN(skb_change_proto), \ +++ FN(skb_change_type), \ +++ FN(skb_under_cgroup), \ +++ FN(get_hash_recalc), \ +++ FN(get_current_task), \ +++ FN(probe_write_user), \ +++ FN(current_task_under_cgroup), \ +++ FN(skb_change_tail), \ +++ FN(skb_pull_data), \ +++ FN(csum_update), \ +++ FN(set_hash_invalid), \ +++ FN(get_numa_node_id), \ +++ FN(skb_change_head), \ +++ FN(xdp_adjust_head), \ +++ FN(probe_read_str), \ +++ FN(get_socket_cookie), \ +++ FN(get_socket_uid), \ +++ FN(set_hash), \ +++ FN(setsockopt), \ +++ FN(skb_adjust_room), \ +++ FN(redirect_map), \ +++ FN(sk_redirect_map), \ +++ FN(sock_map_update), \ +++ FN(xdp_adjust_meta), \ +++ FN(perf_event_read_value), \ +++ FN(perf_prog_read_value), \ +++ FN(getsockopt), \ +++ FN(override_return), \ +++ FN(sock_ops_cb_flags_set), \ +++ FN(msg_redirect_map), \ +++ FN(msg_apply_bytes), \ +++ FN(msg_cork_bytes), \ +++ FN(msg_pull_data), \ +++ FN(bind), \ +++ FN(xdp_adjust_tail), \ +++ FN(skb_get_xfrm_state), \ +++ FN(get_stack), \ +++ FN(skb_load_bytes_relative), \ +++ FN(fib_lookup), \ +++ FN(sock_hash_update), \ +++ FN(msg_redirect_hash), \ +++ FN(sk_redirect_hash), \ +++ FN(lwt_push_encap), \ +++ FN(lwt_seg6_store_bytes), \ +++ FN(lwt_seg6_adjust_srh), \ +++ FN(lwt_seg6_action), \ +++ FN(rc_repeat), \ +++ FN(rc_keydown), \ +++ FN(skb_cgroup_id), \ +++ FN(get_current_cgroup_id), \ +++ FN(get_local_storage), \ +++ FN(sk_select_reuseport), \ +++ FN(skb_ancestor_cgroup_id), \ +++ FN(sk_lookup_tcp), \ +++ FN(sk_lookup_udp), \ +++ FN(sk_release), \ +++ FN(map_push_elem), \ +++ FN(map_pop_elem), \ +++ FN(map_peek_elem), \ +++ FN(msg_push_data), \ +++ FN(msg_pop_data), \ +++ FN(rc_pointer_rel), \ +++ FN(spin_lock), \ +++ FN(spin_unlock), \ +++ FN(sk_fullsock), \ +++ FN(tcp_sock), \ +++ FN(skb_ecn_set_ce), \ +++ FN(get_listener_sock), \ +++ FN(skc_lookup_tcp), \ +++ FN(tcp_check_syncookie), \ +++ FN(sysctl_get_name), \ +++ FN(sysctl_get_current_value), \ +++ FN(sysctl_get_new_value), \ +++ FN(sysctl_set_new_value), \ +++ FN(strtol), \ +++ FN(strtoul), \ +++ FN(sk_storage_get), \ +++ FN(sk_storage_delete), \ +++ FN(send_signal), \ +++ FN(tcp_gen_syncookie), +++ ++ /* integer value in 'imm' field of BPF_CALL instruction selects which helper ++ * function eBPF program intends to call ++ */ +++#define __BPF_ENUM_FN(x) BPF_FUNC_ ## x ++ enum bpf_func_id { ++- BPF_FUNC_unspec, ++- BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ ++- BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ ++- BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ ++- BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ ++- BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ ++- BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ ++- BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ ++- BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ ++- ++- /** ++- * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet ++- * @skb: pointer to skb ++- * @offset: offset within packet from skb->mac_header ++- * @from: pointer where to copy bytes from ++- * @len: number of bytes to store into packet ++- * @flags: bit 0 - if true, recompute skb->csum ++- * other bits - reserved ++- * Return: 0 on success ++- */ ++- BPF_FUNC_skb_store_bytes, ++- ++- /** ++- * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum ++- * @skb: pointer to skb ++- * @offset: offset within packet where IP checksum is located ++- * @from: old value of header field ++- * @to: new value of header field ++- * @flags: bits 0-3 - size of header field ++- * other bits - reserved ++- * Return: 0 on success ++- */ ++- BPF_FUNC_l3_csum_replace, ++- ++- /** ++- * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum ++- * @skb: pointer to skb ++- * @offset: offset within packet where TCP/UDP checksum is located ++- * @from: old value of header field ++- * @to: new value of header field ++- * @flags: bits 0-3 - size of header field ++- * bit 4 - is pseudo header ++- * other bits - reserved ++- * Return: 0 on success ++- */ ++- BPF_FUNC_l4_csum_replace, +++ __BPF_FUNC_MAPPER(__BPF_ENUM_FN) +++ __BPF_FUNC_MAX_ID, +++}; +++#undef __BPF_ENUM_FN ++ ++- /** ++- * bpf_tail_call(ctx, prog_array_map, index) - jump into another BPF program ++- * @ctx: context pointer passed to next program ++- * @prog_array_map: pointer to map which type is BPF_MAP_TYPE_PROG_ARRAY ++- * @index: index inside array that selects specific program to run ++- * Return: 0 on success ++- */ ++- BPF_FUNC_tail_call, +++/* All flags used by eBPF helper functions, placed here. */ ++ ++- /** ++- * bpf_clone_redirect(skb, ifindex, flags) - redirect to another netdev ++- * @skb: pointer to skb ++- * @ifindex: ifindex of the net device ++- * @flags: bit 0 - if set, redirect to ingress instead of egress ++- * other bits - reserved ++- * Return: 0 on success ++- */ ++- BPF_FUNC_clone_redirect, +++/* BPF_FUNC_skb_store_bytes flags. */ +++#define BPF_F_RECOMPUTE_CSUM (1ULL << 0) +++#define BPF_F_INVALIDATE_HASH (1ULL << 1) ++ ++- /** ++- * u64 bpf_get_current_pid_tgid(void) ++- * Return: current->tgid << 32 | current->pid ++- */ ++- BPF_FUNC_get_current_pid_tgid, +++/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. +++ * First 4 bits are for passing the header field size. +++ */ +++#define BPF_F_HDR_FIELD_MASK 0xfULL ++ ++- /** ++- * u64 bpf_get_current_uid_gid(void) ++- * Return: current_gid << 32 | current_uid ++- */ ++- BPF_FUNC_get_current_uid_gid, +++/* BPF_FUNC_l4_csum_replace flags. */ +++#define BPF_F_PSEUDO_HDR (1ULL << 4) +++#define BPF_F_MARK_MANGLED_0 (1ULL << 5) +++#define BPF_F_MARK_ENFORCE (1ULL << 6) +++ +++/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +++#define BPF_F_INGRESS (1ULL << 0) +++ +++/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +++#define BPF_F_TUNINFO_IPV6 (1ULL << 0) +++ +++/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ +++#define BPF_F_SKIP_FIELD_MASK 0xffULL +++#define BPF_F_USER_STACK (1ULL << 8) +++/* flags used by BPF_FUNC_get_stackid only. */ +++#define BPF_F_FAST_STACK_CMP (1ULL << 9) +++#define BPF_F_REUSE_STACKID (1ULL << 10) +++/* flags used by BPF_FUNC_get_stack only. */ +++#define BPF_F_USER_BUILD_ID (1ULL << 11) +++ +++/* BPF_FUNC_skb_set_tunnel_key flags. */ +++#define BPF_F_ZERO_CSUM_TX (1ULL << 1) +++#define BPF_F_DONT_FRAGMENT (1ULL << 2) +++#define BPF_F_SEQ_NUMBER (1ULL << 3) ++ ++- /** ++- * bpf_get_current_comm(char *buf, int size_of_buf) ++- * stores current->comm into buf ++- * Return: 0 on success ++- */ ++- BPF_FUNC_get_current_comm, ++- ++- /** ++- * bpf_get_cgroup_classid(skb) - retrieve a proc's classid ++- * @skb: pointer to skb ++- * Return: classid if != 0 ++- */ ++- BPF_FUNC_get_cgroup_classid, ++- BPF_FUNC_skb_vlan_push, /* bpf_skb_vlan_push(skb, vlan_proto, vlan_tci) */ ++- BPF_FUNC_skb_vlan_pop, /* bpf_skb_vlan_pop(skb) */ ++- ++- /** ++- * bpf_skb_[gs]et_tunnel_key(skb, key, size, flags) ++- * retrieve or populate tunnel metadata ++- * @skb: pointer to skb ++- * @key: pointer to 'struct bpf_tunnel_key' ++- * @size: size of 'struct bpf_tunnel_key' ++- * @flags: room for future extensions ++- * Retrun: 0 on success ++- */ ++- BPF_FUNC_skb_get_tunnel_key, ++- BPF_FUNC_skb_set_tunnel_key, ++- BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */ ++- /** ++- * bpf_redirect(ifindex, flags) - redirect to another netdev ++- * @ifindex: ifindex of the net device ++- * @flags: bit 0 - if set, redirect to ingress instead of egress ++- * other bits - reserved ++- * Return: TC_ACT_REDIRECT ++- */ ++- BPF_FUNC_redirect, +++/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and +++ * BPF_FUNC_perf_event_read_value flags. +++ */ +++#define BPF_F_INDEX_MASK 0xffffffffULL +++#define BPF_F_CURRENT_CPU BPF_F_INDEX_MASK +++/* BPF_FUNC_perf_event_output for sk_buff input context. */ +++#define BPF_F_CTXLEN_MASK (0xfffffULL << 32) +++ +++/* Current network namespace */ +++#define BPF_F_CURRENT_NETNS (-1L) +++ +++/* BPF_FUNC_skb_adjust_room flags. */ +++#define BPF_F_ADJ_ROOM_FIXED_GSO (1ULL << 0) +++ +++#define BPF_ADJ_ROOM_ENCAP_L2_MASK 0xff +++#define BPF_ADJ_ROOM_ENCAP_L2_SHIFT 56 +++ +++#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 (1ULL << 1) +++#define BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 (1ULL << 2) +++#define BPF_F_ADJ_ROOM_ENCAP_L4_GRE (1ULL << 3) +++#define BPF_F_ADJ_ROOM_ENCAP_L4_UDP (1ULL << 4) +++#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ +++ BPF_ADJ_ROOM_ENCAP_L2_MASK) \ +++ << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) +++ +++/* BPF_FUNC_sysctl_get_name flags. */ +++#define BPF_F_SYSCTL_BASE_NAME (1ULL << 0) +++ +++/* BPF_FUNC_sk_storage_get flags */ +++#define BPF_SK_STORAGE_GET_F_CREATE (1ULL << 0) +++ +++/* Mode for BPF_FUNC_skb_adjust_room helper. */ +++enum bpf_adj_room_mode { +++ BPF_ADJ_ROOM_NET, +++ BPF_ADJ_ROOM_MAC, +++}; ++ ++- /** ++- * bpf_get_route_realm(skb) - retrieve a dst's tclassid ++- * @skb: pointer to skb ++- * Return: realm if != 0 ++- */ ++- BPF_FUNC_get_route_realm, +++/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +++enum bpf_hdr_start_off { +++ BPF_HDR_START_MAC, +++ BPF_HDR_START_NET, +++}; ++ ++- /** ++- * bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample ++- * @ctx: struct pt_regs* ++- * @map: pointer to perf_event_array map ++- * @index: index of event in the map ++- * @data: data on stack to be output as raw data ++- * @size: size of data ++- * Return: 0 on success ++- */ ++- BPF_FUNC_perf_event_output, ++- __BPF_FUNC_MAX_ID, +++/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +++enum bpf_lwt_encap_mode { +++ BPF_LWT_ENCAP_SEG6, +++ BPF_LWT_ENCAP_SEG6_INLINE, +++ BPF_LWT_ENCAP_IP, ++ }; ++ +++#define __bpf_md_ptr(type, name) \ +++union { \ +++ type name; \ +++ __u64 :64; \ +++} __attribute__((aligned(8))) +++ ++ /* user accessible mirror of in-kernel sk_buff. ++ * new fields can only be added to the end of this structure ++ */ ++@@ -291,11 +2985,632 @@ struct __sk_buff { ++ __u32 cb[5]; ++ __u32 hash; ++ __u32 tc_classid; +++ __u32 data; +++ __u32 data_end; +++ __u32 napi_id; +++ +++ /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ +++ __u32 family; +++ __u32 remote_ip4; /* Stored in network byte order */ +++ __u32 local_ip4; /* Stored in network byte order */ +++ __u32 remote_ip6[4]; /* Stored in network byte order */ +++ __u32 local_ip6[4]; /* Stored in network byte order */ +++ __u32 remote_port; /* Stored in network byte order */ +++ __u32 local_port; /* stored in host byte order */ +++ /* ... here. */ +++ +++ __u32 data_meta; +++ __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); +++ __u64 tstamp; +++ __u32 wire_len; +++ __u32 gso_segs; +++ __bpf_md_ptr(struct bpf_sock *, sk); ++ }; ++ ++ struct bpf_tunnel_key { ++ __u32 tunnel_id; ++- __u32 remote_ipv4; +++ union { +++ __u32 remote_ipv4; +++ __u32 remote_ipv6[4]; +++ }; +++ __u8 tunnel_tos; +++ __u8 tunnel_ttl; +++ __u16 tunnel_ext; /* Padding, future use. */ +++ __u32 tunnel_label; +++}; +++ +++/* user accessible mirror of in-kernel xfrm_state. +++ * new fields can only be added to the end of this structure +++ */ +++struct bpf_xfrm_state { +++ __u32 reqid; +++ __u32 spi; /* Stored in network byte order */ +++ __u16 family; +++ __u16 ext; /* Padding, future use. */ +++ union { +++ __u32 remote_ipv4; /* Stored in network byte order */ +++ __u32 remote_ipv6[4]; /* Stored in network byte order */ +++ }; +++}; +++ +++/* Generic BPF return codes which all BPF program types may support. +++ * The values are binary compatible with their TC_ACT_* counter-part to +++ * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT +++ * programs. +++ * +++ * XDP is handled seprately, see XDP_*. +++ */ +++enum bpf_ret_code { +++ BPF_OK = 0, +++ /* 1 reserved */ +++ BPF_DROP = 2, +++ /* 3-6 reserved */ +++ BPF_REDIRECT = 7, +++ /* >127 are reserved for prog type specific return codes. +++ * +++ * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and +++ * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been +++ * changed and should be routed based on its new L3 header. +++ * (This is an L3 redirect, as opposed to L2 redirect +++ * represented by BPF_REDIRECT above). +++ */ +++ BPF_LWT_REROUTE = 128, +++}; +++ +++struct bpf_sock { +++ __u32 bound_dev_if; +++ __u32 family; +++ __u32 type; +++ __u32 protocol; +++ __u32 mark; +++ __u32 priority; +++ /* IP address also allows 1 and 2 bytes access */ +++ __u32 src_ip4; +++ __u32 src_ip6[4]; +++ __u32 src_port; /* host byte order */ +++ __u32 dst_port; /* network byte order */ +++ __u32 dst_ip4; +++ __u32 dst_ip6[4]; +++ __u32 state; +++}; +++ +++struct bpf_tcp_sock { +++ __u32 snd_cwnd; /* Sending congestion window */ +++ __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ +++ __u32 rtt_min; +++ __u32 snd_ssthresh; /* Slow start size threshold */ +++ __u32 rcv_nxt; /* What we want to receive next */ +++ __u32 snd_nxt; /* Next sequence we send */ +++ __u32 snd_una; /* First byte we want an ack for */ +++ __u32 mss_cache; /* Cached effective mss, not including SACKS */ +++ __u32 ecn_flags; /* ECN status bits. */ +++ __u32 rate_delivered; /* saved rate sample: packets delivered */ +++ __u32 rate_interval_us; /* saved rate sample: time elapsed */ +++ __u32 packets_out; /* Packets which are "in flight" */ +++ __u32 retrans_out; /* Retransmitted packets out */ +++ __u32 total_retrans; /* Total retransmits for entire connection */ +++ __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn +++ * total number of segments in. +++ */ +++ __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn +++ * total number of data segments in. +++ */ +++ __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut +++ * The total number of segments sent. +++ */ +++ __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut +++ * total number of data segments sent. +++ */ +++ __u32 lost_out; /* Lost packets */ +++ __u32 sacked_out; /* SACK'd packets */ +++ __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived +++ * sum(delta(rcv_nxt)), or how many bytes +++ * were acked. +++ */ +++ __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked +++ * sum(delta(snd_una)), or how many bytes +++ * were acked. +++ */ +++ __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups +++ * total number of DSACK blocks received +++ */ +++ __u32 delivered; /* Total data packets delivered incl. rexmits */ +++ __u32 delivered_ce; /* Like the above but only ECE marked packets */ +++ __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ +++}; +++ +++struct bpf_sock_tuple { +++ union { +++ struct { +++ __be32 saddr; +++ __be32 daddr; +++ __be16 sport; +++ __be16 dport; +++ } ipv4; +++ struct { +++ __be32 saddr[4]; +++ __be32 daddr[4]; +++ __be16 sport; +++ __be16 dport; +++ } ipv6; +++ }; +++}; +++ +++struct bpf_xdp_sock { +++ __u32 queue_id; +++}; +++ +++#define XDP_PACKET_HEADROOM 256 +++ +++/* User return codes for XDP prog type. +++ * A valid XDP program must return one of these defined values. All other +++ * return codes are reserved for future use. Unknown return codes will +++ * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). +++ */ +++enum xdp_action { +++ XDP_ABORTED = 0, +++ XDP_DROP, +++ XDP_PASS, +++ XDP_TX, +++ XDP_REDIRECT, +++}; +++ +++/* user accessible metadata for XDP packet hook +++ * new fields must be added to the end of this structure +++ */ +++struct xdp_md { +++ __u32 data; +++ __u32 data_end; +++ __u32 data_meta; +++ /* Below access go through struct xdp_rxq_info */ +++ __u32 ingress_ifindex; /* rxq->dev->ifindex */ +++ __u32 rx_queue_index; /* rxq->queue_index */ +++}; +++ +++enum sk_action { +++ SK_DROP = 0, +++ SK_PASS, +++}; +++ +++/* user accessible metadata for SK_MSG packet hook, new fields must +++ * be added to the end of this structure +++ */ +++struct sk_msg_md { +++ __bpf_md_ptr(void *, data); +++ __bpf_md_ptr(void *, data_end); +++ +++ __u32 family; +++ __u32 remote_ip4; /* Stored in network byte order */ +++ __u32 local_ip4; /* Stored in network byte order */ +++ __u32 remote_ip6[4]; /* Stored in network byte order */ +++ __u32 local_ip6[4]; /* Stored in network byte order */ +++ __u32 remote_port; /* Stored in network byte order */ +++ __u32 local_port; /* stored in host byte order */ +++ __u32 size; /* Total size of sk_msg */ +++}; +++ +++struct sk_reuseport_md { +++ /* +++ * Start of directly accessible data. It begins from +++ * the tcp/udp header. +++ */ +++ __bpf_md_ptr(void *, data); +++ /* End of directly accessible data */ +++ __bpf_md_ptr(void *, data_end); +++ /* +++ * Total length of packet (starting from the tcp/udp header). +++ * Note that the directly accessible bytes (data_end - data) +++ * could be less than this "len". Those bytes could be +++ * indirectly read by a helper "bpf_skb_load_bytes()". +++ */ +++ __u32 len; +++ /* +++ * Eth protocol in the mac header (network byte order). e.g. +++ * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) +++ */ +++ __u32 eth_protocol; +++ __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ +++ __u32 bind_inany; /* Is sock bound to an INANY address? */ +++ __u32 hash; /* A hash of the packet 4 tuples */ +++}; +++ +++#define BPF_TAG_SIZE 8 +++ +++struct bpf_prog_info { +++ __u32 type; +++ __u32 id; +++ __u8 tag[BPF_TAG_SIZE]; +++ __u32 jited_prog_len; +++ __u32 xlated_prog_len; +++ __aligned_u64 jited_prog_insns; +++ __aligned_u64 xlated_prog_insns; +++ __u64 load_time; /* ns since boottime */ +++ __u32 created_by_uid; +++ __u32 nr_map_ids; +++ __aligned_u64 map_ids; +++ char name[BPF_OBJ_NAME_LEN]; +++ __u32 ifindex; +++ __u32 gpl_compatible:1; +++ __u32 :31; /* alignment pad */ +++ __u64 netns_dev; +++ __u64 netns_ino; +++ __u32 nr_jited_ksyms; +++ __u32 nr_jited_func_lens; +++ __aligned_u64 jited_ksyms; +++ __aligned_u64 jited_func_lens; +++ __u32 btf_id; +++ __u32 func_info_rec_size; +++ __aligned_u64 func_info; +++ __u32 nr_func_info; +++ __u32 nr_line_info; +++ __aligned_u64 line_info; +++ __aligned_u64 jited_line_info; +++ __u32 nr_jited_line_info; +++ __u32 line_info_rec_size; +++ __u32 jited_line_info_rec_size; +++ __u32 nr_prog_tags; +++ __aligned_u64 prog_tags; +++ __u64 run_time_ns; +++ __u64 run_cnt; +++} __attribute__((aligned(8))); +++ +++struct bpf_map_info { +++ __u32 type; +++ __u32 id; +++ __u32 key_size; +++ __u32 value_size; +++ __u32 max_entries; +++ __u32 map_flags; +++ char name[BPF_OBJ_NAME_LEN]; +++ __u32 ifindex; +++ __u32 :32; +++ __u64 netns_dev; +++ __u64 netns_ino; +++ __u32 btf_id; +++ __u32 btf_key_type_id; +++ __u32 btf_value_type_id; +++} __attribute__((aligned(8))); +++ +++struct bpf_btf_info { +++ __aligned_u64 btf; +++ __u32 btf_size; +++ __u32 id; +++} __attribute__((aligned(8))); +++ +++/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed +++ * by user and intended to be used by socket (e.g. to bind to, depends on +++ * attach attach type). +++ */ +++struct bpf_sock_addr { +++ __u32 user_family; /* Allows 4-byte read, but no write. */ +++ __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. +++ * Stored in network byte order. +++ */ +++ __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. +++ * Stored in network byte order. +++ */ +++ __u32 user_port; /* Allows 4-byte read and write. +++ * Stored in network byte order +++ */ +++ __u32 family; /* Allows 4-byte read, but no write */ +++ __u32 type; /* Allows 4-byte read, but no write */ +++ __u32 protocol; /* Allows 4-byte read, but no write */ +++ __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. +++ * Stored in network byte order. +++ */ +++ __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. +++ * Stored in network byte order. +++ */ +++ __bpf_md_ptr(struct bpf_sock *, sk); +++}; +++ +++/* User bpf_sock_ops struct to access socket values and specify request ops +++ * and their replies. +++ * Some of this fields are in network (bigendian) byte order and may need +++ * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). +++ * New fields can only be added at the end of this structure +++ */ +++struct bpf_sock_ops { +++ __u32 op; +++ union { +++ __u32 args[4]; /* Optionally passed to bpf program */ +++ __u32 reply; /* Returned by bpf program */ +++ __u32 replylong[4]; /* Optionally returned by bpf prog */ +++ }; +++ __u32 family; +++ __u32 remote_ip4; /* Stored in network byte order */ +++ __u32 local_ip4; /* Stored in network byte order */ +++ __u32 remote_ip6[4]; /* Stored in network byte order */ +++ __u32 local_ip6[4]; /* Stored in network byte order */ +++ __u32 remote_port; /* Stored in network byte order */ +++ __u32 local_port; /* stored in host byte order */ +++ __u32 is_fullsock; /* Some TCP fields are only valid if +++ * there is a full socket. If not, the +++ * fields read as zero. +++ */ +++ __u32 snd_cwnd; +++ __u32 srtt_us; /* Averaged RTT << 3 in usecs */ +++ __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ +++ __u32 state; +++ __u32 rtt_min; +++ __u32 snd_ssthresh; +++ __u32 rcv_nxt; +++ __u32 snd_nxt; +++ __u32 snd_una; +++ __u32 mss_cache; +++ __u32 ecn_flags; +++ __u32 rate_delivered; +++ __u32 rate_interval_us; +++ __u32 packets_out; +++ __u32 retrans_out; +++ __u32 total_retrans; +++ __u32 segs_in; +++ __u32 data_segs_in; +++ __u32 segs_out; +++ __u32 data_segs_out; +++ __u32 lost_out; +++ __u32 sacked_out; +++ __u32 sk_txhash; +++ __u64 bytes_received; +++ __u64 bytes_acked; +++ __bpf_md_ptr(struct bpf_sock *, sk); +++}; +++ +++/* Definitions for bpf_sock_ops_cb_flags */ +++#define BPF_SOCK_OPS_RTO_CB_FLAG (1<<0) +++#define BPF_SOCK_OPS_RETRANS_CB_FLAG (1<<1) +++#define BPF_SOCK_OPS_STATE_CB_FLAG (1<<2) +++#define BPF_SOCK_OPS_RTT_CB_FLAG (1<<3) +++#define BPF_SOCK_OPS_ALL_CB_FLAGS 0xF /* Mask of all currently +++ * supported cb flags +++ */ +++ +++/* List of known BPF sock_ops operators. +++ * New entries can only be added at the end +++ */ +++enum { +++ BPF_SOCK_OPS_VOID, +++ BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or +++ * -1 if default value should be used +++ */ +++ BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized +++ * window (in packets) or -1 if default +++ * value should be used +++ */ +++ BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an +++ * active connection is initialized +++ */ +++ BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an +++ * active connection is +++ * established +++ */ +++ BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a +++ * passive connection is +++ * established +++ */ +++ BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control +++ * needs ECN +++ */ +++ BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is +++ * based on the path and may be +++ * dependent on the congestion control +++ * algorithm. In general it indicates +++ * a congestion threshold. RTTs above +++ * this indicate congestion +++ */ +++ BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. +++ * Arg1: value of icsk_retransmits +++ * Arg2: value of icsk_rto +++ * Arg3: whether RTO has expired +++ */ +++ BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. +++ * Arg1: sequence number of 1st byte +++ * Arg2: # segments +++ * Arg3: return value of +++ * tcp_transmit_skb (0 => success) +++ */ +++ BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. +++ * Arg1: old_state +++ * Arg2: new_state +++ */ +++ BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after +++ * socket transition to LISTEN state. +++ */ +++ BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. +++ */ +++}; +++ +++/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect +++ * changes between the TCP and BPF versions. Ideally this should never happen. +++ * If it does, we need to add code to convert them before calling +++ * the BPF sock_ops function. +++ */ +++enum { +++ BPF_TCP_ESTABLISHED = 1, +++ BPF_TCP_SYN_SENT, +++ BPF_TCP_SYN_RECV, +++ BPF_TCP_FIN_WAIT1, +++ BPF_TCP_FIN_WAIT2, +++ BPF_TCP_TIME_WAIT, +++ BPF_TCP_CLOSE, +++ BPF_TCP_CLOSE_WAIT, +++ BPF_TCP_LAST_ACK, +++ BPF_TCP_LISTEN, +++ BPF_TCP_CLOSING, /* Now a valid state */ +++ BPF_TCP_NEW_SYN_RECV, +++ +++ BPF_TCP_MAX_STATES /* Leave at the end! */ +++}; +++ +++#define TCP_BPF_IW 1001 /* Set TCP initial congestion window */ +++#define TCP_BPF_SNDCWND_CLAMP 1002 /* Set sndcwnd_clamp */ +++ +++struct bpf_perf_event_value { +++ __u64 counter; +++ __u64 enabled; +++ __u64 running; +++}; +++ +++#define BPF_DEVCG_ACC_MKNOD (1ULL << 0) +++#define BPF_DEVCG_ACC_READ (1ULL << 1) +++#define BPF_DEVCG_ACC_WRITE (1ULL << 2) +++ +++#define BPF_DEVCG_DEV_BLOCK (1ULL << 0) +++#define BPF_DEVCG_DEV_CHAR (1ULL << 1) +++ +++struct bpf_cgroup_dev_ctx { +++ /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ +++ __u32 access_type; +++ __u32 major; +++ __u32 minor; +++}; +++ +++struct bpf_raw_tracepoint_args { +++ __u64 args[0]; +++}; +++ +++/* DIRECT: Skip the FIB rules and go to FIB table associated with device +++ * OUTPUT: Do lookup from egress perspective; default is ingress +++ */ +++#define BPF_FIB_LOOKUP_DIRECT (1U << 0) +++#define BPF_FIB_LOOKUP_OUTPUT (1U << 1) +++ +++enum { +++ BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ +++ BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ +++ BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ +++ BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ +++ BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ +++ BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ +++ BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ +++ BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ +++ BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ +++}; +++ +++struct bpf_fib_lookup { +++ /* input: network family for lookup (AF_INET, AF_INET6) +++ * output: network family of egress nexthop +++ */ +++ __u8 family; +++ +++ /* set if lookup is to consider L4 data - e.g., FIB rules */ +++ __u8 l4_protocol; +++ __be16 sport; +++ __be16 dport; +++ +++ /* total length of packet from network header - used for MTU check */ +++ __u16 tot_len; +++ +++ /* input: L3 device index for lookup +++ * output: device index from FIB lookup +++ */ +++ __u32 ifindex; +++ +++ union { +++ /* inputs to lookup */ +++ __u8 tos; /* AF_INET */ +++ __be32 flowinfo; /* AF_INET6, flow_label + priority */ +++ +++ /* output: metric of fib result (IPv4/IPv6 only) */ +++ __u32 rt_metric; +++ }; +++ +++ union { +++ __be32 ipv4_src; +++ __u32 ipv6_src[4]; /* in6_addr; network order */ +++ }; +++ +++ /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in +++ * network header. output: bpf_fib_lookup sets to gateway address +++ * if FIB lookup returns gateway route +++ */ +++ union { +++ __be32 ipv4_dst; +++ __u32 ipv6_dst[4]; /* in6_addr; network order */ +++ }; +++ +++ /* output */ +++ __be16 h_vlan_proto; +++ __be16 h_vlan_TCI; +++ __u8 smac[6]; /* ETH_ALEN */ +++ __u8 dmac[6]; /* ETH_ALEN */ +++}; +++ +++enum bpf_task_fd_type { +++ BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ +++ BPF_FD_TYPE_TRACEPOINT, /* tp name */ +++ BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ +++ BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ +++ BPF_FD_TYPE_UPROBE, /* filename + offset */ +++ BPF_FD_TYPE_URETPROBE, /* filename + offset */ +++}; +++ +++#define BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG (1U << 0) +++#define BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL (1U << 1) +++#define BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP (1U << 2) +++ +++struct bpf_flow_keys { +++ __u16 nhoff; +++ __u16 thoff; +++ __u16 addr_proto; /* ETH_P_* of valid addrs */ +++ __u8 is_frag; +++ __u8 is_first_frag; +++ __u8 is_encap; +++ __u8 ip_proto; +++ __be16 n_proto; +++ __be16 sport; +++ __be16 dport; +++ union { +++ struct { +++ __be32 ipv4_src; +++ __be32 ipv4_dst; +++ }; +++ struct { +++ __u32 ipv6_src[4]; /* in6_addr; network order */ +++ __u32 ipv6_dst[4]; /* in6_addr; network order */ +++ }; +++ }; +++ __u32 flags; +++ __be32 flow_label; +++}; +++ +++struct bpf_func_info { +++ __u32 insn_off; +++ __u32 type_id; +++}; +++ +++#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +++#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) +++ +++struct bpf_line_info { +++ __u32 insn_off; +++ __u32 file_name_off; +++ __u32 line_off; +++ __u32 line_col; +++}; +++ +++struct bpf_spin_lock { +++ __u32 val; +++}; +++ +++struct bpf_sysctl { +++ __u32 write; /* Sysctl is being read (= 0) or written (= 1). +++ * Allows 1,2,4-byte read, but no write. +++ */ +++ __u32 file_pos; /* Sysctl file position to read from, write to. +++ * Allows 1,2,4-byte read an 4-byte write. +++ */ +++}; +++ +++struct bpf_sockopt { +++ __bpf_md_ptr(struct bpf_sock *, sk); +++ __bpf_md_ptr(void *, optval); +++ __bpf_md_ptr(void *, optval_end); +++ +++ __s32 level; +++ __s32 optname; +++ __s32 optlen; +++ __s32 retval; ++ }; ++ ++ #endif /* _UAPI__LINUX_BPF_H__ */ ++--- /dev/null +++++ b/include/uapi/linux/bpfilter.h ++@@ -0,0 +1,21 @@ +++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +++#ifndef _UAPI_LINUX_BPFILTER_H +++#define _UAPI_LINUX_BPFILTER_H +++ +++#include +++ +++enum { +++ BPFILTER_IPT_SO_SET_REPLACE = 64, +++ BPFILTER_IPT_SO_SET_ADD_COUNTERS = 65, +++ BPFILTER_IPT_SET_MAX, +++}; +++ +++enum { +++ BPFILTER_IPT_SO_GET_INFO = 64, +++ BPFILTER_IPT_SO_GET_ENTRIES = 65, +++ BPFILTER_IPT_SO_GET_REVISION_MATCH = 66, +++ BPFILTER_IPT_SO_GET_REVISION_TARGET = 67, +++ BPFILTER_IPT_GET_MAX, +++}; +++ +++#endif /* _UAPI_LINUX_BPFILTER_H */ ++--- /dev/null +++++ b/include/uapi/linux/bpf_perf_event.h ++@@ -0,0 +1,19 @@ +++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +++/* Copyright (c) 2016 Facebook +++ * +++ * This program is free software; you can redistribute it and/or +++ * modify it under the terms of version 2 of the GNU General Public +++ * License as published by the Free Software Foundation. +++ */ +++#ifndef _UAPI__LINUX_BPF_PERF_EVENT_H__ +++#define _UAPI__LINUX_BPF_PERF_EVENT_H__ +++ +++#include +++ +++struct bpf_perf_event_data { +++ bpf_user_pt_regs_t regs; +++ __u64 sample_period; +++ __u64 addr; +++}; +++ +++#endif /* _UAPI__LINUX_BPF_PERF_EVENT_H__ */ ++--- /dev/null +++++ b/include/uapi/linux/btf.h ++@@ -0,0 +1,165 @@ +++/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +++/* Copyright (c) 2018 Facebook */ +++#ifndef _UAPI__LINUX_BTF_H__ +++#define _UAPI__LINUX_BTF_H__ +++ +++#include +++ +++#define BTF_MAGIC 0xeB9F +++#define BTF_VERSION 1 +++ +++struct btf_header { +++ __u16 magic; +++ __u8 version; +++ __u8 flags; +++ __u32 hdr_len; +++ +++ /* All offsets are in bytes relative to the end of this header */ +++ __u32 type_off; /* offset of type section */ +++ __u32 type_len; /* length of type section */ +++ __u32 str_off; /* offset of string section */ +++ __u32 str_len; /* length of string section */ +++}; +++ +++/* Max # of type identifier */ +++#define BTF_MAX_TYPE 0x000fffff +++/* Max offset into the string section */ +++#define BTF_MAX_NAME_OFFSET 0x00ffffff +++/* Max # of struct/union/enum members or func args */ +++#define BTF_MAX_VLEN 0xffff +++ +++struct btf_type { +++ __u32 name_off; +++ /* "info" bits arrangement +++ * bits 0-15: vlen (e.g. # of struct's members) +++ * bits 16-23: unused +++ * bits 24-27: kind (e.g. int, ptr, array...etc) +++ * bits 28-30: unused +++ * bit 31: kind_flag, currently used by +++ * struct, union and fwd +++ */ +++ __u32 info; +++ /* "size" is used by INT, ENUM, STRUCT, UNION and DATASEC. +++ * "size" tells the size of the type it is describing. +++ * +++ * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, +++ * FUNC, FUNC_PROTO and VAR. +++ * "type" is a type_id referring to another type. +++ */ +++ union { +++ __u32 size; +++ __u32 type; +++ }; +++}; +++ +++#define BTF_INFO_KIND(info) (((info) >> 24) & 0x0f) +++#define BTF_INFO_VLEN(info) ((info) & 0xffff) +++#define BTF_INFO_KFLAG(info) ((info) >> 31) +++ +++#define BTF_KIND_UNKN 0 /* Unknown */ +++#define BTF_KIND_INT 1 /* Integer */ +++#define BTF_KIND_PTR 2 /* Pointer */ +++#define BTF_KIND_ARRAY 3 /* Array */ +++#define BTF_KIND_STRUCT 4 /* Struct */ +++#define BTF_KIND_UNION 5 /* Union */ +++#define BTF_KIND_ENUM 6 /* Enumeration */ +++#define BTF_KIND_FWD 7 /* Forward */ +++#define BTF_KIND_TYPEDEF 8 /* Typedef */ +++#define BTF_KIND_VOLATILE 9 /* Volatile */ +++#define BTF_KIND_CONST 10 /* Const */ +++#define BTF_KIND_RESTRICT 11 /* Restrict */ +++#define BTF_KIND_FUNC 12 /* Function */ +++#define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ +++#define BTF_KIND_VAR 14 /* Variable */ +++#define BTF_KIND_DATASEC 15 /* Section */ +++#define BTF_KIND_MAX BTF_KIND_DATASEC +++#define NR_BTF_KINDS (BTF_KIND_MAX + 1) +++ +++/* For some specific BTF_KIND, "struct btf_type" is immediately +++ * followed by extra data. +++ */ +++ +++/* BTF_KIND_INT is followed by a u32 and the following +++ * is the 32 bits arrangement: +++ */ +++#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) +++#define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff0000) >> 16) +++#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) +++ +++/* Attributes stored in the BTF_INT_ENCODING */ +++#define BTF_INT_SIGNED (1 << 0) +++#define BTF_INT_CHAR (1 << 1) +++#define BTF_INT_BOOL (1 << 2) +++ +++/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". +++ * The exact number of btf_enum is stored in the vlen (of the +++ * info in "struct btf_type"). +++ */ +++struct btf_enum { +++ __u32 name_off; +++ __s32 val; +++}; +++ +++/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +++struct btf_array { +++ __u32 type; +++ __u32 index_type; +++ __u32 nelems; +++}; +++ +++/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed +++ * by multiple "struct btf_member". The exact number +++ * of btf_member is stored in the vlen (of the info in +++ * "struct btf_type"). +++ */ +++struct btf_member { +++ __u32 name_off; +++ __u32 type; +++ /* If the type info kind_flag is set, the btf_member offset +++ * contains both member bitfield size and bit offset. The +++ * bitfield size is set for bitfield members. If the type +++ * info kind_flag is not set, the offset contains only bit +++ * offset. +++ */ +++ __u32 offset; +++}; +++ +++/* If the struct/union type info kind_flag is set, the +++ * following two macros are used to access bitfield_size +++ * and bit_offset from btf_member.offset. +++ */ +++#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +++#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) +++ +++/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". +++ * The exact number of btf_param is stored in the vlen (of the +++ * info in "struct btf_type"). +++ */ +++struct btf_param { +++ __u32 name_off; +++ __u32 type; +++}; +++ +++enum { +++ BTF_VAR_STATIC = 0, +++ BTF_VAR_GLOBAL_ALLOCATED, +++}; +++ +++/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe +++ * additional information related to the variable such as its linkage. +++ */ +++struct btf_var { +++ __u32 linkage; +++}; +++ +++/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" +++ * to describe all BTF_KIND_VAR types it contains along with it's +++ * in-section offset as well as size. +++ */ +++struct btf_var_secinfo { +++ __u32 type; +++ __u32 offset; +++ __u32 size; +++}; +++ +++#endif /* _UAPI__LINUX_BTF_H__ */ ++--- a/kernel/bpf/arraymap.c +++++ b/kernel/bpf/arraymap.c ++@@ -1,78 +1,141 @@ +++// SPDX-License-Identifier: GPL-2.0-only ++ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++- * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of version 2 of the GNU General Public ++- * License as published by the Free Software Foundation. ++- * ++- * This program is distributed in the hope that it will be useful, but ++- * WITHOUT ANY WARRANTY; without even the implied warranty of ++- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++- * General Public License for more details. +++ * Copyright (c) 2016,2017 Facebook ++ */ ++ #include +++#include ++ #include ++-#include ++ #include ++ #include ++ #include ++ #include +++#include +++ +++#include "map_in_map.h" +++ +++#define ARRAY_CREATE_FLAG_MASK \ +++ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) +++ +++static void bpf_array_free_percpu(struct bpf_array *array) +++{ +++ int i; +++ +++ for (i = 0; i < array->map.max_entries; i++) { +++ free_percpu(array->pptrs[i]); +++ cond_resched(); +++ } +++} +++ +++static int bpf_array_alloc_percpu(struct bpf_array *array) +++{ +++ void __percpu *ptr; +++ int i; +++ +++ for (i = 0; i < array->map.max_entries; i++) { +++ ptr = __alloc_percpu_gfp(array->elem_size, 8, +++ GFP_USER | __GFP_NOWARN); +++ if (!ptr) { +++ bpf_array_free_percpu(array); +++ return -ENOMEM; +++ } +++ array->pptrs[i] = ptr; +++ cond_resched(); +++ } +++ +++ return 0; +++} ++ ++ /* Called from syscall */ ++-static struct bpf_map *array_map_alloc(union bpf_attr *attr) +++int array_map_alloc_check(union bpf_attr *attr) ++ { ++- struct bpf_array *array; ++- u32 elem_size, array_size; ++- u32 index_mask, max_entries; ++- bool unpriv = !capable(CAP_SYS_ADMIN); +++ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; +++ int numa_node = bpf_map_attr_numa_node(attr); ++ ++ /* check sanity of attributes */ ++ if (attr->max_entries == 0 || attr->key_size != 4 || ++- attr->value_size == 0) ++- return ERR_PTR(-EINVAL); +++ attr->value_size == 0 || +++ attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || +++ !bpf_map_flags_access_ok(attr->map_flags) || +++ (percpu && numa_node != NUMA_NO_NODE)) +++ return -EINVAL; ++ ++- if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) +++ if (attr->value_size > KMALLOC_MAX_SIZE) ++ /* if value_size is bigger, the user space won't be able to ++ * access the elements. ++ */ ++- return ERR_PTR(-E2BIG); +++ return -E2BIG; +++ +++ return 0; +++} +++ +++static struct bpf_map *array_map_alloc(union bpf_attr *attr) +++{ +++ bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; +++ int ret, numa_node = bpf_map_attr_numa_node(attr); +++ u32 elem_size, index_mask, max_entries; +++ bool unpriv = !capable(CAP_SYS_ADMIN); +++ u64 cost, array_size, mask64; +++ struct bpf_map_memory mem; +++ struct bpf_array *array; ++ ++ elem_size = round_up(attr->value_size, 8); ++ ++ max_entries = attr->max_entries; ++- index_mask = roundup_pow_of_two(max_entries) - 1; ++ ++- if (unpriv) +++ /* On 32 bit archs roundup_pow_of_two() with max_entries that has +++ * upper most bit set in u32 space is undefined behavior due to +++ * resulting 1U << 32, so do it manually here in u64 space. +++ */ +++ mask64 = fls_long(max_entries - 1); +++ mask64 = 1ULL << mask64; +++ mask64 -= 1; +++ +++ index_mask = mask64; +++ if (unpriv) { ++ /* round up array size to nearest power of 2, ++ * since cpu will speculate within index_mask limits ++ */ ++ max_entries = index_mask + 1; +++ /* Check for overflows. */ +++ if (max_entries < attr->max_entries) +++ return ERR_PTR(-E2BIG); +++ } ++ ++- ++- /* check round_up into zero and u32 overflow */ ++- if (elem_size == 0 || ++- attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) ++- return ERR_PTR(-ENOMEM); ++- ++- array_size = sizeof(*array) + max_entries * elem_size; +++ array_size = sizeof(*array); +++ if (percpu) +++ array_size += (u64) max_entries * sizeof(void *); +++ else +++ array_size += (u64) max_entries * elem_size; +++ +++ /* make sure there is no u32 overflow later in round_up() */ +++ cost = array_size; +++ if (percpu) +++ cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); +++ +++ ret = bpf_map_charge_init(&mem, cost); +++ if (ret < 0) +++ return ERR_PTR(ret); ++ ++ /* allocate all map elements and zero-initialize them */ ++- array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); +++ array = bpf_map_area_alloc(array_size, numa_node); ++ if (!array) { ++- array = vzalloc(array_size); ++- if (!array) ++- return ERR_PTR(-ENOMEM); +++ bpf_map_charge_finish(&mem); +++ return ERR_PTR(-ENOMEM); ++ } ++- ++ array->index_mask = index_mask; ++ array->map.unpriv_array = unpriv; ++ ++ /* copy mandatory map attributes */ ++- array->map.key_size = attr->key_size; ++- array->map.value_size = attr->value_size; ++- array->map.max_entries = attr->max_entries; ++- array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; +++ bpf_map_init_from_attr(&array->map, attr); +++ bpf_map_charge_move(&array->map.memory, &mem); ++ array->elem_size = elem_size; ++ +++ if (percpu && bpf_array_alloc_percpu(array)) { +++ bpf_map_charge_finish(&array->map.memory); +++ bpf_map_area_free(array); +++ return ERR_PTR(-ENOMEM); +++ } +++ ++ return &array->map; ++ } ++ ++@@ -82,17 +145,115 @@ static void *array_map_lookup_elem(struc ++ struct bpf_array *array = container_of(map, struct bpf_array, map); ++ u32 index = *(u32 *)key; ++ ++- if (index >= array->map.max_entries) +++ if (unlikely(index >= array->map.max_entries)) ++ return NULL; ++ ++ return array->value + array->elem_size * (index & array->index_mask); ++ } ++ +++static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm, +++ u32 off) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ +++ if (map->max_entries != 1) +++ return -ENOTSUPP; +++ if (off >= map->value_size) +++ return -EINVAL; +++ +++ *imm = (unsigned long)array->value; +++ return 0; +++} +++ +++static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm, +++ u32 *off) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ u64 base = (unsigned long)array->value; +++ u64 range = array->elem_size; +++ +++ if (map->max_entries != 1) +++ return -ENOTSUPP; +++ if (imm < base || imm >= base + range) +++ return -ENOENT; +++ +++ *off = imm - base; +++ return 0; +++} +++ +++/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ +++static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ struct bpf_insn *insn = insn_buf; +++ u32 elem_size = round_up(map->value_size, 8); +++ const int ret = BPF_REG_0; +++ const int map_ptr = BPF_REG_1; +++ const int index = BPF_REG_2; +++ +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); +++ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); +++ if (map->unpriv_array) { +++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); +++ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); +++ } else { +++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); +++ } +++ +++ if (is_power_of_2(elem_size)) { +++ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); +++ } else { +++ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); +++ } +++ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); +++ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); +++ *insn++ = BPF_MOV64_IMM(ret, 0); +++ return insn - insn_buf; +++} +++ +++/* Called from eBPF program */ +++static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ u32 index = *(u32 *)key; +++ +++ if (unlikely(index >= array->map.max_entries)) +++ return NULL; +++ +++ return this_cpu_ptr(array->pptrs[index & array->index_mask]); +++} +++ +++int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ u32 index = *(u32 *)key; +++ void __percpu *pptr; +++ int cpu, off = 0; +++ u32 size; +++ +++ if (unlikely(index >= array->map.max_entries)) +++ return -ENOENT; +++ +++ /* per_cpu areas are zero-filled and bpf programs can only +++ * access 'value_size' of them, so copying rounded areas +++ * will not leak any kernel data +++ */ +++ size = round_up(map->value_size, 8); +++ rcu_read_lock(); +++ pptr = array->pptrs[index & array->index_mask]; +++ for_each_possible_cpu(cpu) { +++ bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); +++ off += size; +++ } +++ rcu_read_unlock(); +++ return 0; +++} +++ ++ /* Called from syscall */ ++ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) ++ { ++ struct bpf_array *array = container_of(map, struct bpf_array, map); ++- u32 index = *(u32 *)key; +++ u32 index = key ? *(u32 *)key : U32_MAX; ++ u32 *next = (u32 *)next_key; ++ ++ if (index >= array->map.max_entries) { ++@@ -113,22 +274,73 @@ static int array_map_update_elem(struct ++ { ++ struct bpf_array *array = container_of(map, struct bpf_array, map); ++ u32 index = *(u32 *)key; +++ char *val; ++ ++- if (map_flags > BPF_EXIST) +++ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) ++ /* unknown flags */ ++ return -EINVAL; ++ ++- if (index >= array->map.max_entries) +++ if (unlikely(index >= array->map.max_entries)) +++ /* all elements were pre-allocated, cannot insert a new one */ +++ return -E2BIG; +++ +++ if (unlikely(map_flags & BPF_NOEXIST)) +++ /* all elements already exist */ +++ return -EEXIST; +++ +++ if (unlikely((map_flags & BPF_F_LOCK) && +++ !map_value_has_spin_lock(map))) +++ return -EINVAL; +++ +++ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { +++ memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), +++ value, map->value_size); +++ } else { +++ val = array->value + +++ array->elem_size * (index & array->index_mask); +++ if (map_flags & BPF_F_LOCK) +++ copy_map_value_locked(map, val, value, false); +++ else +++ copy_map_value(map, val, value); +++ } +++ return 0; +++} +++ +++int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, +++ u64 map_flags) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ u32 index = *(u32 *)key; +++ void __percpu *pptr; +++ int cpu, off = 0; +++ u32 size; +++ +++ if (unlikely(map_flags > BPF_EXIST)) +++ /* unknown flags */ +++ return -EINVAL; +++ +++ if (unlikely(index >= array->map.max_entries)) ++ /* all elements were pre-allocated, cannot insert a new one */ ++ return -E2BIG; ++ ++- if (map_flags == BPF_NOEXIST) +++ if (unlikely(map_flags == BPF_NOEXIST)) ++ /* all elements already exist */ ++ return -EEXIST; ++ ++- memcpy(array->value + ++- array->elem_size * (index & array->index_mask), ++- value, map->value_size); +++ /* the user space will provide round_up(value_size, 8) bytes that +++ * will be copied into per-cpu area. bpf programs can only access +++ * value_size of it. During lookup the same extra bytes will be +++ * returned or zeros which were zero-filled by percpu_alloc, +++ * so no kernel data leaks possible +++ */ +++ size = round_up(map->value_size, 8); +++ rcu_read_lock(); +++ pptr = array->pptrs[index & array->index_mask]; +++ for_each_possible_cpu(cpu) { +++ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); +++ off += size; +++ } +++ rcu_read_unlock(); ++ return 0; ++ } ++ ++@@ -150,36 +362,124 @@ static void array_map_free(struct bpf_ma ++ */ ++ synchronize_rcu(); ++ ++- kvfree(array); +++ if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) +++ bpf_array_free_percpu(array); +++ +++ bpf_map_area_free(array); ++ } ++ ++-static const struct bpf_map_ops array_ops = { +++static void array_map_seq_show_elem(struct bpf_map *map, void *key, +++ struct seq_file *m) +++{ +++ void *value; +++ +++ rcu_read_lock(); +++ +++ value = array_map_lookup_elem(map, key); +++ if (!value) { +++ rcu_read_unlock(); +++ return; +++ } +++ +++ if (map->btf_key_type_id) +++ seq_printf(m, "%u: ", *(u32 *)key); +++ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); +++ seq_puts(m, "\n"); +++ +++ rcu_read_unlock(); +++} +++ +++static void percpu_array_map_seq_show_elem(struct bpf_map *map, void *key, +++ struct seq_file *m) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ u32 index = *(u32 *)key; +++ void __percpu *pptr; +++ int cpu; +++ +++ rcu_read_lock(); +++ +++ seq_printf(m, "%u: {\n", *(u32 *)key); +++ pptr = array->pptrs[index & array->index_mask]; +++ for_each_possible_cpu(cpu) { +++ seq_printf(m, "\tcpu%d: ", cpu); +++ btf_type_seq_show(map->btf, map->btf_value_type_id, +++ per_cpu_ptr(pptr, cpu), m); +++ seq_puts(m, "\n"); +++ } +++ seq_puts(m, "}\n"); +++ +++ rcu_read_unlock(); +++} +++ +++static int array_map_check_btf(const struct bpf_map *map, +++ const struct btf *btf, +++ const struct btf_type *key_type, +++ const struct btf_type *value_type) +++{ +++ u32 int_data; +++ +++ /* One exception for keyless BTF: .bss/.data/.rodata map */ +++ if (btf_type_is_void(key_type)) { +++ if (map->map_type != BPF_MAP_TYPE_ARRAY || +++ map->max_entries != 1) +++ return -EINVAL; +++ +++ if (BTF_INFO_KIND(value_type->info) != BTF_KIND_DATASEC) +++ return -EINVAL; +++ +++ return 0; +++ } +++ +++ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT) +++ return -EINVAL; +++ +++ int_data = *(u32 *)(key_type + 1); +++ /* bpf array can only take a u32 key. This check makes sure +++ * that the btf matches the attr used during map_create. +++ */ +++ if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data)) +++ return -EINVAL; +++ +++ return 0; +++} +++ +++const struct bpf_map_ops array_map_ops = { +++ .map_alloc_check = array_map_alloc_check, ++ .map_alloc = array_map_alloc, ++ .map_free = array_map_free, ++ .map_get_next_key = array_map_get_next_key, ++ .map_lookup_elem = array_map_lookup_elem, ++ .map_update_elem = array_map_update_elem, ++ .map_delete_elem = array_map_delete_elem, +++ .map_gen_lookup = array_map_gen_lookup, +++ .map_direct_value_addr = array_map_direct_value_addr, +++ .map_direct_value_meta = array_map_direct_value_meta, +++ .map_seq_show_elem = array_map_seq_show_elem, +++ .map_check_btf = array_map_check_btf, ++ }; ++ ++-static struct bpf_map_type_list array_type __read_mostly = { ++- .ops = &array_ops, ++- .type = BPF_MAP_TYPE_ARRAY, +++const struct bpf_map_ops percpu_array_map_ops = { +++ .map_alloc_check = array_map_alloc_check, +++ .map_alloc = array_map_alloc, +++ .map_free = array_map_free, +++ .map_get_next_key = array_map_get_next_key, +++ .map_lookup_elem = percpu_array_map_lookup_elem, +++ .map_update_elem = array_map_update_elem, +++ .map_delete_elem = array_map_delete_elem, +++ .map_seq_show_elem = percpu_array_map_seq_show_elem, +++ .map_check_btf = array_map_check_btf, ++ }; ++ ++-static int __init register_array_map(void) ++-{ ++- bpf_register_map_type(&array_type); ++- return 0; ++-} ++-late_initcall(register_array_map); ++- ++-static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +++static int fd_array_map_alloc_check(union bpf_attr *attr) ++ { ++ /* only file descriptors can be stored in this type of map */ ++ if (attr->value_size != sizeof(u32)) ++- return ERR_PTR(-EINVAL); ++- return array_map_alloc(attr); +++ return -EINVAL; +++ /* Program read-only/write-only not supported for special maps yet. */ +++ if (attr->map_flags & (BPF_F_RDONLY_PROG | BPF_F_WRONLY_PROG)) +++ return -EINVAL; +++ return array_map_alloc_check(attr); ++ } ++ ++ static void fd_array_map_free(struct bpf_map *map) ++@@ -192,17 +492,38 @@ static void fd_array_map_free(struct bpf ++ /* make sure it's empty */ ++ for (i = 0; i < array->map.max_entries; i++) ++ BUG_ON(array->ptrs[i] != NULL); ++- kvfree(array); +++ +++ bpf_map_area_free(array); ++ } ++ ++ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key) ++ { ++- return NULL; +++ return ERR_PTR(-EOPNOTSUPP); +++} +++ +++/* only called from syscall */ +++int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +++{ +++ void **elem, *ptr; +++ int ret = 0; +++ +++ if (!map->ops->map_fd_sys_lookup_elem) +++ return -ENOTSUPP; +++ +++ rcu_read_lock(); +++ elem = array_map_lookup_elem(map, key); +++ if (elem && (ptr = READ_ONCE(*elem))) +++ *value = map->ops->map_fd_sys_lookup_elem(ptr); +++ else +++ ret = -ENOENT; +++ rcu_read_unlock(); +++ +++ return ret; ++ } ++ ++ /* only called from syscall */ ++-static int fd_array_map_update_elem(struct bpf_map *map, void *key, ++- void *value, u64 map_flags) +++int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, +++ void *key, void *value, u64 map_flags) ++ { ++ struct bpf_array *array = container_of(map, struct bpf_array, map); ++ void *new_ptr, *old_ptr; ++@@ -215,7 +536,7 @@ static int fd_array_map_update_elem(stru ++ return -E2BIG; ++ ++ ufd = *(u32 *)value; ++- new_ptr = map->ops->map_fd_get_ptr(map, ufd); +++ new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); ++ if (IS_ERR(new_ptr)) ++ return PTR_ERR(new_ptr); ++ ++@@ -244,10 +565,12 @@ static int fd_array_map_delete_elem(stru ++ } ++ } ++ ++-static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd) +++static void *prog_fd_array_get_ptr(struct bpf_map *map, +++ struct file *map_file, int fd) ++ { ++ struct bpf_array *array = container_of(map, struct bpf_array, map); ++ struct bpf_prog *prog = bpf_prog_get(fd); +++ ++ if (IS_ERR(prog)) ++ return prog; ++ ++@@ -255,18 +578,22 @@ static void *prog_fd_array_get_ptr(struc ++ bpf_prog_put(prog); ++ return ERR_PTR(-EINVAL); ++ } +++ ++ return prog; ++ } ++ ++ static void prog_fd_array_put_ptr(void *ptr) ++ { ++- struct bpf_prog *prog = ptr; +++ bpf_prog_put(ptr); +++} ++ ++- bpf_prog_put_rcu(prog); +++static u32 prog_fd_array_sys_lookup_elem(void *ptr) +++{ +++ return ((struct bpf_prog *)ptr)->aux->id; ++ } ++ ++ /* decrement refcnt of all bpf_progs that are stored in this map */ ++-void bpf_fd_array_map_clear(struct bpf_map *map) +++static void bpf_fd_array_map_clear(struct bpf_map *map) ++ { ++ struct bpf_array *array = container_of(map, struct bpf_array, map); ++ int i; ++@@ -275,91 +602,208 @@ void bpf_fd_array_map_clear(struct bpf_m ++ fd_array_map_delete_elem(map, &i); ++ } ++ ++-static const struct bpf_map_ops prog_array_ops = { ++- .map_alloc = fd_array_map_alloc, +++static void prog_array_map_seq_show_elem(struct bpf_map *map, void *key, +++ struct seq_file *m) +++{ +++ void **elem, *ptr; +++ u32 prog_id; +++ +++ rcu_read_lock(); +++ +++ elem = array_map_lookup_elem(map, key); +++ if (elem) { +++ ptr = READ_ONCE(*elem); +++ if (ptr) { +++ seq_printf(m, "%u: ", *(u32 *)key); +++ prog_id = prog_fd_array_sys_lookup_elem(ptr); +++ btf_type_seq_show(map->btf, map->btf_value_type_id, +++ &prog_id, m); +++ seq_puts(m, "\n"); +++ } +++ } +++ +++ rcu_read_unlock(); +++} +++ +++const struct bpf_map_ops prog_array_map_ops = { +++ .map_alloc_check = fd_array_map_alloc_check, +++ .map_alloc = array_map_alloc, ++ .map_free = fd_array_map_free, ++ .map_get_next_key = array_map_get_next_key, ++ .map_lookup_elem = fd_array_map_lookup_elem, ++- .map_update_elem = fd_array_map_update_elem, ++ .map_delete_elem = fd_array_map_delete_elem, ++ .map_fd_get_ptr = prog_fd_array_get_ptr, ++ .map_fd_put_ptr = prog_fd_array_put_ptr, +++ .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem, +++ .map_release_uref = bpf_fd_array_map_clear, +++ .map_seq_show_elem = prog_array_map_seq_show_elem, ++ }; ++ ++-static struct bpf_map_type_list prog_array_type __read_mostly = { ++- .ops = &prog_array_ops, ++- .type = BPF_MAP_TYPE_PROG_ARRAY, ++-}; +++static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, +++ struct file *map_file) +++{ +++ struct bpf_event_entry *ee; +++ +++ ee = kzalloc(sizeof(*ee), GFP_ATOMIC); +++ if (ee) { +++ ee->event = perf_file->private_data; +++ ee->perf_file = perf_file; +++ ee->map_file = map_file; +++ } ++ ++-static int __init register_prog_array_map(void) +++ return ee; +++} +++ +++static void __bpf_event_entry_free(struct rcu_head *rcu) ++ { ++- bpf_register_map_type(&prog_array_type); ++- return 0; +++ struct bpf_event_entry *ee; +++ +++ ee = container_of(rcu, struct bpf_event_entry, rcu); +++ fput(ee->perf_file); +++ kfree(ee); ++ } ++-late_initcall(register_prog_array_map); ++ ++-static void perf_event_array_map_free(struct bpf_map *map) +++static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee) ++ { ++- bpf_fd_array_map_clear(map); ++- fd_array_map_free(map); +++ call_rcu(&ee->rcu, __bpf_event_entry_free); ++ } ++ ++-static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) +++static void *perf_event_fd_array_get_ptr(struct bpf_map *map, +++ struct file *map_file, int fd) ++ { +++ struct bpf_event_entry *ee; ++ struct perf_event *event; ++- const struct perf_event_attr *attr; +++ struct file *perf_file; +++ u64 value; ++ ++ event = perf_event_get(fd); ++ if (IS_ERR(event)) ++ return event; ++ ++- attr = perf_event_attrs(event); ++- if (IS_ERR(attr)) ++- goto err; ++- ++- if (attr->inherit) ++- goto err; +++ value = perf_event_read_local(event); ++ ++- if (attr->type == PERF_TYPE_RAW) ++- return event; ++- ++- if (attr->type == PERF_TYPE_HARDWARE) ++- return event; +++ ee = bpf_event_entry_gen(perf_file, map_file); +++ if (ee) +++ return ee; ++ ++- if (attr->type == PERF_TYPE_SOFTWARE && ++- attr->config == PERF_COUNT_SW_BPF_OUTPUT) ++- return event; ++-err: ++- perf_event_release_kernel(event); ++- return ERR_PTR(-EINVAL); +++ ee = ERR_PTR(-ENOMEM); +++ return ee; ++ } ++ ++ static void perf_event_fd_array_put_ptr(void *ptr) ++ { ++- struct perf_event *event = ptr; +++ bpf_event_entry_free_rcu(ptr); +++} ++ ++- perf_event_release_kernel(event); +++static void perf_event_fd_array_release(struct bpf_map *map, +++ struct file *map_file) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ struct bpf_event_entry *ee; +++ int i; +++ +++ rcu_read_lock(); +++ for (i = 0; i < array->map.max_entries; i++) { +++ ee = READ_ONCE(array->ptrs[i]); +++ if (ee && ee->map_file == map_file) +++ fd_array_map_delete_elem(map, &i); +++ } +++ rcu_read_unlock(); ++ } ++ ++-static const struct bpf_map_ops perf_event_array_ops = { ++- .map_alloc = fd_array_map_alloc, ++- .map_free = perf_event_array_map_free, +++const struct bpf_map_ops perf_event_array_map_ops = { +++ .map_alloc_check = fd_array_map_alloc_check, +++ .map_alloc = array_map_alloc, +++ .map_free = fd_array_map_free, ++ .map_get_next_key = array_map_get_next_key, ++ .map_lookup_elem = fd_array_map_lookup_elem, ++- .map_update_elem = fd_array_map_update_elem, ++ .map_delete_elem = fd_array_map_delete_elem, ++ .map_fd_get_ptr = perf_event_fd_array_get_ptr, ++ .map_fd_put_ptr = perf_event_fd_array_put_ptr, +++ .map_release = perf_event_fd_array_release, +++ .map_check_btf = map_check_no_btf, ++ }; ++ ++-static struct bpf_map_type_list perf_event_array_type __read_mostly = { ++- .ops = &perf_event_array_ops, ++- .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, ++-}; +++static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +++{ +++ struct bpf_map *map, *inner_map_meta; +++ +++ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); +++ if (IS_ERR(inner_map_meta)) +++ return inner_map_meta; +++ +++ map = array_map_alloc(attr); +++ if (IS_ERR(map)) { +++ bpf_map_meta_free(inner_map_meta); +++ return map; +++ } +++ +++ map->inner_map_meta = inner_map_meta; ++ ++-static int __init register_perf_event_array_map(void) +++ return map; +++} +++ +++static void array_of_map_free(struct bpf_map *map) ++ { ++- bpf_register_map_type(&perf_event_array_type); ++- return 0; +++ /* map->inner_map_meta is only accessed by syscall which +++ * is protected by fdget/fdput. +++ */ +++ bpf_map_meta_free(map->inner_map_meta); +++ bpf_fd_array_map_clear(map); +++ fd_array_map_free(map); ++ } ++-late_initcall(register_perf_event_array_map); +++ +++static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_map **inner_map = array_map_lookup_elem(map, key); +++ +++ if (!inner_map) +++ return NULL; +++ +++ return READ_ONCE(*inner_map); +++} +++ +++static u32 array_of_map_gen_lookup(struct bpf_map *map, +++ struct bpf_insn *insn_buf) +++{ +++ struct bpf_array *array = container_of(map, struct bpf_array, map); +++ u32 elem_size = round_up(map->value_size, 8); +++ struct bpf_insn *insn = insn_buf; +++ const int ret = BPF_REG_0; +++ const int map_ptr = BPF_REG_1; +++ const int index = BPF_REG_2; +++ +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); +++ *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); +++ if (map->unpriv_array) { +++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); +++ *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); +++ } else { +++ *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); +++ } +++ if (is_power_of_2(elem_size)) +++ *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); +++ else +++ *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); +++ *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); +++ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); +++ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); +++ *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); +++ *insn++ = BPF_MOV64_IMM(ret, 0); +++ +++ return insn - insn_buf; +++} +++ +++const struct bpf_map_ops array_of_maps_map_ops = { +++ .map_alloc_check = fd_array_map_alloc_check, +++ .map_alloc = array_of_map_alloc, +++ .map_free = array_of_map_free, +++ .map_get_next_key = array_map_get_next_key, +++ .map_lookup_elem = array_of_map_lookup_elem, +++ .map_delete_elem = fd_array_map_delete_elem, +++ .map_fd_get_ptr = bpf_map_fd_get_ptr, +++ .map_fd_put_ptr = bpf_map_fd_put_ptr, +++ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, +++ .map_gen_lookup = array_of_map_gen_lookup, +++ .map_check_btf = map_check_no_btf, +++}; ++--- /dev/null +++++ b/kernel/bpf/bpf_lru_list.c ++@@ -0,0 +1,695 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* Copyright (c) 2016 Facebook +++ */ +++#include +++#include +++#include +++ +++#include "bpf_lru_list.h" +++ +++#define LOCAL_FREE_TARGET (128) +++#define LOCAL_NR_SCANS LOCAL_FREE_TARGET +++ +++#define PERCPU_FREE_TARGET (4) +++#define PERCPU_NR_SCANS PERCPU_FREE_TARGET +++ +++/* Helpers to get the local list index */ +++#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET) +++#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE) +++#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING) +++#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET) +++ +++static int get_next_cpu(int cpu) +++{ +++ cpu = cpumask_next(cpu, cpu_possible_mask); +++ if (cpu >= nr_cpu_ids) +++ cpu = cpumask_first(cpu_possible_mask); +++ return cpu; +++} +++ +++/* Local list helpers */ +++static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l) +++{ +++ return &loc_l->lists[LOCAL_FREE_LIST_IDX]; +++} +++ +++static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l) +++{ +++ return &loc_l->lists[LOCAL_PENDING_LIST_IDX]; +++} +++ +++/* bpf_lru_node helpers */ +++static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node) +++{ +++ return node->ref; +++} +++ +++static void bpf_lru_list_count_inc(struct bpf_lru_list *l, +++ enum bpf_lru_list_type type) +++{ +++ if (type < NR_BPF_LRU_LIST_COUNT) +++ l->counts[type]++; +++} +++ +++static void bpf_lru_list_count_dec(struct bpf_lru_list *l, +++ enum bpf_lru_list_type type) +++{ +++ if (type < NR_BPF_LRU_LIST_COUNT) +++ l->counts[type]--; +++} +++ +++static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l, +++ struct bpf_lru_node *node, +++ struct list_head *free_list, +++ enum bpf_lru_list_type tgt_free_type) +++{ +++ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) +++ return; +++ +++ /* If the removing node is the next_inactive_rotation candidate, +++ * move the next_inactive_rotation pointer also. +++ */ +++ if (&node->list == l->next_inactive_rotation) +++ l->next_inactive_rotation = l->next_inactive_rotation->prev; +++ +++ bpf_lru_list_count_dec(l, node->type); +++ +++ node->type = tgt_free_type; +++ list_move(&node->list, free_list); +++} +++ +++/* Move nodes from local list to the LRU list */ +++static void __bpf_lru_node_move_in(struct bpf_lru_list *l, +++ struct bpf_lru_node *node, +++ enum bpf_lru_list_type tgt_type) +++{ +++ if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) || +++ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) +++ return; +++ +++ bpf_lru_list_count_inc(l, tgt_type); +++ node->type = tgt_type; +++ node->ref = 0; +++ list_move(&node->list, &l->lists[tgt_type]); +++} +++ +++/* Move nodes between or within active and inactive list (like +++ * active to inactive, inactive to active or tail of active back to +++ * the head of active). +++ */ +++static void __bpf_lru_node_move(struct bpf_lru_list *l, +++ struct bpf_lru_node *node, +++ enum bpf_lru_list_type tgt_type) +++{ +++ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) || +++ WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type))) +++ return; +++ +++ if (node->type != tgt_type) { +++ bpf_lru_list_count_dec(l, node->type); +++ bpf_lru_list_count_inc(l, tgt_type); +++ node->type = tgt_type; +++ } +++ node->ref = 0; +++ +++ /* If the moving node is the next_inactive_rotation candidate, +++ * move the next_inactive_rotation pointer also. +++ */ +++ if (&node->list == l->next_inactive_rotation) +++ l->next_inactive_rotation = l->next_inactive_rotation->prev; +++ +++ list_move(&node->list, &l->lists[tgt_type]); +++} +++ +++static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l) +++{ +++ return l->counts[BPF_LRU_LIST_T_INACTIVE] < +++ l->counts[BPF_LRU_LIST_T_ACTIVE]; +++} +++ +++/* Rotate the active list: +++ * 1. Start from tail +++ * 2. If the node has the ref bit set, it will be rotated +++ * back to the head of active list with the ref bit cleared. +++ * Give this node one more chance to survive in the active list. +++ * 3. If the ref bit is not set, move it to the head of the +++ * inactive list. +++ * 4. It will at most scan nr_scans nodes +++ */ +++static void __bpf_lru_list_rotate_active(struct bpf_lru *lru, +++ struct bpf_lru_list *l) +++{ +++ struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE]; +++ struct bpf_lru_node *node, *tmp_node, *first_node; +++ unsigned int i = 0; +++ +++ first_node = list_first_entry(active, struct bpf_lru_node, list); +++ list_for_each_entry_safe_reverse(node, tmp_node, active, list) { +++ if (bpf_lru_node_is_ref(node)) +++ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); +++ else +++ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); +++ +++ if (++i == lru->nr_scans || node == first_node) +++ break; +++ } +++} +++ +++/* Rotate the inactive list. It starts from the next_inactive_rotation +++ * 1. If the node has ref bit set, it will be moved to the head +++ * of active list with the ref bit cleared. +++ * 2. If the node does not have ref bit set, it will leave it +++ * at its current location (i.e. do nothing) so that it can +++ * be considered during the next inactive_shrink. +++ * 3. It will at most scan nr_scans nodes +++ */ +++static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru, +++ struct bpf_lru_list *l) +++{ +++ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; +++ struct list_head *cur, *last, *next = inactive; +++ struct bpf_lru_node *node; +++ unsigned int i = 0; +++ +++ if (list_empty(inactive)) +++ return; +++ +++ last = l->next_inactive_rotation->next; +++ if (last == inactive) +++ last = last->next; +++ +++ cur = l->next_inactive_rotation; +++ while (i < lru->nr_scans) { +++ if (cur == inactive) { +++ cur = cur->prev; +++ continue; +++ } +++ +++ node = list_entry(cur, struct bpf_lru_node, list); +++ next = cur->prev; +++ if (bpf_lru_node_is_ref(node)) +++ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); +++ if (cur == last) +++ break; +++ cur = next; +++ i++; +++ } +++ +++ l->next_inactive_rotation = next; +++} +++ +++/* Shrink the inactive list. It starts from the tail of the +++ * inactive list and only move the nodes without the ref bit +++ * set to the designated free list. +++ */ +++static unsigned int +++__bpf_lru_list_shrink_inactive(struct bpf_lru *lru, +++ struct bpf_lru_list *l, +++ unsigned int tgt_nshrink, +++ struct list_head *free_list, +++ enum bpf_lru_list_type tgt_free_type) +++{ +++ struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE]; +++ struct bpf_lru_node *node, *tmp_node; +++ unsigned int nshrinked = 0; +++ unsigned int i = 0; +++ +++ list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) { +++ if (bpf_lru_node_is_ref(node)) { +++ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE); +++ } else if (lru->del_from_htab(lru->del_arg, node)) { +++ __bpf_lru_node_move_to_free(l, node, free_list, +++ tgt_free_type); +++ if (++nshrinked == tgt_nshrink) +++ break; +++ } +++ +++ if (++i == lru->nr_scans) +++ break; +++ } +++ +++ return nshrinked; +++} +++ +++/* 1. Rotate the active list (if needed) +++ * 2. Always rotate the inactive list +++ */ +++static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l) +++{ +++ if (bpf_lru_list_inactive_low(l)) +++ __bpf_lru_list_rotate_active(lru, l); +++ +++ __bpf_lru_list_rotate_inactive(lru, l); +++} +++ +++/* Calls __bpf_lru_list_shrink_inactive() to shrink some +++ * ref-bit-cleared nodes and move them to the designated +++ * free list. +++ * +++ * If it cannot get a free node after calling +++ * __bpf_lru_list_shrink_inactive(). It will just remove +++ * one node from either inactive or active list without +++ * honoring the ref-bit. It prefers inactive list to active +++ * list in this situation. +++ */ +++static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru, +++ struct bpf_lru_list *l, +++ unsigned int tgt_nshrink, +++ struct list_head *free_list, +++ enum bpf_lru_list_type tgt_free_type) +++ +++{ +++ struct bpf_lru_node *node, *tmp_node; +++ struct list_head *force_shrink_list; +++ unsigned int nshrinked; +++ +++ nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink, +++ free_list, tgt_free_type); +++ if (nshrinked) +++ return nshrinked; +++ +++ /* Do a force shrink by ignoring the reference bit */ +++ if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE])) +++ force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE]; +++ else +++ force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE]; +++ +++ list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list, +++ list) { +++ if (lru->del_from_htab(lru->del_arg, node)) { +++ __bpf_lru_node_move_to_free(l, node, free_list, +++ tgt_free_type); +++ return 1; +++ } +++ } +++ +++ return 0; +++} +++ +++/* Flush the nodes from the local pending list to the LRU list */ +++static void __local_list_flush(struct bpf_lru_list *l, +++ struct bpf_lru_locallist *loc_l) +++{ +++ struct bpf_lru_node *node, *tmp_node; +++ +++ list_for_each_entry_safe_reverse(node, tmp_node, +++ local_pending_list(loc_l), list) { +++ if (bpf_lru_node_is_ref(node)) +++ __bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE); +++ else +++ __bpf_lru_node_move_in(l, node, +++ BPF_LRU_LIST_T_INACTIVE); +++ } +++} +++ +++static void bpf_lru_list_push_free(struct bpf_lru_list *l, +++ struct bpf_lru_node *node) +++{ +++ unsigned long flags; +++ +++ if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type))) +++ return; +++ +++ raw_spin_lock_irqsave(&l->lock, flags); +++ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); +++ raw_spin_unlock_irqrestore(&l->lock, flags); +++} +++ +++static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru, +++ struct bpf_lru_locallist *loc_l) +++{ +++ struct bpf_lru_list *l = &lru->common_lru.lru_list; +++ struct bpf_lru_node *node, *tmp_node; +++ unsigned int nfree = 0; +++ +++ raw_spin_lock(&l->lock); +++ +++ __local_list_flush(l, loc_l); +++ +++ __bpf_lru_list_rotate(lru, l); +++ +++ list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE], +++ list) { +++ __bpf_lru_node_move_to_free(l, node, local_free_list(loc_l), +++ BPF_LRU_LOCAL_LIST_T_FREE); +++ if (++nfree == LOCAL_FREE_TARGET) +++ break; +++ } +++ +++ if (nfree < LOCAL_FREE_TARGET) +++ __bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree, +++ local_free_list(loc_l), +++ BPF_LRU_LOCAL_LIST_T_FREE); +++ +++ raw_spin_unlock(&l->lock); +++} +++ +++static void __local_list_add_pending(struct bpf_lru *lru, +++ struct bpf_lru_locallist *loc_l, +++ int cpu, +++ struct bpf_lru_node *node, +++ u32 hash) +++{ +++ *(u32 *)((void *)node + lru->hash_offset) = hash; +++ node->cpu = cpu; +++ node->type = BPF_LRU_LOCAL_LIST_T_PENDING; +++ node->ref = 0; +++ list_add(&node->list, local_pending_list(loc_l)); +++} +++ +++static struct bpf_lru_node * +++__local_list_pop_free(struct bpf_lru_locallist *loc_l) +++{ +++ struct bpf_lru_node *node; +++ +++ node = list_first_entry_or_null(local_free_list(loc_l), +++ struct bpf_lru_node, +++ list); +++ if (node) +++ list_del(&node->list); +++ +++ return node; +++} +++ +++static struct bpf_lru_node * +++__local_list_pop_pending(struct bpf_lru *lru, struct bpf_lru_locallist *loc_l) +++{ +++ struct bpf_lru_node *node; +++ bool force = false; +++ +++ignore_ref: +++ /* Get from the tail (i.e. older element) of the pending list. */ +++ list_for_each_entry_reverse(node, local_pending_list(loc_l), +++ list) { +++ if ((!bpf_lru_node_is_ref(node) || force) && +++ lru->del_from_htab(lru->del_arg, node)) { +++ list_del(&node->list); +++ return node; +++ } +++ } +++ +++ if (!force) { +++ force = true; +++ goto ignore_ref; +++ } +++ +++ return NULL; +++} +++ +++static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru, +++ u32 hash) +++{ +++ struct list_head *free_list; +++ struct bpf_lru_node *node = NULL; +++ struct bpf_lru_list *l; +++ unsigned long flags; +++ int cpu = raw_smp_processor_id(); +++ +++ l = per_cpu_ptr(lru->percpu_lru, cpu); +++ +++ raw_spin_lock_irqsave(&l->lock, flags); +++ +++ __bpf_lru_list_rotate(lru, l); +++ +++ free_list = &l->lists[BPF_LRU_LIST_T_FREE]; +++ if (list_empty(free_list)) +++ __bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list, +++ BPF_LRU_LIST_T_FREE); +++ +++ if (!list_empty(free_list)) { +++ node = list_first_entry(free_list, struct bpf_lru_node, list); +++ *(u32 *)((void *)node + lru->hash_offset) = hash; +++ node->ref = 0; +++ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE); +++ } +++ +++ raw_spin_unlock_irqrestore(&l->lock, flags); +++ +++ return node; +++} +++ +++static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru, +++ u32 hash) +++{ +++ struct bpf_lru_locallist *loc_l, *steal_loc_l; +++ struct bpf_common_lru *clru = &lru->common_lru; +++ struct bpf_lru_node *node; +++ int steal, first_steal; +++ unsigned long flags; +++ int cpu = raw_smp_processor_id(); +++ +++ loc_l = per_cpu_ptr(clru->local_list, cpu); +++ +++ raw_spin_lock_irqsave(&loc_l->lock, flags); +++ +++ node = __local_list_pop_free(loc_l); +++ if (!node) { +++ bpf_lru_list_pop_free_to_local(lru, loc_l); +++ node = __local_list_pop_free(loc_l); +++ } +++ +++ if (node) +++ __local_list_add_pending(lru, loc_l, cpu, node, hash); +++ +++ raw_spin_unlock_irqrestore(&loc_l->lock, flags); +++ +++ if (node) +++ return node; +++ +++ /* No free nodes found from the local free list and +++ * the global LRU list. +++ * +++ * Steal from the local free/pending list of the +++ * current CPU and remote CPU in RR. It starts +++ * with the loc_l->next_steal CPU. +++ */ +++ +++ first_steal = loc_l->next_steal; +++ steal = first_steal; +++ do { +++ steal_loc_l = per_cpu_ptr(clru->local_list, steal); +++ +++ raw_spin_lock_irqsave(&steal_loc_l->lock, flags); +++ +++ node = __local_list_pop_free(steal_loc_l); +++ if (!node) +++ node = __local_list_pop_pending(lru, steal_loc_l); +++ +++ raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags); +++ +++ steal = get_next_cpu(steal); +++ } while (!node && steal != first_steal); +++ +++ loc_l->next_steal = steal; +++ +++ if (node) { +++ raw_spin_lock_irqsave(&loc_l->lock, flags); +++ __local_list_add_pending(lru, loc_l, cpu, node, hash); +++ raw_spin_unlock_irqrestore(&loc_l->lock, flags); +++ } +++ +++ return node; +++} +++ +++struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) +++{ +++ if (lru->percpu) +++ return bpf_percpu_lru_pop_free(lru, hash); +++ else +++ return bpf_common_lru_pop_free(lru, hash); +++} +++ +++static void bpf_common_lru_push_free(struct bpf_lru *lru, +++ struct bpf_lru_node *node) +++{ +++ u8 node_type = READ_ONCE(node->type); +++ unsigned long flags; +++ +++ if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || +++ WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) +++ return; +++ +++ if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { +++ struct bpf_lru_locallist *loc_l; +++ +++ loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); +++ +++ raw_spin_lock_irqsave(&loc_l->lock, flags); +++ +++ if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) { +++ raw_spin_unlock_irqrestore(&loc_l->lock, flags); +++ goto check_lru_list; +++ } +++ +++ node->type = BPF_LRU_LOCAL_LIST_T_FREE; +++ node->ref = 0; +++ list_move(&node->list, local_free_list(loc_l)); +++ +++ raw_spin_unlock_irqrestore(&loc_l->lock, flags); +++ return; +++ } +++ +++check_lru_list: +++ bpf_lru_list_push_free(&lru->common_lru.lru_list, node); +++} +++ +++static void bpf_percpu_lru_push_free(struct bpf_lru *lru, +++ struct bpf_lru_node *node) +++{ +++ struct bpf_lru_list *l; +++ unsigned long flags; +++ +++ l = per_cpu_ptr(lru->percpu_lru, node->cpu); +++ +++ raw_spin_lock_irqsave(&l->lock, flags); +++ +++ __bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE); +++ +++ raw_spin_unlock_irqrestore(&l->lock, flags); +++} +++ +++void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) +++{ +++ if (lru->percpu) +++ bpf_percpu_lru_push_free(lru, node); +++ else +++ bpf_common_lru_push_free(lru, node); +++} +++ +++static void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, +++ u32 node_offset, u32 elem_size, +++ u32 nr_elems) +++{ +++ struct bpf_lru_list *l = &lru->common_lru.lru_list; +++ u32 i; +++ +++ for (i = 0; i < nr_elems; i++) { +++ struct bpf_lru_node *node; +++ +++ node = (struct bpf_lru_node *)(buf + node_offset); +++ node->type = BPF_LRU_LIST_T_FREE; +++ node->ref = 0; +++ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); +++ buf += elem_size; +++ } +++} +++ +++static void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, +++ u32 node_offset, u32 elem_size, +++ u32 nr_elems) +++{ +++ u32 i, pcpu_entries; +++ int cpu; +++ struct bpf_lru_list *l; +++ +++ pcpu_entries = nr_elems / num_possible_cpus(); +++ +++ i = 0; +++ +++ for_each_possible_cpu(cpu) { +++ struct bpf_lru_node *node; +++ +++ l = per_cpu_ptr(lru->percpu_lru, cpu); +++again: +++ node = (struct bpf_lru_node *)(buf + node_offset); +++ node->cpu = cpu; +++ node->type = BPF_LRU_LIST_T_FREE; +++ node->ref = 0; +++ list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]); +++ i++; +++ buf += elem_size; +++ if (i == nr_elems) +++ break; +++ if (i % pcpu_entries) +++ goto again; +++ } +++} +++ +++void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, +++ u32 elem_size, u32 nr_elems) +++{ +++ if (lru->percpu) +++ bpf_percpu_lru_populate(lru, buf, node_offset, elem_size, +++ nr_elems); +++ else +++ bpf_common_lru_populate(lru, buf, node_offset, elem_size, +++ nr_elems); +++} +++ +++static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu) +++{ +++ int i; +++ +++ for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++) +++ INIT_LIST_HEAD(&loc_l->lists[i]); +++ +++ loc_l->next_steal = cpu; +++ +++ raw_spin_lock_init(&loc_l->lock); +++} +++ +++static void bpf_lru_list_init(struct bpf_lru_list *l) +++{ +++ int i; +++ +++ for (i = 0; i < NR_BPF_LRU_LIST_T; i++) +++ INIT_LIST_HEAD(&l->lists[i]); +++ +++ for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++) +++ l->counts[i] = 0; +++ +++ l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE]; +++ +++ raw_spin_lock_init(&l->lock); +++} +++ +++int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, +++ del_from_htab_func del_from_htab, void *del_arg) +++{ +++ int cpu; +++ +++ if (percpu) { +++ lru->percpu_lru = alloc_percpu(struct bpf_lru_list); +++ if (!lru->percpu_lru) +++ return -ENOMEM; +++ +++ for_each_possible_cpu(cpu) { +++ struct bpf_lru_list *l; +++ +++ l = per_cpu_ptr(lru->percpu_lru, cpu); +++ bpf_lru_list_init(l); +++ } +++ lru->nr_scans = PERCPU_NR_SCANS; +++ } else { +++ struct bpf_common_lru *clru = &lru->common_lru; +++ +++ clru->local_list = alloc_percpu(struct bpf_lru_locallist); +++ if (!clru->local_list) +++ return -ENOMEM; +++ +++ for_each_possible_cpu(cpu) { +++ struct bpf_lru_locallist *loc_l; +++ +++ loc_l = per_cpu_ptr(clru->local_list, cpu); +++ bpf_lru_locallist_init(loc_l, cpu); +++ } +++ +++ bpf_lru_list_init(&clru->lru_list); +++ lru->nr_scans = LOCAL_NR_SCANS; +++ } +++ +++ lru->percpu = percpu; +++ lru->del_from_htab = del_from_htab; +++ lru->del_arg = del_arg; +++ lru->hash_offset = hash_offset; +++ +++ return 0; +++} +++ +++void bpf_lru_destroy(struct bpf_lru *lru) +++{ +++ if (lru->percpu) +++ free_percpu(lru->percpu_lru); +++ else +++ free_percpu(lru->common_lru.local_list); +++} ++--- /dev/null +++++ b/kernel/bpf/bpf_lru_list.h ++@@ -0,0 +1,82 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ +++/* Copyright (c) 2016 Facebook +++ */ +++#ifndef __BPF_LRU_LIST_H_ +++#define __BPF_LRU_LIST_H_ +++ +++#include +++#include +++ +++#define NR_BPF_LRU_LIST_T (3) +++#define NR_BPF_LRU_LIST_COUNT (2) +++#define NR_BPF_LRU_LOCAL_LIST_T (2) +++#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T +++ +++enum bpf_lru_list_type { +++ BPF_LRU_LIST_T_ACTIVE, +++ BPF_LRU_LIST_T_INACTIVE, +++ BPF_LRU_LIST_T_FREE, +++ BPF_LRU_LOCAL_LIST_T_FREE, +++ BPF_LRU_LOCAL_LIST_T_PENDING, +++}; +++ +++struct bpf_lru_node { +++ struct list_head list; +++ u16 cpu; +++ u8 type; +++ u8 ref; +++}; +++ +++struct bpf_lru_list { +++ struct list_head lists[NR_BPF_LRU_LIST_T]; +++ unsigned int counts[NR_BPF_LRU_LIST_COUNT]; +++ /* The next inacitve list rotation starts from here */ +++ struct list_head *next_inactive_rotation; +++ +++ raw_spinlock_t lock ____cacheline_aligned_in_smp; +++}; +++ +++struct bpf_lru_locallist { +++ struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T]; +++ u16 next_steal; +++ raw_spinlock_t lock; +++}; +++ +++struct bpf_common_lru { +++ struct bpf_lru_list lru_list; +++ struct bpf_lru_locallist __percpu *local_list; +++}; +++ +++typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node); +++ +++struct bpf_lru { +++ union { +++ struct bpf_common_lru common_lru; +++ struct bpf_lru_list __percpu *percpu_lru; +++ }; +++ del_from_htab_func del_from_htab; +++ void *del_arg; +++ unsigned int hash_offset; +++ unsigned int nr_scans; +++ bool percpu; +++}; +++ +++static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node) +++{ +++ /* ref is an approximation on access frequency. It does not +++ * have to be very accurate. Hence, no protection is used. +++ */ +++ if (!node->ref) +++ node->ref = 1; +++} +++ +++int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset, +++ del_from_htab_func del_from_htab, void *delete_arg); +++void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset, +++ u32 elem_size, u32 nr_elems); +++void bpf_lru_destroy(struct bpf_lru *lru); +++struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash); +++void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node); +++void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node); +++ +++#endif ++--- /dev/null +++++ b/kernel/bpf/btf.c ++@@ -0,0 +1,3514 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ +++/* Copyright (c) 2018 Facebook */ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++/* BTF (BPF Type Format) is the meta data format which describes +++ * the data types of BPF program/map. Hence, it basically focus +++ * on the C programming language which the modern BPF is primary +++ * using. +++ * +++ * ELF Section: +++ * ~~~~~~~~~~~ +++ * The BTF data is stored under the ".BTF" ELF section +++ * +++ * struct btf_type: +++ * ~~~~~~~~~~~~~~~ +++ * Each 'struct btf_type' object describes a C data type. +++ * Depending on the type it is describing, a 'struct btf_type' +++ * object may be followed by more data. F.e. +++ * To describe an array, 'struct btf_type' is followed by +++ * 'struct btf_array'. +++ * +++ * 'struct btf_type' and any extra data following it are +++ * 4 bytes aligned. +++ * +++ * Type section: +++ * ~~~~~~~~~~~~~ +++ * The BTF type section contains a list of 'struct btf_type' objects. +++ * Each one describes a C type. Recall from the above section +++ * that a 'struct btf_type' object could be immediately followed by extra +++ * data in order to desribe some particular C types. +++ * +++ * type_id: +++ * ~~~~~~~ +++ * Each btf_type object is identified by a type_id. The type_id +++ * is implicitly implied by the location of the btf_type object in +++ * the BTF type section. The first one has type_id 1. The second +++ * one has type_id 2...etc. Hence, an earlier btf_type has +++ * a smaller type_id. +++ * +++ * A btf_type object may refer to another btf_type object by using +++ * type_id (i.e. the "type" in the "struct btf_type"). +++ * +++ * NOTE that we cannot assume any reference-order. +++ * A btf_type object can refer to an earlier btf_type object +++ * but it can also refer to a later btf_type object. +++ * +++ * For example, to describe "const void *". A btf_type +++ * object describing "const" may refer to another btf_type +++ * object describing "void *". This type-reference is done +++ * by specifying type_id: +++ * +++ * [1] CONST (anon) type_id=2 +++ * [2] PTR (anon) type_id=0 +++ * +++ * The above is the btf_verifier debug log: +++ * - Each line started with "[?]" is a btf_type object +++ * - [?] is the type_id of the btf_type object. +++ * - CONST/PTR is the BTF_KIND_XXX +++ * - "(anon)" is the name of the type. It just +++ * happens that CONST and PTR has no name. +++ * - type_id=XXX is the 'u32 type' in btf_type +++ * +++ * NOTE: "void" has type_id 0 +++ * +++ * String section: +++ * ~~~~~~~~~~~~~~ +++ * The BTF string section contains the names used by the type section. +++ * Each string is referred by an "offset" from the beginning of the +++ * string section. +++ * +++ * Each string is '\0' terminated. +++ * +++ * The first character in the string section must be '\0' +++ * which is used to mean 'anonymous'. Some btf_type may not +++ * have a name. +++ */ +++ +++/* BTF verification: +++ * +++ * To verify BTF data, two passes are needed. +++ * +++ * Pass #1 +++ * ~~~~~~~ +++ * The first pass is to collect all btf_type objects to +++ * an array: "btf->types". +++ * +++ * Depending on the C type that a btf_type is describing, +++ * a btf_type may be followed by extra data. We don't know +++ * how many btf_type is there, and more importantly we don't +++ * know where each btf_type is located in the type section. +++ * +++ * Without knowing the location of each type_id, most verifications +++ * cannot be done. e.g. an earlier btf_type may refer to a later +++ * btf_type (recall the "const void *" above), so we cannot +++ * check this type-reference in the first pass. +++ * +++ * In the first pass, it still does some verifications (e.g. +++ * checking the name is a valid offset to the string section). +++ * +++ * Pass #2 +++ * ~~~~~~~ +++ * The main focus is to resolve a btf_type that is referring +++ * to another type. +++ * +++ * We have to ensure the referring type: +++ * 1) does exist in the BTF (i.e. in btf->types[]) +++ * 2) does not cause a loop: +++ * struct A { +++ * struct B b; +++ * }; +++ * +++ * struct B { +++ * struct A a; +++ * }; +++ * +++ * btf_type_needs_resolve() decides if a btf_type needs +++ * to be resolved. +++ * +++ * The needs_resolve type implements the "resolve()" ops which +++ * essentially does a DFS and detects backedge. +++ * +++ * During resolve (or DFS), different C types have different +++ * "RESOLVED" conditions. +++ * +++ * When resolving a BTF_KIND_STRUCT, we need to resolve all its +++ * members because a member is always referring to another +++ * type. A struct's member can be treated as "RESOLVED" if +++ * it is referring to a BTF_KIND_PTR. Otherwise, the +++ * following valid C struct would be rejected: +++ * +++ * struct A { +++ * int m; +++ * struct A *a; +++ * }; +++ * +++ * When resolving a BTF_KIND_PTR, it needs to keep resolving if +++ * it is referring to another BTF_KIND_PTR. Otherwise, we cannot +++ * detect a pointer loop, e.g.: +++ * BTF_KIND_CONST -> BTF_KIND_PTR -> BTF_KIND_CONST -> BTF_KIND_PTR + +++ * ^ | +++ * +-----------------------------------------+ +++ * +++ */ +++ +++#define BITS_PER_U128 (sizeof(u64) * BITS_PER_BYTE * 2) +++#define BITS_PER_BYTE_MASK (BITS_PER_BYTE - 1) +++#define BITS_PER_BYTE_MASKED(bits) ((bits) & BITS_PER_BYTE_MASK) +++#define BITS_ROUNDDOWN_BYTES(bits) ((bits) >> 3) +++#define BITS_ROUNDUP_BYTES(bits) \ +++ (BITS_ROUNDDOWN_BYTES(bits) + !!BITS_PER_BYTE_MASKED(bits)) +++ +++#define BTF_INFO_MASK 0x8f00ffff +++#define BTF_INT_MASK 0x0fffffff +++#define BTF_TYPE_ID_VALID(type_id) ((type_id) <= BTF_MAX_TYPE) +++#define BTF_STR_OFFSET_VALID(name_off) ((name_off) <= BTF_MAX_NAME_OFFSET) +++ +++/* 16MB for 64k structs and each has 16 members and +++ * a few MB spaces for the string section. +++ * The hard limit is S32_MAX. +++ */ +++#define BTF_MAX_SIZE (16 * 1024 * 1024) +++ +++#define for_each_member(i, struct_type, member) \ +++ for (i = 0, member = btf_type_member(struct_type); \ +++ i < btf_type_vlen(struct_type); \ +++ i++, member++) +++ +++#define for_each_member_from(i, from, struct_type, member) \ +++ for (i = from, member = btf_type_member(struct_type) + from; \ +++ i < btf_type_vlen(struct_type); \ +++ i++, member++) +++ +++#define for_each_vsi(i, struct_type, member) \ +++ for (i = 0, member = btf_type_var_secinfo(struct_type); \ +++ i < btf_type_vlen(struct_type); \ +++ i++, member++) +++ +++#define for_each_vsi_from(i, from, struct_type, member) \ +++ for (i = from, member = btf_type_var_secinfo(struct_type) + from; \ +++ i < btf_type_vlen(struct_type); \ +++ i++, member++) +++ +++DEFINE_IDR(btf_idr); +++DEFINE_SPINLOCK(btf_idr_lock); +++ +++struct btf { +++ void *data; +++ struct btf_type **types; +++ u32 *resolved_ids; +++ u32 *resolved_sizes; +++ const char *strings; +++ void *nohdr_data; +++ struct btf_header hdr; +++ u32 nr_types; +++ u32 types_size; +++ u32 data_size; +++ refcount_t refcnt; +++ u32 id; +++ struct rcu_head rcu; +++}; +++ +++enum verifier_phase { +++ CHECK_META, +++ CHECK_TYPE, +++}; +++ +++struct resolve_vertex { +++ const struct btf_type *t; +++ u32 type_id; +++ u16 next_member; +++}; +++ +++enum visit_state { +++ NOT_VISITED, +++ VISITED, +++ RESOLVED, +++}; +++ +++enum resolve_mode { +++ RESOLVE_TBD, /* To Be Determined */ +++ RESOLVE_PTR, /* Resolving for Pointer */ +++ RESOLVE_STRUCT_OR_ARRAY, /* Resolving for struct/union +++ * or array +++ */ +++}; +++ +++#define MAX_RESOLVE_DEPTH 32 +++ +++struct btf_sec_info { +++ u32 off; +++ u32 len; +++}; +++ +++struct btf_verifier_env { +++ struct btf *btf; +++ u8 *visit_states; +++ struct resolve_vertex stack[MAX_RESOLVE_DEPTH]; +++ struct bpf_verifier_log log; +++ u32 log_type_id; +++ u32 top_stack; +++ enum verifier_phase phase; +++ enum resolve_mode resolve_mode; +++}; +++ +++static const char * const btf_kind_str[NR_BTF_KINDS] = { +++ [BTF_KIND_UNKN] = "UNKNOWN", +++ [BTF_KIND_INT] = "INT", +++ [BTF_KIND_PTR] = "PTR", +++ [BTF_KIND_ARRAY] = "ARRAY", +++ [BTF_KIND_STRUCT] = "STRUCT", +++ [BTF_KIND_UNION] = "UNION", +++ [BTF_KIND_ENUM] = "ENUM", +++ [BTF_KIND_FWD] = "FWD", +++ [BTF_KIND_TYPEDEF] = "TYPEDEF", +++ [BTF_KIND_VOLATILE] = "VOLATILE", +++ [BTF_KIND_CONST] = "CONST", +++ [BTF_KIND_RESTRICT] = "RESTRICT", +++ [BTF_KIND_FUNC] = "FUNC", +++ [BTF_KIND_FUNC_PROTO] = "FUNC_PROTO", +++ [BTF_KIND_VAR] = "VAR", +++ [BTF_KIND_DATASEC] = "DATASEC", +++}; +++ +++struct btf_kind_operations { +++ s32 (*check_meta)(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left); +++ int (*resolve)(struct btf_verifier_env *env, +++ const struct resolve_vertex *v); +++ int (*check_member)(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type); +++ int (*check_kflag_member)(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type); +++ void (*log_details)(struct btf_verifier_env *env, +++ const struct btf_type *t); +++ void (*seq_show)(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offsets, +++ struct seq_file *m); +++}; +++ +++static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS]; +++static struct btf_type btf_void; +++ +++static int btf_resolve(struct btf_verifier_env *env, +++ const struct btf_type *t, u32 type_id); +++ +++static bool btf_type_is_modifier(const struct btf_type *t) +++{ +++ /* Some of them is not strictly a C modifier +++ * but they are grouped into the same bucket +++ * for BTF concern: +++ * A type (t) that refers to another +++ * type through t->type AND its size cannot +++ * be determined without following the t->type. +++ * +++ * ptr does not fall into this bucket +++ * because its size is always sizeof(void *). +++ */ +++ switch (BTF_INFO_KIND(t->info)) { +++ case BTF_KIND_TYPEDEF: +++ case BTF_KIND_VOLATILE: +++ case BTF_KIND_CONST: +++ case BTF_KIND_RESTRICT: +++ return true; +++ } +++ +++ return false; +++} +++ +++bool btf_type_is_void(const struct btf_type *t) +++{ +++ return t == &btf_void; +++} +++ +++static bool btf_type_is_fwd(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_FWD; +++} +++ +++static bool btf_type_is_func(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC; +++} +++ +++static bool btf_type_is_func_proto(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_FUNC_PROTO; +++} +++ +++static bool btf_type_nosize(const struct btf_type *t) +++{ +++ return btf_type_is_void(t) || btf_type_is_fwd(t) || +++ btf_type_is_func(t) || btf_type_is_func_proto(t); +++} +++ +++static bool btf_type_nosize_or_null(const struct btf_type *t) +++{ +++ return !t || btf_type_nosize(t); +++} +++ +++/* union is only a special case of struct: +++ * all its offsetof(member) == 0 +++ */ +++static bool btf_type_is_struct(const struct btf_type *t) +++{ +++ u8 kind = BTF_INFO_KIND(t->info); +++ +++ return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; +++} +++ +++static bool __btf_type_is_struct(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +++} +++ +++static bool btf_type_is_array(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; +++} +++ +++static bool btf_type_is_ptr(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_PTR; +++} +++ +++static bool btf_type_is_int(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_INT; +++} +++ +++static bool btf_type_is_var(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_VAR; +++} +++ +++static bool btf_type_is_datasec(const struct btf_type *t) +++{ +++ return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; +++} +++ +++/* Types that act only as a source, not sink or intermediate +++ * type when resolving. +++ */ +++static bool btf_type_is_resolve_source_only(const struct btf_type *t) +++{ +++ return btf_type_is_var(t) || +++ btf_type_is_datasec(t); +++} +++ +++/* What types need to be resolved? +++ * +++ * btf_type_is_modifier() is an obvious one. +++ * +++ * btf_type_is_struct() because its member refers to +++ * another type (through member->type). +++ * +++ * btf_type_is_var() because the variable refers to +++ * another type. btf_type_is_datasec() holds multiple +++ * btf_type_is_var() types that need resolving. +++ * +++ * btf_type_is_array() because its element (array->type) +++ * refers to another type. Array can be thought of a +++ * special case of struct while array just has the same +++ * member-type repeated by array->nelems of times. +++ */ +++static bool btf_type_needs_resolve(const struct btf_type *t) +++{ +++ return btf_type_is_modifier(t) || +++ btf_type_is_ptr(t) || +++ btf_type_is_struct(t) || +++ btf_type_is_array(t) || +++ btf_type_is_var(t) || +++ btf_type_is_datasec(t); +++} +++ +++/* t->size can be used */ +++static bool btf_type_has_size(const struct btf_type *t) +++{ +++ switch (BTF_INFO_KIND(t->info)) { +++ case BTF_KIND_INT: +++ case BTF_KIND_STRUCT: +++ case BTF_KIND_UNION: +++ case BTF_KIND_ENUM: +++ case BTF_KIND_DATASEC: +++ return true; +++ } +++ +++ return false; +++} +++ +++static const char *btf_int_encoding_str(u8 encoding) +++{ +++ if (encoding == 0) +++ return "(none)"; +++ else if (encoding == BTF_INT_SIGNED) +++ return "SIGNED"; +++ else if (encoding == BTF_INT_CHAR) +++ return "CHAR"; +++ else if (encoding == BTF_INT_BOOL) +++ return "BOOL"; +++ else +++ return "UNKN"; +++} +++ +++static u16 btf_type_vlen(const struct btf_type *t) +++{ +++ return BTF_INFO_VLEN(t->info); +++} +++ +++static bool btf_type_kflag(const struct btf_type *t) +++{ +++ return BTF_INFO_KFLAG(t->info); +++} +++ +++static u32 btf_member_bit_offset(const struct btf_type *struct_type, +++ const struct btf_member *member) +++{ +++ return btf_type_kflag(struct_type) ? BTF_MEMBER_BIT_OFFSET(member->offset) +++ : member->offset; +++} +++ +++static u32 btf_member_bitfield_size(const struct btf_type *struct_type, +++ const struct btf_member *member) +++{ +++ return btf_type_kflag(struct_type) ? BTF_MEMBER_BITFIELD_SIZE(member->offset) +++ : 0; +++} +++ +++static u32 btf_type_int(const struct btf_type *t) +++{ +++ return *(u32 *)(t + 1); +++} +++ +++static const struct btf_array *btf_type_array(const struct btf_type *t) +++{ +++ return (const struct btf_array *)(t + 1); +++} +++ +++static const struct btf_member *btf_type_member(const struct btf_type *t) +++{ +++ return (const struct btf_member *)(t + 1); +++} +++ +++static const struct btf_enum *btf_type_enum(const struct btf_type *t) +++{ +++ return (const struct btf_enum *)(t + 1); +++} +++ +++static const struct btf_var *btf_type_var(const struct btf_type *t) +++{ +++ return (const struct btf_var *)(t + 1); +++} +++ +++static const struct btf_var_secinfo *btf_type_var_secinfo(const struct btf_type *t) +++{ +++ return (const struct btf_var_secinfo *)(t + 1); +++} +++ +++static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) +++{ +++ return kind_ops[BTF_INFO_KIND(t->info)]; +++} +++ +++static bool btf_name_offset_valid(const struct btf *btf, u32 offset) +++{ +++ return BTF_STR_OFFSET_VALID(offset) && +++ offset < btf->hdr.str_len; +++} +++ +++static bool __btf_name_char_ok(char c, bool first, bool dot_ok) +++{ +++ if ((first ? !isalpha(c) : +++ !isalnum(c)) && +++ c != '_' && +++ ((c == '.' && !dot_ok) || +++ c != '.')) +++ return false; +++ return true; +++} +++ +++static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) +++{ +++ /* offset must be valid */ +++ const char *src = &btf->strings[offset]; +++ const char *src_limit; +++ +++ if (!__btf_name_char_ok(*src, true, dot_ok)) +++ return false; +++ +++ /* set a limit on identifier length */ +++ src_limit = src + KSYM_NAME_LEN; +++ src++; +++ while (*src && src < src_limit) { +++ if (!__btf_name_char_ok(*src, false, dot_ok)) +++ return false; +++ src++; +++ } +++ +++ return !*src; +++} +++ +++/* Only C-style identifier is permitted. This can be relaxed if +++ * necessary. +++ */ +++static bool btf_name_valid_identifier(const struct btf *btf, u32 offset) +++{ +++ return __btf_name_valid(btf, offset, false); +++} +++ +++static bool btf_name_valid_section(const struct btf *btf, u32 offset) +++{ +++ return __btf_name_valid(btf, offset, true); +++} +++ +++static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) +++{ +++ if (!offset) +++ return "(anon)"; +++ else if (offset < btf->hdr.str_len) +++ return &btf->strings[offset]; +++ else +++ return "(invalid-name-offset)"; +++} +++ +++const char *btf_name_by_offset(const struct btf *btf, u32 offset) +++{ +++ if (offset < btf->hdr.str_len) +++ return &btf->strings[offset]; +++ +++ return NULL; +++} +++ +++const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) +++{ +++ if (type_id > btf->nr_types) +++ return NULL; +++ +++ return btf->types[type_id]; +++} +++ +++/* +++ * Regular int is not a bit field and it must be either +++ * u8/u16/u32/u64 or __int128. +++ */ +++static bool btf_type_int_is_regular(const struct btf_type *t) +++{ +++ u8 nr_bits, nr_bytes; +++ u32 int_data; +++ +++ int_data = btf_type_int(t); +++ nr_bits = BTF_INT_BITS(int_data); +++ nr_bytes = BITS_ROUNDUP_BYTES(nr_bits); +++ if (BITS_PER_BYTE_MASKED(nr_bits) || +++ BTF_INT_OFFSET(int_data) || +++ (nr_bytes != sizeof(u8) && nr_bytes != sizeof(u16) && +++ nr_bytes != sizeof(u32) && nr_bytes != sizeof(u64) && +++ nr_bytes != (2 * sizeof(u64)))) { +++ return false; +++ } +++ +++ return true; +++} +++ +++/* +++ * Check that given struct member is a regular int with expected +++ * offset and size. +++ */ +++bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, +++ const struct btf_member *m, +++ u32 expected_offset, u32 expected_size) +++{ +++ const struct btf_type *t; +++ u32 id, int_data; +++ u8 nr_bits; +++ +++ id = m->type; +++ t = btf_type_id_size(btf, &id, NULL); +++ if (!t || !btf_type_is_int(t)) +++ return false; +++ +++ int_data = btf_type_int(t); +++ nr_bits = BTF_INT_BITS(int_data); +++ if (btf_type_kflag(s)) { +++ u32 bitfield_size = BTF_MEMBER_BITFIELD_SIZE(m->offset); +++ u32 bit_offset = BTF_MEMBER_BIT_OFFSET(m->offset); +++ +++ /* if kflag set, int should be a regular int and +++ * bit offset should be at byte boundary. +++ */ +++ return !bitfield_size && +++ BITS_ROUNDUP_BYTES(bit_offset) == expected_offset && +++ BITS_ROUNDUP_BYTES(nr_bits) == expected_size; +++ } +++ +++ if (BTF_INT_OFFSET(int_data) || +++ BITS_PER_BYTE_MASKED(m->offset) || +++ BITS_ROUNDUP_BYTES(m->offset) != expected_offset || +++ BITS_PER_BYTE_MASKED(nr_bits) || +++ BITS_ROUNDUP_BYTES(nr_bits) != expected_size) +++ return false; +++ +++ return true; +++} +++ +++__printf(2, 3) static void __btf_verifier_log(struct bpf_verifier_log *log, +++ const char *fmt, ...) +++{ +++ va_list args; +++ +++ va_start(args, fmt); +++ bpf_verifier_vlog(log, fmt, args); +++ va_end(args); +++} +++ +++__printf(2, 3) static void btf_verifier_log(struct btf_verifier_env *env, +++ const char *fmt, ...) +++{ +++ struct bpf_verifier_log *log = &env->log; +++ va_list args; +++ +++ if (!bpf_verifier_log_needed(log)) +++ return; +++ +++ va_start(args, fmt); +++ bpf_verifier_vlog(log, fmt, args); +++ va_end(args); +++} +++ +++__printf(4, 5) static void __btf_verifier_log_type(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ bool log_details, +++ const char *fmt, ...) +++{ +++ struct bpf_verifier_log *log = &env->log; +++ u8 kind = BTF_INFO_KIND(t->info); +++ struct btf *btf = env->btf; +++ va_list args; +++ +++ if (!bpf_verifier_log_needed(log)) +++ return; +++ +++ __btf_verifier_log(log, "[%u] %s %s%s", +++ env->log_type_id, +++ btf_kind_str[kind], +++ __btf_name_by_offset(btf, t->name_off), +++ log_details ? " " : ""); +++ +++ if (log_details) +++ btf_type_ops(t)->log_details(env, t); +++ +++ if (fmt && *fmt) { +++ __btf_verifier_log(log, " "); +++ va_start(args, fmt); +++ bpf_verifier_vlog(log, fmt, args); +++ va_end(args); +++ } +++ +++ __btf_verifier_log(log, "\n"); +++} +++ +++#define btf_verifier_log_type(env, t, ...) \ +++ __btf_verifier_log_type((env), (t), true, __VA_ARGS__) +++#define btf_verifier_log_basic(env, t, ...) \ +++ __btf_verifier_log_type((env), (t), false, __VA_ARGS__) +++ +++__printf(4, 5) +++static void btf_verifier_log_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const char *fmt, ...) +++{ +++ struct bpf_verifier_log *log = &env->log; +++ struct btf *btf = env->btf; +++ va_list args; +++ +++ if (!bpf_verifier_log_needed(log)) +++ return; +++ +++ /* The CHECK_META phase already did a btf dump. +++ * +++ * If member is logged again, it must hit an error in +++ * parsing this member. It is useful to print out which +++ * struct this member belongs to. +++ */ +++ if (env->phase != CHECK_META) +++ btf_verifier_log_type(env, struct_type, NULL); +++ +++ if (btf_type_kflag(struct_type)) +++ __btf_verifier_log(log, +++ "\t%s type_id=%u bitfield_size=%u bits_offset=%u", +++ __btf_name_by_offset(btf, member->name_off), +++ member->type, +++ BTF_MEMBER_BITFIELD_SIZE(member->offset), +++ BTF_MEMBER_BIT_OFFSET(member->offset)); +++ else +++ __btf_verifier_log(log, "\t%s type_id=%u bits_offset=%u", +++ __btf_name_by_offset(btf, member->name_off), +++ member->type, member->offset); +++ +++ if (fmt && *fmt) { +++ __btf_verifier_log(log, " "); +++ va_start(args, fmt); +++ bpf_verifier_vlog(log, fmt, args); +++ va_end(args); +++ } +++ +++ __btf_verifier_log(log, "\n"); +++} +++ +++__printf(4, 5) +++static void btf_verifier_log_vsi(struct btf_verifier_env *env, +++ const struct btf_type *datasec_type, +++ const struct btf_var_secinfo *vsi, +++ const char *fmt, ...) +++{ +++ struct bpf_verifier_log *log = &env->log; +++ va_list args; +++ +++ if (!bpf_verifier_log_needed(log)) +++ return; +++ if (env->phase != CHECK_META) +++ btf_verifier_log_type(env, datasec_type, NULL); +++ +++ __btf_verifier_log(log, "\t type_id=%u offset=%u size=%u", +++ vsi->type, vsi->offset, vsi->size); +++ if (fmt && *fmt) { +++ __btf_verifier_log(log, " "); +++ va_start(args, fmt); +++ bpf_verifier_vlog(log, fmt, args); +++ va_end(args); +++ } +++ +++ __btf_verifier_log(log, "\n"); +++} +++ +++static void btf_verifier_log_hdr(struct btf_verifier_env *env, +++ u32 btf_data_size) +++{ +++ struct bpf_verifier_log *log = &env->log; +++ const struct btf *btf = env->btf; +++ const struct btf_header *hdr; +++ +++ if (!bpf_verifier_log_needed(log)) +++ return; +++ +++ hdr = &btf->hdr; +++ __btf_verifier_log(log, "magic: 0x%x\n", hdr->magic); +++ __btf_verifier_log(log, "version: %u\n", hdr->version); +++ __btf_verifier_log(log, "flags: 0x%x\n", hdr->flags); +++ __btf_verifier_log(log, "hdr_len: %u\n", hdr->hdr_len); +++ __btf_verifier_log(log, "type_off: %u\n", hdr->type_off); +++ __btf_verifier_log(log, "type_len: %u\n", hdr->type_len); +++ __btf_verifier_log(log, "str_off: %u\n", hdr->str_off); +++ __btf_verifier_log(log, "str_len: %u\n", hdr->str_len); +++ __btf_verifier_log(log, "btf_total_size: %u\n", btf_data_size); +++} +++ +++static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) +++{ +++ struct btf *btf = env->btf; +++ +++ /* < 2 because +1 for btf_void which is always in btf->types[0]. +++ * btf_void is not accounted in btf->nr_types because btf_void +++ * does not come from the BTF file. +++ */ +++ if (btf->types_size - btf->nr_types < 2) { +++ /* Expand 'types' array */ +++ +++ struct btf_type **new_types; +++ u32 expand_by, new_size; +++ +++ if (btf->types_size == BTF_MAX_TYPE) { +++ btf_verifier_log(env, "Exceeded max num of types"); +++ return -E2BIG; +++ } +++ +++ expand_by = max_t(u32, btf->types_size >> 2, 16); +++ new_size = min_t(u32, BTF_MAX_TYPE, +++ btf->types_size + expand_by); +++ +++ new_types = kcalloc(new_size, sizeof(*new_types), +++ GFP_KERNEL | __GFP_NOWARN); +++ if (!new_types) +++ return -ENOMEM; +++ +++ if (btf->nr_types == 0) +++ new_types[0] = &btf_void; +++ else +++ memcpy(new_types, btf->types, +++ sizeof(*btf->types) * (btf->nr_types + 1)); +++ +++ kvfree(btf->types); +++ btf->types = new_types; +++ btf->types_size = new_size; +++ } +++ +++ btf->types[++(btf->nr_types)] = t; +++ +++ return 0; +++} +++ +++static int btf_alloc_id(struct btf *btf) +++{ +++ int id; +++ +++ idr_preload(GFP_KERNEL); +++ spin_lock_bh(&btf_idr_lock); +++ id = idr_alloc_cyclic(&btf_idr, btf, 1, INT_MAX, GFP_ATOMIC); +++ if (id > 0) +++ btf->id = id; +++ spin_unlock_bh(&btf_idr_lock); +++ idr_preload_end(); +++ +++ if (WARN_ON_ONCE(!id)) +++ return -ENOSPC; +++ +++ return id > 0 ? 0 : id; +++} +++ +++static void btf_free_id(struct btf *btf) +++{ +++ unsigned long flags; +++ +++ /* +++ * In map-in-map, calling map_delete_elem() on outer +++ * map will call bpf_map_put on the inner map. +++ * It will then eventually call btf_free_id() +++ * on the inner map. Some of the map_delete_elem() +++ * implementation may have irq disabled, so +++ * we need to use the _irqsave() version instead +++ * of the _bh() version. +++ */ +++ spin_lock_irqsave(&btf_idr_lock, flags); +++ idr_remove(&btf_idr, btf->id); +++ spin_unlock_irqrestore(&btf_idr_lock, flags); +++} +++ +++static void btf_free(struct btf *btf) +++{ +++ kvfree(btf->types); +++ kvfree(btf->resolved_sizes); +++ kvfree(btf->resolved_ids); +++ kvfree(btf->data); +++ kfree(btf); +++} +++ +++static void btf_free_rcu(struct rcu_head *rcu) +++{ +++ struct btf *btf = container_of(rcu, struct btf, rcu); +++ +++ btf_free(btf); +++} +++ +++void btf_put(struct btf *btf) +++{ +++ if (btf && refcount_dec_and_test(&btf->refcnt)) { +++ btf_free_id(btf); +++ call_rcu(&btf->rcu, btf_free_rcu); +++ } +++} +++ +++static int env_resolve_init(struct btf_verifier_env *env) +++{ +++ struct btf *btf = env->btf; +++ u32 nr_types = btf->nr_types; +++ u32 *resolved_sizes = NULL; +++ u32 *resolved_ids = NULL; +++ u8 *visit_states = NULL; +++ +++ /* +1 for btf_void */ +++ resolved_sizes = kcalloc(nr_types + 1, sizeof(*resolved_sizes), +++ GFP_KERNEL | __GFP_NOWARN); +++ if (!resolved_sizes) +++ goto nomem; +++ +++ resolved_ids = kcalloc(nr_types + 1, sizeof(*resolved_ids), +++ GFP_KERNEL | __GFP_NOWARN); +++ if (!resolved_ids) +++ goto nomem; +++ +++ visit_states = kcalloc(nr_types + 1, sizeof(*visit_states), +++ GFP_KERNEL | __GFP_NOWARN); +++ if (!visit_states) +++ goto nomem; +++ +++ btf->resolved_sizes = resolved_sizes; +++ btf->resolved_ids = resolved_ids; +++ env->visit_states = visit_states; +++ +++ return 0; +++ +++nomem: +++ kvfree(resolved_sizes); +++ kvfree(resolved_ids); +++ kvfree(visit_states); +++ return -ENOMEM; +++} +++ +++static void btf_verifier_env_free(struct btf_verifier_env *env) +++{ +++ kvfree(env->visit_states); +++ kfree(env); +++} +++ +++static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, +++ const struct btf_type *next_type) +++{ +++ switch (env->resolve_mode) { +++ case RESOLVE_TBD: +++ /* int, enum or void is a sink */ +++ return !btf_type_needs_resolve(next_type); +++ case RESOLVE_PTR: +++ /* int, enum, void, struct, array, func or func_proto is a sink +++ * for ptr +++ */ +++ return !btf_type_is_modifier(next_type) && +++ !btf_type_is_ptr(next_type); +++ case RESOLVE_STRUCT_OR_ARRAY: +++ /* int, enum, void, ptr, func or func_proto is a sink +++ * for struct and array +++ */ +++ return !btf_type_is_modifier(next_type) && +++ !btf_type_is_array(next_type) && +++ !btf_type_is_struct(next_type); +++ default: +++ BUG(); +++ } +++} +++ +++static bool env_type_is_resolved(const struct btf_verifier_env *env, +++ u32 type_id) +++{ +++ return env->visit_states[type_id] == RESOLVED; +++} +++ +++static int env_stack_push(struct btf_verifier_env *env, +++ const struct btf_type *t, u32 type_id) +++{ +++ struct resolve_vertex *v; +++ +++ if (env->top_stack == MAX_RESOLVE_DEPTH) +++ return -E2BIG; +++ +++ if (env->visit_states[type_id] != NOT_VISITED) +++ return -EEXIST; +++ +++ env->visit_states[type_id] = VISITED; +++ +++ v = &env->stack[env->top_stack++]; +++ v->t = t; +++ v->type_id = type_id; +++ v->next_member = 0; +++ +++ if (env->resolve_mode == RESOLVE_TBD) { +++ if (btf_type_is_ptr(t)) +++ env->resolve_mode = RESOLVE_PTR; +++ else if (btf_type_is_struct(t) || btf_type_is_array(t)) +++ env->resolve_mode = RESOLVE_STRUCT_OR_ARRAY; +++ } +++ +++ return 0; +++} +++ +++static void env_stack_set_next_member(struct btf_verifier_env *env, +++ u16 next_member) +++{ +++ env->stack[env->top_stack - 1].next_member = next_member; +++} +++ +++static void env_stack_pop_resolved(struct btf_verifier_env *env, +++ u32 resolved_type_id, +++ u32 resolved_size) +++{ +++ u32 type_id = env->stack[--(env->top_stack)].type_id; +++ struct btf *btf = env->btf; +++ +++ btf->resolved_sizes[type_id] = resolved_size; +++ btf->resolved_ids[type_id] = resolved_type_id; +++ env->visit_states[type_id] = RESOLVED; +++} +++ +++static const struct resolve_vertex *env_stack_peak(struct btf_verifier_env *env) +++{ +++ return env->top_stack ? &env->stack[env->top_stack - 1] : NULL; +++} +++ +++/* The input param "type_id" must point to a needs_resolve type */ +++static const struct btf_type *btf_type_id_resolve(const struct btf *btf, +++ u32 *type_id) +++{ +++ *type_id = btf->resolved_ids[*type_id]; +++ return btf_type_by_id(btf, *type_id); +++} +++ +++const struct btf_type *btf_type_id_size(const struct btf *btf, +++ u32 *type_id, u32 *ret_size) +++{ +++ const struct btf_type *size_type; +++ u32 size_type_id = *type_id; +++ u32 size = 0; +++ +++ size_type = btf_type_by_id(btf, size_type_id); +++ if (btf_type_nosize_or_null(size_type)) +++ return NULL; +++ +++ if (btf_type_has_size(size_type)) { +++ size = size_type->size; +++ } else if (btf_type_is_array(size_type)) { +++ size = btf->resolved_sizes[size_type_id]; +++ } else if (btf_type_is_ptr(size_type)) { +++ size = sizeof(void *); +++ } else { +++ if (WARN_ON_ONCE(!btf_type_is_modifier(size_type) && +++ !btf_type_is_var(size_type))) +++ return NULL; +++ +++ size_type_id = btf->resolved_ids[size_type_id]; +++ size_type = btf_type_by_id(btf, size_type_id); +++ if (btf_type_nosize_or_null(size_type)) +++ return NULL; +++ else if (btf_type_has_size(size_type)) +++ size = size_type->size; +++ else if (btf_type_is_array(size_type)) +++ size = btf->resolved_sizes[size_type_id]; +++ else if (btf_type_is_ptr(size_type)) +++ size = sizeof(void *); +++ else +++ return NULL; +++ } +++ +++ *type_id = size_type_id; +++ if (ret_size) +++ *ret_size = size; +++ +++ return size_type; +++} +++ +++static int btf_df_check_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ btf_verifier_log_basic(env, struct_type, +++ "Unsupported check_member"); +++ return -EINVAL; +++} +++ +++static int btf_df_check_kflag_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ btf_verifier_log_basic(env, struct_type, +++ "Unsupported check_kflag_member"); +++ return -EINVAL; +++} +++ +++/* Used for ptr, array and struct/union type members. +++ * int, enum and modifier types have their specific callback functions. +++ */ +++static int btf_generic_check_kflag_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ if (BTF_MEMBER_BITFIELD_SIZE(member->offset)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Invalid member bitfield_size"); +++ return -EINVAL; +++ } +++ +++ /* bitfield size is 0, so member->offset represents bit offset only. +++ * It is safe to call non kflag check_member variants. +++ */ +++ return btf_type_ops(member_type)->check_member(env, struct_type, +++ member, +++ member_type); +++} +++ +++static int btf_df_resolve(struct btf_verifier_env *env, +++ const struct resolve_vertex *v) +++{ +++ btf_verifier_log_basic(env, v->t, "Unsupported resolve"); +++ return -EINVAL; +++} +++ +++static void btf_df_seq_show(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offsets, +++ struct seq_file *m) +++{ +++ seq_printf(m, "", BTF_INFO_KIND(t->info)); +++} +++ +++static int btf_int_check_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ u32 int_data = btf_type_int(member_type); +++ u32 struct_bits_off = member->offset; +++ u32 struct_size = struct_type->size; +++ u32 nr_copy_bits; +++ u32 bytes_offset; +++ +++ if (U32_MAX - struct_bits_off < BTF_INT_OFFSET(int_data)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "bits_offset exceeds U32_MAX"); +++ return -EINVAL; +++ } +++ +++ struct_bits_off += BTF_INT_OFFSET(int_data); +++ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); +++ nr_copy_bits = BTF_INT_BITS(int_data) + +++ BITS_PER_BYTE_MASKED(struct_bits_off); +++ +++ if (nr_copy_bits > BITS_PER_U128) { +++ btf_verifier_log_member(env, struct_type, member, +++ "nr_copy_bits exceeds 128"); +++ return -EINVAL; +++ } +++ +++ if (struct_size < bytes_offset || +++ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member exceeds struct_size"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static int btf_int_check_kflag_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ u32 struct_bits_off, nr_bits, nr_int_data_bits, bytes_offset; +++ u32 int_data = btf_type_int(member_type); +++ u32 struct_size = struct_type->size; +++ u32 nr_copy_bits; +++ +++ /* a regular int type is required for the kflag int member */ +++ if (!btf_type_int_is_regular(member_type)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Invalid member base type"); +++ return -EINVAL; +++ } +++ +++ /* check sanity of bitfield size */ +++ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); +++ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); +++ nr_int_data_bits = BTF_INT_BITS(int_data); +++ if (!nr_bits) { +++ /* Not a bitfield member, member offset must be at byte +++ * boundary. +++ */ +++ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Invalid member offset"); +++ return -EINVAL; +++ } +++ +++ nr_bits = nr_int_data_bits; +++ } else if (nr_bits > nr_int_data_bits) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Invalid member bitfield_size"); +++ return -EINVAL; +++ } +++ +++ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); +++ nr_copy_bits = nr_bits + BITS_PER_BYTE_MASKED(struct_bits_off); +++ if (nr_copy_bits > BITS_PER_U128) { +++ btf_verifier_log_member(env, struct_type, member, +++ "nr_copy_bits exceeds 128"); +++ return -EINVAL; +++ } +++ +++ if (struct_size < bytes_offset || +++ struct_size - bytes_offset < BITS_ROUNDUP_BYTES(nr_copy_bits)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member exceeds struct_size"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static s32 btf_int_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ u32 int_data, nr_bits, meta_needed = sizeof(int_data); +++ u16 encoding; +++ +++ if (meta_left < meta_needed) { +++ btf_verifier_log_basic(env, t, +++ "meta_left:%u meta_needed:%u", +++ meta_left, meta_needed); +++ return -EINVAL; +++ } +++ +++ if (btf_type_vlen(t)) { +++ btf_verifier_log_type(env, t, "vlen != 0"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ int_data = btf_type_int(t); +++ if (int_data & ~BTF_INT_MASK) { +++ btf_verifier_log_basic(env, t, "Invalid int_data:%x", +++ int_data); +++ return -EINVAL; +++ } +++ +++ nr_bits = BTF_INT_BITS(int_data) + BTF_INT_OFFSET(int_data); +++ +++ if (nr_bits > BITS_PER_U128) { +++ btf_verifier_log_type(env, t, "nr_bits exceeds %zu", +++ BITS_PER_U128); +++ return -EINVAL; +++ } +++ +++ if (BITS_ROUNDUP_BYTES(nr_bits) > t->size) { +++ btf_verifier_log_type(env, t, "nr_bits exceeds type_size"); +++ return -EINVAL; +++ } +++ +++ /* +++ * Only one of the encoding bits is allowed and it +++ * should be sufficient for the pretty print purpose (i.e. decoding). +++ * Multiple bits can be allowed later if it is found +++ * to be insufficient. +++ */ +++ encoding = BTF_INT_ENCODING(int_data); +++ if (encoding && +++ encoding != BTF_INT_SIGNED && +++ encoding != BTF_INT_CHAR && +++ encoding != BTF_INT_BOOL) { +++ btf_verifier_log_type(env, t, "Unsupported encoding"); +++ return -ENOTSUPP; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ return meta_needed; +++} +++ +++static void btf_int_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ int int_data = btf_type_int(t); +++ +++ btf_verifier_log(env, +++ "size=%u bits_offset=%u nr_bits=%u encoding=%s", +++ t->size, BTF_INT_OFFSET(int_data), +++ BTF_INT_BITS(int_data), +++ btf_int_encoding_str(BTF_INT_ENCODING(int_data))); +++} +++ +++static void btf_int128_print(struct seq_file *m, void *data) +++{ +++ /* data points to a __int128 number. +++ * Suppose +++ * int128_num = *(__int128 *)data; +++ * The below formulas shows what upper_num and lower_num represents: +++ * upper_num = int128_num >> 64; +++ * lower_num = int128_num & 0xffffffffFFFFFFFFULL; +++ */ +++ u64 upper_num, lower_num; +++ +++#ifdef __BIG_ENDIAN_BITFIELD +++ upper_num = *(u64 *)data; +++ lower_num = *(u64 *)(data + 8); +++#else +++ upper_num = *(u64 *)(data + 8); +++ lower_num = *(u64 *)data; +++#endif +++ if (upper_num == 0) +++ seq_printf(m, "0x%llx", lower_num); +++ else +++ seq_printf(m, "0x%llx%016llx", upper_num, lower_num); +++} +++ +++static void btf_int128_shift(u64 *print_num, u16 left_shift_bits, +++ u16 right_shift_bits) +++{ +++ u64 upper_num, lower_num; +++ +++#ifdef __BIG_ENDIAN_BITFIELD +++ upper_num = print_num[0]; +++ lower_num = print_num[1]; +++#else +++ upper_num = print_num[1]; +++ lower_num = print_num[0]; +++#endif +++ +++ /* shake out un-needed bits by shift/or operations */ +++ if (left_shift_bits >= 64) { +++ upper_num = lower_num << (left_shift_bits - 64); +++ lower_num = 0; +++ } else { +++ upper_num = (upper_num << left_shift_bits) | +++ (lower_num >> (64 - left_shift_bits)); +++ lower_num = lower_num << left_shift_bits; +++ } +++ +++ if (right_shift_bits >= 64) { +++ lower_num = upper_num >> (right_shift_bits - 64); +++ upper_num = 0; +++ } else { +++ lower_num = (lower_num >> right_shift_bits) | +++ (upper_num << (64 - right_shift_bits)); +++ upper_num = upper_num >> right_shift_bits; +++ } +++ +++#ifdef __BIG_ENDIAN_BITFIELD +++ print_num[0] = upper_num; +++ print_num[1] = lower_num; +++#else +++ print_num[0] = lower_num; +++ print_num[1] = upper_num; +++#endif +++} +++ +++static void btf_bitfield_seq_show(void *data, u8 bits_offset, +++ u8 nr_bits, struct seq_file *m) +++{ +++ u16 left_shift_bits, right_shift_bits; +++ u8 nr_copy_bytes; +++ u8 nr_copy_bits; +++ u64 print_num[2] = {}; +++ +++ nr_copy_bits = nr_bits + bits_offset; +++ nr_copy_bytes = BITS_ROUNDUP_BYTES(nr_copy_bits); +++ +++ memcpy(print_num, data, nr_copy_bytes); +++ +++#ifdef __BIG_ENDIAN_BITFIELD +++ left_shift_bits = bits_offset; +++#else +++ left_shift_bits = BITS_PER_U128 - nr_copy_bits; +++#endif +++ right_shift_bits = BITS_PER_U128 - nr_bits; +++ +++ btf_int128_shift(print_num, left_shift_bits, right_shift_bits); +++ btf_int128_print(m, print_num); +++} +++ +++ +++static void btf_int_bits_seq_show(const struct btf *btf, +++ const struct btf_type *t, +++ void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ u32 int_data = btf_type_int(t); +++ u8 nr_bits = BTF_INT_BITS(int_data); +++ u8 total_bits_offset; +++ +++ /* +++ * bits_offset is at most 7. +++ * BTF_INT_OFFSET() cannot exceed 128 bits. +++ */ +++ total_bits_offset = bits_offset + BTF_INT_OFFSET(int_data); +++ data += BITS_ROUNDDOWN_BYTES(total_bits_offset); +++ bits_offset = BITS_PER_BYTE_MASKED(total_bits_offset); +++ btf_bitfield_seq_show(data, bits_offset, nr_bits, m); +++} +++ +++static void btf_int_seq_show(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ u32 int_data = btf_type_int(t); +++ u8 encoding = BTF_INT_ENCODING(int_data); +++ bool sign = encoding & BTF_INT_SIGNED; +++ u8 nr_bits = BTF_INT_BITS(int_data); +++ +++ if (bits_offset || BTF_INT_OFFSET(int_data) || +++ BITS_PER_BYTE_MASKED(nr_bits)) { +++ btf_int_bits_seq_show(btf, t, data, bits_offset, m); +++ return; +++ } +++ +++ switch (nr_bits) { +++ case 128: +++ btf_int128_print(m, data); +++ break; +++ case 64: +++ if (sign) +++ seq_printf(m, "%lld", *(s64 *)data); +++ else +++ seq_printf(m, "%llu", *(u64 *)data); +++ break; +++ case 32: +++ if (sign) +++ seq_printf(m, "%d", *(s32 *)data); +++ else +++ seq_printf(m, "%u", *(u32 *)data); +++ break; +++ case 16: +++ if (sign) +++ seq_printf(m, "%d", *(s16 *)data); +++ else +++ seq_printf(m, "%u", *(u16 *)data); +++ break; +++ case 8: +++ if (sign) +++ seq_printf(m, "%d", *(s8 *)data); +++ else +++ seq_printf(m, "%u", *(u8 *)data); +++ break; +++ default: +++ btf_int_bits_seq_show(btf, t, data, bits_offset, m); +++ } +++} +++ +++static const struct btf_kind_operations int_ops = { +++ .check_meta = btf_int_check_meta, +++ .resolve = btf_df_resolve, +++ .check_member = btf_int_check_member, +++ .check_kflag_member = btf_int_check_kflag_member, +++ .log_details = btf_int_log, +++ .seq_show = btf_int_seq_show, +++}; +++ +++static int btf_modifier_check_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ const struct btf_type *resolved_type; +++ u32 resolved_type_id = member->type; +++ struct btf_member resolved_member; +++ struct btf *btf = env->btf; +++ +++ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); +++ if (!resolved_type) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Invalid member"); +++ return -EINVAL; +++ } +++ +++ resolved_member = *member; +++ resolved_member.type = resolved_type_id; +++ +++ return btf_type_ops(resolved_type)->check_member(env, struct_type, +++ &resolved_member, +++ resolved_type); +++} +++ +++static int btf_modifier_check_kflag_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ const struct btf_type *resolved_type; +++ u32 resolved_type_id = member->type; +++ struct btf_member resolved_member; +++ struct btf *btf = env->btf; +++ +++ resolved_type = btf_type_id_size(btf, &resolved_type_id, NULL); +++ if (!resolved_type) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Invalid member"); +++ return -EINVAL; +++ } +++ +++ resolved_member = *member; +++ resolved_member.type = resolved_type_id; +++ +++ return btf_type_ops(resolved_type)->check_kflag_member(env, struct_type, +++ &resolved_member, +++ resolved_type); +++} +++ +++static int btf_ptr_check_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ u32 struct_size, struct_bits_off, bytes_offset; +++ +++ struct_size = struct_type->size; +++ struct_bits_off = member->offset; +++ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); +++ +++ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member is not byte aligned"); +++ return -EINVAL; +++ } +++ +++ if (struct_size - bytes_offset < sizeof(void *)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member exceeds struct_size"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static int btf_ref_type_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ if (btf_type_vlen(t)) { +++ btf_verifier_log_type(env, t, "vlen != 0"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ if (!BTF_TYPE_ID_VALID(t->type)) { +++ btf_verifier_log_type(env, t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ /* typedef type must have a valid name, and other ref types, +++ * volatile, const, restrict, should have a null name. +++ */ +++ if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) { +++ if (!t->name_off || +++ !btf_name_valid_identifier(env->btf, t->name_off)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ } else { +++ if (t->name_off) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ return 0; +++} +++ +++static int btf_modifier_resolve(struct btf_verifier_env *env, +++ const struct resolve_vertex *v) +++{ +++ const struct btf_type *t = v->t; +++ const struct btf_type *next_type; +++ u32 next_type_id = t->type; +++ struct btf *btf = env->btf; +++ +++ next_type = btf_type_by_id(btf, next_type_id); +++ if (!next_type || btf_type_is_resolve_source_only(next_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ if (!env_type_is_resolve_sink(env, next_type) && +++ !env_type_is_resolved(env, next_type_id)) +++ return env_stack_push(env, next_type, next_type_id); +++ +++ /* Figure out the resolved next_type_id with size. +++ * They will be stored in the current modifier's +++ * resolved_ids and resolved_sizes such that it can +++ * save us a few type-following when we use it later (e.g. in +++ * pretty print). +++ */ +++ if (!btf_type_id_size(btf, &next_type_id, NULL)) { +++ if (env_type_is_resolved(env, next_type_id)) +++ next_type = btf_type_id_resolve(btf, &next_type_id); +++ +++ /* "typedef void new_void", "const void"...etc */ +++ if (!btf_type_is_void(next_type) && +++ !btf_type_is_fwd(next_type) && +++ !btf_type_is_func_proto(next_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ } +++ +++ env_stack_pop_resolved(env, next_type_id, 0); +++ +++ return 0; +++} +++ +++static int btf_var_resolve(struct btf_verifier_env *env, +++ const struct resolve_vertex *v) +++{ +++ const struct btf_type *next_type; +++ const struct btf_type *t = v->t; +++ u32 next_type_id = t->type; +++ struct btf *btf = env->btf; +++ +++ next_type = btf_type_by_id(btf, next_type_id); +++ if (!next_type || btf_type_is_resolve_source_only(next_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ if (!env_type_is_resolve_sink(env, next_type) && +++ !env_type_is_resolved(env, next_type_id)) +++ return env_stack_push(env, next_type, next_type_id); +++ +++ if (btf_type_is_modifier(next_type)) { +++ const struct btf_type *resolved_type; +++ u32 resolved_type_id; +++ +++ resolved_type_id = next_type_id; +++ resolved_type = btf_type_id_resolve(btf, &resolved_type_id); +++ +++ if (btf_type_is_ptr(resolved_type) && +++ !env_type_is_resolve_sink(env, resolved_type) && +++ !env_type_is_resolved(env, resolved_type_id)) +++ return env_stack_push(env, resolved_type, +++ resolved_type_id); +++ } +++ +++ /* We must resolve to something concrete at this point, no +++ * forward types or similar that would resolve to size of +++ * zero is allowed. +++ */ +++ if (!btf_type_id_size(btf, &next_type_id, NULL)) { +++ btf_verifier_log_type(env, v->t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ env_stack_pop_resolved(env, next_type_id, 0); +++ +++ return 0; +++} +++ +++static int btf_ptr_resolve(struct btf_verifier_env *env, +++ const struct resolve_vertex *v) +++{ +++ const struct btf_type *next_type; +++ const struct btf_type *t = v->t; +++ u32 next_type_id = t->type; +++ struct btf *btf = env->btf; +++ +++ next_type = btf_type_by_id(btf, next_type_id); +++ if (!next_type || btf_type_is_resolve_source_only(next_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ if (!env_type_is_resolve_sink(env, next_type) && +++ !env_type_is_resolved(env, next_type_id)) +++ return env_stack_push(env, next_type, next_type_id); +++ +++ /* If the modifier was RESOLVED during RESOLVE_STRUCT_OR_ARRAY, +++ * the modifier may have stopped resolving when it was resolved +++ * to a ptr (last-resolved-ptr). +++ * +++ * We now need to continue from the last-resolved-ptr to +++ * ensure the last-resolved-ptr will not referring back to +++ * the currenct ptr (t). +++ */ +++ if (btf_type_is_modifier(next_type)) { +++ const struct btf_type *resolved_type; +++ u32 resolved_type_id; +++ +++ resolved_type_id = next_type_id; +++ resolved_type = btf_type_id_resolve(btf, &resolved_type_id); +++ +++ if (btf_type_is_ptr(resolved_type) && +++ !env_type_is_resolve_sink(env, resolved_type) && +++ !env_type_is_resolved(env, resolved_type_id)) +++ return env_stack_push(env, resolved_type, +++ resolved_type_id); +++ } +++ +++ if (!btf_type_id_size(btf, &next_type_id, NULL)) { +++ if (env_type_is_resolved(env, next_type_id)) +++ next_type = btf_type_id_resolve(btf, &next_type_id); +++ +++ if (!btf_type_is_void(next_type) && +++ !btf_type_is_fwd(next_type) && +++ !btf_type_is_func_proto(next_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ } +++ +++ env_stack_pop_resolved(env, next_type_id, 0); +++ +++ return 0; +++} +++ +++static void btf_modifier_seq_show(const struct btf *btf, +++ const struct btf_type *t, +++ u32 type_id, void *data, +++ u8 bits_offset, struct seq_file *m) +++{ +++ t = btf_type_id_resolve(btf, &type_id); +++ +++ btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +++} +++ +++static void btf_var_seq_show(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ t = btf_type_id_resolve(btf, &type_id); +++ +++ btf_type_ops(t)->seq_show(btf, t, type_id, data, bits_offset, m); +++} +++ +++static void btf_ptr_seq_show(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ /* It is a hashed value */ +++ seq_printf(m, "%p", *(void **)data); +++} +++ +++static void btf_ref_type_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ btf_verifier_log(env, "type_id=%u", t->type); +++} +++ +++static struct btf_kind_operations modifier_ops = { +++ .check_meta = btf_ref_type_check_meta, +++ .resolve = btf_modifier_resolve, +++ .check_member = btf_modifier_check_member, +++ .check_kflag_member = btf_modifier_check_kflag_member, +++ .log_details = btf_ref_type_log, +++ .seq_show = btf_modifier_seq_show, +++}; +++ +++static struct btf_kind_operations ptr_ops = { +++ .check_meta = btf_ref_type_check_meta, +++ .resolve = btf_ptr_resolve, +++ .check_member = btf_ptr_check_member, +++ .check_kflag_member = btf_generic_check_kflag_member, +++ .log_details = btf_ref_type_log, +++ .seq_show = btf_ptr_seq_show, +++}; +++ +++static s32 btf_fwd_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ if (btf_type_vlen(t)) { +++ btf_verifier_log_type(env, t, "vlen != 0"); +++ return -EINVAL; +++ } +++ +++ if (t->type) { +++ btf_verifier_log_type(env, t, "type != 0"); +++ return -EINVAL; +++ } +++ +++ /* fwd type must have a valid name */ +++ if (!t->name_off || +++ !btf_name_valid_identifier(env->btf, t->name_off)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ return 0; +++} +++ +++static void btf_fwd_type_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ btf_verifier_log(env, "%s", btf_type_kflag(t) ? "union" : "struct"); +++} +++ +++static struct btf_kind_operations fwd_ops = { +++ .check_meta = btf_fwd_check_meta, +++ .resolve = btf_df_resolve, +++ .check_member = btf_df_check_member, +++ .check_kflag_member = btf_df_check_kflag_member, +++ .log_details = btf_fwd_type_log, +++ .seq_show = btf_df_seq_show, +++}; +++ +++static int btf_array_check_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ u32 struct_bits_off = member->offset; +++ u32 struct_size, bytes_offset; +++ u32 array_type_id, array_size; +++ struct btf *btf = env->btf; +++ +++ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member is not byte aligned"); +++ return -EINVAL; +++ } +++ +++ array_type_id = member->type; +++ btf_type_id_size(btf, &array_type_id, &array_size); +++ struct_size = struct_type->size; +++ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); +++ if (struct_size - bytes_offset < array_size) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member exceeds struct_size"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static s32 btf_array_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ const struct btf_array *array = btf_type_array(t); +++ u32 meta_needed = sizeof(*array); +++ +++ if (meta_left < meta_needed) { +++ btf_verifier_log_basic(env, t, +++ "meta_left:%u meta_needed:%u", +++ meta_left, meta_needed); +++ return -EINVAL; +++ } +++ +++ /* array type should not have a name */ +++ if (t->name_off) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_vlen(t)) { +++ btf_verifier_log_type(env, t, "vlen != 0"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ if (t->size) { +++ btf_verifier_log_type(env, t, "size != 0"); +++ return -EINVAL; +++ } +++ +++ /* Array elem type and index type cannot be in type void, +++ * so !array->type and !array->index_type are not allowed. +++ */ +++ if (!array->type || !BTF_TYPE_ID_VALID(array->type)) { +++ btf_verifier_log_type(env, t, "Invalid elem"); +++ return -EINVAL; +++ } +++ +++ if (!array->index_type || !BTF_TYPE_ID_VALID(array->index_type)) { +++ btf_verifier_log_type(env, t, "Invalid index"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ return meta_needed; +++} +++ +++static int btf_array_resolve(struct btf_verifier_env *env, +++ const struct resolve_vertex *v) +++{ +++ const struct btf_array *array = btf_type_array(v->t); +++ const struct btf_type *elem_type, *index_type; +++ u32 elem_type_id, index_type_id; +++ struct btf *btf = env->btf; +++ u32 elem_size; +++ +++ /* Check array->index_type */ +++ index_type_id = array->index_type; +++ index_type = btf_type_by_id(btf, index_type_id); +++ if (btf_type_nosize_or_null(index_type) || +++ btf_type_is_resolve_source_only(index_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid index"); +++ return -EINVAL; +++ } +++ +++ if (!env_type_is_resolve_sink(env, index_type) && +++ !env_type_is_resolved(env, index_type_id)) +++ return env_stack_push(env, index_type, index_type_id); +++ +++ index_type = btf_type_id_size(btf, &index_type_id, NULL); +++ if (!index_type || !btf_type_is_int(index_type) || +++ !btf_type_int_is_regular(index_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid index"); +++ return -EINVAL; +++ } +++ +++ /* Check array->type */ +++ elem_type_id = array->type; +++ elem_type = btf_type_by_id(btf, elem_type_id); +++ if (btf_type_nosize_or_null(elem_type) || +++ btf_type_is_resolve_source_only(elem_type)) { +++ btf_verifier_log_type(env, v->t, +++ "Invalid elem"); +++ return -EINVAL; +++ } +++ +++ if (!env_type_is_resolve_sink(env, elem_type) && +++ !env_type_is_resolved(env, elem_type_id)) +++ return env_stack_push(env, elem_type, elem_type_id); +++ +++ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); +++ if (!elem_type) { +++ btf_verifier_log_type(env, v->t, "Invalid elem"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_is_int(elem_type) && !btf_type_int_is_regular(elem_type)) { +++ btf_verifier_log_type(env, v->t, "Invalid array of int"); +++ return -EINVAL; +++ } +++ +++ if (array->nelems && elem_size > U32_MAX / array->nelems) { +++ btf_verifier_log_type(env, v->t, +++ "Array size overflows U32_MAX"); +++ return -EINVAL; +++ } +++ +++ env_stack_pop_resolved(env, elem_type_id, elem_size * array->nelems); +++ +++ return 0; +++} +++ +++static void btf_array_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ const struct btf_array *array = btf_type_array(t); +++ +++ btf_verifier_log(env, "type_id=%u index_type_id=%u nr_elems=%u", +++ array->type, array->index_type, array->nelems); +++} +++ +++static void btf_array_seq_show(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ const struct btf_array *array = btf_type_array(t); +++ const struct btf_kind_operations *elem_ops; +++ const struct btf_type *elem_type; +++ u32 i, elem_size, elem_type_id; +++ +++ elem_type_id = array->type; +++ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); +++ elem_ops = btf_type_ops(elem_type); +++ seq_puts(m, "["); +++ for (i = 0; i < array->nelems; i++) { +++ if (i) +++ seq_puts(m, ","); +++ +++ elem_ops->seq_show(btf, elem_type, elem_type_id, data, +++ bits_offset, m); +++ data += elem_size; +++ } +++ seq_puts(m, "]"); +++} +++ +++static struct btf_kind_operations array_ops = { +++ .check_meta = btf_array_check_meta, +++ .resolve = btf_array_resolve, +++ .check_member = btf_array_check_member, +++ .check_kflag_member = btf_generic_check_kflag_member, +++ .log_details = btf_array_log, +++ .seq_show = btf_array_seq_show, +++}; +++ +++static int btf_struct_check_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ u32 struct_bits_off = member->offset; +++ u32 struct_size, bytes_offset; +++ +++ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member is not byte aligned"); +++ return -EINVAL; +++ } +++ +++ struct_size = struct_type->size; +++ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); +++ if (struct_size - bytes_offset < member_type->size) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member exceeds struct_size"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static s32 btf_struct_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ bool is_union = BTF_INFO_KIND(t->info) == BTF_KIND_UNION; +++ const struct btf_member *member; +++ u32 meta_needed, last_offset; +++ struct btf *btf = env->btf; +++ u32 struct_size = t->size; +++ u32 offset; +++ u16 i; +++ +++ meta_needed = btf_type_vlen(t) * sizeof(*member); +++ if (meta_left < meta_needed) { +++ btf_verifier_log_basic(env, t, +++ "meta_left:%u meta_needed:%u", +++ meta_left, meta_needed); +++ return -EINVAL; +++ } +++ +++ /* struct type either no name or a valid one */ +++ if (t->name_off && +++ !btf_name_valid_identifier(env->btf, t->name_off)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ last_offset = 0; +++ for_each_member(i, t, member) { +++ if (!btf_name_offset_valid(btf, member->name_off)) { +++ btf_verifier_log_member(env, t, member, +++ "Invalid member name_offset:%u", +++ member->name_off); +++ return -EINVAL; +++ } +++ +++ /* struct member either no name or a valid one */ +++ if (member->name_off && +++ !btf_name_valid_identifier(btf, member->name_off)) { +++ btf_verifier_log_member(env, t, member, "Invalid name"); +++ return -EINVAL; +++ } +++ /* A member cannot be in type void */ +++ if (!member->type || !BTF_TYPE_ID_VALID(member->type)) { +++ btf_verifier_log_member(env, t, member, +++ "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ offset = btf_member_bit_offset(t, member); +++ if (is_union && offset) { +++ btf_verifier_log_member(env, t, member, +++ "Invalid member bits_offset"); +++ return -EINVAL; +++ } +++ +++ /* +++ * ">" instead of ">=" because the last member could be +++ * "char a[0];" +++ */ +++ if (last_offset > offset) { +++ btf_verifier_log_member(env, t, member, +++ "Invalid member bits_offset"); +++ return -EINVAL; +++ } +++ +++ if (BITS_ROUNDUP_BYTES(offset) > struct_size) { +++ btf_verifier_log_member(env, t, member, +++ "Member bits_offset exceeds its struct size"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_member(env, t, member, NULL); +++ last_offset = offset; +++ } +++ +++ return meta_needed; +++} +++ +++static int btf_struct_resolve(struct btf_verifier_env *env, +++ const struct resolve_vertex *v) +++{ +++ const struct btf_member *member; +++ int err; +++ u16 i; +++ +++ /* Before continue resolving the next_member, +++ * ensure the last member is indeed resolved to a +++ * type with size info. +++ */ +++ if (v->next_member) { +++ const struct btf_type *last_member_type; +++ const struct btf_member *last_member; +++ u16 last_member_type_id; +++ +++ last_member = btf_type_member(v->t) + v->next_member - 1; +++ last_member_type_id = last_member->type; +++ if (WARN_ON_ONCE(!env_type_is_resolved(env, +++ last_member_type_id))) +++ return -EINVAL; +++ +++ last_member_type = btf_type_by_id(env->btf, +++ last_member_type_id); +++ if (btf_type_kflag(v->t)) +++ err = btf_type_ops(last_member_type)->check_kflag_member(env, v->t, +++ last_member, +++ last_member_type); +++ else +++ err = btf_type_ops(last_member_type)->check_member(env, v->t, +++ last_member, +++ last_member_type); +++ if (err) +++ return err; +++ } +++ +++ for_each_member_from(i, v->next_member, v->t, member) { +++ u32 member_type_id = member->type; +++ const struct btf_type *member_type = btf_type_by_id(env->btf, +++ member_type_id); +++ +++ if (btf_type_nosize_or_null(member_type) || +++ btf_type_is_resolve_source_only(member_type)) { +++ btf_verifier_log_member(env, v->t, member, +++ "Invalid member"); +++ return -EINVAL; +++ } +++ +++ if (!env_type_is_resolve_sink(env, member_type) && +++ !env_type_is_resolved(env, member_type_id)) { +++ env_stack_set_next_member(env, i + 1); +++ return env_stack_push(env, member_type, member_type_id); +++ } +++ +++ if (btf_type_kflag(v->t)) +++ err = btf_type_ops(member_type)->check_kflag_member(env, v->t, +++ member, +++ member_type); +++ else +++ err = btf_type_ops(member_type)->check_member(env, v->t, +++ member, +++ member_type); +++ if (err) +++ return err; +++ } +++ +++ env_stack_pop_resolved(env, 0, 0); +++ +++ return 0; +++} +++ +++static void btf_struct_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +++} +++ +++/* find 'struct bpf_spin_lock' in map value. +++ * return >= 0 offset if found +++ * and < 0 in case of error +++ */ +++int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +++{ +++ const struct btf_member *member; +++ u32 i, off = -ENOENT; +++ +++ if (!__btf_type_is_struct(t)) +++ return -EINVAL; +++ +++ for_each_member(i, t, member) { +++ const struct btf_type *member_type = btf_type_by_id(btf, +++ member->type); +++ if (!__btf_type_is_struct(member_type)) +++ continue; +++ if (member_type->size != sizeof(struct bpf_spin_lock)) +++ continue; +++ if (strcmp(__btf_name_by_offset(btf, member_type->name_off), +++ "bpf_spin_lock")) +++ continue; +++ if (off != -ENOENT) +++ /* only one 'struct bpf_spin_lock' is allowed */ +++ return -E2BIG; +++ off = btf_member_bit_offset(t, member); +++ if (off % 8) +++ /* valid C code cannot generate such BTF */ +++ return -EINVAL; +++ off /= 8; +++ if (off % __alignof__(struct bpf_spin_lock)) +++ /* valid struct bpf_spin_lock will be 4 byte aligned */ +++ return -EINVAL; +++ } +++ return off; +++} +++ +++static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ const char *seq = BTF_INFO_KIND(t->info) == BTF_KIND_UNION ? "|" : ","; +++ const struct btf_member *member; +++ u32 i; +++ +++ seq_puts(m, "{"); +++ for_each_member(i, t, member) { +++ const struct btf_type *member_type = btf_type_by_id(btf, +++ member->type); +++ const struct btf_kind_operations *ops; +++ u32 member_offset, bitfield_size; +++ u32 bytes_offset; +++ u8 bits8_offset; +++ +++ if (i) +++ seq_puts(m, seq); +++ +++ member_offset = btf_member_bit_offset(t, member); +++ bitfield_size = btf_member_bitfield_size(t, member); +++ bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset); +++ bits8_offset = BITS_PER_BYTE_MASKED(member_offset); +++ if (bitfield_size) { +++ btf_bitfield_seq_show(data + bytes_offset, bits8_offset, +++ bitfield_size, m); +++ } else { +++ ops = btf_type_ops(member_type); +++ ops->seq_show(btf, member_type, member->type, +++ data + bytes_offset, bits8_offset, m); +++ } +++ } +++ seq_puts(m, "}"); +++} +++ +++static struct btf_kind_operations struct_ops = { +++ .check_meta = btf_struct_check_meta, +++ .resolve = btf_struct_resolve, +++ .check_member = btf_struct_check_member, +++ .check_kflag_member = btf_generic_check_kflag_member, +++ .log_details = btf_struct_log, +++ .seq_show = btf_struct_seq_show, +++}; +++ +++static int btf_enum_check_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ u32 struct_bits_off = member->offset; +++ u32 struct_size, bytes_offset; +++ +++ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member is not byte aligned"); +++ return -EINVAL; +++ } +++ +++ struct_size = struct_type->size; +++ bytes_offset = BITS_ROUNDDOWN_BYTES(struct_bits_off); +++ if (struct_size - bytes_offset < member_type->size) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member exceeds struct_size"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static int btf_enum_check_kflag_member(struct btf_verifier_env *env, +++ const struct btf_type *struct_type, +++ const struct btf_member *member, +++ const struct btf_type *member_type) +++{ +++ u32 struct_bits_off, nr_bits, bytes_end, struct_size; +++ u32 int_bitsize = sizeof(int) * BITS_PER_BYTE; +++ +++ struct_bits_off = BTF_MEMBER_BIT_OFFSET(member->offset); +++ nr_bits = BTF_MEMBER_BITFIELD_SIZE(member->offset); +++ if (!nr_bits) { +++ if (BITS_PER_BYTE_MASKED(struct_bits_off)) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member is not byte aligned"); +++ return -EINVAL; +++ } +++ +++ nr_bits = int_bitsize; +++ } else if (nr_bits > int_bitsize) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Invalid member bitfield_size"); +++ return -EINVAL; +++ } +++ +++ struct_size = struct_type->size; +++ bytes_end = BITS_ROUNDUP_BYTES(struct_bits_off + nr_bits); +++ if (struct_size < bytes_end) { +++ btf_verifier_log_member(env, struct_type, member, +++ "Member exceeds struct_size"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static s32 btf_enum_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ const struct btf_enum *enums = btf_type_enum(t); +++ struct btf *btf = env->btf; +++ u16 i, nr_enums; +++ u32 meta_needed; +++ +++ nr_enums = btf_type_vlen(t); +++ meta_needed = nr_enums * sizeof(*enums); +++ +++ if (meta_left < meta_needed) { +++ btf_verifier_log_basic(env, t, +++ "meta_left:%u meta_needed:%u", +++ meta_left, meta_needed); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ if (t->size > 8 || !is_power_of_2(t->size)) { +++ btf_verifier_log_type(env, t, "Unexpected size"); +++ return -EINVAL; +++ } +++ +++ /* enum type either no name or a valid one */ +++ if (t->name_off && +++ !btf_name_valid_identifier(env->btf, t->name_off)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ for (i = 0; i < nr_enums; i++) { +++ if (!btf_name_offset_valid(btf, enums[i].name_off)) { +++ btf_verifier_log(env, "\tInvalid name_offset:%u", +++ enums[i].name_off); +++ return -EINVAL; +++ } +++ +++ /* enum member must have a valid name */ +++ if (!enums[i].name_off || +++ !btf_name_valid_identifier(btf, enums[i].name_off)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ +++ btf_verifier_log(env, "\t%s val=%d\n", +++ __btf_name_by_offset(btf, enums[i].name_off), +++ enums[i].val); +++ } +++ +++ return meta_needed; +++} +++ +++static void btf_enum_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +++} +++ +++static void btf_enum_seq_show(const struct btf *btf, const struct btf_type *t, +++ u32 type_id, void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ const struct btf_enum *enums = btf_type_enum(t); +++ u32 i, nr_enums = btf_type_vlen(t); +++ int v = *(int *)data; +++ +++ for (i = 0; i < nr_enums; i++) { +++ if (v == enums[i].val) { +++ seq_printf(m, "%s", +++ __btf_name_by_offset(btf, +++ enums[i].name_off)); +++ return; +++ } +++ } +++ +++ seq_printf(m, "%d", v); +++} +++ +++static struct btf_kind_operations enum_ops = { +++ .check_meta = btf_enum_check_meta, +++ .resolve = btf_df_resolve, +++ .check_member = btf_enum_check_member, +++ .check_kflag_member = btf_enum_check_kflag_member, +++ .log_details = btf_enum_log, +++ .seq_show = btf_enum_seq_show, +++}; +++ +++static s32 btf_func_proto_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ u32 meta_needed = btf_type_vlen(t) * sizeof(struct btf_param); +++ +++ if (meta_left < meta_needed) { +++ btf_verifier_log_basic(env, t, +++ "meta_left:%u meta_needed:%u", +++ meta_left, meta_needed); +++ return -EINVAL; +++ } +++ +++ if (t->name_off) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ return meta_needed; +++} +++ +++static void btf_func_proto_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ const struct btf_param *args = (const struct btf_param *)(t + 1); +++ u16 nr_args = btf_type_vlen(t), i; +++ +++ btf_verifier_log(env, "return=%u args=(", t->type); +++ if (!nr_args) { +++ btf_verifier_log(env, "void"); +++ goto done; +++ } +++ +++ if (nr_args == 1 && !args[0].type) { +++ /* Only one vararg */ +++ btf_verifier_log(env, "vararg"); +++ goto done; +++ } +++ +++ btf_verifier_log(env, "%u %s", args[0].type, +++ __btf_name_by_offset(env->btf, +++ args[0].name_off)); +++ for (i = 1; i < nr_args - 1; i++) +++ btf_verifier_log(env, ", %u %s", args[i].type, +++ __btf_name_by_offset(env->btf, +++ args[i].name_off)); +++ +++ if (nr_args > 1) { +++ const struct btf_param *last_arg = &args[nr_args - 1]; +++ +++ if (last_arg->type) +++ btf_verifier_log(env, ", %u %s", last_arg->type, +++ __btf_name_by_offset(env->btf, +++ last_arg->name_off)); +++ else +++ btf_verifier_log(env, ", vararg"); +++ } +++ +++done: +++ btf_verifier_log(env, ")"); +++} +++ +++static struct btf_kind_operations func_proto_ops = { +++ .check_meta = btf_func_proto_check_meta, +++ .resolve = btf_df_resolve, +++ /* +++ * BTF_KIND_FUNC_PROTO cannot be directly referred by +++ * a struct's member. +++ * +++ * It should be a funciton pointer instead. +++ * (i.e. struct's member -> BTF_KIND_PTR -> BTF_KIND_FUNC_PROTO) +++ * +++ * Hence, there is no btf_func_check_member(). +++ */ +++ .check_member = btf_df_check_member, +++ .check_kflag_member = btf_df_check_kflag_member, +++ .log_details = btf_func_proto_log, +++ .seq_show = btf_df_seq_show, +++}; +++ +++static s32 btf_func_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ if (!t->name_off || +++ !btf_name_valid_identifier(env->btf, t->name_off)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_vlen(t)) { +++ btf_verifier_log_type(env, t, "vlen != 0"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ return 0; +++} +++ +++static struct btf_kind_operations func_ops = { +++ .check_meta = btf_func_check_meta, +++ .resolve = btf_df_resolve, +++ .check_member = btf_df_check_member, +++ .check_kflag_member = btf_df_check_kflag_member, +++ .log_details = btf_ref_type_log, +++ .seq_show = btf_df_seq_show, +++}; +++ +++static s32 btf_var_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ const struct btf_var *var; +++ u32 meta_needed = sizeof(*var); +++ +++ if (meta_left < meta_needed) { +++ btf_verifier_log_basic(env, t, +++ "meta_left:%u meta_needed:%u", +++ meta_left, meta_needed); +++ return -EINVAL; +++ } +++ +++ if (btf_type_vlen(t)) { +++ btf_verifier_log_type(env, t, "vlen != 0"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ if (!t->name_off || +++ !__btf_name_valid(env->btf, t->name_off, true)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ /* A var cannot be in type void */ +++ if (!t->type || !BTF_TYPE_ID_VALID(t->type)) { +++ btf_verifier_log_type(env, t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ var = btf_type_var(t); +++ if (var->linkage != BTF_VAR_STATIC && +++ var->linkage != BTF_VAR_GLOBAL_ALLOCATED) { +++ btf_verifier_log_type(env, t, "Linkage not supported"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ return meta_needed; +++} +++ +++static void btf_var_log(struct btf_verifier_env *env, const struct btf_type *t) +++{ +++ const struct btf_var *var = btf_type_var(t); +++ +++ btf_verifier_log(env, "type_id=%u linkage=%u", t->type, var->linkage); +++} +++ +++static const struct btf_kind_operations var_ops = { +++ .check_meta = btf_var_check_meta, +++ .resolve = btf_var_resolve, +++ .check_member = btf_df_check_member, +++ .check_kflag_member = btf_df_check_kflag_member, +++ .log_details = btf_var_log, +++ .seq_show = btf_var_seq_show, +++}; +++ +++static s32 btf_datasec_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ const struct btf_var_secinfo *vsi; +++ u64 last_vsi_end_off = 0, sum = 0; +++ u32 i, meta_needed; +++ +++ meta_needed = btf_type_vlen(t) * sizeof(*vsi); +++ if (meta_left < meta_needed) { +++ btf_verifier_log_basic(env, t, +++ "meta_left:%u meta_needed:%u", +++ meta_left, meta_needed); +++ return -EINVAL; +++ } +++ +++ if (!btf_type_vlen(t)) { +++ btf_verifier_log_type(env, t, "vlen == 0"); +++ return -EINVAL; +++ } +++ +++ if (!t->size) { +++ btf_verifier_log_type(env, t, "size == 0"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_kflag(t)) { +++ btf_verifier_log_type(env, t, "Invalid btf_info kind_flag"); +++ return -EINVAL; +++ } +++ +++ if (!t->name_off || +++ !btf_name_valid_section(env->btf, t->name_off)) { +++ btf_verifier_log_type(env, t, "Invalid name"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_type(env, t, NULL); +++ +++ for_each_vsi(i, t, vsi) { +++ /* A var cannot be in type void */ +++ if (!vsi->type || !BTF_TYPE_ID_VALID(vsi->type)) { +++ btf_verifier_log_vsi(env, t, vsi, +++ "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ if (vsi->offset < last_vsi_end_off || vsi->offset >= t->size) { +++ btf_verifier_log_vsi(env, t, vsi, +++ "Invalid offset"); +++ return -EINVAL; +++ } +++ +++ if (!vsi->size || vsi->size > t->size) { +++ btf_verifier_log_vsi(env, t, vsi, +++ "Invalid size"); +++ return -EINVAL; +++ } +++ +++ last_vsi_end_off = vsi->offset + vsi->size; +++ if (last_vsi_end_off > t->size) { +++ btf_verifier_log_vsi(env, t, vsi, +++ "Invalid offset+size"); +++ return -EINVAL; +++ } +++ +++ btf_verifier_log_vsi(env, t, vsi, NULL); +++ sum += vsi->size; +++ } +++ +++ if (t->size < sum) { +++ btf_verifier_log_type(env, t, "Invalid btf_info size"); +++ return -EINVAL; +++ } +++ +++ return meta_needed; +++} +++ +++static int btf_datasec_resolve(struct btf_verifier_env *env, +++ const struct resolve_vertex *v) +++{ +++ const struct btf_var_secinfo *vsi; +++ struct btf *btf = env->btf; +++ u16 i; +++ +++ for_each_vsi_from(i, v->next_member, v->t, vsi) { +++ u32 var_type_id = vsi->type, type_id, type_size = 0; +++ const struct btf_type *var_type = btf_type_by_id(env->btf, +++ var_type_id); +++ if (!var_type || !btf_type_is_var(var_type)) { +++ btf_verifier_log_vsi(env, v->t, vsi, +++ "Not a VAR kind member"); +++ return -EINVAL; +++ } +++ +++ if (!env_type_is_resolve_sink(env, var_type) && +++ !env_type_is_resolved(env, var_type_id)) { +++ env_stack_set_next_member(env, i + 1); +++ return env_stack_push(env, var_type, var_type_id); +++ } +++ +++ type_id = var_type->type; +++ if (!btf_type_id_size(btf, &type_id, &type_size)) { +++ btf_verifier_log_vsi(env, v->t, vsi, "Invalid type"); +++ return -EINVAL; +++ } +++ +++ if (vsi->size < type_size) { +++ btf_verifier_log_vsi(env, v->t, vsi, "Invalid size"); +++ return -EINVAL; +++ } +++ } +++ +++ env_stack_pop_resolved(env, 0, 0); +++ return 0; +++} +++ +++static void btf_datasec_log(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); +++} +++ +++static void btf_datasec_seq_show(const struct btf *btf, +++ const struct btf_type *t, u32 type_id, +++ void *data, u8 bits_offset, +++ struct seq_file *m) +++{ +++ const struct btf_var_secinfo *vsi; +++ const struct btf_type *var; +++ u32 i; +++ +++ seq_printf(m, "section (\"%s\") = {", __btf_name_by_offset(btf, t->name_off)); +++ for_each_vsi(i, t, vsi) { +++ var = btf_type_by_id(btf, vsi->type); +++ if (i) +++ seq_puts(m, ","); +++ btf_type_ops(var)->seq_show(btf, var, vsi->type, +++ data + vsi->offset, bits_offset, m); +++ } +++ seq_puts(m, "}"); +++} +++ +++static const struct btf_kind_operations datasec_ops = { +++ .check_meta = btf_datasec_check_meta, +++ .resolve = btf_datasec_resolve, +++ .check_member = btf_df_check_member, +++ .check_kflag_member = btf_df_check_kflag_member, +++ .log_details = btf_datasec_log, +++ .seq_show = btf_datasec_seq_show, +++}; +++ +++static int btf_func_proto_check(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ const struct btf_type *ret_type; +++ const struct btf_param *args; +++ const struct btf *btf; +++ u16 nr_args, i; +++ int err; +++ +++ btf = env->btf; +++ args = (const struct btf_param *)(t + 1); +++ nr_args = btf_type_vlen(t); +++ +++ /* Check func return type which could be "void" (t->type == 0) */ +++ if (t->type) { +++ u32 ret_type_id = t->type; +++ +++ ret_type = btf_type_by_id(btf, ret_type_id); +++ if (!ret_type) { +++ btf_verifier_log_type(env, t, "Invalid return type"); +++ return -EINVAL; +++ } +++ +++ if (btf_type_needs_resolve(ret_type) && +++ !env_type_is_resolved(env, ret_type_id)) { +++ err = btf_resolve(env, ret_type, ret_type_id); +++ if (err) +++ return err; +++ } +++ +++ /* Ensure the return type is a type that has a size */ +++ if (!btf_type_id_size(btf, &ret_type_id, NULL)) { +++ btf_verifier_log_type(env, t, "Invalid return type"); +++ return -EINVAL; +++ } +++ } +++ +++ if (!nr_args) +++ return 0; +++ +++ /* Last func arg type_id could be 0 if it is a vararg */ +++ if (!args[nr_args - 1].type) { +++ if (args[nr_args - 1].name_off) { +++ btf_verifier_log_type(env, t, "Invalid arg#%u", +++ nr_args); +++ return -EINVAL; +++ } +++ nr_args--; +++ } +++ +++ err = 0; +++ for (i = 0; i < nr_args; i++) { +++ const struct btf_type *arg_type; +++ u32 arg_type_id; +++ +++ arg_type_id = args[i].type; +++ arg_type = btf_type_by_id(btf, arg_type_id); +++ if (!arg_type) { +++ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); +++ err = -EINVAL; +++ break; +++ } +++ +++ if (args[i].name_off && +++ (!btf_name_offset_valid(btf, args[i].name_off) || +++ !btf_name_valid_identifier(btf, args[i].name_off))) { +++ btf_verifier_log_type(env, t, +++ "Invalid arg#%u", i + 1); +++ err = -EINVAL; +++ break; +++ } +++ +++ if (btf_type_needs_resolve(arg_type) && +++ !env_type_is_resolved(env, arg_type_id)) { +++ err = btf_resolve(env, arg_type, arg_type_id); +++ if (err) +++ break; +++ } +++ +++ if (!btf_type_id_size(btf, &arg_type_id, NULL)) { +++ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); +++ err = -EINVAL; +++ break; +++ } +++ } +++ +++ return err; +++} +++ +++static int btf_func_check(struct btf_verifier_env *env, +++ const struct btf_type *t) +++{ +++ const struct btf_type *proto_type; +++ const struct btf_param *args; +++ const struct btf *btf; +++ u16 nr_args, i; +++ +++ btf = env->btf; +++ proto_type = btf_type_by_id(btf, t->type); +++ +++ if (!proto_type || !btf_type_is_func_proto(proto_type)) { +++ btf_verifier_log_type(env, t, "Invalid type_id"); +++ return -EINVAL; +++ } +++ +++ args = (const struct btf_param *)(proto_type + 1); +++ nr_args = btf_type_vlen(proto_type); +++ for (i = 0; i < nr_args; i++) { +++ if (!args[i].name_off && args[i].type) { +++ btf_verifier_log_type(env, t, "Invalid arg#%u", i + 1); +++ return -EINVAL; +++ } +++ } +++ +++ return 0; +++} +++ +++static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = { +++ [BTF_KIND_INT] = &int_ops, +++ [BTF_KIND_PTR] = &ptr_ops, +++ [BTF_KIND_ARRAY] = &array_ops, +++ [BTF_KIND_STRUCT] = &struct_ops, +++ [BTF_KIND_UNION] = &struct_ops, +++ [BTF_KIND_ENUM] = &enum_ops, +++ [BTF_KIND_FWD] = &fwd_ops, +++ [BTF_KIND_TYPEDEF] = &modifier_ops, +++ [BTF_KIND_VOLATILE] = &modifier_ops, +++ [BTF_KIND_CONST] = &modifier_ops, +++ [BTF_KIND_RESTRICT] = &modifier_ops, +++ [BTF_KIND_FUNC] = &func_ops, +++ [BTF_KIND_FUNC_PROTO] = &func_proto_ops, +++ [BTF_KIND_VAR] = &var_ops, +++ [BTF_KIND_DATASEC] = &datasec_ops, +++}; +++ +++static s32 btf_check_meta(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 meta_left) +++{ +++ u32 saved_meta_left = meta_left; +++ s32 var_meta_size; +++ +++ if (meta_left < sizeof(*t)) { +++ btf_verifier_log(env, "[%u] meta_left:%u meta_needed:%zu", +++ env->log_type_id, meta_left, sizeof(*t)); +++ return -EINVAL; +++ } +++ meta_left -= sizeof(*t); +++ +++ if (t->info & ~BTF_INFO_MASK) { +++ btf_verifier_log(env, "[%u] Invalid btf_info:%x", +++ env->log_type_id, t->info); +++ return -EINVAL; +++ } +++ +++ if (BTF_INFO_KIND(t->info) > BTF_KIND_MAX || +++ BTF_INFO_KIND(t->info) == BTF_KIND_UNKN) { +++ btf_verifier_log(env, "[%u] Invalid kind:%u", +++ env->log_type_id, BTF_INFO_KIND(t->info)); +++ return -EINVAL; +++ } +++ +++ if (!btf_name_offset_valid(env->btf, t->name_off)) { +++ btf_verifier_log(env, "[%u] Invalid name_offset:%u", +++ env->log_type_id, t->name_off); +++ return -EINVAL; +++ } +++ +++ var_meta_size = btf_type_ops(t)->check_meta(env, t, meta_left); +++ if (var_meta_size < 0) +++ return var_meta_size; +++ +++ meta_left -= var_meta_size; +++ +++ return saved_meta_left - meta_left; +++} +++ +++static int btf_check_all_metas(struct btf_verifier_env *env) +++{ +++ struct btf *btf = env->btf; +++ struct btf_header *hdr; +++ void *cur, *end; +++ +++ hdr = &btf->hdr; +++ cur = btf->nohdr_data + hdr->type_off; +++ end = cur + hdr->type_len; +++ +++ env->log_type_id = 1; +++ while (cur < end) { +++ struct btf_type *t = cur; +++ s32 meta_size; +++ +++ meta_size = btf_check_meta(env, t, end - cur); +++ if (meta_size < 0) +++ return meta_size; +++ +++ btf_add_type(env, t); +++ cur += meta_size; +++ env->log_type_id++; +++ } +++ +++ return 0; +++} +++ +++static bool btf_resolve_valid(struct btf_verifier_env *env, +++ const struct btf_type *t, +++ u32 type_id) +++{ +++ struct btf *btf = env->btf; +++ +++ if (!env_type_is_resolved(env, type_id)) +++ return false; +++ +++ if (btf_type_is_struct(t) || btf_type_is_datasec(t)) +++ return !btf->resolved_ids[type_id] && +++ !btf->resolved_sizes[type_id]; +++ +++ if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || +++ btf_type_is_var(t)) { +++ t = btf_type_id_resolve(btf, &type_id); +++ return t && +++ !btf_type_is_modifier(t) && +++ !btf_type_is_var(t) && +++ !btf_type_is_datasec(t); +++ } +++ +++ if (btf_type_is_array(t)) { +++ const struct btf_array *array = btf_type_array(t); +++ const struct btf_type *elem_type; +++ u32 elem_type_id = array->type; +++ u32 elem_size; +++ +++ elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); +++ return elem_type && !btf_type_is_modifier(elem_type) && +++ (array->nelems * elem_size == +++ btf->resolved_sizes[type_id]); +++ } +++ +++ return false; +++} +++ +++static int btf_resolve(struct btf_verifier_env *env, +++ const struct btf_type *t, u32 type_id) +++{ +++ u32 save_log_type_id = env->log_type_id; +++ const struct resolve_vertex *v; +++ int err = 0; +++ +++ env->resolve_mode = RESOLVE_TBD; +++ env_stack_push(env, t, type_id); +++ while (!err && (v = env_stack_peak(env))) { +++ env->log_type_id = v->type_id; +++ err = btf_type_ops(v->t)->resolve(env, v); +++ } +++ +++ env->log_type_id = type_id; +++ if (err == -E2BIG) { +++ btf_verifier_log_type(env, t, +++ "Exceeded max resolving depth:%u", +++ MAX_RESOLVE_DEPTH); +++ } else if (err == -EEXIST) { +++ btf_verifier_log_type(env, t, "Loop detected"); +++ } +++ +++ /* Final sanity check */ +++ if (!err && !btf_resolve_valid(env, t, type_id)) { +++ btf_verifier_log_type(env, t, "Invalid resolve state"); +++ err = -EINVAL; +++ } +++ +++ env->log_type_id = save_log_type_id; +++ return err; +++} +++ +++static int btf_check_all_types(struct btf_verifier_env *env) +++{ +++ struct btf *btf = env->btf; +++ u32 type_id; +++ int err; +++ +++ err = env_resolve_init(env); +++ if (err) +++ return err; +++ +++ env->phase++; +++ for (type_id = 1; type_id <= btf->nr_types; type_id++) { +++ const struct btf_type *t = btf_type_by_id(btf, type_id); +++ +++ env->log_type_id = type_id; +++ if (btf_type_needs_resolve(t) && +++ !env_type_is_resolved(env, type_id)) { +++ err = btf_resolve(env, t, type_id); +++ if (err) +++ return err; +++ } +++ +++ if (btf_type_is_func_proto(t)) { +++ err = btf_func_proto_check(env, t); +++ if (err) +++ return err; +++ } +++ +++ if (btf_type_is_func(t)) { +++ err = btf_func_check(env, t); +++ if (err) +++ return err; +++ } +++ } +++ +++ return 0; +++} +++ +++static int btf_parse_type_sec(struct btf_verifier_env *env) +++{ +++ const struct btf_header *hdr = &env->btf->hdr; +++ int err; +++ +++ /* Type section must align to 4 bytes */ +++ if (hdr->type_off & (sizeof(u32) - 1)) { +++ btf_verifier_log(env, "Unaligned type_off"); +++ return -EINVAL; +++ } +++ +++ if (!hdr->type_len) { +++ btf_verifier_log(env, "No type found"); +++ return -EINVAL; +++ } +++ +++ err = btf_check_all_metas(env); +++ if (err) +++ return err; +++ +++ return btf_check_all_types(env); +++} +++ +++static int btf_parse_str_sec(struct btf_verifier_env *env) +++{ +++ const struct btf_header *hdr; +++ struct btf *btf = env->btf; +++ const char *start, *end; +++ +++ hdr = &btf->hdr; +++ start = btf->nohdr_data + hdr->str_off; +++ end = start + hdr->str_len; +++ +++ if (end != btf->data + btf->data_size) { +++ btf_verifier_log(env, "String section is not at the end"); +++ return -EINVAL; +++ } +++ +++ if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || +++ start[0] || end[-1]) { +++ btf_verifier_log(env, "Invalid string section"); +++ return -EINVAL; +++ } +++ +++ btf->strings = start; +++ +++ return 0; +++} +++ +++static const size_t btf_sec_info_offset[] = { +++ offsetof(struct btf_header, type_off), +++ offsetof(struct btf_header, str_off), +++}; +++ +++static int btf_sec_info_cmp(const void *a, const void *b) +++{ +++ const struct btf_sec_info *x = a; +++ const struct btf_sec_info *y = b; +++ +++ return (int)(x->off - y->off) ? : (int)(x->len - y->len); +++} +++ +++static int btf_check_sec_info(struct btf_verifier_env *env, +++ u32 btf_data_size) +++{ +++ struct btf_sec_info secs[ARRAY_SIZE(btf_sec_info_offset)]; +++ u32 total, expected_total, i; +++ const struct btf_header *hdr; +++ const struct btf *btf; +++ +++ btf = env->btf; +++ hdr = &btf->hdr; +++ +++ /* Populate the secs from hdr */ +++ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) +++ secs[i] = *(struct btf_sec_info *)((void *)hdr + +++ btf_sec_info_offset[i]); +++ +++ sort(secs, ARRAY_SIZE(btf_sec_info_offset), +++ sizeof(struct btf_sec_info), btf_sec_info_cmp, NULL); +++ +++ /* Check for gaps and overlap among sections */ +++ total = 0; +++ expected_total = btf_data_size - hdr->hdr_len; +++ for (i = 0; i < ARRAY_SIZE(btf_sec_info_offset); i++) { +++ if (expected_total < secs[i].off) { +++ btf_verifier_log(env, "Invalid section offset"); +++ return -EINVAL; +++ } +++ if (total < secs[i].off) { +++ /* gap */ +++ btf_verifier_log(env, "Unsupported section found"); +++ return -EINVAL; +++ } +++ if (total > secs[i].off) { +++ btf_verifier_log(env, "Section overlap found"); +++ return -EINVAL; +++ } +++ if (expected_total - total < secs[i].len) { +++ btf_verifier_log(env, +++ "Total section length too long"); +++ return -EINVAL; +++ } +++ total += secs[i].len; +++ } +++ +++ /* There is data other than hdr and known sections */ +++ if (expected_total != total) { +++ btf_verifier_log(env, "Unsupported section found"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static int btf_parse_hdr(struct btf_verifier_env *env) +++{ +++ u32 hdr_len, hdr_copy, btf_data_size; +++ const struct btf_header *hdr; +++ struct btf *btf; +++ int err; +++ +++ btf = env->btf; +++ btf_data_size = btf->data_size; +++ +++ if (btf_data_size < +++ offsetof(struct btf_header, hdr_len) + sizeof(hdr->hdr_len)) { +++ btf_verifier_log(env, "hdr_len not found"); +++ return -EINVAL; +++ } +++ +++ hdr = btf->data; +++ hdr_len = hdr->hdr_len; +++ if (btf_data_size < hdr_len) { +++ btf_verifier_log(env, "btf_header not found"); +++ return -EINVAL; +++ } +++ +++ /* Ensure the unsupported header fields are zero */ +++ if (hdr_len > sizeof(btf->hdr)) { +++ u8 *expected_zero = btf->data + sizeof(btf->hdr); +++ u8 *end = btf->data + hdr_len; +++ +++ for (; expected_zero < end; expected_zero++) { +++ if (*expected_zero) { +++ btf_verifier_log(env, "Unsupported btf_header"); +++ return -E2BIG; +++ } +++ } +++ } +++ +++ hdr_copy = min_t(u32, hdr_len, sizeof(btf->hdr)); +++ memcpy(&btf->hdr, btf->data, hdr_copy); +++ +++ hdr = &btf->hdr; +++ +++ btf_verifier_log_hdr(env, btf_data_size); +++ +++ if (hdr->magic != BTF_MAGIC) { +++ btf_verifier_log(env, "Invalid magic"); +++ return -EINVAL; +++ } +++ +++ if (hdr->version != BTF_VERSION) { +++ btf_verifier_log(env, "Unsupported version"); +++ return -ENOTSUPP; +++ } +++ +++ if (hdr->flags) { +++ btf_verifier_log(env, "Unsupported flags"); +++ return -ENOTSUPP; +++ } +++ +++ if (btf_data_size == hdr->hdr_len) { +++ btf_verifier_log(env, "No data"); +++ return -EINVAL; +++ } +++ +++ err = btf_check_sec_info(env, btf_data_size); +++ if (err) +++ return err; +++ +++ return 0; +++} +++ +++static struct btf *btf_parse(void __user *btf_data, u32 btf_data_size, +++ u32 log_level, char __user *log_ubuf, u32 log_size) +++{ +++ struct btf_verifier_env *env = NULL; +++ struct bpf_verifier_log *log; +++ struct btf *btf = NULL; +++ u8 *data; +++ int err; +++ +++ if (btf_data_size > BTF_MAX_SIZE) +++ return ERR_PTR(-E2BIG); +++ +++ env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); +++ if (!env) +++ return ERR_PTR(-ENOMEM); +++ +++ log = &env->log; +++ if (log_level || log_ubuf || log_size) { +++ /* user requested verbose verifier output +++ * and supplied buffer to store the verification trace +++ */ +++ log->level = log_level; +++ log->ubuf = log_ubuf; +++ log->len_total = log_size; +++ +++ /* log attributes have to be sane */ +++ if (log->len_total < 128 || log->len_total > UINT_MAX >> 8 || +++ !log->level || !log->ubuf) { +++ err = -EINVAL; +++ goto errout; +++ } +++ } +++ +++ btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); +++ if (!btf) { +++ err = -ENOMEM; +++ goto errout; +++ } +++ env->btf = btf; +++ +++ data = kmalloc(btf_data_size, GFP_KERNEL | __GFP_NOWARN); +++ if (!data) { +++ err = -ENOMEM; +++ goto errout; +++ } +++ +++ btf->data = data; +++ btf->data_size = btf_data_size; +++ +++ if (copy_from_user(data, btf_data, btf_data_size)) { +++ err = -EFAULT; +++ goto errout; +++ } +++ +++ err = btf_parse_hdr(env); +++ if (err) +++ goto errout; +++ +++ btf->nohdr_data = btf->data + btf->hdr.hdr_len; +++ +++ err = btf_parse_str_sec(env); +++ if (err) +++ goto errout; +++ +++ err = btf_parse_type_sec(env); +++ if (err) +++ goto errout; +++ +++ if (log->level && bpf_verifier_log_full(log)) { +++ err = -ENOSPC; +++ goto errout; +++ } +++ +++ btf_verifier_env_free(env); +++ refcount_set(&btf->refcnt, 1); +++ return btf; +++ +++errout: +++ btf_verifier_env_free(env); +++ if (btf) +++ btf_free(btf); +++ return ERR_PTR(err); +++} +++ +++void btf_type_seq_show(const struct btf *btf, u32 type_id, void *obj, +++ struct seq_file *m) +++{ +++ const struct btf_type *t = btf_type_by_id(btf, type_id); +++ +++ btf_type_ops(t)->seq_show(btf, t, type_id, obj, 0, m); +++} +++ +++#ifdef CONFIG_PROC_FS +++static void bpf_btf_show_fdinfo(struct seq_file *m, struct file *filp) +++{ +++ const struct btf *btf = filp->private_data; +++ +++ seq_printf(m, "btf_id:\t%u\n", btf->id); +++} +++#endif +++ +++static int btf_release(struct inode *inode, struct file *filp) +++{ +++ btf_put(filp->private_data); +++ return 0; +++} +++ +++const struct file_operations btf_fops = { +++#ifdef CONFIG_PROC_FS +++ .show_fdinfo = bpf_btf_show_fdinfo, +++#endif +++ .release = btf_release, +++}; +++ +++static int __btf_new_fd(struct btf *btf) +++{ +++ return anon_inode_getfd("btf", &btf_fops, btf, O_RDONLY | O_CLOEXEC); +++} +++ +++int btf_new_fd(const union bpf_attr *attr) +++{ +++ struct btf *btf; +++ int ret; +++ +++ btf = btf_parse(u64_to_user_ptr(attr->btf), +++ attr->btf_size, attr->btf_log_level, +++ u64_to_user_ptr(attr->btf_log_buf), +++ attr->btf_log_size); +++ if (IS_ERR(btf)) +++ return PTR_ERR(btf); +++ +++ ret = btf_alloc_id(btf); +++ if (ret) { +++ btf_free(btf); +++ return ret; +++ } +++ +++ /* +++ * The BTF ID is published to the userspace. +++ * All BTF free must go through call_rcu() from +++ * now on (i.e. free by calling btf_put()). +++ */ +++ +++ ret = __btf_new_fd(btf); +++ if (ret < 0) +++ btf_put(btf); +++ +++ return ret; +++} +++ +++struct btf *btf_get_by_fd(int fd) +++{ +++ struct btf *btf; +++ struct fd f; +++ +++ f = fdget(fd); +++ +++ if (!f.file) +++ return ERR_PTR(-EBADF); +++ +++ if (f.file->f_op != &btf_fops) { +++ fdput(f); +++ return ERR_PTR(-EINVAL); +++ } +++ +++ btf = f.file->private_data; +++ refcount_inc(&btf->refcnt); +++ fdput(f); +++ +++ return btf; +++} +++ +++int btf_get_info_by_fd(const struct btf *btf, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ struct bpf_btf_info __user *uinfo; +++ struct bpf_btf_info info; +++ u32 info_copy, btf_copy; +++ void __user *ubtf; +++ u32 uinfo_len; +++ +++ uinfo = u64_to_user_ptr(attr->info.info); +++ uinfo_len = attr->info.info_len; +++ +++ info_copy = min_t(u32, uinfo_len, sizeof(info)); +++ memset(&info, 0, sizeof(info)); +++ if (copy_from_user(&info, uinfo, info_copy)) +++ return -EFAULT; +++ +++ info.id = btf->id; +++ ubtf = u64_to_user_ptr(info.btf); +++ btf_copy = min_t(u32, btf->data_size, info.btf_size); +++ if (copy_to_user(ubtf, btf->data, btf_copy)) +++ return -EFAULT; +++ info.btf_size = btf->data_size; +++ +++ if (copy_to_user(uinfo, &info, info_copy) || +++ put_user(info_copy, &uattr->info.info_len)) +++ return -EFAULT; +++ +++ return 0; +++} +++ +++int btf_get_fd_by_id(u32 id) +++{ +++ struct btf *btf; +++ int fd; +++ +++ rcu_read_lock(); +++ btf = idr_find(&btf_idr, id); +++ if (!btf || !refcount_inc_not_zero(&btf->refcnt)) +++ btf = ERR_PTR(-ENOENT); +++ rcu_read_unlock(); +++ +++ if (IS_ERR(btf)) +++ return PTR_ERR(btf); +++ +++ fd = __btf_new_fd(btf); +++ if (fd < 0) +++ btf_put(btf); +++ +++ return fd; +++} +++ +++u32 btf_id(const struct btf *btf) +++{ +++ return btf->id; +++} ++--- /dev/null +++++ b/kernel/bpf/cgroup.c ++@@ -0,0 +1,1581 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* +++ * Functions to manage eBPF programs attached to cgroups +++ * +++ * Copyright (c) 2016 Daniel Mack +++ */ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++#include "../cgroup/cgroup-internal.h" +++ +++DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +++EXPORT_SYMBOL(cgroup_bpf_enabled_key); +++ +++void cgroup_bpf_offline(struct cgroup *cgrp) +++{ +++ cgroup_get(cgrp); +++ percpu_ref_kill(&cgrp->bpf.refcnt); +++} +++ +++/** +++ * cgroup_bpf_release() - put references of all bpf programs and +++ * release all cgroup bpf data +++ * @work: work structure embedded into the cgroup to modify +++ */ +++static void cgroup_bpf_release(struct work_struct *work) +++{ +++ struct cgroup *p, *cgrp = container_of(work, struct cgroup, +++ bpf.release_work); +++ enum bpf_cgroup_storage_type stype; +++ struct bpf_prog_array *old_array; +++ unsigned int type; +++ +++ mutex_lock(&cgroup_mutex); +++ +++ for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { +++ struct list_head *progs = &cgrp->bpf.progs[type]; +++ struct bpf_prog_list *pl, *tmp; +++ +++ list_for_each_entry_safe(pl, tmp, progs, node) { +++ list_del(&pl->node); +++ bpf_prog_put(pl->prog); +++ for_each_cgroup_storage_type(stype) { +++ bpf_cgroup_storage_unlink(pl->storage[stype]); +++ bpf_cgroup_storage_free(pl->storage[stype]); +++ } +++ kfree(pl); +++ static_branch_dec(&cgroup_bpf_enabled_key); +++ } +++ old_array = rcu_dereference_protected( +++ cgrp->bpf.effective[type], +++ lockdep_is_held(&cgroup_mutex)); +++ bpf_prog_array_free(old_array); +++ } +++ +++ mutex_unlock(&cgroup_mutex); +++ +++ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) +++ cgroup_bpf_put(p); +++ +++ percpu_ref_exit(&cgrp->bpf.refcnt); +++ cgroup_put(cgrp); +++} +++ +++/** +++ * cgroup_bpf_release_fn() - callback used to schedule releasing +++ * of bpf cgroup data +++ * @ref: percpu ref counter structure +++ */ +++static void cgroup_bpf_release_fn(struct percpu_ref *ref) +++{ +++ struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); +++ +++ INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); +++ queue_work(system_wq, &cgrp->bpf.release_work); +++} +++ +++/* count number of elements in the list. +++ * it's slow but the list cannot be long +++ */ +++static u32 prog_list_length(struct list_head *head) +++{ +++ struct bpf_prog_list *pl; +++ u32 cnt = 0; +++ +++ list_for_each_entry(pl, head, node) { +++ if (!pl->prog) +++ continue; +++ cnt++; +++ } +++ return cnt; +++} +++ +++/* if parent has non-overridable prog attached, +++ * disallow attaching new programs to the descendent cgroup. +++ * if parent has overridable or multi-prog, allow attaching +++ */ +++static bool hierarchy_allows_attach(struct cgroup *cgrp, +++ enum bpf_attach_type type, +++ u32 new_flags) +++{ +++ struct cgroup *p; +++ +++ p = cgroup_parent(cgrp); +++ if (!p) +++ return true; +++ do { +++ u32 flags = p->bpf.flags[type]; +++ u32 cnt; +++ +++ if (flags & BPF_F_ALLOW_MULTI) +++ return true; +++ cnt = prog_list_length(&p->bpf.progs[type]); +++ WARN_ON_ONCE(cnt > 1); +++ if (cnt == 1) +++ return !!(flags & BPF_F_ALLOW_OVERRIDE); +++ p = cgroup_parent(p); +++ } while (p); +++ return true; +++} +++ +++/* compute a chain of effective programs for a given cgroup: +++ * start from the list of programs in this cgroup and add +++ * all parent programs. +++ * Note that parent's F_ALLOW_OVERRIDE-type program is yielding +++ * to programs in this cgroup +++ */ +++static int compute_effective_progs(struct cgroup *cgrp, +++ enum bpf_attach_type type, +++ struct bpf_prog_array **array) +++{ +++ enum bpf_cgroup_storage_type stype; +++ struct bpf_prog_array *progs; +++ struct bpf_prog_list *pl; +++ struct cgroup *p = cgrp; +++ int cnt = 0; +++ +++ /* count number of effective programs by walking parents */ +++ do { +++ if (cnt == 0 || (p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) +++ cnt += prog_list_length(&p->bpf.progs[type]); +++ p = cgroup_parent(p); +++ } while (p); +++ +++ progs = bpf_prog_array_alloc(cnt, GFP_KERNEL); +++ if (!progs) +++ return -ENOMEM; +++ +++ /* populate the array with effective progs */ +++ cnt = 0; +++ p = cgrp; +++ do { +++ if (cnt > 0 && !(p->bpf.flags[type] & BPF_F_ALLOW_MULTI)) +++ continue; +++ +++ list_for_each_entry(pl, &p->bpf.progs[type], node) { +++ if (!pl->prog) +++ continue; +++ +++ progs->items[cnt].prog = pl->prog; +++ for_each_cgroup_storage_type(stype) +++ progs->items[cnt].cgroup_storage[stype] = +++ pl->storage[stype]; +++ cnt++; +++ } +++ } while ((p = cgroup_parent(p))); +++ +++ *array = progs; +++ return 0; +++} +++ +++static void activate_effective_progs(struct cgroup *cgrp, +++ enum bpf_attach_type type, +++ struct bpf_prog_array *old_array) +++{ +++ rcu_swap_protected(cgrp->bpf.effective[type], old_array, +++ lockdep_is_held(&cgroup_mutex)); +++ /* free prog array after grace period, since __cgroup_bpf_run_*() +++ * might be still walking the array +++ */ +++ bpf_prog_array_free(old_array); +++} +++ +++/** +++ * cgroup_bpf_inherit() - inherit effective programs from parent +++ * @cgrp: the cgroup to modify +++ */ +++int cgroup_bpf_inherit(struct cgroup *cgrp) +++{ +++/* has to use marco instead of const int, since compiler thinks +++ * that array below is variable length +++ */ +++#define NR ARRAY_SIZE(cgrp->bpf.effective) +++ struct bpf_prog_array *arrays[NR] = {}; +++ struct cgroup *p; +++ int ret, i; +++ +++ ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, +++ GFP_KERNEL); +++ if (ret) +++ return ret; +++ +++ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) +++ cgroup_bpf_get(p); +++ +++ for (i = 0; i < NR; i++) +++ INIT_LIST_HEAD(&cgrp->bpf.progs[i]); +++ +++ for (i = 0; i < NR; i++) +++ if (compute_effective_progs(cgrp, i, &arrays[i])) +++ goto cleanup; +++ +++ for (i = 0; i < NR; i++) +++ activate_effective_progs(cgrp, i, arrays[i]); +++ +++ return 0; +++cleanup: +++ for (i = 0; i < NR; i++) +++ bpf_prog_array_free(arrays[i]); +++ +++ for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p)) +++ cgroup_bpf_put(p); +++ +++ percpu_ref_exit(&cgrp->bpf.refcnt); +++ +++ return -ENOMEM; +++} +++ +++static int update_effective_progs(struct cgroup *cgrp, +++ enum bpf_attach_type type) +++{ +++ struct cgroup_subsys_state *css; +++ int err; +++ +++ /* allocate and recompute effective prog arrays */ +++ css_for_each_descendant_pre(css, &cgrp->self) { +++ struct cgroup *desc = container_of(css, struct cgroup, self); +++ +++ if (percpu_ref_is_zero(&desc->bpf.refcnt)) +++ continue; +++ +++ err = compute_effective_progs(desc, type, &desc->bpf.inactive); +++ if (err) +++ goto cleanup; +++ } +++ +++ /* all allocations were successful. Activate all prog arrays */ +++ css_for_each_descendant_pre(css, &cgrp->self) { +++ struct cgroup *desc = container_of(css, struct cgroup, self); +++ +++ if (percpu_ref_is_zero(&desc->bpf.refcnt)) { +++ if (unlikely(desc->bpf.inactive)) { +++ bpf_prog_array_free(desc->bpf.inactive); +++ desc->bpf.inactive = NULL; +++ } +++ continue; +++ } +++ +++ activate_effective_progs(desc, type, desc->bpf.inactive); +++ desc->bpf.inactive = NULL; +++ } +++ +++ return 0; +++ +++cleanup: +++ /* oom while computing effective. Free all computed effective arrays +++ * since they were not activated +++ */ +++ css_for_each_descendant_pre(css, &cgrp->self) { +++ struct cgroup *desc = container_of(css, struct cgroup, self); +++ +++ bpf_prog_array_free(desc->bpf.inactive); +++ desc->bpf.inactive = NULL; +++ } +++ +++ return err; +++} +++ +++#define BPF_CGROUP_MAX_PROGS 64 +++ +++/** +++ * __cgroup_bpf_attach() - Attach the program to a cgroup, and +++ * propagate the change to descendants +++ * @cgrp: The cgroup which descendants to traverse +++ * @prog: A program to attach +++ * @type: Type of attach operation +++ * @flags: Option flags +++ * +++ * Must be called with cgroup_mutex held. +++ */ +++int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog, +++ enum bpf_attach_type type, u32 flags) +++{ +++ struct list_head *progs = &cgrp->bpf.progs[type]; +++ struct bpf_prog *old_prog = NULL; +++ struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; +++ struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {}; +++ enum bpf_cgroup_storage_type stype; +++ struct bpf_prog_list *pl; +++ bool pl_was_allocated; +++ int err; +++ +++ if ((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) +++ /* invalid combination */ +++ return -EINVAL; +++ +++ if (!hierarchy_allows_attach(cgrp, type, flags)) +++ return -EPERM; +++ +++ if (!list_empty(progs) && cgrp->bpf.flags[type] != flags) +++ /* Disallow attaching non-overridable on top +++ * of existing overridable in this cgroup. +++ * Disallow attaching multi-prog if overridable or none +++ */ +++ return -EPERM; +++ +++ if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS) +++ return -E2BIG; +++ +++ for_each_cgroup_storage_type(stype) { +++ storage[stype] = bpf_cgroup_storage_alloc(prog, stype); +++ if (IS_ERR(storage[stype])) { +++ storage[stype] = NULL; +++ for_each_cgroup_storage_type(stype) +++ bpf_cgroup_storage_free(storage[stype]); +++ return -ENOMEM; +++ } +++ } +++ +++ if (flags & BPF_F_ALLOW_MULTI) { +++ list_for_each_entry(pl, progs, node) { +++ if (pl->prog == prog) { +++ /* disallow attaching the same prog twice */ +++ for_each_cgroup_storage_type(stype) +++ bpf_cgroup_storage_free(storage[stype]); +++ return -EINVAL; +++ } +++ } +++ +++ pl = kmalloc(sizeof(*pl), GFP_KERNEL); +++ if (!pl) { +++ for_each_cgroup_storage_type(stype) +++ bpf_cgroup_storage_free(storage[stype]); +++ return -ENOMEM; +++ } +++ +++ pl_was_allocated = true; +++ pl->prog = prog; +++ for_each_cgroup_storage_type(stype) +++ pl->storage[stype] = storage[stype]; +++ list_add_tail(&pl->node, progs); +++ } else { +++ if (list_empty(progs)) { +++ pl = kmalloc(sizeof(*pl), GFP_KERNEL); +++ if (!pl) { +++ for_each_cgroup_storage_type(stype) +++ bpf_cgroup_storage_free(storage[stype]); +++ return -ENOMEM; +++ } +++ pl_was_allocated = true; +++ list_add_tail(&pl->node, progs); +++ } else { +++ pl = list_first_entry(progs, typeof(*pl), node); +++ old_prog = pl->prog; +++ for_each_cgroup_storage_type(stype) { +++ old_storage[stype] = pl->storage[stype]; +++ bpf_cgroup_storage_unlink(old_storage[stype]); +++ } +++ pl_was_allocated = false; +++ } +++ pl->prog = prog; +++ for_each_cgroup_storage_type(stype) +++ pl->storage[stype] = storage[stype]; +++ } +++ +++ cgrp->bpf.flags[type] = flags; +++ +++ err = update_effective_progs(cgrp, type); +++ if (err) +++ goto cleanup; +++ +++ static_branch_inc(&cgroup_bpf_enabled_key); +++ for_each_cgroup_storage_type(stype) { +++ if (!old_storage[stype]) +++ continue; +++ bpf_cgroup_storage_free(old_storage[stype]); +++ } +++ if (old_prog) { +++ bpf_prog_put(old_prog); +++ static_branch_dec(&cgroup_bpf_enabled_key); +++ } +++ for_each_cgroup_storage_type(stype) +++ bpf_cgroup_storage_link(storage[stype], cgrp, type); +++ return 0; +++ +++cleanup: +++ /* and cleanup the prog list */ +++ pl->prog = old_prog; +++ for_each_cgroup_storage_type(stype) { +++ bpf_cgroup_storage_free(pl->storage[stype]); +++ pl->storage[stype] = old_storage[stype]; +++ bpf_cgroup_storage_link(old_storage[stype], cgrp, type); +++ } +++ if (pl_was_allocated) { +++ list_del(&pl->node); +++ kfree(pl); +++ } +++ return err; +++} +++ +++/** +++ * __cgroup_bpf_detach() - Detach the program from a cgroup, and +++ * propagate the change to descendants +++ * @cgrp: The cgroup which descendants to traverse +++ * @prog: A program to detach or NULL +++ * @type: Type of detach operation +++ * +++ * Must be called with cgroup_mutex held. +++ */ +++int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, +++ enum bpf_attach_type type) +++{ +++ struct list_head *progs = &cgrp->bpf.progs[type]; +++ enum bpf_cgroup_storage_type stype; +++ u32 flags = cgrp->bpf.flags[type]; +++ struct bpf_prog *old_prog = NULL; +++ struct bpf_prog_list *pl; +++ int err; +++ +++ if (flags & BPF_F_ALLOW_MULTI) { +++ if (!prog) +++ /* to detach MULTI prog the user has to specify valid FD +++ * of the program to be detached +++ */ +++ return -EINVAL; +++ } else { +++ if (list_empty(progs)) +++ /* report error when trying to detach and nothing is attached */ +++ return -ENOENT; +++ } +++ +++ if (flags & BPF_F_ALLOW_MULTI) { +++ /* find the prog and detach it */ +++ list_for_each_entry(pl, progs, node) { +++ if (pl->prog != prog) +++ continue; +++ old_prog = prog; +++ /* mark it deleted, so it's ignored while +++ * recomputing effective +++ */ +++ pl->prog = NULL; +++ break; +++ } +++ if (!old_prog) +++ return -ENOENT; +++ } else { +++ /* to maintain backward compatibility NONE and OVERRIDE cgroups +++ * allow detaching with invalid FD (prog==NULL) +++ */ +++ pl = list_first_entry(progs, typeof(*pl), node); +++ old_prog = pl->prog; +++ pl->prog = NULL; +++ } +++ +++ err = update_effective_progs(cgrp, type); +++ if (err) +++ goto cleanup; +++ +++ /* now can actually delete it from this cgroup list */ +++ list_del(&pl->node); +++ for_each_cgroup_storage_type(stype) { +++ bpf_cgroup_storage_unlink(pl->storage[stype]); +++ bpf_cgroup_storage_free(pl->storage[stype]); +++ } +++ kfree(pl); +++ if (list_empty(progs)) +++ /* last program was detached, reset flags to zero */ +++ cgrp->bpf.flags[type] = 0; +++ +++ bpf_prog_put(old_prog); +++ static_branch_dec(&cgroup_bpf_enabled_key); +++ return 0; +++ +++cleanup: +++ /* and restore back old_prog */ +++ pl->prog = old_prog; +++ return err; +++} +++ +++/* Must be called with cgroup_mutex held to avoid races. */ +++int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); +++ enum bpf_attach_type type = attr->query.attach_type; +++ struct list_head *progs = &cgrp->bpf.progs[type]; +++ u32 flags = cgrp->bpf.flags[type]; +++ struct bpf_prog_array *effective; +++ int cnt, ret = 0, i; +++ +++ effective = rcu_dereference_protected(cgrp->bpf.effective[type], +++ lockdep_is_held(&cgroup_mutex)); +++ +++ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) +++ cnt = bpf_prog_array_length(effective); +++ else +++ cnt = prog_list_length(progs); +++ +++ if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) +++ return -EFAULT; +++ if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt))) +++ return -EFAULT; +++ if (attr->query.prog_cnt == 0 || !prog_ids || !cnt) +++ /* return early if user requested only program count + flags */ +++ return 0; +++ if (attr->query.prog_cnt < cnt) { +++ cnt = attr->query.prog_cnt; +++ ret = -ENOSPC; +++ } +++ +++ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { +++ return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); +++ } else { +++ struct bpf_prog_list *pl; +++ u32 id; +++ +++ i = 0; +++ list_for_each_entry(pl, progs, node) { +++ id = pl->prog->aux->id; +++ if (copy_to_user(prog_ids + i, &id, sizeof(id))) +++ return -EFAULT; +++ if (++i == cnt) +++ break; +++ } +++ } +++ return ret; +++} +++ +++int cgroup_bpf_prog_attach(const union bpf_attr *attr, +++ enum bpf_prog_type ptype, struct bpf_prog *prog) +++{ +++ struct cgroup *cgrp; +++ int ret; +++ +++ cgrp = cgroup_get_from_fd(attr->target_fd); +++ if (IS_ERR(cgrp)) +++ return PTR_ERR(cgrp); +++ +++ ret = cgroup_bpf_attach(cgrp, prog, attr->attach_type, +++ attr->attach_flags); +++ cgroup_put(cgrp); +++ return ret; +++} +++ +++int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +++{ +++ struct bpf_prog *prog; +++ struct cgroup *cgrp; +++ int ret; +++ +++ cgrp = cgroup_get_from_fd(attr->target_fd); +++ if (IS_ERR(cgrp)) +++ return PTR_ERR(cgrp); +++ +++ prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); +++ if (IS_ERR(prog)) +++ prog = NULL; +++ +++ ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, 0); +++ if (prog) +++ bpf_prog_put(prog); +++ +++ cgroup_put(cgrp); +++ return ret; +++} +++ +++int cgroup_bpf_prog_query(const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ struct cgroup *cgrp; +++ int ret; +++ +++ cgrp = cgroup_get_from_fd(attr->query.target_fd); +++ if (IS_ERR(cgrp)) +++ return PTR_ERR(cgrp); +++ +++ ret = cgroup_bpf_query(cgrp, attr, uattr); +++ +++ cgroup_put(cgrp); +++ return ret; +++} +++ +++/** +++ * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering +++ * @sk: The socket sending or receiving traffic +++ * @skb: The skb that is being sent or received +++ * @type: The type of program to be exectuted +++ * +++ * If no socket is passed, or the socket is not of type INET or INET6, +++ * this function does nothing and returns 0. +++ * +++ * The program type passed in via @type must be suitable for network +++ * filtering. No further check is performed to assert that. +++ * +++ * For egress packets, this function can return: +++ * NET_XMIT_SUCCESS (0) - continue with packet output +++ * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr +++ * NET_XMIT_CN (2) - continue with packet output and notify TCP +++ * to call cwr +++ * -EPERM - drop packet +++ * +++ * For ingress packets, this function will return -EPERM if any +++ * attached program was found and if it returned != 1 during execution. +++ * Otherwise 0 is returned. +++ */ +++int __cgroup_bpf_run_filter_skb(struct sock *sk, +++ struct sk_buff *skb, +++ enum bpf_attach_type type) +++{ +++ unsigned int offset = skb->data - skb_network_header(skb); +++ struct sock *save_sk; +++ void *saved_data_end; +++ struct cgroup *cgrp; +++ int ret; +++ +++ if (!sk || !sk_fullsock(sk)) +++ return 0; +++ +++ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) +++ return 0; +++ +++ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +++ save_sk = skb->sk; +++ skb->sk = sk; +++ __skb_push(skb, offset); +++ +++ /* compute pointers for the bpf prog */ +++ bpf_compute_and_save_data_end(skb, &saved_data_end); +++ +++ if (type == BPF_CGROUP_INET_EGRESS) { +++ ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( +++ cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); +++ } else { +++ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, +++ __bpf_prog_run_save_cb); +++ ret = (ret == 1 ? 0 : -EPERM); +++ } +++ bpf_restore_data_end(skb, saved_data_end); +++ __skb_pull(skb, offset); +++ skb->sk = save_sk; +++ +++ return ret; +++} +++EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); +++ +++/** +++ * __cgroup_bpf_run_filter_sk() - Run a program on a sock +++ * @sk: sock structure to manipulate +++ * @type: The type of program to be exectuted +++ * +++ * socket is passed is expected to be of type INET or INET6. +++ * +++ * The program type passed in via @type must be suitable for sock +++ * filtering. No further check is performed to assert that. +++ * +++ * This function will return %-EPERM if any if an attached program was found +++ * and if it returned != 1 during execution. In all other cases, 0 is returned. +++ */ +++int __cgroup_bpf_run_filter_sk(struct sock *sk, +++ enum bpf_attach_type type) +++{ +++ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +++ int ret; +++ +++ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sk, BPF_PROG_RUN); +++ return ret == 1 ? 0 : -EPERM; +++} +++EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); +++ +++/** +++ * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and +++ * provided by user sockaddr +++ * @sk: sock struct that will use sockaddr +++ * @uaddr: sockaddr struct provided by user +++ * @type: The type of program to be exectuted +++ * @t_ctx: Pointer to attach type specific context +++ * +++ * socket is expected to be of type INET or INET6. +++ * +++ * This function will return %-EPERM if an attached program is found and +++ * returned value != 1 during execution. In all other cases, 0 is returned. +++ */ +++int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, +++ struct sockaddr *uaddr, +++ enum bpf_attach_type type, +++ void *t_ctx) +++{ +++ struct bpf_sock_addr_kern ctx = { +++ .sk = sk, +++ .uaddr = uaddr, +++ .t_ctx = t_ctx, +++ }; +++ struct sockaddr_storage unspec; +++ struct cgroup *cgrp; +++ int ret; +++ +++ /* Check socket family since not all sockets represent network +++ * endpoint (e.g. AF_UNIX). +++ */ +++ if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) +++ return 0; +++ +++ if (!ctx.uaddr) { +++ memset(&unspec, 0, sizeof(unspec)); +++ ctx.uaddr = (struct sockaddr *)&unspec; +++ } +++ +++ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +++ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); +++ +++ return ret == 1 ? 0 : -EPERM; +++} +++EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); +++ +++/** +++ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock +++ * @sk: socket to get cgroup from +++ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains +++ * sk with connection information (IP addresses, etc.) May not contain +++ * cgroup info if it is a req sock. +++ * @type: The type of program to be exectuted +++ * +++ * socket passed is expected to be of type INET or INET6. +++ * +++ * The program type passed in via @type must be suitable for sock_ops +++ * filtering. No further check is performed to assert that. +++ * +++ * This function will return %-EPERM if any if an attached program was found +++ * and if it returned != 1 during execution. In all other cases, 0 is returned. +++ */ +++int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, +++ struct bpf_sock_ops_kern *sock_ops, +++ enum bpf_attach_type type) +++{ +++ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +++ int ret; +++ +++ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], sock_ops, +++ BPF_PROG_RUN); +++ return ret == 1 ? 0 : -EPERM; +++} +++EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops); +++ +++int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, +++ short access, enum bpf_attach_type type) +++{ +++ struct cgroup *cgrp; +++ struct bpf_cgroup_dev_ctx ctx = { +++ .access_type = (access << 16) | dev_type, +++ .major = major, +++ .minor = minor, +++ }; +++ int allow = 1; +++ +++ rcu_read_lock(); +++ cgrp = task_dfl_cgroup(current); +++ allow = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, +++ BPF_PROG_RUN); +++ rcu_read_unlock(); +++ +++ return !allow; +++} +++EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); +++ +++static const struct bpf_func_proto * +++cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ case BPF_FUNC_map_lookup_elem: +++ return &bpf_map_lookup_elem_proto; +++ case BPF_FUNC_map_update_elem: +++ return &bpf_map_update_elem_proto; +++ case BPF_FUNC_map_delete_elem: +++ return &bpf_map_delete_elem_proto; +++ case BPF_FUNC_map_push_elem: +++ return &bpf_map_push_elem_proto; +++ case BPF_FUNC_map_pop_elem: +++ return &bpf_map_pop_elem_proto; +++ case BPF_FUNC_map_peek_elem: +++ return &bpf_map_peek_elem_proto; +++ case BPF_FUNC_get_current_uid_gid: +++ return &bpf_get_current_uid_gid_proto; +++ case BPF_FUNC_get_local_storage: +++ return &bpf_get_local_storage_proto; +++ case BPF_FUNC_get_current_cgroup_id: +++ return &bpf_get_current_cgroup_id_proto; +++ case BPF_FUNC_trace_printk: +++ if (capable(CAP_SYS_ADMIN)) +++ return bpf_get_trace_printk_proto(); +++ /* fall through */ +++ default: +++ return NULL; +++ } +++} +++ +++static const struct bpf_func_proto * +++cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ return cgroup_base_func_proto(func_id, prog); +++} +++ +++static bool cgroup_dev_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ const int size_default = sizeof(__u32); +++ +++ if (type == BPF_WRITE) +++ return false; +++ +++ if (off < 0 || off + size > sizeof(struct bpf_cgroup_dev_ctx)) +++ return false; +++ /* The verifier guarantees that size > 0. */ +++ if (off % size != 0) +++ return false; +++ +++ switch (off) { +++ case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): +++ bpf_ctx_record_field_size(info, size_default); +++ if (!bpf_ctx_narrow_access_ok(off, size, size_default)) +++ return false; +++ break; +++ default: +++ if (size != size_default) +++ return false; +++ } +++ +++ return true; +++} +++ +++const struct bpf_prog_ops cg_dev_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops cg_dev_verifier_ops = { +++ .get_func_proto = cgroup_dev_func_proto, +++ .is_valid_access = cgroup_dev_is_valid_access, +++}; +++ +++/** +++ * __cgroup_bpf_run_filter_sysctl - Run a program on sysctl +++ * +++ * @head: sysctl table header +++ * @table: sysctl table +++ * @write: sysctl is being read (= 0) or written (= 1) +++ * @buf: pointer to buffer passed by user space +++ * @pcount: value-result argument: value is size of buffer pointed to by @buf, +++ * result is size of @new_buf if program set new value, initial value +++ * otherwise +++ * @ppos: value-result argument: value is position at which read from or write +++ * to sysctl is happening, result is new position if program overrode it, +++ * initial value otherwise +++ * @new_buf: pointer to pointer to new buffer that will be allocated if program +++ * overrides new value provided by user space on sysctl write +++ * NOTE: it's caller responsibility to free *new_buf if it was set +++ * @type: type of program to be executed +++ * +++ * Program is run when sysctl is being accessed, either read or written, and +++ * can allow or deny such access. +++ * +++ * This function will return %-EPERM if an attached program is found and +++ * returned value != 1 during execution. In all other cases 0 is returned. +++ */ +++int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, +++ struct ctl_table *table, int write, +++ void __user *buf, size_t *pcount, +++ loff_t *ppos, void **new_buf, +++ enum bpf_attach_type type) +++{ +++ struct bpf_sysctl_kern ctx = { +++ .head = head, +++ .table = table, +++ .write = write, +++ .ppos = ppos, +++ .cur_val = NULL, +++ .cur_len = PAGE_SIZE, +++ .new_val = NULL, +++ .new_len = 0, +++ .new_updated = 0, +++ }; +++ struct cgroup *cgrp; +++ int ret; +++ +++ ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); +++ if (ctx.cur_val) { +++ mm_segment_t old_fs; +++ loff_t pos = 0; +++ +++ old_fs = get_fs(); +++ set_fs(KERNEL_DS); +++ if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, +++ &ctx.cur_len, &pos)) { +++ /* Let BPF program decide how to proceed. */ +++ ctx.cur_len = 0; +++ } +++ set_fs(old_fs); +++ } else { +++ /* Let BPF program decide how to proceed. */ +++ ctx.cur_len = 0; +++ } +++ +++ if (write && buf && *pcount) { +++ /* BPF program should be able to override new value with a +++ * buffer bigger than provided by user. +++ */ +++ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); +++ ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); +++ if (!ctx.new_val || +++ copy_from_user(ctx.new_val, buf, ctx.new_len)) +++ /* Let BPF program decide how to proceed. */ +++ ctx.new_len = 0; +++ } +++ +++ rcu_read_lock(); +++ cgrp = task_dfl_cgroup(current); +++ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); +++ rcu_read_unlock(); +++ +++ kfree(ctx.cur_val); +++ +++ if (ret == 1 && ctx.new_updated) { +++ *new_buf = ctx.new_val; +++ *pcount = ctx.new_len; +++ } else { +++ kfree(ctx.new_val); +++ } +++ +++ return ret == 1 ? 0 : -EPERM; +++} +++EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +++ +++#ifdef CONFIG_NET +++static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, +++ enum bpf_attach_type attach_type) +++{ +++ struct bpf_prog_array *prog_array; +++ bool empty; +++ +++ rcu_read_lock(); +++ prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); +++ empty = bpf_prog_array_is_empty(prog_array); +++ rcu_read_unlock(); +++ +++ return empty; +++} +++ +++static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +++{ +++ if (unlikely(max_optlen < 0)) +++ return -EINVAL; +++ +++ if (unlikely(max_optlen > PAGE_SIZE)) { +++ /* We don't expose optvals that are greater than PAGE_SIZE +++ * to the BPF program. +++ */ +++ max_optlen = PAGE_SIZE; +++ } +++ +++ ctx->optval = kzalloc(max_optlen, GFP_USER); +++ if (!ctx->optval) +++ return -ENOMEM; +++ +++ ctx->optval_end = ctx->optval + max_optlen; +++ +++ return max_optlen; +++} +++ +++static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +++{ +++ kfree(ctx->optval); +++} +++ +++int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, +++ int *optname, char __user *optval, +++ int *optlen, char **kernel_optval) +++{ +++ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +++ struct bpf_sockopt_kern ctx = { +++ .sk = sk, +++ .level = *level, +++ .optname = *optname, +++ }; +++ int ret, max_optlen; +++ +++ /* Opportunistic check to see whether we have any BPF program +++ * attached to the hook so we don't waste time allocating +++ * memory and locking the socket. +++ */ +++ if (!cgroup_bpf_enabled || +++ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) +++ return 0; +++ +++ /* Allocate a bit more than the initial user buffer for +++ * BPF program. The canonical use case is overriding +++ * TCP_CONGESTION(nv) to TCP_CONGESTION(cubic). +++ */ +++ max_optlen = max_t(int, 16, *optlen); +++ +++ max_optlen = sockopt_alloc_buf(&ctx, max_optlen); +++ if (max_optlen < 0) +++ return max_optlen; +++ +++ ctx.optlen = *optlen; +++ +++ if (copy_from_user(ctx.optval, optval, min(*optlen, max_optlen)) != 0) { +++ ret = -EFAULT; +++ goto out; +++ } +++ +++ lock_sock(sk); +++ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], +++ &ctx, BPF_PROG_RUN); +++ release_sock(sk); +++ +++ if (!ret) { +++ ret = -EPERM; +++ goto out; +++ } +++ +++ if (ctx.optlen == -1) { +++ /* optlen set to -1, bypass kernel */ +++ ret = 1; +++ } else if (ctx.optlen > max_optlen || ctx.optlen < -1) { +++ /* optlen is out of bounds */ +++ ret = -EFAULT; +++ } else { +++ /* optlen within bounds, run kernel handler */ +++ ret = 0; +++ +++ /* export any potential modifications */ +++ *level = ctx.level; +++ *optname = ctx.optname; +++ +++ /* optlen == 0 from BPF indicates that we should +++ * use original userspace data. +++ */ +++ if (ctx.optlen != 0) { +++ *optlen = ctx.optlen; +++ *kernel_optval = ctx.optval; +++ /* export and don't free sockopt buf */ +++ return 0; +++ } +++ } +++ +++out: +++ sockopt_free_buf(&ctx); +++ return ret; +++} +++EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); +++ +++int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, +++ int optname, char __user *optval, +++ int __user *optlen, int max_optlen, +++ int retval) +++{ +++ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +++ struct bpf_sockopt_kern ctx = { +++ .sk = sk, +++ .level = level, +++ .optname = optname, +++ .retval = retval, +++ }; +++ int ret; +++ +++ /* Opportunistic check to see whether we have any BPF program +++ * attached to the hook so we don't waste time allocating +++ * memory and locking the socket. +++ */ +++ if (!cgroup_bpf_enabled || +++ __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) +++ return retval; +++ +++ ctx.optlen = max_optlen; +++ +++ max_optlen = sockopt_alloc_buf(&ctx, max_optlen); +++ if (max_optlen < 0) +++ return max_optlen; +++ +++ if (!retval) { +++ /* If kernel getsockopt finished successfully, +++ * copy whatever was returned to the user back +++ * into our temporary buffer. Set optlen to the +++ * one that kernel returned as well to let +++ * BPF programs inspect the value. +++ */ +++ +++ if (get_user(ctx.optlen, optlen)) { +++ ret = -EFAULT; +++ goto out; +++ } +++ +++ if (ctx.optlen < 0) { +++ ret = -EFAULT; +++ goto out; +++ } +++ +++ if (copy_from_user(ctx.optval, optval, +++ min(ctx.optlen, max_optlen)) != 0) { +++ ret = -EFAULT; +++ goto out; +++ } +++ } +++ +++ lock_sock(sk); +++ ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], +++ &ctx, BPF_PROG_RUN); +++ release_sock(sk); +++ +++ if (!ret) { +++ ret = -EPERM; +++ goto out; +++ } +++ +++ if (ctx.optlen > max_optlen || ctx.optlen < 0) { +++ ret = -EFAULT; +++ goto out; +++ } +++ +++ /* BPF programs only allowed to set retval to 0, not some +++ * arbitrary value. +++ */ +++ if (ctx.retval != 0 && ctx.retval != retval) { +++ ret = -EFAULT; +++ goto out; +++ } +++ +++ if (ctx.optlen != 0) { +++ if (copy_to_user(optval, ctx.optval, ctx.optlen) || +++ put_user(ctx.optlen, optlen)) { +++ ret = -EFAULT; +++ goto out; +++ } +++ } +++ +++ ret = ctx.retval; +++ +++out: +++ sockopt_free_buf(&ctx); +++ return ret; +++} +++EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); +++#endif +++ +++static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, +++ size_t *lenp) +++{ +++ ssize_t tmp_ret = 0, ret; +++ +++ if (dir->header.parent) { +++ tmp_ret = sysctl_cpy_dir(dir->header.parent, bufp, lenp); +++ if (tmp_ret < 0) +++ return tmp_ret; +++ } +++ +++ ret = strscpy(*bufp, dir->header.ctl_table[0].procname, *lenp); +++ if (ret < 0) +++ return ret; +++ *bufp += ret; +++ *lenp -= ret; +++ ret += tmp_ret; +++ +++ /* Avoid leading slash. */ +++ if (!ret) +++ return ret; +++ +++ tmp_ret = strscpy(*bufp, "/", *lenp); +++ if (tmp_ret < 0) +++ return tmp_ret; +++ *bufp += tmp_ret; +++ *lenp -= tmp_ret; +++ +++ return ret + tmp_ret; +++} +++ +++BPF_CALL_4(bpf_sysctl_get_name, struct bpf_sysctl_kern *, ctx, char *, buf, +++ size_t, buf_len, u64, flags) +++{ +++ ssize_t tmp_ret = 0, ret; +++ +++ if (!buf) +++ return -EINVAL; +++ +++ if (!(flags & BPF_F_SYSCTL_BASE_NAME)) { +++ if (!ctx->head) +++ return -EINVAL; +++ tmp_ret = sysctl_cpy_dir(ctx->head->parent, &buf, &buf_len); +++ if (tmp_ret < 0) +++ return tmp_ret; +++ } +++ +++ ret = strscpy(buf, ctx->table->procname, buf_len); +++ +++ return ret < 0 ? ret : tmp_ret + ret; +++} +++ +++static const struct bpf_func_proto bpf_sysctl_get_name_proto = { +++ .func = bpf_sysctl_get_name, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_PTR_TO_MEM, +++ .arg3_type = ARG_CONST_SIZE, +++ .arg4_type = ARG_ANYTHING, +++}; +++ +++static int copy_sysctl_value(char *dst, size_t dst_len, char *src, +++ size_t src_len) +++{ +++ if (!dst) +++ return -EINVAL; +++ +++ if (!dst_len) +++ return -E2BIG; +++ +++ if (!src || !src_len) { +++ memset(dst, 0, dst_len); +++ return -EINVAL; +++ } +++ +++ memcpy(dst, src, min(dst_len, src_len)); +++ +++ if (dst_len > src_len) { +++ memset(dst + src_len, '\0', dst_len - src_len); +++ return src_len; +++ } +++ +++ dst[dst_len - 1] = '\0'; +++ +++ return -E2BIG; +++} +++ +++BPF_CALL_3(bpf_sysctl_get_current_value, struct bpf_sysctl_kern *, ctx, +++ char *, buf, size_t, buf_len) +++{ +++ return copy_sysctl_value(buf, buf_len, ctx->cur_val, ctx->cur_len); +++} +++ +++static const struct bpf_func_proto bpf_sysctl_get_current_value_proto = { +++ .func = bpf_sysctl_get_current_value, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_PTR_TO_UNINIT_MEM, +++ .arg3_type = ARG_CONST_SIZE, +++}; +++ +++BPF_CALL_3(bpf_sysctl_get_new_value, struct bpf_sysctl_kern *, ctx, char *, buf, +++ size_t, buf_len) +++{ +++ if (!ctx->write) { +++ if (buf && buf_len) +++ memset(buf, '\0', buf_len); +++ return -EINVAL; +++ } +++ return copy_sysctl_value(buf, buf_len, ctx->new_val, ctx->new_len); +++} +++ +++static const struct bpf_func_proto bpf_sysctl_get_new_value_proto = { +++ .func = bpf_sysctl_get_new_value, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_PTR_TO_UNINIT_MEM, +++ .arg3_type = ARG_CONST_SIZE, +++}; +++ +++BPF_CALL_3(bpf_sysctl_set_new_value, struct bpf_sysctl_kern *, ctx, +++ const char *, buf, size_t, buf_len) +++{ +++ if (!ctx->write || !ctx->new_val || !ctx->new_len || !buf || !buf_len) +++ return -EINVAL; +++ +++ if (buf_len > PAGE_SIZE - 1) +++ return -E2BIG; +++ +++ memcpy(ctx->new_val, buf, buf_len); +++ ctx->new_len = buf_len; +++ ctx->new_updated = 1; +++ +++ return 0; +++} +++ +++static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = { +++ .func = bpf_sysctl_set_new_value, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_PTR_TO_MEM, +++ .arg3_type = ARG_CONST_SIZE, +++}; +++ +++static const struct bpf_func_proto * +++sysctl_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ case BPF_FUNC_strtol: +++ return &bpf_strtol_proto; +++ case BPF_FUNC_strtoul: +++ return &bpf_strtoul_proto; +++ case BPF_FUNC_sysctl_get_name: +++ return &bpf_sysctl_get_name_proto; +++ case BPF_FUNC_sysctl_get_current_value: +++ return &bpf_sysctl_get_current_value_proto; +++ case BPF_FUNC_sysctl_get_new_value: +++ return &bpf_sysctl_get_new_value_proto; +++ case BPF_FUNC_sysctl_set_new_value: +++ return &bpf_sysctl_set_new_value_proto; +++ default: +++ return cgroup_base_func_proto(func_id, prog); +++ } +++} +++ +++static bool sysctl_is_valid_access(int off, int size, enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ const int size_default = sizeof(__u32); +++ +++ if (off < 0 || off + size > sizeof(struct bpf_sysctl) || off % size) +++ return false; +++ +++ switch (off) { +++ case bpf_ctx_range(struct bpf_sysctl, write): +++ if (type != BPF_READ) +++ return false; +++ bpf_ctx_record_field_size(info, size_default); +++ return bpf_ctx_narrow_access_ok(off, size, size_default); +++ case bpf_ctx_range(struct bpf_sysctl, file_pos): +++ if (type == BPF_READ) { +++ bpf_ctx_record_field_size(info, size_default); +++ return bpf_ctx_narrow_access_ok(off, size, size_default); +++ } else { +++ return size == size_default; +++ } +++ default: +++ return false; +++ } +++} +++ +++static u32 sysctl_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ struct bpf_insn *insn = insn_buf; +++ u32 read_size; +++ +++ switch (si->off) { +++ case offsetof(struct bpf_sysctl, write): +++ *insn++ = BPF_LDX_MEM( +++ BPF_SIZE(si->code), si->dst_reg, si->src_reg, +++ bpf_target_off(struct bpf_sysctl_kern, write, +++ FIELD_SIZEOF(struct bpf_sysctl_kern, +++ write), +++ target_size)); +++ break; +++ case offsetof(struct bpf_sysctl, file_pos): +++ /* ppos is a pointer so it should be accessed via indirect +++ * loads and stores. Also for stores additional temporary +++ * register is used since neither src_reg nor dst_reg can be +++ * overridden. +++ */ +++ if (type == BPF_WRITE) { +++ int treg = BPF_REG_9; +++ +++ if (si->src_reg == treg || si->dst_reg == treg) +++ --treg; +++ if (si->src_reg == treg || si->dst_reg == treg) +++ --treg; +++ *insn++ = BPF_STX_MEM( +++ BPF_DW, si->dst_reg, treg, +++ offsetof(struct bpf_sysctl_kern, tmp_reg)); +++ *insn++ = BPF_LDX_MEM( +++ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), +++ treg, si->dst_reg, +++ offsetof(struct bpf_sysctl_kern, ppos)); +++ *insn++ = BPF_STX_MEM( +++ BPF_SIZEOF(u32), treg, si->src_reg, +++ bpf_ctx_narrow_access_offset( +++ 0, sizeof(u32), sizeof(loff_t))); +++ *insn++ = BPF_LDX_MEM( +++ BPF_DW, treg, si->dst_reg, +++ offsetof(struct bpf_sysctl_kern, tmp_reg)); +++ } else { +++ *insn++ = BPF_LDX_MEM( +++ BPF_FIELD_SIZEOF(struct bpf_sysctl_kern, ppos), +++ si->dst_reg, si->src_reg, +++ offsetof(struct bpf_sysctl_kern, ppos)); +++ read_size = bpf_size_to_bytes(BPF_SIZE(si->code)); +++ *insn++ = BPF_LDX_MEM( +++ BPF_SIZE(si->code), si->dst_reg, si->dst_reg, +++ bpf_ctx_narrow_access_offset( +++ 0, read_size, sizeof(loff_t))); +++ } +++ *target_size = sizeof(u32); +++ break; +++ } +++ +++ return insn - insn_buf; +++} +++ +++const struct bpf_verifier_ops cg_sysctl_verifier_ops = { +++ .get_func_proto = sysctl_func_proto, +++ .is_valid_access = sysctl_is_valid_access, +++ .convert_ctx_access = sysctl_convert_ctx_access, +++}; +++ +++const struct bpf_prog_ops cg_sysctl_prog_ops = { +++}; +++ +++static const struct bpf_func_proto * +++cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++#ifdef CONFIG_NET +++ case BPF_FUNC_sk_storage_get: +++ return &bpf_sk_storage_get_proto; +++ case BPF_FUNC_sk_storage_delete: +++ return &bpf_sk_storage_delete_proto; +++#endif +++#ifdef CONFIG_INET +++ case BPF_FUNC_tcp_sock: +++ return &bpf_tcp_sock_proto; +++#endif +++ default: +++ return cgroup_base_func_proto(func_id, prog); +++ } +++} +++ +++static bool cg_sockopt_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ const int size_default = sizeof(__u32); +++ +++ if (off < 0 || off >= sizeof(struct bpf_sockopt)) +++ return false; +++ +++ if (off % size != 0) +++ return false; +++ +++ if (type == BPF_WRITE) { +++ switch (off) { +++ case offsetof(struct bpf_sockopt, retval): +++ if (size != size_default) +++ return false; +++ return prog->expected_attach_type == +++ BPF_CGROUP_GETSOCKOPT; +++ case offsetof(struct bpf_sockopt, optname): +++ /* fallthrough */ +++ case offsetof(struct bpf_sockopt, level): +++ if (size != size_default) +++ return false; +++ return prog->expected_attach_type == +++ BPF_CGROUP_SETSOCKOPT; +++ case offsetof(struct bpf_sockopt, optlen): +++ return size == size_default; +++ default: +++ return false; +++ } +++ } +++ +++ switch (off) { +++ case offsetof(struct bpf_sockopt, sk): +++ if (size != sizeof(__u64)) +++ return false; +++ info->reg_type = PTR_TO_SOCKET; +++ break; +++ case offsetof(struct bpf_sockopt, optval): +++ if (size != sizeof(__u64)) +++ return false; +++ info->reg_type = PTR_TO_PACKET; +++ break; +++ case offsetof(struct bpf_sockopt, optval_end): +++ if (size != sizeof(__u64)) +++ return false; +++ info->reg_type = PTR_TO_PACKET_END; +++ break; +++ case offsetof(struct bpf_sockopt, retval): +++ if (size != size_default) +++ return false; +++ return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; +++ default: +++ if (size != size_default) +++ return false; +++ break; +++ } +++ return true; +++} +++ +++#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ +++ T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ +++ si->dst_reg, si->src_reg, \ +++ offsetof(struct bpf_sockopt_kern, F)) +++ +++static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size) +++{ +++ struct bpf_insn *insn = insn_buf; +++ +++ switch (si->off) { +++ case offsetof(struct bpf_sockopt, sk): +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); +++ break; +++ case offsetof(struct bpf_sockopt, level): +++ if (type == BPF_WRITE) +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); +++ else +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); +++ break; +++ case offsetof(struct bpf_sockopt, optname): +++ if (type == BPF_WRITE) +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); +++ else +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); +++ break; +++ case offsetof(struct bpf_sockopt, optlen): +++ if (type == BPF_WRITE) +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); +++ else +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); +++ break; +++ case offsetof(struct bpf_sockopt, retval): +++ if (type == BPF_WRITE) +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); +++ else +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); +++ break; +++ case offsetof(struct bpf_sockopt, optval): +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); +++ break; +++ case offsetof(struct bpf_sockopt, optval_end): +++ *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); +++ break; +++ } +++ +++ return insn - insn_buf; +++} +++ +++static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, +++ bool direct_write, +++ const struct bpf_prog *prog) +++{ +++ /* Nothing to do for sockopt argument. The data is kzalloc'ated. +++ */ +++ return 0; +++} +++ +++const struct bpf_verifier_ops cg_sockopt_verifier_ops = { +++ .get_func_proto = cg_sockopt_func_proto, +++ .is_valid_access = cg_sockopt_is_valid_access, +++ .convert_ctx_access = cg_sockopt_convert_ctx_access, +++ .gen_prologue = cg_sockopt_get_prologue, +++}; +++ +++const struct bpf_prog_ops cg_sockopt_prog_ops = { +++}; ++--- a/kernel/bpf/core.c +++++ b/kernel/bpf/core.c ++@@ -1,3 +1,4 @@ +++// SPDX-License-Identifier: GPL-2.0-or-later ++ /* ++ * Linux Socket Filter - Kernel level socket filtering ++ * ++@@ -12,21 +13,22 @@ ++ * Alexei Starovoitov ++ * Daniel Borkmann ++ * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of the GNU General Public License ++- * as published by the Free Software Foundation; either version ++- * 2 of the License, or (at your option) any later version. ++- * ++ * Andi Kleen - Fix a few bad bugs and races. ++ * Kris Katterjohn - Added many additional checks in bpf_check_classic() ++ */ ++ +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include +++#include +++#include +++#include +++#include +++#include ++ ++ #include ++ ++@@ -47,6 +49,7 @@ ++ #define DST regs[insn->dst_reg] ++ #define SRC regs[insn->src_reg] ++ #define FP regs[BPF_REG_FP] +++#define AX regs[BPF_REG_AX] ++ #define ARG1 regs[BPF_REG_ARG1] ++ #define CTX regs[BPF_REG_CTX] ++ #define IMM insn->imm ++@@ -70,10 +73,9 @@ void *bpf_internal_load_pointer_neg_help ++ return NULL; ++ } ++ ++-struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +++struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) ++ { ++- gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | ++- gfp_extra_flags; +++ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; ++ struct bpf_prog_aux *aux; ++ struct bpf_prog *fp; ++ ++@@ -82,8 +84,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned ++ if (fp == NULL) ++ return NULL; ++ ++- kmemcheck_annotate_bitfield(fp, meta); ++- ++ aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); ++ if (aux == NULL) { ++ vfree(fp); ++@@ -93,30 +93,151 @@ struct bpf_prog *bpf_prog_alloc(unsigned ++ fp->pages = size / PAGE_SIZE; ++ fp->aux = aux; ++ fp->aux->prog = fp; +++ fp->jit_requested = ebpf_jit_enabled(); +++ +++ INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); ++ ++ return fp; ++ } +++ +++struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) +++{ +++ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; +++ struct bpf_prog *prog; +++ int cpu; +++ +++ prog = bpf_prog_alloc_no_stats(size, gfp_extra_flags); +++ if (!prog) +++ return NULL; +++ +++ prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); +++ if (!prog->aux->stats) { +++ kfree(prog->aux); +++ vfree(prog); +++ return NULL; +++ } +++ +++ for_each_possible_cpu(cpu) { +++ struct bpf_prog_stats *pstats; +++ +++ pstats = per_cpu_ptr(prog->aux->stats, cpu); +++ u64_stats_init(&pstats->syncp); +++ } +++ return prog; +++} ++ EXPORT_SYMBOL_GPL(bpf_prog_alloc); ++ +++int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) +++{ +++ if (!prog->aux->nr_linfo || !prog->jit_requested) +++ return 0; +++ +++ prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, +++ sizeof(*prog->aux->jited_linfo), +++ GFP_KERNEL | __GFP_NOWARN); +++ if (!prog->aux->jited_linfo) +++ return -ENOMEM; +++ +++ return 0; +++} +++ +++void bpf_prog_free_jited_linfo(struct bpf_prog *prog) +++{ +++ kfree(prog->aux->jited_linfo); +++ prog->aux->jited_linfo = NULL; +++} +++ +++void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog) +++{ +++ if (prog->aux->jited_linfo && !prog->aux->jited_linfo[0]) +++ bpf_prog_free_jited_linfo(prog); +++} +++ +++/* The jit engine is responsible to provide an array +++ * for insn_off to the jited_off mapping (insn_to_jit_off). +++ * +++ * The idx to this array is the insn_off. Hence, the insn_off +++ * here is relative to the prog itself instead of the main prog. +++ * This array has one entry for each xlated bpf insn. +++ * +++ * jited_off is the byte off to the last byte of the jited insn. +++ * +++ * Hence, with +++ * insn_start: +++ * The first bpf insn off of the prog. The insn off +++ * here is relative to the main prog. +++ * e.g. if prog is a subprog, insn_start > 0 +++ * linfo_idx: +++ * The prog's idx to prog->aux->linfo and jited_linfo +++ * +++ * jited_linfo[linfo_idx] = prog->bpf_func +++ * +++ * For i > linfo_idx, +++ * +++ * jited_linfo[i] = prog->bpf_func + +++ * insn_to_jit_off[linfo[i].insn_off - insn_start - 1] +++ */ +++void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, +++ const u32 *insn_to_jit_off) +++{ +++ u32 linfo_idx, insn_start, insn_end, nr_linfo, i; +++ const struct bpf_line_info *linfo; +++ void **jited_linfo; +++ +++ if (!prog->aux->jited_linfo) +++ /* Userspace did not provide linfo */ +++ return; +++ +++ linfo_idx = prog->aux->linfo_idx; +++ linfo = &prog->aux->linfo[linfo_idx]; +++ insn_start = linfo[0].insn_off; +++ insn_end = insn_start + prog->len; +++ +++ jited_linfo = &prog->aux->jited_linfo[linfo_idx]; +++ jited_linfo[0] = prog->bpf_func; +++ +++ nr_linfo = prog->aux->nr_linfo - linfo_idx; +++ +++ for (i = 1; i < nr_linfo && linfo[i].insn_off < insn_end; i++) +++ /* The verifier ensures that linfo[i].insn_off is +++ * strictly increasing +++ */ +++ jited_linfo[i] = prog->bpf_func + +++ insn_to_jit_off[linfo[i].insn_off - insn_start - 1]; +++} +++ +++void bpf_prog_free_linfo(struct bpf_prog *prog) +++{ +++ bpf_prog_free_jited_linfo(prog); +++ kvfree(prog->aux->linfo); +++} +++ ++ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, ++ gfp_t gfp_extra_flags) ++ { ++- gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | ++- gfp_extra_flags; +++ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; ++ struct bpf_prog *fp; +++ u32 pages, delta; +++ int ret; ++ ++ BUG_ON(fp_old == NULL); ++ ++ size = round_up(size, PAGE_SIZE); ++- if (size <= fp_old->pages * PAGE_SIZE) +++ pages = size / PAGE_SIZE; +++ if (pages <= fp_old->pages) ++ return fp_old; ++ ++- fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); ++- if (fp != NULL) { ++- kmemcheck_annotate_bitfield(fp, meta); +++ delta = pages - fp_old->pages; +++ ret = __bpf_prog_charge(fp_old->aux->user, delta); +++ if (ret) +++ return NULL; ++ +++ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); +++ if (fp == NULL) { +++ __bpf_prog_uncharge(fp_old->aux->user, delta); +++ } else { ++ memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); ++- fp->pages = size / PAGE_SIZE; +++ fp->pages = pages; ++ fp->aux->prog = fp; ++ ++ /* We keep fp->aux from fp_old around in the new ++@@ -128,40 +249,578 @@ struct bpf_prog *bpf_prog_realloc(struct ++ ++ return fp; ++ } ++-EXPORT_SYMBOL_GPL(bpf_prog_realloc); ++ ++ void __bpf_prog_free(struct bpf_prog *fp) ++ { ++- kfree(fp->aux); +++ if (fp->aux) { +++ free_percpu(fp->aux->stats); +++ kfree(fp->aux); +++ } ++ vfree(fp); ++ } ++-EXPORT_SYMBOL_GPL(__bpf_prog_free); +++ +++int bpf_prog_calc_tag(struct bpf_prog *fp) +++{ +++ const u32 bits_offset = SHA_MESSAGE_BYTES - sizeof(__be64); +++ u32 raw_size = bpf_prog_tag_scratch_size(fp); +++ u32 digest[SHA_DIGEST_WORDS]; +++ u32 ws[SHA_WORKSPACE_WORDS]; +++ u32 i, bsize, psize, blocks; +++ struct bpf_insn *dst; +++ bool was_ld_map; +++ u8 *raw, *todo; +++ __be32 *result; +++ __be64 *bits; +++ +++ raw = vmalloc(raw_size); +++ if (!raw) +++ return -ENOMEM; +++ +++ sha_init(digest); +++ memset(ws, 0, sizeof(ws)); +++ +++ /* We need to take out the map fd for the digest calculation +++ * since they are unstable from user space side. +++ */ +++ dst = (void *)raw; +++ for (i = 0, was_ld_map = false; i < fp->len; i++) { +++ dst[i] = fp->insnsi[i]; +++ if (!was_ld_map && +++ dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && +++ (dst[i].src_reg == BPF_PSEUDO_MAP_FD || +++ dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) { +++ was_ld_map = true; +++ dst[i].imm = 0; +++ } else if (was_ld_map && +++ dst[i].code == 0 && +++ dst[i].dst_reg == 0 && +++ dst[i].src_reg == 0 && +++ dst[i].off == 0) { +++ was_ld_map = false; +++ dst[i].imm = 0; +++ } else { +++ was_ld_map = false; +++ } +++ } +++ +++ psize = bpf_prog_insn_size(fp); +++ memset(&raw[psize], 0, raw_size - psize); +++ raw[psize++] = 0x80; +++ +++ bsize = round_up(psize, SHA_MESSAGE_BYTES); +++ blocks = bsize / SHA_MESSAGE_BYTES; +++ todo = raw; +++ if (bsize - psize >= sizeof(__be64)) { +++ bits = (__be64 *)(todo + bsize - sizeof(__be64)); +++ } else { +++ bits = (__be64 *)(todo + bsize + bits_offset); +++ blocks++; +++ } +++ *bits = cpu_to_be64((psize - 1) << 3); +++ +++ while (blocks--) { +++ sha_transform(digest, todo, ws); +++ todo += SHA_MESSAGE_BYTES; +++ } +++ +++ result = (__force __be32 *)digest; +++ for (i = 0; i < SHA_DIGEST_WORDS; i++) +++ result[i] = cpu_to_be32(digest[i]); +++ memcpy(fp->tag, result, sizeof(fp->tag)); +++ +++ vfree(raw); +++ return 0; +++} +++ +++static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, s32 end_old, +++ s32 end_new, s32 curr, const bool probe_pass) +++{ +++ const s64 imm_min = S32_MIN, imm_max = S32_MAX; +++ s32 delta = end_new - end_old; +++ s64 imm = insn->imm; +++ +++ if (curr < pos && curr + imm + 1 >= end_old) +++ imm += delta; +++ else if (curr >= end_new && curr + imm + 1 < end_new) +++ imm -= delta; +++ if (imm < imm_min || imm > imm_max) +++ return -ERANGE; +++ if (!probe_pass) +++ insn->imm = imm; +++ return 0; +++} +++ +++static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, s32 end_old, +++ s32 end_new, s32 curr, const bool probe_pass) +++{ +++ const s32 off_min = S16_MIN, off_max = S16_MAX; +++ s32 delta = end_new - end_old; +++ s32 off = insn->off; +++ +++ if (curr < pos && curr + off + 1 >= end_old) +++ off += delta; +++ else if (curr >= end_new && curr + off + 1 < end_new) +++ off -= delta; +++ if (off < off_min || off > off_max) +++ return -ERANGE; +++ if (!probe_pass) +++ insn->off = off; +++ return 0; +++} +++ +++static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old, +++ s32 end_new, const bool probe_pass) +++{ +++ u32 i, insn_cnt = prog->len + (probe_pass ? end_new - end_old : 0); +++ struct bpf_insn *insn = prog->insnsi; +++ int ret = 0; +++ +++ for (i = 0; i < insn_cnt; i++, insn++) { +++ u8 code; +++ +++ /* In the probing pass we still operate on the original, +++ * unpatched image in order to check overflows before we +++ * do any other adjustments. Therefore skip the patchlet. +++ */ +++ if (probe_pass && i == pos) { +++ i = end_new; +++ insn = prog->insnsi + end_old; +++ } +++ code = insn->code; +++ if ((BPF_CLASS(code) != BPF_JMP && +++ BPF_CLASS(code) != BPF_JMP32) || +++ BPF_OP(code) == BPF_EXIT) +++ continue; +++ /* Adjust offset of jmps if we cross patch boundaries. */ +++ if (BPF_OP(code) == BPF_CALL) { +++ if (insn->src_reg != BPF_PSEUDO_CALL) +++ continue; +++ ret = bpf_adj_delta_to_imm(insn, pos, end_old, +++ end_new, i, probe_pass); +++ } else { +++ ret = bpf_adj_delta_to_off(insn, pos, end_old, +++ end_new, i, probe_pass); +++ } +++ if (ret) +++ break; +++ } +++ +++ return ret; +++} +++ +++static void bpf_adj_linfo(struct bpf_prog *prog, u32 off, u32 delta) +++{ +++ struct bpf_line_info *linfo; +++ u32 i, nr_linfo; +++ +++ nr_linfo = prog->aux->nr_linfo; +++ if (!nr_linfo || !delta) +++ return; +++ +++ linfo = prog->aux->linfo; +++ +++ for (i = 0; i < nr_linfo; i++) +++ if (off < linfo[i].insn_off) +++ break; +++ +++ /* Push all off < linfo[i].insn_off by delta */ +++ for (; i < nr_linfo; i++) +++ linfo[i].insn_off += delta; +++} +++ +++struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, +++ const struct bpf_insn *patch, u32 len) +++{ +++ u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; +++ const u32 cnt_max = S16_MAX; +++ struct bpf_prog *prog_adj; +++ int err; +++ +++ /* Since our patchlet doesn't expand the image, we're done. */ +++ if (insn_delta == 0) { +++ memcpy(prog->insnsi + off, patch, sizeof(*patch)); +++ return prog; +++ } +++ +++ insn_adj_cnt = prog->len + insn_delta; +++ +++ /* Reject anything that would potentially let the insn->off +++ * target overflow when we have excessive program expansions. +++ * We need to probe here before we do any reallocation where +++ * we afterwards may not fail anymore. +++ */ +++ if (insn_adj_cnt > cnt_max && +++ (err = bpf_adj_branches(prog, off, off + 1, off + len, true))) +++ return ERR_PTR(err); +++ +++ /* Several new instructions need to be inserted. Make room +++ * for them. Likely, there's no need for a new allocation as +++ * last page could have large enough tailroom. +++ */ +++ prog_adj = bpf_prog_realloc(prog, bpf_prog_size(insn_adj_cnt), +++ GFP_USER); +++ if (!prog_adj) +++ return ERR_PTR(-ENOMEM); +++ +++ prog_adj->len = insn_adj_cnt; +++ +++ /* Patching happens in 3 steps: +++ * +++ * 1) Move over tail of insnsi from next instruction onwards, +++ * so we can patch the single target insn with one or more +++ * new ones (patching is always from 1 to n insns, n > 0). +++ * 2) Inject new instructions at the target location. +++ * 3) Adjust branch offsets if necessary. +++ */ +++ insn_rest = insn_adj_cnt - off - len; +++ +++ memmove(prog_adj->insnsi + off + len, prog_adj->insnsi + off + 1, +++ sizeof(*patch) * insn_rest); +++ memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); +++ +++ /* We are guaranteed to not fail at this point, otherwise +++ * the ship has sailed to reverse to the original state. An +++ * overflow cannot happen at this point. +++ */ +++ BUG_ON(bpf_adj_branches(prog_adj, off, off + 1, off + len, false)); +++ +++ bpf_adj_linfo(prog_adj, off, insn_delta); +++ +++ return prog_adj; +++} +++ +++int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) +++{ +++ /* Branch offsets can't overflow when program is shrinking, no need +++ * to call bpf_adj_branches(..., true) here +++ */ +++ memmove(prog->insnsi + off, prog->insnsi + off + cnt, +++ sizeof(struct bpf_insn) * (prog->len - off - cnt)); +++ prog->len -= cnt; +++ +++ return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); +++} +++ +++static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) +++{ +++ int i; +++ +++ for (i = 0; i < fp->aux->func_cnt; i++) +++ bpf_prog_kallsyms_del(fp->aux->func[i]); +++} +++ +++void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) +++{ +++ bpf_prog_kallsyms_del_subprogs(fp); +++ bpf_prog_kallsyms_del(fp); +++} ++ ++ #ifdef CONFIG_BPF_JIT +++/* All BPF JIT sysctl knobs here. */ +++int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +++int bpf_jit_harden __read_mostly; +++int bpf_jit_kallsyms __read_mostly; +++long bpf_jit_limit __read_mostly; +++ +++static __always_inline void +++bpf_get_prog_addr_region(const struct bpf_prog *prog, +++ unsigned long *symbol_start, +++ unsigned long *symbol_end) +++{ +++ const struct bpf_binary_header *hdr = bpf_jit_binary_hdr(prog); +++ unsigned long addr = (unsigned long)hdr; +++ +++ WARN_ON_ONCE(!bpf_prog_ebpf_jited(prog)); +++ +++ *symbol_start = addr; +++ *symbol_end = addr + hdr->pages * PAGE_SIZE; +++} +++ +++void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +++{ +++ const char *end = sym + KSYM_NAME_LEN; +++ const struct btf_type *type; +++ const char *func_name; +++ +++ BUILD_BUG_ON(sizeof("bpf_prog_") + +++ sizeof(prog->tag) * 2 + +++ /* name has been null terminated. +++ * We should need +1 for the '_' preceding +++ * the name. However, the null character +++ * is double counted between the name and the +++ * sizeof("bpf_prog_") above, so we omit +++ * the +1 here. +++ */ +++ sizeof(prog->aux->name) > KSYM_NAME_LEN); +++ +++ sym += snprintf(sym, KSYM_NAME_LEN, "bpf_prog_"); +++ sym = bin2hex(sym, prog->tag, sizeof(prog->tag)); +++ +++ /* prog->aux->name will be ignored if full btf name is available */ +++ if (prog->aux->func_info_cnt) { +++ type = btf_type_by_id(prog->aux->btf, +++ prog->aux->func_info[prog->aux->func_idx].type_id); +++ func_name = btf_name_by_offset(prog->aux->btf, type->name_off); +++ snprintf(sym, (size_t)(end - sym), "_%s", func_name); +++ return; +++ } +++ +++ if (prog->aux->name[0]) +++ snprintf(sym, (size_t)(end - sym), "_%s", prog->aux->name); +++ else +++ *sym = 0; +++} +++ +++static __always_inline unsigned long +++bpf_get_prog_addr_start(struct latch_tree_node *n) +++{ +++ unsigned long symbol_start, symbol_end; +++ const struct bpf_prog_aux *aux; +++ +++ aux = container_of(n, struct bpf_prog_aux, ksym_tnode); +++ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); +++ +++ return symbol_start; +++} +++ +++static __always_inline bool bpf_tree_less(struct latch_tree_node *a, +++ struct latch_tree_node *b) +++{ +++ return bpf_get_prog_addr_start(a) < bpf_get_prog_addr_start(b); +++} +++ +++static __always_inline int bpf_tree_comp(void *key, struct latch_tree_node *n) +++{ +++ unsigned long val = (unsigned long)key; +++ unsigned long symbol_start, symbol_end; +++ const struct bpf_prog_aux *aux; +++ +++ aux = container_of(n, struct bpf_prog_aux, ksym_tnode); +++ bpf_get_prog_addr_region(aux->prog, &symbol_start, &symbol_end); +++ +++ if (val < symbol_start) +++ return -1; +++ if (val >= symbol_end) +++ return 1; +++ +++ return 0; +++} +++ +++static const struct latch_tree_ops bpf_tree_ops = { +++ .less = bpf_tree_less, +++ .comp = bpf_tree_comp, +++}; +++ +++static DEFINE_SPINLOCK(bpf_lock); +++static LIST_HEAD(bpf_kallsyms); +++static struct latch_tree_root bpf_tree __cacheline_aligned; +++ +++static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) +++{ +++ WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); +++ list_add_tail_rcu(&aux->ksym_lnode, &bpf_kallsyms); +++ latch_tree_insert(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); +++} +++ +++static void bpf_prog_ksym_node_del(struct bpf_prog_aux *aux) +++{ +++ if (list_empty(&aux->ksym_lnode)) +++ return; +++ +++ latch_tree_erase(&aux->ksym_tnode, &bpf_tree, &bpf_tree_ops); +++ list_del_rcu(&aux->ksym_lnode); +++} +++ +++static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp) +++{ +++ return fp->jited && !bpf_prog_was_classic(fp); +++} +++ +++static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) +++{ +++ return list_empty(&fp->aux->ksym_lnode) || +++ fp->aux->ksym_lnode.prev == LIST_POISON2; +++} +++ +++void bpf_prog_kallsyms_add(struct bpf_prog *fp) +++{ +++ if (!bpf_prog_kallsyms_candidate(fp) || +++ !capable(CAP_SYS_ADMIN)) +++ return; +++ +++ spin_lock_bh(&bpf_lock); +++ bpf_prog_ksym_node_add(fp->aux); +++ spin_unlock_bh(&bpf_lock); +++} +++ +++void bpf_prog_kallsyms_del(struct bpf_prog *fp) +++{ +++ if (!bpf_prog_kallsyms_candidate(fp)) +++ return; +++ +++ spin_lock_bh(&bpf_lock); +++ bpf_prog_ksym_node_del(fp->aux); +++ spin_unlock_bh(&bpf_lock); +++} +++ +++static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) +++{ +++ struct latch_tree_node *n; +++ +++ if (!bpf_jit_kallsyms_enabled()) +++ return NULL; +++ +++ n = latch_tree_find((void *)addr, &bpf_tree, &bpf_tree_ops); +++ return n ? +++ container_of(n, struct bpf_prog_aux, ksym_tnode)->prog : +++ NULL; +++} +++ +++const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +++ unsigned long *off, char *sym) +++{ +++ unsigned long symbol_start, symbol_end; +++ struct bpf_prog *prog; +++ char *ret = NULL; +++ +++ rcu_read_lock(); +++ prog = bpf_prog_kallsyms_find(addr); +++ if (prog) { +++ bpf_get_prog_addr_region(prog, &symbol_start, &symbol_end); +++ bpf_get_prog_name(prog, sym); +++ +++ ret = sym; +++ if (size) +++ *size = symbol_end - symbol_start; +++ if (off) +++ *off = addr - symbol_start; +++ } +++ rcu_read_unlock(); +++ +++ return ret; +++} +++ +++bool is_bpf_text_address(unsigned long addr) +++{ +++ bool ret; +++ +++ rcu_read_lock(); +++ ret = bpf_prog_kallsyms_find(addr) != NULL; +++ rcu_read_unlock(); +++ +++ return ret; +++} +++ +++int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, +++ char *sym) +++{ +++ struct bpf_prog_aux *aux; +++ unsigned int it = 0; +++ int ret = -ERANGE; +++ +++ if (!bpf_jit_kallsyms_enabled()) +++ return ret; +++ +++ rcu_read_lock(); +++ list_for_each_entry_rcu(aux, &bpf_kallsyms, ksym_lnode) { +++ if (it++ != symnum) +++ continue; +++ +++ bpf_get_prog_name(aux->prog, sym); +++ +++ *value = (unsigned long)aux->prog->bpf_func; +++ *type = BPF_SYM_ELF_TYPE; +++ +++ ret = 0; +++ break; +++ } +++ rcu_read_unlock(); +++ +++ return ret; +++} +++ +++static atomic_long_t bpf_jit_current; +++ +++/* Can be overridden by an arch's JIT compiler if it has a custom, +++ * dedicated BPF backend memory area, or if neither of the two +++ * below apply. +++ */ +++u64 __weak bpf_jit_alloc_exec_limit(void) +++{ +++#if defined(MODULES_VADDR) +++ return MODULES_END - MODULES_VADDR; +++#else +++ return VMALLOC_END - VMALLOC_START; +++#endif +++} +++ +++static int __init bpf_jit_charge_init(void) +++{ +++ /* Only used as heuristic here to derive limit. */ +++ bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, +++ PAGE_SIZE), LONG_MAX); +++ return 0; +++} +++pure_initcall(bpf_jit_charge_init); +++ +++static int bpf_jit_charge_modmem(u32 pages) +++{ +++ if (atomic_long_add_return(pages, &bpf_jit_current) > +++ (bpf_jit_limit >> PAGE_SHIFT)) { +++ if (!capable(CAP_SYS_ADMIN)) { +++ atomic_long_sub(pages, &bpf_jit_current); +++ return -EPERM; +++ } +++ } +++ +++ return 0; +++} +++ +++static void bpf_jit_uncharge_modmem(u32 pages) +++{ +++ atomic_long_sub(pages, &bpf_jit_current); +++} +++ +++void *__weak bpf_jit_alloc_exec(unsigned long size) +++{ +++ return module_alloc(size); +++} +++ +++void __weak bpf_jit_free_exec(void *addr) +++{ +++ module_memfree(addr); +++} +++ ++ struct bpf_binary_header * ++ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, ++ unsigned int alignment, ++ bpf_jit_fill_hole_t bpf_fill_ill_insns) ++ { ++ struct bpf_binary_header *hdr; ++- unsigned int size, hole, start; +++ u32 size, hole, start, pages; ++ ++ /* Most of BPF filters are really small, but if some of them ++ * fill a page, allow at least 128 extra bytes to insert a ++ * random section of illegal instructions. ++ */ ++ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE); ++- hdr = module_alloc(size); ++- if (hdr == NULL) +++ pages = size / PAGE_SIZE; +++ +++ if (bpf_jit_charge_modmem(pages)) +++ return NULL; +++ hdr = bpf_jit_alloc_exec(size); +++ if (!hdr) { +++ bpf_jit_uncharge_modmem(pages); ++ return NULL; +++ } ++ ++ /* Fill space with illegal/arch-dep instructions. */ ++ bpf_fill_ill_insns(hdr, size); ++ ++- hdr->pages = size / PAGE_SIZE; +++ hdr->pages = pages; ++ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)), ++ PAGE_SIZE - sizeof(*hdr)); ++- start = (prandom_u32() % hole) & ~(alignment - 1); +++ start = (get_random_int() % hole) & ~(alignment - 1); ++ ++ /* Leave a random number of instructions before BPF code. */ ++ *image_ptr = &hdr->image[start]; ++@@ -171,13 +830,301 @@ bpf_jit_binary_alloc(unsigned int progle ++ ++ void bpf_jit_binary_free(struct bpf_binary_header *hdr) ++ { ++- module_memfree(hdr); +++ u32 pages = hdr->pages; +++ +++ bpf_jit_free_exec(hdr); +++ bpf_jit_uncharge_modmem(pages); +++} +++ +++/* This symbol is only overridden by archs that have different +++ * requirements than the usual eBPF JITs, f.e. when they only +++ * implement cBPF JIT, do not set images read-only, etc. +++ */ +++void __weak bpf_jit_free(struct bpf_prog *fp) +++{ +++ if (fp->jited) { +++ struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); +++ +++ bpf_jit_binary_unlock_ro(hdr); +++ bpf_jit_binary_free(hdr); +++ +++ WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); +++ } +++ +++ bpf_prog_unlock_free(fp); +++} +++ +++int bpf_jit_get_func_addr(const struct bpf_prog *prog, +++ const struct bpf_insn *insn, bool extra_pass, +++ u64 *func_addr, bool *func_addr_fixed) +++{ +++ s16 off = insn->off; +++ s32 imm = insn->imm; +++ u8 *addr; +++ +++ *func_addr_fixed = insn->src_reg != BPF_PSEUDO_CALL; +++ if (!*func_addr_fixed) { +++ /* Place-holder address till the last pass has collected +++ * all addresses for JITed subprograms in which case we +++ * can pick them up from prog->aux. +++ */ +++ if (!extra_pass) +++ addr = NULL; +++ else if (prog->aux->func && +++ off >= 0 && off < prog->aux->func_cnt) +++ addr = (u8 *)prog->aux->func[off]->bpf_func; +++ else +++ return -EINVAL; +++ } else { +++ /* Address of a BPF helper call. Since part of the core +++ * kernel, it's always at a fixed location. __bpf_call_base +++ * and the helper with imm relative to it are both in core +++ * kernel. +++ */ +++ addr = (u8 *)__bpf_call_base + imm; +++ } +++ +++ *func_addr = (unsigned long)addr; +++ return 0; +++} +++ +++static int bpf_jit_blind_insn(const struct bpf_insn *from, +++ const struct bpf_insn *aux, +++ struct bpf_insn *to_buff, +++ bool emit_zext) +++{ +++ struct bpf_insn *to = to_buff; +++ u32 imm_rnd = get_random_int(); +++ s16 off; +++ +++ BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); +++ BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); +++ +++ /* Constraints on AX register: +++ * +++ * AX register is inaccessible from user space. It is mapped in +++ * all JITs, and used here for constant blinding rewrites. It is +++ * typically "stateless" meaning its contents are only valid within +++ * the executed instruction, but not across several instructions. +++ * There are a few exceptions however which are further detailed +++ * below. +++ * +++ * Constant blinding is only used by JITs, not in the interpreter. +++ * The interpreter uses AX in some occasions as a local temporary +++ * register e.g. in DIV or MOD instructions. +++ * +++ * In restricted circumstances, the verifier can also use the AX +++ * register for rewrites as long as they do not interfere with +++ * the above cases! +++ */ +++ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) +++ goto out; +++ +++ if (from->imm == 0 && +++ (from->code == (BPF_ALU | BPF_MOV | BPF_K) || +++ from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { +++ *to++ = BPF_ALU64_REG(BPF_XOR, from->dst_reg, from->dst_reg); +++ goto out; +++ } +++ +++ switch (from->code) { +++ case BPF_ALU | BPF_ADD | BPF_K: +++ case BPF_ALU | BPF_SUB | BPF_K: +++ case BPF_ALU | BPF_AND | BPF_K: +++ case BPF_ALU | BPF_OR | BPF_K: +++ case BPF_ALU | BPF_XOR | BPF_K: +++ case BPF_ALU | BPF_MUL | BPF_K: +++ case BPF_ALU | BPF_MOV | BPF_K: +++ case BPF_ALU | BPF_DIV | BPF_K: +++ case BPF_ALU | BPF_MOD | BPF_K: +++ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); +++ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +++ *to++ = BPF_ALU32_REG(from->code, from->dst_reg, BPF_REG_AX); +++ break; +++ +++ case BPF_ALU64 | BPF_ADD | BPF_K: +++ case BPF_ALU64 | BPF_SUB | BPF_K: +++ case BPF_ALU64 | BPF_AND | BPF_K: +++ case BPF_ALU64 | BPF_OR | BPF_K: +++ case BPF_ALU64 | BPF_XOR | BPF_K: +++ case BPF_ALU64 | BPF_MUL | BPF_K: +++ case BPF_ALU64 | BPF_MOV | BPF_K: +++ case BPF_ALU64 | BPF_DIV | BPF_K: +++ case BPF_ALU64 | BPF_MOD | BPF_K: +++ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); +++ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +++ *to++ = BPF_ALU64_REG(from->code, from->dst_reg, BPF_REG_AX); +++ break; +++ +++ case BPF_JMP | BPF_JEQ | BPF_K: +++ case BPF_JMP | BPF_JNE | BPF_K: +++ case BPF_JMP | BPF_JGT | BPF_K: +++ case BPF_JMP | BPF_JLT | BPF_K: +++ case BPF_JMP | BPF_JGE | BPF_K: +++ case BPF_JMP | BPF_JLE | BPF_K: +++ case BPF_JMP | BPF_JSGT | BPF_K: +++ case BPF_JMP | BPF_JSLT | BPF_K: +++ case BPF_JMP | BPF_JSGE | BPF_K: +++ case BPF_JMP | BPF_JSLE | BPF_K: +++ case BPF_JMP | BPF_JSET | BPF_K: +++ /* Accommodate for extra offset in case of a backjump. */ +++ off = from->off; +++ if (off < 0) +++ off -= 2; +++ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); +++ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +++ *to++ = BPF_JMP_REG(from->code, from->dst_reg, BPF_REG_AX, off); +++ break; +++ +++ case BPF_JMP32 | BPF_JEQ | BPF_K: +++ case BPF_JMP32 | BPF_JNE | BPF_K: +++ case BPF_JMP32 | BPF_JGT | BPF_K: +++ case BPF_JMP32 | BPF_JLT | BPF_K: +++ case BPF_JMP32 | BPF_JGE | BPF_K: +++ case BPF_JMP32 | BPF_JLE | BPF_K: +++ case BPF_JMP32 | BPF_JSGT | BPF_K: +++ case BPF_JMP32 | BPF_JSLT | BPF_K: +++ case BPF_JMP32 | BPF_JSGE | BPF_K: +++ case BPF_JMP32 | BPF_JSLE | BPF_K: +++ case BPF_JMP32 | BPF_JSET | BPF_K: +++ /* Accommodate for extra offset in case of a backjump. */ +++ off = from->off; +++ if (off < 0) +++ off -= 2; +++ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); +++ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +++ *to++ = BPF_JMP32_REG(from->code, from->dst_reg, BPF_REG_AX, +++ off); +++ break; +++ +++ case BPF_LD | BPF_IMM | BPF_DW: +++ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[1].imm); +++ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +++ *to++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); +++ *to++ = BPF_ALU64_REG(BPF_MOV, aux[0].dst_reg, BPF_REG_AX); +++ break; +++ case 0: /* Part 2 of BPF_LD | BPF_IMM | BPF_DW. */ +++ *to++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ aux[0].imm); +++ *to++ = BPF_ALU32_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +++ if (emit_zext) +++ *to++ = BPF_ZEXT_REG(BPF_REG_AX); +++ *to++ = BPF_ALU64_REG(BPF_OR, aux[0].dst_reg, BPF_REG_AX); +++ break; +++ +++ case BPF_ST | BPF_MEM | BPF_DW: +++ case BPF_ST | BPF_MEM | BPF_W: +++ case BPF_ST | BPF_MEM | BPF_H: +++ case BPF_ST | BPF_MEM | BPF_B: +++ *to++ = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, imm_rnd ^ from->imm); +++ *to++ = BPF_ALU64_IMM(BPF_XOR, BPF_REG_AX, imm_rnd); +++ *to++ = BPF_STX_MEM(from->code, from->dst_reg, BPF_REG_AX, from->off); +++ break; +++ } +++out: +++ return to - to_buff; +++} +++ +++static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, +++ gfp_t gfp_extra_flags) +++{ +++ gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; +++ struct bpf_prog *fp; +++ +++ fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); +++ if (fp != NULL) { +++ /* aux->prog still points to the fp_other one, so +++ * when promoting the clone to the real program, +++ * this still needs to be adapted. +++ */ +++ memcpy(fp, fp_other, fp_other->pages * PAGE_SIZE); +++ } +++ +++ return fp; +++} +++ +++static void bpf_prog_clone_free(struct bpf_prog *fp) +++{ +++ /* aux was stolen by the other clone, so we cannot free +++ * it from this path! It will be freed eventually by the +++ * other program on release. +++ * +++ * At this point, we don't need a deferred release since +++ * clone is guaranteed to not be locked. +++ */ +++ fp->aux = NULL; +++ __bpf_prog_free(fp); +++} +++ +++void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other) +++{ +++ /* We have to repoint aux->prog to self, as we don't +++ * know whether fp here is the clone or the original. +++ */ +++ fp->aux->prog = fp; +++ bpf_prog_clone_free(fp_other); +++} +++ +++struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) +++{ +++ struct bpf_insn insn_buff[16], aux[2]; +++ struct bpf_prog *clone, *tmp; +++ int insn_delta, insn_cnt; +++ struct bpf_insn *insn; +++ int i, rewritten; +++ +++ if (!bpf_jit_blinding_enabled(prog) || prog->blinded) +++ return prog; +++ +++ clone = bpf_prog_clone_create(prog, GFP_USER); +++ if (!clone) +++ return ERR_PTR(-ENOMEM); +++ +++ insn_cnt = clone->len; +++ insn = clone->insnsi; +++ +++ for (i = 0; i < insn_cnt; i++, insn++) { +++ /* We temporarily need to hold the original ld64 insn +++ * so that we can still access the first part in the +++ * second blinding run. +++ */ +++ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW) && +++ insn[1].code == 0) +++ memcpy(aux, insn, sizeof(aux)); +++ +++ rewritten = bpf_jit_blind_insn(insn, aux, insn_buff, +++ clone->aux->verifier_zext); +++ if (!rewritten) +++ continue; +++ +++ tmp = bpf_patch_insn_single(clone, i, insn_buff, rewritten); +++ if (IS_ERR(tmp)) { +++ /* Patching may have repointed aux->prog during +++ * realloc from the original one, so we need to +++ * fix it up here on error. +++ */ +++ bpf_jit_prog_release_other(prog, clone); +++ return tmp; +++ } +++ +++ clone = tmp; +++ insn_delta = rewritten - 1; +++ +++ /* Walk new program and skip insns we just inserted. */ +++ insn = clone->insnsi + i + insn_delta; +++ insn_cnt += insn_delta; +++ i += insn_delta; +++ } +++ +++ clone->blinded = 1; +++ return clone; ++ } ++ #endif /* CONFIG_BPF_JIT */ ++ ++ /* Base function for offset calculation. Needs to go into .text section, ++ * therefore keeping it non-static as well; will also be used by JITs ++- * anyway later on, so do not let the compiler omit it. +++ * anyway later on, so do not let the compiler omit it. This also needs +++ * to go into kallsyms for correlation from e.g. bpftool, so naming +++ * must not change. ++ */ ++ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) ++ { ++@@ -185,157 +1132,243 @@ noinline u64 __bpf_call_base(u64 r1, u64 ++ } ++ EXPORT_SYMBOL_GPL(__bpf_call_base); ++ +++/* All UAPI available opcodes. */ +++#define BPF_INSN_MAP(INSN_2, INSN_3) \ +++ /* 32 bit ALU operations. */ \ +++ /* Register based. */ \ +++ INSN_3(ALU, ADD, X), \ +++ INSN_3(ALU, SUB, X), \ +++ INSN_3(ALU, AND, X), \ +++ INSN_3(ALU, OR, X), \ +++ INSN_3(ALU, LSH, X), \ +++ INSN_3(ALU, RSH, X), \ +++ INSN_3(ALU, XOR, X), \ +++ INSN_3(ALU, MUL, X), \ +++ INSN_3(ALU, MOV, X), \ +++ INSN_3(ALU, ARSH, X), \ +++ INSN_3(ALU, DIV, X), \ +++ INSN_3(ALU, MOD, X), \ +++ INSN_2(ALU, NEG), \ +++ INSN_3(ALU, END, TO_BE), \ +++ INSN_3(ALU, END, TO_LE), \ +++ /* Immediate based. */ \ +++ INSN_3(ALU, ADD, K), \ +++ INSN_3(ALU, SUB, K), \ +++ INSN_3(ALU, AND, K), \ +++ INSN_3(ALU, OR, K), \ +++ INSN_3(ALU, LSH, K), \ +++ INSN_3(ALU, RSH, K), \ +++ INSN_3(ALU, XOR, K), \ +++ INSN_3(ALU, MUL, K), \ +++ INSN_3(ALU, MOV, K), \ +++ INSN_3(ALU, ARSH, K), \ +++ INSN_3(ALU, DIV, K), \ +++ INSN_3(ALU, MOD, K), \ +++ /* 64 bit ALU operations. */ \ +++ /* Register based. */ \ +++ INSN_3(ALU64, ADD, X), \ +++ INSN_3(ALU64, SUB, X), \ +++ INSN_3(ALU64, AND, X), \ +++ INSN_3(ALU64, OR, X), \ +++ INSN_3(ALU64, LSH, X), \ +++ INSN_3(ALU64, RSH, X), \ +++ INSN_3(ALU64, XOR, X), \ +++ INSN_3(ALU64, MUL, X), \ +++ INSN_3(ALU64, MOV, X), \ +++ INSN_3(ALU64, ARSH, X), \ +++ INSN_3(ALU64, DIV, X), \ +++ INSN_3(ALU64, MOD, X), \ +++ INSN_2(ALU64, NEG), \ +++ /* Immediate based. */ \ +++ INSN_3(ALU64, ADD, K), \ +++ INSN_3(ALU64, SUB, K), \ +++ INSN_3(ALU64, AND, K), \ +++ INSN_3(ALU64, OR, K), \ +++ INSN_3(ALU64, LSH, K), \ +++ INSN_3(ALU64, RSH, K), \ +++ INSN_3(ALU64, XOR, K), \ +++ INSN_3(ALU64, MUL, K), \ +++ INSN_3(ALU64, MOV, K), \ +++ INSN_3(ALU64, ARSH, K), \ +++ INSN_3(ALU64, DIV, K), \ +++ INSN_3(ALU64, MOD, K), \ +++ /* Call instruction. */ \ +++ INSN_2(JMP, CALL), \ +++ /* Exit instruction. */ \ +++ INSN_2(JMP, EXIT), \ +++ /* 32-bit Jump instructions. */ \ +++ /* Register based. */ \ +++ INSN_3(JMP32, JEQ, X), \ +++ INSN_3(JMP32, JNE, X), \ +++ INSN_3(JMP32, JGT, X), \ +++ INSN_3(JMP32, JLT, X), \ +++ INSN_3(JMP32, JGE, X), \ +++ INSN_3(JMP32, JLE, X), \ +++ INSN_3(JMP32, JSGT, X), \ +++ INSN_3(JMP32, JSLT, X), \ +++ INSN_3(JMP32, JSGE, X), \ +++ INSN_3(JMP32, JSLE, X), \ +++ INSN_3(JMP32, JSET, X), \ +++ /* Immediate based. */ \ +++ INSN_3(JMP32, JEQ, K), \ +++ INSN_3(JMP32, JNE, K), \ +++ INSN_3(JMP32, JGT, K), \ +++ INSN_3(JMP32, JLT, K), \ +++ INSN_3(JMP32, JGE, K), \ +++ INSN_3(JMP32, JLE, K), \ +++ INSN_3(JMP32, JSGT, K), \ +++ INSN_3(JMP32, JSLT, K), \ +++ INSN_3(JMP32, JSGE, K), \ +++ INSN_3(JMP32, JSLE, K), \ +++ INSN_3(JMP32, JSET, K), \ +++ /* Jump instructions. */ \ +++ /* Register based. */ \ +++ INSN_3(JMP, JEQ, X), \ +++ INSN_3(JMP, JNE, X), \ +++ INSN_3(JMP, JGT, X), \ +++ INSN_3(JMP, JLT, X), \ +++ INSN_3(JMP, JGE, X), \ +++ INSN_3(JMP, JLE, X), \ +++ INSN_3(JMP, JSGT, X), \ +++ INSN_3(JMP, JSLT, X), \ +++ INSN_3(JMP, JSGE, X), \ +++ INSN_3(JMP, JSLE, X), \ +++ INSN_3(JMP, JSET, X), \ +++ /* Immediate based. */ \ +++ INSN_3(JMP, JEQ, K), \ +++ INSN_3(JMP, JNE, K), \ +++ INSN_3(JMP, JGT, K), \ +++ INSN_3(JMP, JLT, K), \ +++ INSN_3(JMP, JGE, K), \ +++ INSN_3(JMP, JLE, K), \ +++ INSN_3(JMP, JSGT, K), \ +++ INSN_3(JMP, JSLT, K), \ +++ INSN_3(JMP, JSGE, K), \ +++ INSN_3(JMP, JSLE, K), \ +++ INSN_3(JMP, JSET, K), \ +++ INSN_2(JMP, JA), \ +++ /* Store instructions. */ \ +++ /* Register based. */ \ +++ INSN_3(STX, MEM, B), \ +++ INSN_3(STX, MEM, H), \ +++ INSN_3(STX, MEM, W), \ +++ INSN_3(STX, MEM, DW), \ +++ INSN_3(STX, XADD, W), \ +++ INSN_3(STX, XADD, DW), \ +++ /* Immediate based. */ \ +++ INSN_3(ST, MEM, B), \ +++ INSN_3(ST, MEM, H), \ +++ INSN_3(ST, MEM, W), \ +++ INSN_3(ST, MEM, DW), \ +++ /* Load instructions. */ \ +++ /* Register based. */ \ +++ INSN_3(LDX, MEM, B), \ +++ INSN_3(LDX, MEM, H), \ +++ INSN_3(LDX, MEM, W), \ +++ INSN_3(LDX, MEM, DW), \ +++ /* Immediate based. */ \ +++ INSN_3(LD, IMM, DW) +++ +++bool bpf_opcode_in_insntable(u8 code) +++{ +++#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +++#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true +++ static const bool public_insntable[256] = { +++ [0 ... 255] = false, +++ /* Now overwrite non-defaults ... */ +++ BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), +++ /* UAPI exposed, but rewritten opcodes. cBPF carry-over. */ +++ [BPF_LD | BPF_ABS | BPF_B] = true, +++ [BPF_LD | BPF_ABS | BPF_H] = true, +++ [BPF_LD | BPF_ABS | BPF_W] = true, +++ [BPF_LD | BPF_IND | BPF_B] = true, +++ [BPF_LD | BPF_IND | BPF_H] = true, +++ [BPF_LD | BPF_IND | BPF_W] = true, +++ }; +++#undef BPF_INSN_3_TBL +++#undef BPF_INSN_2_TBL +++ return public_insntable[code]; +++} +++ +++#ifndef CONFIG_BPF_JIT_ALWAYS_ON ++ /** ++ * __bpf_prog_run - run eBPF program on a given context ++- * @ctx: is the data we are operating on +++ * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers ++ * @insn: is the array of eBPF instructions +++ * @stack: is the eBPF storage stack ++ * ++ * Decode and execute eBPF instructions. ++ */ ++-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) +++static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) ++ { ++- u64 stack[MAX_BPF_STACK / sizeof(u64)]; ++- u64 regs[MAX_BPF_REG], tmp; ++- static const void *jumptable[256] = { +++#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +++#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z +++ static const void * const jumptable[256] = { ++ [0 ... 255] = &&default_label, ++ /* Now overwrite non-defaults ... */ ++- /* 32 bit ALU operations */ ++- [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, ++- [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, ++- [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, ++- [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, ++- [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, ++- [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, ++- [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, ++- [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, ++- [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, ++- [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, ++- [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, ++- [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, ++- [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, ++- [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, ++- [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, ++- [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, ++- [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, ++- [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, ++- [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, ++- [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, ++- [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, ++- [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, ++- [BPF_ALU | BPF_NEG] = &&ALU_NEG, ++- [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, ++- [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, ++- /* 64 bit ALU operations */ ++- [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, ++- [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, ++- [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, ++- [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, ++- [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, ++- [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, ++- [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, ++- [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, ++- [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, ++- [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, ++- [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, ++- [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, ++- [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, ++- [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, ++- [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, ++- [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, ++- [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, ++- [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, ++- [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, ++- [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, ++- [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, ++- [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, ++- [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, ++- [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, ++- [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, ++- /* Call instruction */ ++- [BPF_JMP | BPF_CALL] = &&JMP_CALL, ++- [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL, ++- /* Jumps */ ++- [BPF_JMP | BPF_JA] = &&JMP_JA, ++- [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, ++- [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, ++- [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, ++- [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, ++- [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, ++- [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, ++- [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, ++- [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, ++- [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, ++- [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, ++- [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, ++- [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, ++- [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, ++- [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, ++- /* Program return */ ++- [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, ++- /* Store instructions */ ++- [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, ++- [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, ++- [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, ++- [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, ++- [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, ++- [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, ++- [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, ++- [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, ++- [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, ++- [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, ++- /* Load instructions */ ++- [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, ++- [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, ++- [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, ++- [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, ++- [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, ++- [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, ++- [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, ++- [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, ++- [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, ++- [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, ++- [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, +++ BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), +++ /* Non-UAPI available opcodes. */ +++ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, +++ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, ++ }; +++#undef BPF_INSN_3_LBL +++#undef BPF_INSN_2_LBL ++ u32 tail_call_cnt = 0; ++- void *ptr; ++- int off; ++ ++ #define CONT ({ insn++; goto select_insn; }) ++ #define CONT_JMP ({ insn++; goto select_insn; }) ++ ++- FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ++- ARG1 = (u64) (unsigned long) ctx; ++- ++- /* Registers used in classic BPF programs need to be reset first. */ ++- regs[BPF_REG_A] = 0; ++- regs[BPF_REG_X] = 0; ++- ++ select_insn: ++ goto *jumptable[insn->code]; ++ ++- /* ALU */ ++-#define ALU(OPCODE, OP) \ ++- ALU64_##OPCODE##_X: \ ++- DST = DST OP SRC; \ ++- CONT; \ ++- ALU_##OPCODE##_X: \ ++- DST = (u32) DST OP (u32) SRC; \ ++- CONT; \ ++- ALU64_##OPCODE##_K: \ ++- DST = DST OP IMM; \ ++- CONT; \ ++- ALU_##OPCODE##_K: \ ++- DST = (u32) DST OP (u32) IMM; \ +++ /* Explicitly mask the register-based shift amounts with 63 or 31 +++ * to avoid undefined behavior. Normally this won't affect the +++ * generated code, for example, in case of native 64 bit archs such +++ * as x86-64 or arm64, the compiler is optimizing the AND away for +++ * the interpreter. In case of JITs, each of the JIT backends compiles +++ * the BPF shift operations to machine instructions which produce +++ * implementation-defined results in such a case; the resulting +++ * contents of the register may be arbitrary, but program behaviour +++ * as a whole remains defined. In other words, in case of JIT backends, +++ * the AND must /not/ be added to the emitted LSH/RSH/ARSH translation. +++ */ +++ /* ALU (shifts) */ +++#define SHT(OPCODE, OP) \ +++ ALU64_##OPCODE##_X: \ +++ DST = DST OP (SRC & 63); \ +++ CONT; \ +++ ALU_##OPCODE##_X: \ +++ DST = (u32) DST OP ((u32) SRC & 31); \ +++ CONT; \ +++ ALU64_##OPCODE##_K: \ +++ DST = DST OP IMM; \ +++ CONT; \ +++ ALU_##OPCODE##_K: \ +++ DST = (u32) DST OP (u32) IMM; \ +++ CONT; +++ /* ALU (rest) */ +++#define ALU(OPCODE, OP) \ +++ ALU64_##OPCODE##_X: \ +++ DST = DST OP SRC; \ +++ CONT; \ +++ ALU_##OPCODE##_X: \ +++ DST = (u32) DST OP (u32) SRC; \ +++ CONT; \ +++ ALU64_##OPCODE##_K: \ +++ DST = DST OP IMM; \ +++ CONT; \ +++ ALU_##OPCODE##_K: \ +++ DST = (u32) DST OP (u32) IMM; \ ++ CONT; ++- ++ ALU(ADD, +) ++ ALU(SUB, -) ++ ALU(AND, &) ++ ALU(OR, |) ++- ALU(LSH, <<) ++- ALU(RSH, >>) ++ ALU(XOR, ^) ++ ALU(MUL, *) +++ SHT(LSH, <<) +++ SHT(RSH, >>) +++#undef SHT ++ #undef ALU ++ ALU_NEG: ++ DST = (u32) -DST; ++@@ -359,51 +1392,49 @@ select_insn: ++ DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32; ++ insn++; ++ CONT; +++ ALU_ARSH_X: +++ DST = (u64) (u32) (((s32) DST) >> (SRC & 31)); +++ CONT; +++ ALU_ARSH_K: +++ DST = (u64) (u32) (((s32) DST) >> IMM); +++ CONT; ++ ALU64_ARSH_X: ++- (*(s64 *) &DST) >>= SRC; +++ (*(s64 *) &DST) >>= (SRC & 63); ++ CONT; ++ ALU64_ARSH_K: ++ (*(s64 *) &DST) >>= IMM; ++ CONT; ++ ALU64_MOD_X: ++- if (unlikely(SRC == 0)) ++- return 0; ++- div64_u64_rem(DST, SRC, &tmp); ++- DST = tmp; +++ div64_u64_rem(DST, SRC, &AX); +++ DST = AX; ++ CONT; ++ ALU_MOD_X: ++- if (unlikely(SRC == 0)) ++- return 0; ++- tmp = (u32) DST; ++- DST = do_div(tmp, (u32) SRC); +++ AX = (u32) DST; +++ DST = do_div(AX, (u32) SRC); ++ CONT; ++ ALU64_MOD_K: ++- div64_u64_rem(DST, IMM, &tmp); ++- DST = tmp; +++ div64_u64_rem(DST, IMM, &AX); +++ DST = AX; ++ CONT; ++ ALU_MOD_K: ++- tmp = (u32) DST; ++- DST = do_div(tmp, (u32) IMM); +++ AX = (u32) DST; +++ DST = do_div(AX, (u32) IMM); ++ CONT; ++ ALU64_DIV_X: ++- if (unlikely(SRC == 0)) ++- return 0; ++ DST = div64_u64(DST, SRC); ++ CONT; ++ ALU_DIV_X: ++- if (unlikely(SRC == 0)) ++- return 0; ++- tmp = (u32) DST; ++- do_div(tmp, (u32) SRC); ++- DST = (u32) tmp; +++ AX = (u32) DST; +++ do_div(AX, (u32) SRC); +++ DST = (u32) AX; ++ CONT; ++ ALU64_DIV_K: ++ DST = div64_u64(DST, IMM); ++ CONT; ++ ALU_DIV_K: ++- tmp = (u32) DST; ++- do_div(tmp, (u32) IMM); ++- DST = (u32) tmp; +++ AX = (u32) DST; +++ do_div(AX, (u32) IMM); +++ DST = (u32) AX; ++ CONT; ++ ALU_END_TO_BE: ++ switch (IMM) { ++@@ -442,22 +1473,28 @@ select_insn: ++ BPF_R4, BPF_R5); ++ CONT; ++ +++ JMP_CALL_ARGS: +++ BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, +++ BPF_R3, BPF_R4, +++ BPF_R5, +++ insn + insn->off + 1); +++ CONT; +++ ++ JMP_TAIL_CALL: { ++ struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; ++ struct bpf_array *array = container_of(map, struct bpf_array, map); ++ struct bpf_prog *prog; ++- u64 index = BPF_R3; +++ u32 index = BPF_R3; ++ ++ if (unlikely(index >= array->map.max_entries)) ++ goto out; ++- ++ if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT)) ++ goto out; ++ ++ tail_call_cnt++; ++ ++ prog = READ_ONCE(array->ptrs[index]); ++- if (unlikely(!prog)) +++ if (!prog) ++ goto out; ++ ++ /* ARG1 at this point is guaranteed to point to CTX from ++@@ -470,97 +1507,49 @@ select_insn: ++ out: ++ CONT; ++ } ++- /* JMP */ ++ JMP_JA: ++ insn += insn->off; ++ CONT; ++- JMP_JEQ_X: ++- if (DST == SRC) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JEQ_K: ++- if (DST == IMM) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JNE_X: ++- if (DST != SRC) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JNE_K: ++- if (DST != IMM) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JGT_X: ++- if (DST > SRC) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JGT_K: ++- if (DST > IMM) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JGE_X: ++- if (DST >= SRC) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JGE_K: ++- if (DST >= IMM) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JSGT_X: ++- if (((s64) DST) > ((s64) SRC)) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JSGT_K: ++- if (((s64) DST) > ((s64) IMM)) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JSGE_X: ++- if (((s64) DST) >= ((s64) SRC)) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JSGE_K: ++- if (((s64) DST) >= ((s64) IMM)) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JSET_X: ++- if (DST & SRC) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++- JMP_JSET_K: ++- if (DST & IMM) { ++- insn += insn->off; ++- CONT_JMP; ++- } ++- CONT; ++ JMP_EXIT: ++ return BPF_R0; ++- +++ /* JMP */ +++#define COND_JMP(SIGN, OPCODE, CMP_OP) \ +++ JMP_##OPCODE##_X: \ +++ if ((SIGN##64) DST CMP_OP (SIGN##64) SRC) { \ +++ insn += insn->off; \ +++ CONT_JMP; \ +++ } \ +++ CONT; \ +++ JMP32_##OPCODE##_X: \ +++ if ((SIGN##32) DST CMP_OP (SIGN##32) SRC) { \ +++ insn += insn->off; \ +++ CONT_JMP; \ +++ } \ +++ CONT; \ +++ JMP_##OPCODE##_K: \ +++ if ((SIGN##64) DST CMP_OP (SIGN##64) IMM) { \ +++ insn += insn->off; \ +++ CONT_JMP; \ +++ } \ +++ CONT; \ +++ JMP32_##OPCODE##_K: \ +++ if ((SIGN##32) DST CMP_OP (SIGN##32) IMM) { \ +++ insn += insn->off; \ +++ CONT_JMP; \ +++ } \ +++ CONT; +++ COND_JMP(u, JEQ, ==) +++ COND_JMP(u, JNE, !=) +++ COND_JMP(u, JGT, >) +++ COND_JMP(u, JLT, <) +++ COND_JMP(u, JGE, >=) +++ COND_JMP(u, JLE, <=) +++ COND_JMP(u, JSET, &) +++ COND_JMP(s, JSGT, >) +++ COND_JMP(s, JSLT, <) +++ COND_JMP(s, JSGE, >=) +++ COND_JMP(s, JSLE, <=) +++#undef COND_JMP ++ /* STX and ST and LDX*/ ++ #define LDST(SIZEOP, SIZE) \ ++ STX_MEM_##SIZEOP: \ ++@@ -586,77 +1575,108 @@ out: ++ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) ++ (DST + insn->off)); ++ CONT; ++- LD_ABS_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + imm32)) */ ++- off = IMM; ++-load_word: ++- /* BPF_LD + BPD_ABS and BPF_LD + BPF_IND insns are ++- * only appearing in the programs where ctx == ++- * skb. All programs keep 'ctx' in regs[BPF_REG_CTX] ++- * == BPF_R6, bpf_convert_filter() saves it in BPF_R6, ++- * internal BPF verifier will check that BPF_R6 == ++- * ctx. ++- * ++- * BPF_ABS and BPF_IND are wrappers of function calls, ++- * so they scratch BPF_R1-BPF_R5 registers, preserve ++- * BPF_R6-BPF_R9, and store return value into BPF_R0. ++- * ++- * Implicit input: ++- * ctx == skb == BPF_R6 == CTX ++- * ++- * Explicit input: ++- * SRC == any register ++- * IMM == 32-bit immediate +++ +++ default_label: +++ /* If we ever reach this, we have a bug somewhere. Die hard here +++ * instead of just returning 0; we could be somewhere in a subprog, +++ * so execution could continue otherwise which we do /not/ want. ++ * ++- * Output: ++- * BPF_R0 - 8/16/32-bit skb data converted to cpu endianness +++ * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). ++ */ +++ pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); +++ BUG_ON(1); +++ return 0; +++} ++ ++- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 4, &tmp); ++- if (likely(ptr != NULL)) { ++- BPF_R0 = get_unaligned_be32(ptr); ++- CONT; ++- } +++#define PROG_NAME(stack_size) __bpf_prog_run##stack_size +++#define DEFINE_BPF_PROG_RUN(stack_size) \ +++static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ +++{ \ +++ u64 stack[stack_size / sizeof(u64)]; \ +++ u64 regs[MAX_BPF_EXT_REG]; \ +++\ +++ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ +++ ARG1 = (u64) (unsigned long) ctx; \ +++ return ___bpf_prog_run(regs, insn, stack); \ +++} ++ ++- return 0; ++- LD_ABS_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + imm32)) */ ++- off = IMM; ++-load_half: ++- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 2, &tmp); ++- if (likely(ptr != NULL)) { ++- BPF_R0 = get_unaligned_be16(ptr); ++- CONT; ++- } +++#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size +++#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ +++static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ +++ const struct bpf_insn *insn) \ +++{ \ +++ u64 stack[stack_size / sizeof(u64)]; \ +++ u64 regs[MAX_BPF_EXT_REG]; \ +++\ +++ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ +++ BPF_R1 = r1; \ +++ BPF_R2 = r2; \ +++ BPF_R3 = r3; \ +++ BPF_R4 = r4; \ +++ BPF_R5 = r5; \ +++ return ___bpf_prog_run(regs, insn, stack); \ +++} ++ ++- return 0; ++- LD_ABS_B: /* BPF_R0 = *(u8 *) (skb->data + imm32) */ ++- off = IMM; ++-load_byte: ++- ptr = bpf_load_pointer((struct sk_buff *) (unsigned long) CTX, off, 1, &tmp); ++- if (likely(ptr != NULL)) { ++- BPF_R0 = *(u8 *)ptr; ++- CONT; ++- } +++#define EVAL1(FN, X) FN(X) +++#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) +++#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) +++#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y) +++#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y) +++#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y) +++ +++EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); +++EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); +++EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); +++ +++EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); +++EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); +++EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); +++ +++#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), +++ +++static unsigned int (*interpreters[])(const void *ctx, +++ const struct bpf_insn *insn) = { +++EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +++EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +++EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +++}; +++#undef PROG_NAME_LIST +++#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), +++static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, +++ const struct bpf_insn *insn) = { +++EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +++EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +++EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +++}; +++#undef PROG_NAME_LIST ++ ++- return 0; ++- LD_IND_W: /* BPF_R0 = ntohl(*(u32 *) (skb->data + src_reg + imm32)) */ ++- off = IMM + SRC; ++- goto load_word; ++- LD_IND_H: /* BPF_R0 = ntohs(*(u16 *) (skb->data + src_reg + imm32)) */ ++- off = IMM + SRC; ++- goto load_half; ++- LD_IND_B: /* BPF_R0 = *(u8 *) (skb->data + src_reg + imm32) */ ++- off = IMM + SRC; ++- goto load_byte; +++void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +++{ +++ stack_depth = max_t(u32, stack_depth, 1); +++ insn->off = (s16) insn->imm; +++ insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - +++ __bpf_call_base_args; +++ insn->code = BPF_JMP | BPF_CALL_ARGS; +++} ++ ++- default_label: ++- /* If we ever reach this, we have a bug somewhere. */ ++- WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); ++- return 0; +++#else +++static unsigned int __bpf_prog_ret0_warn(const void *ctx, +++ const struct bpf_insn *insn) +++{ +++ /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON +++ * is not working properly, so warn about it! +++ */ +++ WARN_ON_ONCE(1); +++ return 0; ++ } +++#endif ++ ++ bool bpf_prog_array_compatible(struct bpf_array *array, ++ const struct bpf_prog *fp) ++ { +++ if (fp->kprobe_override) +++ return false; +++ ++ if (!array->owner_prog_type) { ++ /* There's no owner yet where we could check for ++ * compatibility. ++@@ -691,18 +1711,62 @@ static int bpf_check_tail_call(const str ++ return 0; ++ } ++ +++static void bpf_prog_select_func(struct bpf_prog *fp) +++{ +++#ifndef CONFIG_BPF_JIT_ALWAYS_ON +++ u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); +++ +++ fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +++#else +++ fp->bpf_func = __bpf_prog_ret0_warn; +++#endif +++} +++ ++ /** ++ * bpf_prog_select_runtime - select exec runtime for BPF program ++ * @fp: bpf_prog populated with internal BPF program +++ * @err: pointer to error variable ++ * ++ * Try to JIT eBPF program, if JIT is not available, use interpreter. ++ * The BPF program will be executed via BPF_PROG_RUN() macro. ++ */ ++-int bpf_prog_select_runtime(struct bpf_prog *fp) +++struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) ++ { ++- fp->bpf_func = (void *) __bpf_prog_run; +++ /* In case of BPF to BPF calls, verifier did all the prep +++ * work with regards to JITing, etc. +++ */ +++ if (fp->bpf_func) +++ goto finalize; +++ +++ bpf_prog_select_func(fp); +++ +++ /* eBPF JITs can rewrite the program in case constant +++ * blinding is active. However, in case of error during +++ * blinding, bpf_int_jit_compile() must always return a +++ * valid program, which in this case would simply not +++ * be JITed, but falls back to the interpreter. +++ */ +++ if (!bpf_prog_is_dev_bound(fp->aux)) { +++ *err = bpf_prog_alloc_jited_linfo(fp); +++ if (*err) +++ return fp; +++ +++ fp = bpf_int_jit_compile(fp); +++ if (!fp->jited) { +++ bpf_prog_free_jited_linfo(fp); +++#ifdef CONFIG_BPF_JIT_ALWAYS_ON +++ *err = -ENOTSUPP; +++ return fp; +++#endif +++ } else { +++ bpf_prog_free_unused_jited_linfo(fp); +++ } +++ } else { +++ *err = -EINVAL; +++ return fp; +++ } ++ ++- bpf_int_jit_compile(fp); +++finalize: ++ bpf_prog_lock_ro(fp); ++ ++ /* The tail call compatibility check can only be done at ++@@ -710,16 +1774,238 @@ int bpf_prog_select_runtime(struct bpf_p ++ * with JITed or non JITed program concatenations and not ++ * all eBPF JITs might immediately support all features. ++ */ ++- return bpf_check_tail_call(fp); +++ *err = bpf_check_tail_call(fp); +++ +++ return fp; ++ } ++ EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); ++ +++static unsigned int __bpf_prog_ret1(const void *ctx, +++ const struct bpf_insn *insn) +++{ +++ return 1; +++} +++ +++static struct bpf_prog_dummy { +++ struct bpf_prog prog; +++} dummy_bpf_prog = { +++ .prog = { +++ .bpf_func = __bpf_prog_ret1, +++ }, +++}; +++ +++/* to avoid allocating empty bpf_prog_array for cgroups that +++ * don't have bpf program attached use one global 'empty_prog_array' +++ * It will not be modified the caller of bpf_prog_array_alloc() +++ * (since caller requested prog_cnt == 0) +++ * that pointer should be 'freed' by bpf_prog_array_free() +++ */ +++static struct { +++ struct bpf_prog_array hdr; +++ struct bpf_prog *null_prog; +++} empty_prog_array = { +++ .null_prog = NULL, +++}; +++ +++struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags) +++{ +++ if (prog_cnt) +++ return kzalloc(sizeof(struct bpf_prog_array) + +++ sizeof(struct bpf_prog_array_item) * +++ (prog_cnt + 1), +++ flags); +++ +++ return &empty_prog_array.hdr; +++} +++ +++void bpf_prog_array_free(struct bpf_prog_array *progs) +++{ +++ if (!progs || progs == &empty_prog_array.hdr) +++ return; +++ kfree_rcu(progs, rcu); +++} +++ +++int bpf_prog_array_length(struct bpf_prog_array *array) +++{ +++ struct bpf_prog_array_item *item; +++ u32 cnt = 0; +++ +++ for (item = array->items; item->prog; item++) +++ if (item->prog != &dummy_bpf_prog.prog) +++ cnt++; +++ return cnt; +++} +++ +++bool bpf_prog_array_is_empty(struct bpf_prog_array *array) +++{ +++ struct bpf_prog_array_item *item; +++ +++ for (item = array->items; item->prog; item++) +++ if (item->prog != &dummy_bpf_prog.prog) +++ return false; +++ return true; +++} +++ +++static bool bpf_prog_array_copy_core(struct bpf_prog_array *array, +++ u32 *prog_ids, +++ u32 request_cnt) +++{ +++ struct bpf_prog_array_item *item; +++ int i = 0; +++ +++ for (item = array->items; item->prog; item++) { +++ if (item->prog == &dummy_bpf_prog.prog) +++ continue; +++ prog_ids[i] = item->prog->aux->id; +++ if (++i == request_cnt) { +++ item++; +++ break; +++ } +++ } +++ +++ return !!(item->prog); +++} +++ +++int bpf_prog_array_copy_to_user(struct bpf_prog_array *array, +++ __u32 __user *prog_ids, u32 cnt) +++{ +++ unsigned long err = 0; +++ bool nospc; +++ u32 *ids; +++ +++ /* users of this function are doing: +++ * cnt = bpf_prog_array_length(); +++ * if (cnt > 0) +++ * bpf_prog_array_copy_to_user(..., cnt); +++ * so below kcalloc doesn't need extra cnt > 0 check. +++ */ +++ ids = kcalloc(cnt, sizeof(u32), GFP_USER | __GFP_NOWARN); +++ if (!ids) +++ return -ENOMEM; +++ nospc = bpf_prog_array_copy_core(array, ids, cnt); +++ err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); +++ kfree(ids); +++ if (err) +++ return -EFAULT; +++ if (nospc) +++ return -ENOSPC; +++ return 0; +++} +++ +++void bpf_prog_array_delete_safe(struct bpf_prog_array *array, +++ struct bpf_prog *old_prog) +++{ +++ struct bpf_prog_array_item *item; +++ +++ for (item = array->items; item->prog; item++) +++ if (item->prog == old_prog) { +++ WRITE_ONCE(item->prog, &dummy_bpf_prog.prog); +++ break; +++ } +++} +++ +++int bpf_prog_array_copy(struct bpf_prog_array *old_array, +++ struct bpf_prog *exclude_prog, +++ struct bpf_prog *include_prog, +++ struct bpf_prog_array **new_array) +++{ +++ int new_prog_cnt, carry_prog_cnt = 0; +++ struct bpf_prog_array_item *existing; +++ struct bpf_prog_array *array; +++ bool found_exclude = false; +++ int new_prog_idx = 0; +++ +++ /* Figure out how many existing progs we need to carry over to +++ * the new array. +++ */ +++ if (old_array) { +++ existing = old_array->items; +++ for (; existing->prog; existing++) { +++ if (existing->prog == exclude_prog) { +++ found_exclude = true; +++ continue; +++ } +++ if (existing->prog != &dummy_bpf_prog.prog) +++ carry_prog_cnt++; +++ if (existing->prog == include_prog) +++ return -EEXIST; +++ } +++ } +++ +++ if (exclude_prog && !found_exclude) +++ return -ENOENT; +++ +++ /* How many progs (not NULL) will be in the new array? */ +++ new_prog_cnt = carry_prog_cnt; +++ if (include_prog) +++ new_prog_cnt += 1; +++ +++ /* Do we have any prog (not NULL) in the new array? */ +++ if (!new_prog_cnt) { +++ *new_array = NULL; +++ return 0; +++ } +++ +++ /* +1 as the end of prog_array is marked with NULL */ +++ array = bpf_prog_array_alloc(new_prog_cnt + 1, GFP_KERNEL); +++ if (!array) +++ return -ENOMEM; +++ +++ /* Fill in the new prog array */ +++ if (carry_prog_cnt) { +++ existing = old_array->items; +++ for (; existing->prog; existing++) +++ if (existing->prog != exclude_prog && +++ existing->prog != &dummy_bpf_prog.prog) { +++ array->items[new_prog_idx++].prog = +++ existing->prog; +++ } +++ } +++ if (include_prog) +++ array->items[new_prog_idx++].prog = include_prog; +++ array->items[new_prog_idx].prog = NULL; +++ *new_array = array; +++ return 0; +++} +++ +++int bpf_prog_array_copy_info(struct bpf_prog_array *array, +++ u32 *prog_ids, u32 request_cnt, +++ u32 *prog_cnt) +++{ +++ u32 cnt = 0; +++ +++ if (array) +++ cnt = bpf_prog_array_length(array); +++ +++ *prog_cnt = cnt; +++ +++ /* return early if user requested only program count or nothing to copy */ +++ if (!request_cnt || !cnt) +++ return 0; +++ +++ /* this function is called under trace/bpf_trace.c: bpf_event_mutex */ +++ return bpf_prog_array_copy_core(array, prog_ids, request_cnt) ? -ENOSPC +++ : 0; +++} +++ ++ static void bpf_prog_free_deferred(struct work_struct *work) ++ { ++ struct bpf_prog_aux *aux; +++ int i; ++ ++ aux = container_of(work, struct bpf_prog_aux, work); ++- bpf_jit_free(aux->prog); +++#ifdef CONFIG_PERF_EVENTS +++ if (aux->prog->has_callchain_buf) +++ put_callchain_buffers(); +++#endif +++ for (i = 0; i < aux->func_cnt; i++) +++ bpf_jit_free(aux->func[i]); +++ if (aux->func_cnt) { +++ kfree(aux->func); +++ bpf_prog_unlock_free(aux->prog); +++ } else { +++ bpf_jit_free(aux->prog); +++ } ++ } ++ ++ /* Free internal BPF program */ ++@@ -740,7 +2026,7 @@ void bpf_user_rnd_init_once(void) ++ prandom_init_once(&bpf_user_rnd_state); ++ } ++ ++-u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_0(bpf_user_rnd_u32) ++ { ++ /* Should someone ever have the rather unwise idea to use some ++ * of the registers passed into this function, then note that ++@@ -753,7 +2039,7 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 ++ ++ state = &get_cpu_var(bpf_user_rnd_state); ++ res = prandom_u32_state(state); ++- put_cpu_var(state); +++ put_cpu_var(bpf_user_rnd_state); ++ ++ return res; ++ } ++@@ -762,18 +2048,36 @@ u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 ++ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; ++ const struct bpf_func_proto bpf_map_update_elem_proto __weak; ++ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +++const struct bpf_func_proto bpf_map_push_elem_proto __weak; +++const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +++const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +++const struct bpf_func_proto bpf_spin_lock_proto __weak; +++const struct bpf_func_proto bpf_spin_unlock_proto __weak; ++ ++ const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; ++ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; +++const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; ++ const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; +++ ++ const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; ++ const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; ++ const struct bpf_func_proto bpf_get_current_comm_proto __weak; +++const struct bpf_func_proto bpf_get_current_cgroup_id_proto __weak; +++const struct bpf_func_proto bpf_get_local_storage_proto __weak; +++ ++ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void) ++ { ++ return NULL; ++ } ++ +++u64 __weak +++bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, +++ void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) +++{ +++ return -ENOTSUPP; +++} +++EXPORT_SYMBOL_GPL(bpf_event_output); +++ ++ /* Always built-in helper functions. */ ++ const struct bpf_func_proto bpf_tail_call_proto = { ++ .func = NULL, ++@@ -784,9 +2088,34 @@ const struct bpf_func_proto bpf_tail_cal ++ .arg3_type = ARG_ANYTHING, ++ }; ++ ++-/* For classic BPF JITs that don't implement bpf_int_jit_compile(). */ ++-void __weak bpf_int_jit_compile(struct bpf_prog *prog) +++/* Stub for JITs that only support cBPF. eBPF programs are interpreted. +++ * It is encouraged to implement bpf_int_jit_compile() instead, so that +++ * eBPF and implicitly also cBPF can get JITed! +++ */ +++struct bpf_prog * __weak bpf_int_jit_compile(struct bpf_prog *prog) ++ { +++ return prog; +++} +++ +++/* Stub for JITs that support eBPF. All cBPF code gets transformed into +++ * eBPF by the kernel and is later compiled by bpf_int_jit_compile(). +++ */ +++void __weak bpf_jit_compile(struct bpf_prog *prog) +++{ +++} +++ +++bool __weak bpf_helper_changes_pkt_data(void *func) +++{ +++ return false; +++} +++ +++/* Return TRUE if the JIT backend wants verifier to enable sub-register usage +++ * analysis code and wants explicit zero extension inserted by verifier. +++ * Otherwise, return FALSE. +++ */ +++bool __weak bpf_jit_needs_zext(void) +++{ +++ return false; ++ } ++ ++ /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call ++@@ -797,3 +2126,13 @@ int __weak skb_copy_bits(const struct sk ++ { ++ return -EFAULT; ++ } +++ +++DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key); +++EXPORT_SYMBOL(bpf_stats_enabled_key); +++ +++/* All definitions of tracepoints related to BPF. */ +++#define CREATE_TRACE_POINTS +++#include +++ +++EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_exception); +++EXPORT_TRACEPOINT_SYMBOL_GPL(xdp_bulk_tx); ++--- /dev/null +++++ b/kernel/bpf/devmap.c ++@@ -0,0 +1,698 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io +++ */ +++ +++/* Devmaps primary use is as a backend map for XDP BPF helper call +++ * bpf_redirect_map(). Because XDP is mostly concerned with performance we +++ * spent some effort to ensure the datapath with redirect maps does not use +++ * any locking. This is a quick note on the details. +++ * +++ * We have three possible paths to get into the devmap control plane bpf +++ * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall +++ * will invoke an update, delete, or lookup operation. To ensure updates and +++ * deletes appear atomic from the datapath side xchg() is used to modify the +++ * netdev_map array. Then because the datapath does a lookup into the netdev_map +++ * array (read-only) from an RCU critical section we use call_rcu() to wait for +++ * an rcu grace period before free'ing the old data structures. This ensures the +++ * datapath always has a valid copy. However, the datapath does a "flush" +++ * operation that pushes any pending packets in the driver outside the RCU +++ * critical section. Each bpf_dtab_netdev tracks these pending operations using +++ * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until +++ * this list is empty, indicating outstanding flush operations have completed. +++ * +++ * BPF syscalls may race with BPF program calls on any of the update, delete +++ * or lookup operations. As noted above the xchg() operation also keep the +++ * netdev_map consistent in this case. From the devmap side BPF programs +++ * calling into these operations are the same as multiple user space threads +++ * making system calls. +++ * +++ * Finally, any of the above may race with a netdev_unregister notifier. The +++ * unregister notifier must search for net devices in the map structure that +++ * contain a reference to the net device and remove them. This is a two step +++ * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b) +++ * check to see if the ifindex is the same as the net_device being removed. +++ * When removing the dev a cmpxchg() is used to ensure the correct dev is +++ * removed, in the case of a concurrent update or delete operation it is +++ * possible that the initially referenced dev is no longer in the map. As the +++ * notifier hook walks the map we know that new dev references can not be +++ * added by the user because core infrastructure ensures dev_get_by_index() +++ * calls will fail at this point. +++ * +++ * The devmap_hash type is a map type which interprets keys as ifindexes and +++ * indexes these using a hashmap. This allows maps that use ifindex as key to be +++ * densely packed instead of having holes in the lookup array for unused +++ * ifindexes. The setup and packet enqueue/send code is shared between the two +++ * types of devmap; only the lookup and insertion is different. +++ */ +++#include +++#include +++#include +++#include +++ +++#define DEV_CREATE_FLAG_MASK \ +++ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +++ +++#define DEV_MAP_BULK_SIZE 16 +++struct bpf_dtab_netdev; +++ +++struct xdp_bulk_queue { +++ struct xdp_frame *q[DEV_MAP_BULK_SIZE]; +++ struct list_head flush_node; +++ struct net_device *dev_rx; +++ struct bpf_dtab_netdev *obj; +++ unsigned int count; +++}; +++ +++struct bpf_dtab_netdev { +++ struct net_device *dev; /* must be first member, due to tracepoint */ +++ struct hlist_node index_hlist; +++ struct bpf_dtab *dtab; +++ struct xdp_bulk_queue __percpu *bulkq; +++ struct rcu_head rcu; +++ unsigned int idx; /* keep track of map index for tracepoint */ +++}; +++ +++struct bpf_dtab { +++ struct bpf_map map; +++ struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */ +++ struct list_head __percpu *flush_list; +++ struct list_head list; +++ +++ /* these are only used for DEVMAP_HASH type maps */ +++ struct hlist_head *dev_index_head; +++ spinlock_t index_lock; +++ unsigned int items; +++ u32 n_buckets; +++}; +++ +++static DEFINE_SPINLOCK(dev_map_lock); +++static LIST_HEAD(dev_map_list); +++ +++static struct hlist_head *dev_map_create_hash(unsigned int entries, +++ int numa_node) +++{ +++ int i; +++ struct hlist_head *hash; +++ +++ hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node); +++ if (hash != NULL) +++ for (i = 0; i < entries; i++) +++ INIT_HLIST_HEAD(&hash[i]); +++ +++ return hash; +++} +++ +++static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, +++ int idx) +++{ +++ return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +++} +++ +++static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) +++{ +++ int err, cpu; +++ u64 cost; +++ +++ /* check sanity of attributes */ +++ if (attr->max_entries == 0 || attr->key_size != 4 || +++ attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) +++ return -EINVAL; +++ +++ /* Lookup returns a pointer straight to dev->ifindex, so make sure the +++ * verifier prevents writes from the BPF side +++ */ +++ attr->map_flags |= BPF_F_RDONLY_PROG; +++ +++ +++ bpf_map_init_from_attr(&dtab->map, attr); +++ +++ /* make sure page count doesn't overflow */ +++ cost = (u64) sizeof(struct list_head) * num_possible_cpus(); +++ +++ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { +++ dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); +++ +++ if (!dtab->n_buckets) /* Overflow check */ +++ return -EINVAL; +++ cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; +++ } else { +++ cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); +++ } +++ +++ /* if map size is larger than memlock limit, reject it */ +++ err = bpf_map_charge_init(&dtab->map.memory, cost); +++ if (err) +++ return -EINVAL; +++ +++ dtab->flush_list = alloc_percpu(struct list_head); +++ if (!dtab->flush_list) +++ goto free_charge; +++ +++ for_each_possible_cpu(cpu) +++ INIT_LIST_HEAD(per_cpu_ptr(dtab->flush_list, cpu)); +++ +++ if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { +++ dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, +++ dtab->map.numa_node); +++ if (!dtab->dev_index_head) +++ goto free_percpu; +++ +++ spin_lock_init(&dtab->index_lock); +++ } else { +++ dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * +++ sizeof(struct bpf_dtab_netdev *), +++ dtab->map.numa_node); +++ if (!dtab->netdev_map) +++ goto free_percpu; +++ } +++ +++ return 0; +++ +++free_percpu: +++ free_percpu(dtab->flush_list); +++free_charge: +++ bpf_map_charge_finish(&dtab->map.memory); +++ return -ENOMEM; +++} +++ +++static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +++{ +++ struct bpf_dtab *dtab; +++ int err; +++ +++ if (!capable(CAP_NET_ADMIN)) +++ return ERR_PTR(-EPERM); +++ +++ dtab = kzalloc(sizeof(*dtab), GFP_USER); +++ if (!dtab) +++ return ERR_PTR(-ENOMEM); +++ +++ err = dev_map_init_map(dtab, attr); +++ if (err) { +++ kfree(dtab); +++ return ERR_PTR(err); +++ } +++ +++ spin_lock(&dev_map_lock); +++ list_add_tail_rcu(&dtab->list, &dev_map_list); +++ spin_unlock(&dev_map_lock); +++ +++ return &dtab->map; +++} +++ +++static void dev_map_free(struct bpf_map *map) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ int i, cpu; +++ +++ /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, +++ * so the programs (can be more than one that used this map) were +++ * disconnected from events. Wait for outstanding critical sections in +++ * these programs to complete. The rcu critical section only guarantees +++ * no further reads against netdev_map. It does __not__ ensure pending +++ * flush operations (if any) are complete. +++ */ +++ +++ spin_lock(&dev_map_lock); +++ list_del_rcu(&dtab->list); +++ spin_unlock(&dev_map_lock); +++ +++ bpf_clear_redirect_map(map); +++ synchronize_rcu(); +++ +++ /* Make sure prior __dev_map_entry_free() have completed. */ +++ rcu_barrier(); +++ +++ /* To ensure all pending flush operations have completed wait for flush +++ * list to empty on _all_ cpus. +++ * Because the above synchronize_rcu() ensures the map is disconnected +++ * from the program we can assume no new items will be added. +++ */ +++ for_each_online_cpu(cpu) { +++ struct list_head *flush_list = per_cpu_ptr(dtab->flush_list, cpu); +++ +++ while (!list_empty(flush_list)) +++ cond_resched(); +++ } +++ +++ if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { +++ for (i = 0; i < dtab->n_buckets; i++) { +++ struct bpf_dtab_netdev *dev; +++ struct hlist_head *head; +++ struct hlist_node *next; +++ +++ head = dev_map_index_hash(dtab, i); +++ +++ hlist_for_each_entry_safe(dev, next, head, index_hlist) { +++ hlist_del_rcu(&dev->index_hlist); +++ free_percpu(dev->bulkq); +++ dev_put(dev->dev); +++ kfree(dev); +++ } +++ } +++ +++ bpf_map_area_free(dtab->dev_index_head); +++ } else { +++ for (i = 0; i < dtab->map.max_entries; i++) { +++ struct bpf_dtab_netdev *dev; +++ +++ dev = dtab->netdev_map[i]; +++ if (!dev) +++ continue; +++ +++ free_percpu(dev->bulkq); +++ dev_put(dev->dev); +++ kfree(dev); +++ } +++ +++ bpf_map_area_free(dtab->netdev_map); +++ } +++ +++ free_percpu(dtab->flush_list); +++ kfree(dtab); +++} +++ +++static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ u32 index = key ? *(u32 *)key : U32_MAX; +++ u32 *next = next_key; +++ +++ if (index >= dtab->map.max_entries) { +++ *next = 0; +++ return 0; +++ } +++ +++ if (index == dtab->map.max_entries - 1) +++ return -ENOENT; +++ *next = index + 1; +++ return 0; +++} +++ +++struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ struct hlist_head *head = dev_map_index_hash(dtab, key); +++ struct bpf_dtab_netdev *dev; +++ +++ hlist_for_each_entry_rcu(dev, head, index_hlist) +++ if (dev->idx == key) +++ return dev; +++ +++ return NULL; +++} +++ +++static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, +++ void *next_key) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ u32 idx, *next = next_key; +++ struct bpf_dtab_netdev *dev, *next_dev; +++ struct hlist_head *head; +++ int i = 0; +++ +++ if (!key) +++ goto find_first; +++ +++ idx = *(u32 *)key; +++ +++ dev = __dev_map_hash_lookup_elem(map, idx); +++ if (!dev) +++ goto find_first; +++ +++ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), +++ struct bpf_dtab_netdev, index_hlist); +++ +++ if (next_dev) { +++ *next = next_dev->idx; +++ return 0; +++ } +++ +++ i = idx & (dtab->n_buckets - 1); +++ i++; +++ +++ find_first: +++ for (; i < dtab->n_buckets; i++) { +++ head = dev_map_index_hash(dtab, i); +++ +++ next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), +++ struct bpf_dtab_netdev, +++ index_hlist); +++ if (next_dev) { +++ *next = next_dev->idx; +++ return 0; +++ } +++ } +++ +++ return -ENOENT; +++} +++ +++/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled +++ * from the driver before returning from its napi->poll() routine. The poll() +++ * routine is called either from busy_poll context or net_rx_action signaled +++ * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the +++ * net device can be torn down. On devmap tear down we ensure the flush list +++ * is empty before completing to ensure all flush operations have completed. +++ */ +++void __dev_map_flush(struct bpf_map *map) +++{ +++} +++ +++/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or +++ * update happens in parallel here a dev_put wont happen until after reading the +++ * ifindex. +++ */ +++struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ struct bpf_dtab_netdev *obj; +++ +++ if (key >= map->max_entries) +++ return NULL; +++ +++ obj = READ_ONCE(dtab->netdev_map[key]); +++ return obj; +++} +++ +++int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, +++ struct net_device *dev_rx) +++{ +++ return -EOPNOTSUPP; +++} +++ +++int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, +++ struct bpf_prog *xdp_prog) +++{ +++ return -EOPNOTSUPP; +++} +++ +++static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key); +++ struct net_device *dev = obj ? obj->dev : NULL; +++ +++ return dev ? &dev->ifindex : NULL; +++} +++ +++static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map, +++ *(u32 *)key); +++ struct net_device *dev = obj ? obj->dev : NULL; +++ +++ return dev ? &dev->ifindex : NULL; +++} +++ +++static void __dev_map_entry_free(struct rcu_head *rcu) +++{ +++ struct bpf_dtab_netdev *dev; +++ +++ dev = container_of(rcu, struct bpf_dtab_netdev, rcu); +++ free_percpu(dev->bulkq); +++ dev_put(dev->dev); +++ kfree(dev); +++} +++ +++static int dev_map_delete_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ struct bpf_dtab_netdev *old_dev; +++ int k = *(u32 *)key; +++ +++ if (k >= map->max_entries) +++ return -EINVAL; +++ +++ /* Use call_rcu() here to ensure any rcu critical sections have +++ * completed, but this does not guarantee a flush has happened +++ * yet. Because driver side rcu_read_lock/unlock only protects the +++ * running XDP program. However, for pending flush operations the +++ * dev and ctx are stored in another per cpu map. And additionally, +++ * the driver tear down ensures all soft irqs are complete before +++ * removing the net device in the case of dev_put equals zero. +++ */ +++ old_dev = xchg(&dtab->netdev_map[k], NULL); +++ if (old_dev) +++ call_rcu(&old_dev->rcu, __dev_map_entry_free); +++ return 0; +++} +++ +++static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ struct bpf_dtab_netdev *old_dev; +++ int k = *(u32 *)key; +++ unsigned long flags; +++ int ret = -ENOENT; +++ +++ spin_lock_irqsave(&dtab->index_lock, flags); +++ +++ old_dev = __dev_map_hash_lookup_elem(map, k); +++ if (old_dev) { +++ dtab->items--; +++ hlist_del_init_rcu(&old_dev->index_hlist); +++ call_rcu(&old_dev->rcu, __dev_map_entry_free); +++ ret = 0; +++ } +++ spin_unlock_irqrestore(&dtab->index_lock, flags); +++ +++ return ret; +++} +++ +++static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, +++ struct bpf_dtab *dtab, +++ u32 ifindex, +++ unsigned int idx) +++{ +++ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN; +++ struct bpf_dtab_netdev *dev; +++ struct xdp_bulk_queue *bq; +++ int cpu; +++ +++ dev = kmalloc_node(sizeof(*dev), gfp, dtab->map.numa_node); +++ if (!dev) +++ return ERR_PTR(-ENOMEM); +++ +++ dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq), +++ sizeof(void *), gfp); +++ if (!dev->bulkq) { +++ kfree(dev); +++ return ERR_PTR(-ENOMEM); +++ } +++ +++ for_each_possible_cpu(cpu) { +++ bq = per_cpu_ptr(dev->bulkq, cpu); +++ bq->obj = dev; +++ } +++ +++ dev->dev = dev_get_by_index(net, ifindex); +++ if (!dev->dev) { +++ free_percpu(dev->bulkq); +++ kfree(dev); +++ return ERR_PTR(-EINVAL); +++ } +++ +++ dev->idx = idx; +++ dev->dtab = dtab; +++ +++ return dev; +++} +++ +++static int __dev_map_update_elem(struct net *net, struct bpf_map *map, +++ void *key, void *value, u64 map_flags) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ struct bpf_dtab_netdev *dev, *old_dev; +++ u32 ifindex = *(u32 *)value; +++ u32 i = *(u32 *)key; +++ +++ if (unlikely(map_flags > BPF_EXIST)) +++ return -EINVAL; +++ if (unlikely(i >= dtab->map.max_entries)) +++ return -E2BIG; +++ if (unlikely(map_flags == BPF_NOEXIST)) +++ return -EEXIST; +++ +++ if (!ifindex) { +++ dev = NULL; +++ } else { +++ dev = __dev_map_alloc_node(net, dtab, ifindex, i); +++ if (IS_ERR(dev)) +++ return PTR_ERR(dev); +++ } +++ +++ /* Use call_rcu() here to ensure rcu critical sections have completed +++ * Remembering the driver side flush operation will happen before the +++ * net device is removed. +++ */ +++ old_dev = xchg(&dtab->netdev_map[i], dev); +++ if (old_dev) +++ call_rcu(&old_dev->rcu, __dev_map_entry_free); +++ +++ return 0; +++} +++ +++static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, +++ u64 map_flags) +++{ +++ return __dev_map_update_elem(current->nsproxy->net_ns, +++ map, key, value, map_flags); +++} +++ +++static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, +++ void *key, void *value, u64 map_flags) +++{ +++ struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); +++ struct bpf_dtab_netdev *dev, *old_dev; +++ u32 ifindex = *(u32 *)value; +++ u32 idx = *(u32 *)key; +++ unsigned long flags; +++ int err = -EEXIST; +++ +++ if (unlikely(map_flags > BPF_EXIST || !ifindex)) +++ return -EINVAL; +++ +++ spin_lock_irqsave(&dtab->index_lock, flags); +++ +++ old_dev = __dev_map_hash_lookup_elem(map, idx); +++ if (old_dev && (map_flags & BPF_NOEXIST)) +++ goto out_err; +++ +++ dev = __dev_map_alloc_node(net, dtab, ifindex, idx); +++ if (IS_ERR(dev)) { +++ err = PTR_ERR(dev); +++ goto out_err; +++ } +++ +++ if (old_dev) { +++ hlist_del_rcu(&old_dev->index_hlist); +++ } else { +++ if (dtab->items >= dtab->map.max_entries) { +++ spin_unlock_irqrestore(&dtab->index_lock, flags); +++ call_rcu(&dev->rcu, __dev_map_entry_free); +++ return -E2BIG; +++ } +++ dtab->items++; +++ } +++ +++ hlist_add_head_rcu(&dev->index_hlist, +++ dev_map_index_hash(dtab, idx)); +++ spin_unlock_irqrestore(&dtab->index_lock, flags); +++ +++ if (old_dev) +++ call_rcu(&old_dev->rcu, __dev_map_entry_free); +++ +++ return 0; +++ +++out_err: +++ spin_unlock_irqrestore(&dtab->index_lock, flags); +++ return err; +++} +++ +++static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, +++ u64 map_flags) +++{ +++ return __dev_map_hash_update_elem(current->nsproxy->net_ns, +++ map, key, value, map_flags); +++} +++ +++const struct bpf_map_ops dev_map_ops = { +++ .map_alloc = dev_map_alloc, +++ .map_free = dev_map_free, +++ .map_get_next_key = dev_map_get_next_key, +++ .map_lookup_elem = dev_map_lookup_elem, +++ .map_update_elem = dev_map_update_elem, +++ .map_delete_elem = dev_map_delete_elem, +++ .map_check_btf = map_check_no_btf, +++}; +++ +++const struct bpf_map_ops dev_map_hash_ops = { +++ .map_alloc = dev_map_alloc, +++ .map_free = dev_map_free, +++ .map_get_next_key = dev_map_hash_get_next_key, +++ .map_lookup_elem = dev_map_hash_lookup_elem, +++ .map_update_elem = dev_map_hash_update_elem, +++ .map_delete_elem = dev_map_hash_delete_elem, +++ .map_check_btf = map_check_no_btf, +++}; +++ +++static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab, +++ struct net_device *netdev) +++{ +++ unsigned long flags; +++ u32 i; +++ +++ spin_lock_irqsave(&dtab->index_lock, flags); +++ for (i = 0; i < dtab->n_buckets; i++) { +++ struct bpf_dtab_netdev *dev; +++ struct hlist_head *head; +++ struct hlist_node *next; +++ +++ head = dev_map_index_hash(dtab, i); +++ +++ hlist_for_each_entry_safe(dev, next, head, index_hlist) { +++ if (netdev != dev->dev) +++ continue; +++ +++ dtab->items--; +++ hlist_del_rcu(&dev->index_hlist); +++ call_rcu(&dev->rcu, __dev_map_entry_free); +++ } +++ } +++ spin_unlock_irqrestore(&dtab->index_lock, flags); +++} +++ +++static int dev_map_notification(struct notifier_block *notifier, +++ ulong event, void *ptr) +++{ +++ struct net_device *netdev = netdev_notifier_info_to_dev(ptr); +++ struct bpf_dtab *dtab; +++ int i; +++ +++ switch (event) { +++ case NETDEV_UNREGISTER: +++ /* This rcu_read_lock/unlock pair is needed because +++ * dev_map_list is an RCU list AND to ensure a delete +++ * operation does not free a netdev_map entry while we +++ * are comparing it against the netdev being unregistered. +++ */ +++ rcu_read_lock(); +++ list_for_each_entry_rcu(dtab, &dev_map_list, list) { +++ if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) { +++ dev_map_hash_remove_netdev(dtab, netdev); +++ continue; +++ } +++ +++ for (i = 0; i < dtab->map.max_entries; i++) { +++ struct bpf_dtab_netdev *dev, *odev; +++ +++ dev = READ_ONCE(dtab->netdev_map[i]); +++ if (!dev || netdev != dev->dev) +++ continue; +++ odev = cmpxchg(&dtab->netdev_map[i], dev, NULL); +++ if (dev == odev) +++ call_rcu(&dev->rcu, +++ __dev_map_entry_free); +++ } +++ } +++ rcu_read_unlock(); +++ break; +++ default: +++ break; +++ } +++ return NOTIFY_OK; +++} +++ +++static struct notifier_block dev_map_notifier = { +++ .notifier_call = dev_map_notification, +++}; +++ +++static int __init dev_map_init(void) +++{ +++ /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */ +++ BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) != +++ offsetof(struct _bpf_dtab_netdev, dev)); +++ register_netdevice_notifier(&dev_map_notifier); +++ return 0; +++} +++ +++subsys_initcall(dev_map_init); ++--- /dev/null +++++ b/kernel/bpf/disasm.c ++@@ -0,0 +1,258 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com +++ * Copyright (c) 2016 Facebook +++ */ +++ +++#include +++ +++#include "disasm.h" +++ +++#define __BPF_FUNC_STR_FN(x) [BPF_FUNC_ ## x] = __stringify(bpf_ ## x) +++static const char * const func_id_str[] = { +++ __BPF_FUNC_MAPPER(__BPF_FUNC_STR_FN) +++}; +++#undef __BPF_FUNC_STR_FN +++ +++static const char *__func_get_name(const struct bpf_insn_cbs *cbs, +++ const struct bpf_insn *insn, +++ char *buff, size_t len) +++{ +++ BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); +++ +++ if (insn->src_reg != BPF_PSEUDO_CALL && +++ insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && +++ func_id_str[insn->imm]) +++ return func_id_str[insn->imm]; +++ +++ if (cbs && cbs->cb_call) +++ return cbs->cb_call(cbs->private_data, insn); +++ +++ if (insn->src_reg == BPF_PSEUDO_CALL) +++ snprintf(buff, len, "%+d", insn->imm); +++ +++ return buff; +++} +++ +++static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, +++ const struct bpf_insn *insn, +++ u64 full_imm, char *buff, size_t len) +++{ +++ if (cbs && cbs->cb_imm) +++ return cbs->cb_imm(cbs->private_data, insn, full_imm); +++ +++ snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); +++ return buff; +++} +++ +++const char *func_id_name(int id) +++{ +++ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) +++ return func_id_str[id]; +++ else +++ return "unknown"; +++} +++ +++const char *const bpf_class_string[8] = { +++ [BPF_LD] = "ld", +++ [BPF_LDX] = "ldx", +++ [BPF_ST] = "st", +++ [BPF_STX] = "stx", +++ [BPF_ALU] = "alu", +++ [BPF_JMP] = "jmp", +++ [BPF_JMP32] = "jmp32", +++ [BPF_ALU64] = "alu64", +++}; +++ +++const char *const bpf_alu_string[16] = { +++ [BPF_ADD >> 4] = "+=", +++ [BPF_SUB >> 4] = "-=", +++ [BPF_MUL >> 4] = "*=", +++ [BPF_DIV >> 4] = "/=", +++ [BPF_OR >> 4] = "|=", +++ [BPF_AND >> 4] = "&=", +++ [BPF_LSH >> 4] = "<<=", +++ [BPF_RSH >> 4] = ">>=", +++ [BPF_NEG >> 4] = "neg", +++ [BPF_MOD >> 4] = "%=", +++ [BPF_XOR >> 4] = "^=", +++ [BPF_MOV >> 4] = "=", +++ [BPF_ARSH >> 4] = "s>>=", +++ [BPF_END >> 4] = "endian", +++}; +++ +++static const char *const bpf_ldst_string[] = { +++ [BPF_W >> 3] = "u32", +++ [BPF_H >> 3] = "u16", +++ [BPF_B >> 3] = "u8", +++ [BPF_DW >> 3] = "u64", +++}; +++ +++static const char *const bpf_jmp_string[16] = { +++ [BPF_JA >> 4] = "jmp", +++ [BPF_JEQ >> 4] = "==", +++ [BPF_JGT >> 4] = ">", +++ [BPF_JLT >> 4] = "<", +++ [BPF_JGE >> 4] = ">=", +++ [BPF_JLE >> 4] = "<=", +++ [BPF_JSET >> 4] = "&", +++ [BPF_JNE >> 4] = "!=", +++ [BPF_JSGT >> 4] = "s>", +++ [BPF_JSLT >> 4] = "s<", +++ [BPF_JSGE >> 4] = "s>=", +++ [BPF_JSLE >> 4] = "s<=", +++ [BPF_CALL >> 4] = "call", +++ [BPF_EXIT >> 4] = "exit", +++}; +++ +++static void print_bpf_end_insn(bpf_insn_print_t verbose, +++ void *private_data, +++ const struct bpf_insn *insn) +++{ +++ verbose(private_data, "(%02x) r%d = %s%d r%d\n", +++ insn->code, insn->dst_reg, +++ BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", +++ insn->imm, insn->dst_reg); +++} +++ +++void print_bpf_insn(const struct bpf_insn_cbs *cbs, +++ const struct bpf_insn *insn, +++ bool allow_ptr_leaks) +++{ +++ const bpf_insn_print_t verbose = cbs->cb_print; +++ u8 class = BPF_CLASS(insn->code); +++ +++ if (class == BPF_ALU || class == BPF_ALU64) { +++ if (BPF_OP(insn->code) == BPF_END) { +++ if (class == BPF_ALU64) +++ verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); +++ else +++ print_bpf_end_insn(verbose, cbs->private_data, insn); +++ } else if (BPF_OP(insn->code) == BPF_NEG) { +++ verbose(cbs->private_data, "(%02x) %c%d = -%c%d\n", +++ insn->code, class == BPF_ALU ? 'w' : 'r', +++ insn->dst_reg, class == BPF_ALU ? 'w' : 'r', +++ insn->dst_reg); +++ } else if (BPF_SRC(insn->code) == BPF_X) { +++ verbose(cbs->private_data, "(%02x) %c%d %s %c%d\n", +++ insn->code, class == BPF_ALU ? 'w' : 'r', +++ insn->dst_reg, +++ bpf_alu_string[BPF_OP(insn->code) >> 4], +++ class == BPF_ALU ? 'w' : 'r', +++ insn->src_reg); +++ } else { +++ verbose(cbs->private_data, "(%02x) %c%d %s %d\n", +++ insn->code, class == BPF_ALU ? 'w' : 'r', +++ insn->dst_reg, +++ bpf_alu_string[BPF_OP(insn->code) >> 4], +++ insn->imm); +++ } +++ } else if (class == BPF_STX) { +++ if (BPF_MODE(insn->code) == BPF_MEM) +++ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", +++ insn->code, +++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +++ insn->dst_reg, +++ insn->off, insn->src_reg); +++ else if (BPF_MODE(insn->code) == BPF_XADD) +++ verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", +++ insn->code, +++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +++ insn->dst_reg, insn->off, +++ insn->src_reg); +++ else +++ verbose(cbs->private_data, "BUG_%02x\n", insn->code); +++ } else if (class == BPF_ST) { +++ if (BPF_MODE(insn->code) != BPF_MEM) { +++ verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); +++ return; +++ } +++ verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", +++ insn->code, +++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +++ insn->dst_reg, +++ insn->off, insn->imm); +++ } else if (class == BPF_LDX) { +++ if (BPF_MODE(insn->code) != BPF_MEM) { +++ verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); +++ return; +++ } +++ verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", +++ insn->code, insn->dst_reg, +++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +++ insn->src_reg, insn->off); +++ } else if (class == BPF_LD) { +++ if (BPF_MODE(insn->code) == BPF_ABS) { +++ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", +++ insn->code, +++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +++ insn->imm); +++ } else if (BPF_MODE(insn->code) == BPF_IND) { +++ verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", +++ insn->code, +++ bpf_ldst_string[BPF_SIZE(insn->code) >> 3], +++ insn->src_reg, insn->imm); +++ } else if (BPF_MODE(insn->code) == BPF_IMM && +++ BPF_SIZE(insn->code) == BPF_DW) { +++ /* At this point, we already made sure that the second +++ * part of the ldimm64 insn is accessible. +++ */ +++ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; +++ bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD || +++ insn->src_reg == BPF_PSEUDO_MAP_VALUE; +++ char tmp[64]; +++ +++ if (is_ptr && !allow_ptr_leaks) +++ imm = 0; +++ +++ verbose(cbs->private_data, "(%02x) r%d = %s\n", +++ insn->code, insn->dst_reg, +++ __func_imm_name(cbs, insn, imm, +++ tmp, sizeof(tmp))); +++ } else { +++ verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); +++ return; +++ } +++ } else if (class == BPF_JMP32 || class == BPF_JMP) { +++ u8 opcode = BPF_OP(insn->code); +++ +++ if (opcode == BPF_CALL) { +++ char tmp[64]; +++ +++ if (insn->src_reg == BPF_PSEUDO_CALL) { +++ verbose(cbs->private_data, "(%02x) call pc%s\n", +++ insn->code, +++ __func_get_name(cbs, insn, +++ tmp, sizeof(tmp))); +++ } else { +++ strcpy(tmp, "unknown"); +++ verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, +++ __func_get_name(cbs, insn, +++ tmp, sizeof(tmp)), +++ insn->imm); +++ } +++ } else if (insn->code == (BPF_JMP | BPF_JA)) { +++ verbose(cbs->private_data, "(%02x) goto pc%+d\n", +++ insn->code, insn->off); +++ } else if (insn->code == (BPF_JMP | BPF_EXIT)) { +++ verbose(cbs->private_data, "(%02x) exit\n", insn->code); +++ } else if (BPF_SRC(insn->code) == BPF_X) { +++ verbose(cbs->private_data, +++ "(%02x) if %c%d %s %c%d goto pc%+d\n", +++ insn->code, class == BPF_JMP32 ? 'w' : 'r', +++ insn->dst_reg, +++ bpf_jmp_string[BPF_OP(insn->code) >> 4], +++ class == BPF_JMP32 ? 'w' : 'r', +++ insn->src_reg, insn->off); +++ } else { +++ verbose(cbs->private_data, +++ "(%02x) if %c%d %s 0x%x goto pc%+d\n", +++ insn->code, class == BPF_JMP32 ? 'w' : 'r', +++ insn->dst_reg, +++ bpf_jmp_string[BPF_OP(insn->code) >> 4], +++ insn->imm, insn->off); +++ } +++ } else { +++ verbose(cbs->private_data, "(%02x) %s\n", +++ insn->code, bpf_class_string[class]); +++ } +++} ++--- /dev/null +++++ b/kernel/bpf/disasm.h ++@@ -0,0 +1,40 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ +++/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com +++ * Copyright (c) 2016 Facebook +++ */ +++ +++#ifndef __BPF_DISASM_H__ +++#define __BPF_DISASM_H__ +++ +++#include +++#include +++#include +++#ifndef __KERNEL__ +++#include +++#include +++#endif +++ +++extern const char *const bpf_alu_string[16]; +++extern const char *const bpf_class_string[8]; +++ +++const char *func_id_name(int id); +++ +++typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, +++ const char *, ...); +++typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, +++ const struct bpf_insn *insn); +++typedef const char *(*bpf_insn_print_imm_t)(void *private_data, +++ const struct bpf_insn *insn, +++ __u64 full_imm); +++ +++struct bpf_insn_cbs { +++ bpf_insn_print_t cb_print; +++ bpf_insn_revmap_call_t cb_call; +++ bpf_insn_print_imm_t cb_imm; +++ void *private_data; +++}; +++ +++void print_bpf_insn(const struct bpf_insn_cbs *cbs, +++ const struct bpf_insn *insn, +++ bool allow_ptr_leaks); +++#endif ++--- a/kernel/bpf/hashtab.c +++++ b/kernel/bpf/hashtab.c ++@@ -1,147 +1,467 @@ +++// SPDX-License-Identifier: GPL-2.0-only ++ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++- * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of version 2 of the GNU General Public ++- * License as published by the Free Software Foundation. ++- * ++- * This program is distributed in the hope that it will be useful, but ++- * WITHOUT ANY WARRANTY; without even the implied warranty of ++- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++- * General Public License for more details. +++ * Copyright (c) 2016 Facebook ++ */ ++ #include +++#include ++ #include ++ #include ++-#include +++#include +++#include +++#include +++#include "percpu_freelist.h" +++#include "bpf_lru_list.h" +++#include "map_in_map.h" +++ +++#define HTAB_CREATE_FLAG_MASK \ +++ (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \ +++ BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED) +++ +++struct bucket { +++ struct hlist_nulls_head head; +++ raw_spinlock_t lock; +++}; ++ ++ struct bpf_htab { ++ struct bpf_map map; ++- struct hlist_head *buckets; ++- raw_spinlock_t lock; ++- u32 count; /* number of elements in this hashtable */ +++ struct bucket *buckets; +++ void *elems; +++ union { +++ struct pcpu_freelist freelist; +++ struct bpf_lru lru; +++ }; +++ struct htab_elem *__percpu *extra_elems; +++ atomic_t count; /* number of elements in this hashtable */ ++ u32 n_buckets; /* number of hash buckets */ ++ u32 elem_size; /* size of each element in bytes */ +++ u32 hashrnd; ++ }; ++ ++ /* each htab element is struct htab_elem + key + value */ ++ struct htab_elem { ++- struct hlist_node hash_node; ++- struct rcu_head rcu; +++ union { +++ struct hlist_nulls_node hash_node; +++ struct { +++ void *padding; +++ union { +++ struct bpf_htab *htab; +++ struct pcpu_freelist_node fnode; +++ }; +++ }; +++ }; +++ union { +++ struct rcu_head rcu; +++ struct bpf_lru_node lru_node; +++ }; ++ u32 hash; ++ char key[0] __aligned(8); ++ }; ++ +++static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); +++ +++static bool htab_is_lru(const struct bpf_htab *htab) +++{ +++ return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH || +++ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +++} +++ +++static bool htab_is_percpu(const struct bpf_htab *htab) +++{ +++ return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH || +++ htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; +++} +++ +++static bool htab_is_prealloc(const struct bpf_htab *htab) +++{ +++ return !(htab->map.map_flags & BPF_F_NO_PREALLOC); +++} +++ +++static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, +++ void __percpu *pptr) +++{ +++ *(void __percpu **)(l->key + key_size) = pptr; +++} +++ +++static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +++{ +++ return *(void __percpu **)(l->key + key_size); +++} +++ +++static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +++{ +++ return *(void **)(l->key + roundup(map->key_size, 8)); +++} +++ +++static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +++{ +++ return (struct htab_elem *) (htab->elems + i * htab->elem_size); +++} +++ +++static void htab_free_elems(struct bpf_htab *htab) +++{ +++ int i; +++ +++ if (!htab_is_percpu(htab)) +++ goto free_elems; +++ +++ for (i = 0; i < htab->map.max_entries; i++) { +++ void __percpu *pptr; +++ +++ pptr = htab_elem_get_ptr(get_htab_elem(htab, i), +++ htab->map.key_size); +++ free_percpu(pptr); +++ cond_resched(); +++ } +++free_elems: +++ bpf_map_area_free(htab->elems); +++} +++ +++static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, +++ u32 hash) +++{ +++ struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash); +++ struct htab_elem *l; +++ +++ if (node) { +++ l = container_of(node, struct htab_elem, lru_node); +++ memcpy(l->key, key, htab->map.key_size); +++ return l; +++ } +++ +++ return NULL; +++} +++ +++static int prealloc_init(struct bpf_htab *htab) +++{ +++ u32 num_entries = htab->map.max_entries; +++ int err = -ENOMEM, i; +++ +++ if (!htab_is_percpu(htab) && !htab_is_lru(htab)) +++ num_entries += num_possible_cpus(); +++ +++ htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, +++ htab->map.numa_node); +++ if (!htab->elems) +++ return -ENOMEM; +++ +++ if (!htab_is_percpu(htab)) +++ goto skip_percpu_elems; +++ +++ for (i = 0; i < num_entries; i++) { +++ u32 size = round_up(htab->map.value_size, 8); +++ void __percpu *pptr; +++ +++ pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); +++ if (!pptr) +++ goto free_elems; +++ htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, +++ pptr); +++ cond_resched(); +++ } +++ +++skip_percpu_elems: +++ if (htab_is_lru(htab)) +++ err = bpf_lru_init(&htab->lru, +++ htab->map.map_flags & BPF_F_NO_COMMON_LRU, +++ offsetof(struct htab_elem, hash) - +++ offsetof(struct htab_elem, lru_node), +++ htab_lru_map_delete_node, +++ htab); +++ else +++ err = pcpu_freelist_init(&htab->freelist); +++ +++ if (err) +++ goto free_elems; +++ +++ if (htab_is_lru(htab)) +++ bpf_lru_populate(&htab->lru, htab->elems, +++ offsetof(struct htab_elem, lru_node), +++ htab->elem_size, num_entries); +++ else +++ pcpu_freelist_populate(&htab->freelist, +++ htab->elems + offsetof(struct htab_elem, fnode), +++ htab->elem_size, num_entries); +++ +++ return 0; +++ +++free_elems: +++ htab_free_elems(htab); +++ return err; +++} +++ +++static void prealloc_destroy(struct bpf_htab *htab) +++{ +++ htab_free_elems(htab); +++ +++ if (htab_is_lru(htab)) +++ bpf_lru_destroy(&htab->lru); +++ else +++ pcpu_freelist_destroy(&htab->freelist); +++} +++ +++static int alloc_extra_elems(struct bpf_htab *htab) +++{ +++ struct htab_elem *__percpu *pptr, *l_new; +++ struct pcpu_freelist_node *l; +++ int cpu; +++ +++ pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, +++ GFP_USER | __GFP_NOWARN); +++ if (!pptr) +++ return -ENOMEM; +++ +++ for_each_possible_cpu(cpu) { +++ l = pcpu_freelist_pop(&htab->freelist); +++ /* pop will succeed, since prealloc_init() +++ * preallocated extra num_possible_cpus elements +++ */ +++ l_new = container_of(l, struct htab_elem, fnode); +++ *per_cpu_ptr(pptr, cpu) = l_new; +++ } +++ htab->extra_elems = pptr; +++ return 0; +++} +++ ++ /* Called from syscall */ ++-static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +++static int htab_map_alloc_check(union bpf_attr *attr) ++ { ++- struct bpf_htab *htab; ++- int err, i; +++ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || +++ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +++ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || +++ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +++ /* percpu_lru means each cpu has its own LRU list. +++ * it is different from BPF_MAP_TYPE_PERCPU_HASH where +++ * the map's value itself is percpu. percpu_lru has +++ * nothing to do with the map's value. +++ */ +++ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); +++ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); +++ bool zero_seed = (attr->map_flags & BPF_F_ZERO_SEED); +++ int numa_node = bpf_map_attr_numa_node(attr); +++ +++ BUILD_BUG_ON(offsetof(struct htab_elem, htab) != +++ offsetof(struct htab_elem, hash_node.pprev)); +++ BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != +++ offsetof(struct htab_elem, hash_node.pprev)); +++ +++ if (lru && !capable(CAP_SYS_ADMIN)) +++ /* LRU implementation is much complicated than other +++ * maps. Hence, limit to CAP_SYS_ADMIN for now. +++ */ +++ return -EPERM; ++ ++- htab = kzalloc(sizeof(*htab), GFP_USER); ++- if (!htab) ++- return ERR_PTR(-ENOMEM); +++ if (zero_seed && !capable(CAP_SYS_ADMIN)) +++ /* Guard against local DoS, and discourage production use. */ +++ return -EPERM; ++ ++- /* mandatory map attributes */ ++- htab->map.key_size = attr->key_size; ++- htab->map.value_size = attr->value_size; ++- htab->map.max_entries = attr->max_entries; +++ if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK || +++ !bpf_map_flags_access_ok(attr->map_flags)) +++ return -EINVAL; +++ +++ if (!lru && percpu_lru) +++ return -EINVAL; +++ +++ if (lru && !prealloc) +++ return -ENOTSUPP; +++ +++ if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) +++ return -EINVAL; ++ ++ /* check sanity of attributes. ++ * value_size == 0 may be allowed in the future to use map as a set ++ */ ++- err = -EINVAL; ++- if (htab->map.max_entries == 0 || htab->map.key_size == 0 || ++- htab->map.value_size == 0) ++- goto free_htab; ++- ++- /* hash table size must be power of 2 */ ++- htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); +++ if (attr->max_entries == 0 || attr->key_size == 0 || +++ attr->value_size == 0) +++ return -EINVAL; ++ ++- err = -E2BIG; ++- if (htab->map.key_size > MAX_BPF_STACK) +++ if (attr->key_size > MAX_BPF_STACK) ++ /* eBPF programs initialize keys on stack, so they cannot be ++ * larger than max stack size ++ */ ++- goto free_htab; +++ return -E2BIG; ++ ++- if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - +++ if (attr->value_size >= KMALLOC_MAX_SIZE - ++ MAX_BPF_STACK - sizeof(struct htab_elem)) ++ /* if value_size is bigger, the user space won't be able to ++ * access the elements via bpf syscall. This check also makes ++ * sure that the elem_size doesn't overflow and it's ++ * kmalloc-able later in htab_map_update_elem() ++ */ ++- goto free_htab; +++ return -E2BIG; +++ +++ return 0; +++} +++ +++static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +++{ +++ bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || +++ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +++ bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || +++ attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); +++ /* percpu_lru means each cpu has its own LRU list. +++ * it is different from BPF_MAP_TYPE_PERCPU_HASH where +++ * the map's value itself is percpu. percpu_lru has +++ * nothing to do with the map's value. +++ */ +++ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); +++ bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); +++ struct bpf_htab *htab; +++ int err, i; +++ u64 cost; +++ +++ htab = kzalloc(sizeof(*htab), GFP_USER); +++ if (!htab) +++ return ERR_PTR(-ENOMEM); +++ +++ bpf_map_init_from_attr(&htab->map, attr); +++ +++ if (percpu_lru) { +++ /* ensure each CPU's lru list has >=1 elements. +++ * since we are at it, make each lru list has the same +++ * number of elements. +++ */ +++ htab->map.max_entries = roundup(attr->max_entries, +++ num_possible_cpus()); +++ if (htab->map.max_entries < attr->max_entries) +++ htab->map.max_entries = rounddown(attr->max_entries, +++ num_possible_cpus()); +++ } +++ +++ /* hash table size must be power of 2 */ +++ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); ++ ++ htab->elem_size = sizeof(struct htab_elem) + ++- round_up(htab->map.key_size, 8) + ++- htab->map.value_size; +++ round_up(htab->map.key_size, 8); +++ if (percpu) +++ htab->elem_size += sizeof(void *); +++ else +++ htab->elem_size += round_up(htab->map.value_size, 8); ++ +++ err = -E2BIG; ++ /* prevent zero size kmalloc and check for u32 overflow */ ++ if (htab->n_buckets == 0 || ++- htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) +++ htab->n_buckets > U32_MAX / sizeof(struct bucket)) ++ goto free_htab; ++ ++- if ((u64) htab->n_buckets * sizeof(struct hlist_head) + ++- (u64) htab->elem_size * htab->map.max_entries >= ++- U32_MAX - PAGE_SIZE) ++- /* make sure page count doesn't overflow */ ++- goto free_htab; +++ cost = (u64) htab->n_buckets * sizeof(struct bucket) + +++ (u64) htab->elem_size * htab->map.max_entries; ++ ++- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + ++- htab->elem_size * htab->map.max_entries, ++- PAGE_SIZE) >> PAGE_SHIFT; +++ if (percpu) +++ cost += (u64) round_up(htab->map.value_size, 8) * +++ num_possible_cpus() * htab->map.max_entries; +++ else +++ cost += (u64) htab->elem_size * num_possible_cpus(); +++ +++ /* if map size is larger than memlock limit, reject it */ +++ err = bpf_map_charge_init(&htab->map.memory, cost); +++ if (err) +++ goto free_htab; ++ ++ err = -ENOMEM; ++- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), ++- GFP_USER | __GFP_NOWARN); +++ htab->buckets = bpf_map_area_alloc(htab->n_buckets * +++ sizeof(struct bucket), +++ htab->map.numa_node); +++ if (!htab->buckets) +++ goto free_charge; +++ +++ if (htab->map.map_flags & BPF_F_ZERO_SEED) +++ htab->hashrnd = 0; +++ else +++ htab->hashrnd = get_random_int(); ++ ++- if (!htab->buckets) { ++- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); ++- if (!htab->buckets) ++- goto free_htab; +++ for (i = 0; i < htab->n_buckets; i++) { +++ INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); +++ raw_spin_lock_init(&htab->buckets[i].lock); ++ } ++ ++- for (i = 0; i < htab->n_buckets; i++) ++- INIT_HLIST_HEAD(&htab->buckets[i]); ++- ++- raw_spin_lock_init(&htab->lock); ++- htab->count = 0; +++ if (prealloc) { +++ err = prealloc_init(htab); +++ if (err) +++ goto free_buckets; +++ +++ if (!percpu && !lru) { +++ /* lru itself can remove the least used element, so +++ * there is no need for an extra elem during map_update. +++ */ +++ err = alloc_extra_elems(htab); +++ if (err) +++ goto free_prealloc; +++ } +++ } ++ ++ return &htab->map; ++ +++free_prealloc: +++ prealloc_destroy(htab); +++free_buckets: +++ bpf_map_area_free(htab->buckets); +++free_charge: +++ bpf_map_charge_finish(&htab->map.memory); ++ free_htab: ++ kfree(htab); ++ return ERR_PTR(err); ++ } ++ ++-static inline u32 htab_map_hash(const void *key, u32 key_len) +++static inline u32 htab_map_hash(const void *key, u32 key_len, u32 hashrnd) ++ { ++- return jhash(key, key_len, 0); +++ return jhash(key, key_len, hashrnd); ++ } ++ ++-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +++static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) ++ { ++ return &htab->buckets[hash & (htab->n_buckets - 1)]; ++ } ++ ++-static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, +++static inline struct hlist_nulls_head *select_bucket(struct bpf_htab *htab, u32 hash) +++{ +++ return &__select_bucket(htab, hash)->head; +++} +++ +++/* this lookup function can only be called with bucket lock taken */ +++static struct htab_elem *lookup_elem_raw(struct hlist_nulls_head *head, u32 hash, ++ void *key, u32 key_size) ++ { +++ struct hlist_nulls_node *n; ++ struct htab_elem *l; ++ ++- hlist_for_each_entry_rcu(l, head, hash_node) +++ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) ++ if (l->hash == hash && !memcmp(&l->key, key, key_size)) ++ return l; ++ ++ return NULL; ++ } ++ ++-/* Called from syscall or from eBPF program */ ++-static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +++/* can be called without bucket lock. it will repeat the loop in +++ * the unlikely event when elements moved from one bucket into another +++ * while link list is being walked +++ */ +++static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, +++ u32 hash, void *key, +++ u32 key_size, u32 n_buckets) +++{ +++ struct hlist_nulls_node *n; +++ struct htab_elem *l; +++ +++again: +++ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) +++ if (l->hash == hash && !memcmp(&l->key, key, key_size)) +++ return l; +++ +++ if (unlikely(get_nulls_value(n) != (hash & (n_buckets - 1)))) +++ goto again; +++ +++ return NULL; +++} +++ +++/* Called from syscall or from eBPF program directly, so +++ * arguments have to match bpf_map_lookup_elem() exactly. +++ * The return value is adjusted by BPF instructions +++ * in htab_map_gen_lookup(). +++ */ +++static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) ++ { ++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); ++- struct hlist_head *head; +++ struct hlist_nulls_head *head; ++ struct htab_elem *l; ++ u32 hash, key_size; ++ ++@@ -150,11 +470,18 @@ static void *htab_map_lookup_elem(struct ++ ++ key_size = map->key_size; ++ ++- hash = htab_map_hash(key, key_size); +++ hash = htab_map_hash(key, key_size, htab->hashrnd); ++ ++ head = select_bucket(htab, hash); ++ ++- l = lookup_elem_raw(head, hash, key, key_size); +++ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); +++ +++ return l; +++} +++ +++static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ struct htab_elem *l = __htab_map_lookup_elem(map, key); ++ ++ if (l) ++ return l->key + round_up(map->key_size, 8); ++@@ -162,33 +489,138 @@ static void *htab_map_lookup_elem(struct ++ return NULL; ++ } ++ +++/* inline bpf_map_lookup_elem() call. +++ * Instead of: +++ * bpf_prog +++ * bpf_map_lookup_elem +++ * map->ops->map_lookup_elem +++ * htab_map_lookup_elem +++ * __htab_map_lookup_elem +++ * do: +++ * bpf_prog +++ * __htab_map_lookup_elem +++ */ +++static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +++{ +++ struct bpf_insn *insn = insn_buf; +++ const int ret = BPF_REG_0; +++ +++ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, +++ (void *(*)(struct bpf_map *map, void *key))NULL)); +++ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); +++ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, +++ offsetof(struct htab_elem, key) + +++ round_up(map->key_size, 8)); +++ return insn - insn_buf; +++} +++ +++static __always_inline void *__htab_lru_map_lookup_elem(struct bpf_map *map, +++ void *key, const bool mark) +++{ +++ struct htab_elem *l = __htab_map_lookup_elem(map, key); +++ +++ if (l) { +++ if (mark) +++ bpf_lru_node_set_ref(&l->lru_node); +++ return l->key + round_up(map->key_size, 8); +++ } +++ +++ return NULL; +++} +++ +++static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ return __htab_lru_map_lookup_elem(map, key, true); +++} +++ +++static void *htab_lru_map_lookup_elem_sys(struct bpf_map *map, void *key) +++{ +++ return __htab_lru_map_lookup_elem(map, key, false); +++} +++ +++static u32 htab_lru_map_gen_lookup(struct bpf_map *map, +++ struct bpf_insn *insn_buf) +++{ +++ struct bpf_insn *insn = insn_buf; +++ const int ret = BPF_REG_0; +++ const int ref_reg = BPF_REG_1; +++ +++ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, +++ (void *(*)(struct bpf_map *map, void *key))NULL)); +++ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); +++ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4); +++ *insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret, +++ offsetof(struct htab_elem, lru_node) + +++ offsetof(struct bpf_lru_node, ref)); +++ *insn++ = BPF_JMP_IMM(BPF_JNE, ref_reg, 0, 1); +++ *insn++ = BPF_ST_MEM(BPF_B, ret, +++ offsetof(struct htab_elem, lru_node) + +++ offsetof(struct bpf_lru_node, ref), +++ 1); +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, +++ offsetof(struct htab_elem, key) + +++ round_up(map->key_size, 8)); +++ return insn - insn_buf; +++} +++ +++/* It is called from the bpf_lru_list when the LRU needs to delete +++ * older elements from the htab. +++ */ +++static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) +++{ +++ struct bpf_htab *htab = (struct bpf_htab *)arg; +++ struct htab_elem *l = NULL, *tgt_l; +++ struct hlist_nulls_head *head; +++ struct hlist_nulls_node *n; +++ unsigned long flags; +++ struct bucket *b; +++ +++ tgt_l = container_of(node, struct htab_elem, lru_node); +++ b = __select_bucket(htab, tgt_l->hash); +++ head = &b->head; +++ +++ raw_spin_lock_irqsave(&b->lock, flags); +++ +++ hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) +++ if (l == tgt_l) { +++ hlist_nulls_del_rcu(&l->hash_node); +++ break; +++ } +++ +++ raw_spin_unlock_irqrestore(&b->lock, flags); +++ +++ return l == tgt_l; +++} +++ ++ /* Called from syscall */ ++ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) ++ { ++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); ++- struct hlist_head *head; +++ struct hlist_nulls_head *head; ++ struct htab_elem *l, *next_l; ++ u32 hash, key_size; ++- int i; +++ int i = 0; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++ ++ key_size = map->key_size; ++ ++- hash = htab_map_hash(key, key_size); +++ if (!key) +++ goto find_first_elem; +++ +++ hash = htab_map_hash(key, key_size, htab->hashrnd); ++ ++ head = select_bucket(htab, hash); ++ ++ /* lookup the key */ ++- l = lookup_elem_raw(head, hash, key, key_size); +++ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); ++ ++- if (!l) { ++- i = 0; +++ if (!l) ++ goto find_first_elem; ++- } ++ ++ /* key was found, get next key in the same bucket */ ++- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)), +++ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), ++ struct htab_elem, hash_node); ++ ++ if (next_l) { ++@@ -207,7 +639,7 @@ find_first_elem: ++ head = select_bucket(htab, i); ++ ++ /* pick first element in the bucket */ ++- next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), +++ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_first_rcu(head)), ++ struct htab_elem, hash_node); ++ if (next_l) { ++ /* if it's not empty, just return it */ ++@@ -216,90 +648,491 @@ find_first_elem: ++ } ++ } ++ ++- /* itereated over all buckets and all elements */ +++ /* iterated over all buckets and all elements */ ++ return -ENOENT; ++ } ++ +++static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) +++{ +++ if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) +++ free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); +++ kfree(l); +++} +++ +++static void htab_elem_free_rcu(struct rcu_head *head) +++{ +++ struct htab_elem *l = container_of(head, struct htab_elem, rcu); +++ struct bpf_htab *htab = l->htab; +++ +++ htab_elem_free(htab, l); +++} +++ +++static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l) +++{ +++ struct bpf_map *map = &htab->map; +++ void *ptr; +++ +++ if (map->ops->map_fd_put_ptr) { +++ ptr = fd_htab_map_get_ptr(map, l); +++ map->ops->map_fd_put_ptr(ptr); +++ } +++} +++ +++static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +++{ +++ htab_put_fd_value(htab, l); +++ +++ if (htab_is_prealloc(htab)) { +++ __pcpu_freelist_push(&htab->freelist, &l->fnode); +++ } else { +++ atomic_dec(&htab->count); +++ l->htab = htab; +++ call_rcu(&l->rcu, htab_elem_free_rcu); +++ } +++} +++ +++static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, +++ void *value, bool onallcpus) +++{ +++ if (!onallcpus) { +++ /* copy true value_size bytes */ +++ memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); +++ } else { +++ u32 size = round_up(htab->map.value_size, 8); +++ int off = 0, cpu; +++ +++ for_each_possible_cpu(cpu) { +++ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), +++ value + off, size); +++ off += size; +++ } +++ } +++} +++ +++static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, +++ void *value, bool onallcpus) +++{ +++ /* When using prealloc and not setting the initial value on all cpus, +++ * zero-fill element values for other cpus (just as what happens when +++ * not using prealloc). Otherwise, bpf program has no way to ensure +++ * known initial values for cpus other than current one +++ * (onallcpus=false always when coming from bpf prog). +++ */ +++ if (htab_is_prealloc(htab) && !onallcpus) { +++ u32 size = round_up(htab->map.value_size, 8); +++ int current_cpu = raw_smp_processor_id(); +++ int cpu; +++ +++ for_each_possible_cpu(cpu) { +++ if (cpu == current_cpu) +++ bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, +++ size); +++ else +++ memset(per_cpu_ptr(pptr, cpu), 0, size); +++ } +++ } else { +++ pcpu_copy_value(htab, pptr, value, onallcpus); +++ } +++} +++ +++static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) +++{ +++ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && +++ BITS_PER_LONG == 64; +++} +++ +++static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, +++ void *value, u32 key_size, u32 hash, +++ bool percpu, bool onallcpus, +++ struct htab_elem *old_elem) +++{ +++ u32 size = htab->map.value_size; +++ bool prealloc = htab_is_prealloc(htab); +++ struct htab_elem *l_new, **pl_new; +++ void __percpu *pptr; +++ +++ if (prealloc) { +++ if (old_elem) { +++ /* if we're updating the existing element, +++ * use per-cpu extra elems to avoid freelist_pop/push +++ */ +++ pl_new = this_cpu_ptr(htab->extra_elems); +++ l_new = *pl_new; +++ htab_put_fd_value(htab, old_elem); +++ *pl_new = old_elem; +++ } else { +++ struct pcpu_freelist_node *l; +++ +++ l = __pcpu_freelist_pop(&htab->freelist); +++ if (!l) +++ return ERR_PTR(-E2BIG); +++ l_new = container_of(l, struct htab_elem, fnode); +++ } +++ } else { +++ if (atomic_inc_return(&htab->count) > htab->map.max_entries) +++ if (!old_elem) { +++ /* when map is full and update() is replacing +++ * old element, it's ok to allocate, since +++ * old element will be freed immediately. +++ * Otherwise return an error +++ */ +++ l_new = ERR_PTR(-E2BIG); +++ goto dec_count; +++ } +++ l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, +++ htab->map.numa_node); +++ if (!l_new) { +++ l_new = ERR_PTR(-ENOMEM); +++ goto dec_count; +++ } +++ check_and_init_map_lock(&htab->map, +++ l_new->key + round_up(key_size, 8)); +++ } +++ +++ memcpy(l_new->key, key, key_size); +++ if (percpu) { +++ size = round_up(size, 8); +++ if (prealloc) { +++ pptr = htab_elem_get_ptr(l_new, key_size); +++ } else { +++ /* alloc_percpu zero-fills */ +++ pptr = __alloc_percpu_gfp(size, 8, +++ GFP_ATOMIC | __GFP_NOWARN); +++ if (!pptr) { +++ kfree(l_new); +++ l_new = ERR_PTR(-ENOMEM); +++ goto dec_count; +++ } +++ } +++ +++ pcpu_init_value(htab, pptr, value, onallcpus); +++ +++ if (!prealloc) +++ htab_elem_set_ptr(l_new, key_size, pptr); +++ } else if (fd_htab_map_needs_adjust(htab)) { +++ size = round_up(size, 8); +++ memcpy(l_new->key + round_up(key_size, 8), value, size); +++ } else { +++ copy_map_value(&htab->map, +++ l_new->key + round_up(key_size, 8), +++ value); +++ } +++ +++ l_new->hash = hash; +++ return l_new; +++dec_count: +++ atomic_dec(&htab->count); +++ return l_new; +++} +++ +++static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, +++ u64 map_flags) +++{ +++ if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) +++ /* elem already exists */ +++ return -EEXIST; +++ +++ if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) +++ /* elem doesn't exist, cannot update it */ +++ return -ENOENT; +++ +++ return 0; +++} +++ ++ /* Called from syscall or from eBPF program */ ++ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, ++ u64 map_flags) ++ { ++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); ++- struct htab_elem *l_new, *l_old; ++- struct hlist_head *head; +++ struct htab_elem *l_new = NULL, *l_old; +++ struct hlist_nulls_head *head; ++ unsigned long flags; ++- u32 key_size; +++ struct bucket *b; +++ u32 key_size, hash; ++ int ret; ++ ++- if (map_flags > BPF_EXIST) +++ if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) ++ /* unknown flags */ ++ return -EINVAL; ++ ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++ ++- /* allocate new element outside of lock */ ++- l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); ++- if (!l_new) ++- return -ENOMEM; ++- ++ key_size = map->key_size; ++ ++- memcpy(l_new->key, key, key_size); ++- memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); +++ hash = htab_map_hash(key, key_size, htab->hashrnd); +++ +++ b = __select_bucket(htab, hash); +++ head = &b->head; ++ ++- l_new->hash = htab_map_hash(l_new->key, key_size); +++ if (unlikely(map_flags & BPF_F_LOCK)) { +++ if (unlikely(!map_value_has_spin_lock(map))) +++ return -EINVAL; +++ /* find an element without taking the bucket lock */ +++ l_old = lookup_nulls_elem_raw(head, hash, key, key_size, +++ htab->n_buckets); +++ ret = check_flags(htab, l_old, map_flags); +++ if (ret) +++ return ret; +++ if (l_old) { +++ /* grab the element lock and update value in place */ +++ copy_map_value_locked(map, +++ l_old->key + round_up(key_size, 8), +++ value, false); +++ return 0; +++ } +++ /* fall through, grab the bucket lock and lookup again. +++ * 99.9% chance that the element won't be found, +++ * but second lookup under lock has to be done. +++ */ +++ } ++ ++ /* bpf_map_update_elem() can be called in_irq() */ ++- raw_spin_lock_irqsave(&htab->lock, flags); +++ raw_spin_lock_irqsave(&b->lock, flags); ++ ++- head = select_bucket(htab, l_new->hash); +++ l_old = lookup_elem_raw(head, hash, key, key_size); ++ ++- l_old = lookup_elem_raw(head, l_new->hash, key, key_size); +++ ret = check_flags(htab, l_old, map_flags); +++ if (ret) +++ goto err; ++ ++- if (!l_old && unlikely(htab->count >= map->max_entries)) { ++- /* if elem with this 'key' doesn't exist and we've reached ++- * max_entries limit, fail insertion of new elem +++ if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { +++ /* first lookup without the bucket lock didn't find the element, +++ * but second lookup with the bucket lock found it. +++ * This case is highly unlikely, but has to be dealt with: +++ * grab the element lock in addition to the bucket lock +++ * and update element in place ++ */ ++- ret = -E2BIG; +++ copy_map_value_locked(map, +++ l_old->key + round_up(key_size, 8), +++ value, false); +++ ret = 0; ++ goto err; ++ } ++ ++- if (l_old && map_flags == BPF_NOEXIST) { ++- /* elem already exists */ ++- ret = -EEXIST; +++ l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, +++ l_old); +++ if (IS_ERR(l_new)) { +++ /* all pre-allocated elements are in use or memory exhausted */ +++ ret = PTR_ERR(l_new); ++ goto err; ++ } ++ ++- if (!l_old && map_flags == BPF_EXIST) { ++- /* elem doesn't exist, cannot update it */ ++- ret = -ENOENT; ++- goto err; +++ /* add new element to the head of the list, so that +++ * concurrent search will find it before old elem +++ */ +++ hlist_nulls_add_head_rcu(&l_new->hash_node, head); +++ if (l_old) { +++ hlist_nulls_del_rcu(&l_old->hash_node); +++ if (!htab_is_prealloc(htab)) +++ free_htab_elem(htab, l_old); ++ } +++ ret = 0; +++err: +++ raw_spin_unlock_irqrestore(&b->lock, flags); +++ return ret; +++} ++ ++- /* add new element to the head of the list, so that concurrent ++- * search will find it before old elem +++static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, +++ u64 map_flags) +++{ +++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +++ struct htab_elem *l_new, *l_old = NULL; +++ struct hlist_nulls_head *head; +++ unsigned long flags; +++ struct bucket *b; +++ u32 key_size, hash; +++ int ret; +++ +++ if (unlikely(map_flags > BPF_EXIST)) +++ /* unknown flags */ +++ return -EINVAL; +++ +++ WARN_ON_ONCE(!rcu_read_lock_held()); +++ +++ key_size = map->key_size; +++ +++ hash = htab_map_hash(key, key_size, htab->hashrnd); +++ +++ b = __select_bucket(htab, hash); +++ head = &b->head; +++ +++ /* For LRU, we need to alloc before taking bucket's +++ * spinlock because getting free nodes from LRU may need +++ * to remove older elements from htab and this removal +++ * operation will need a bucket lock. ++ */ ++- hlist_add_head_rcu(&l_new->hash_node, head); +++ l_new = prealloc_lru_pop(htab, key, hash); +++ if (!l_new) +++ return -ENOMEM; +++ memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); +++ +++ /* bpf_map_update_elem() can be called in_irq() */ +++ raw_spin_lock_irqsave(&b->lock, flags); +++ +++ l_old = lookup_elem_raw(head, hash, key, key_size); +++ +++ ret = check_flags(htab, l_old, map_flags); +++ if (ret) +++ goto err; +++ +++ /* add new element to the head of the list, so that +++ * concurrent search will find it before old elem +++ */ +++ hlist_nulls_add_head_rcu(&l_new->hash_node, head); +++ if (l_old) { +++ bpf_lru_node_set_ref(&l_new->lru_node); +++ hlist_nulls_del_rcu(&l_old->hash_node); +++ } +++ ret = 0; +++ +++err: +++ raw_spin_unlock_irqrestore(&b->lock, flags); +++ +++ if (ret) +++ bpf_lru_push_free(&htab->lru, &l_new->lru_node); +++ else if (l_old) +++ bpf_lru_push_free(&htab->lru, &l_old->lru_node); +++ +++ return ret; +++} +++ +++static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, +++ void *value, u64 map_flags, +++ bool onallcpus) +++{ +++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +++ struct htab_elem *l_new = NULL, *l_old; +++ struct hlist_nulls_head *head; +++ unsigned long flags; +++ struct bucket *b; +++ u32 key_size, hash; +++ int ret; +++ +++ if (unlikely(map_flags > BPF_EXIST)) +++ /* unknown flags */ +++ return -EINVAL; +++ +++ WARN_ON_ONCE(!rcu_read_lock_held()); +++ +++ key_size = map->key_size; +++ +++ hash = htab_map_hash(key, key_size, htab->hashrnd); +++ +++ b = __select_bucket(htab, hash); +++ head = &b->head; +++ +++ /* bpf_map_update_elem() can be called in_irq() */ +++ raw_spin_lock_irqsave(&b->lock, flags); +++ +++ l_old = lookup_elem_raw(head, hash, key, key_size); +++ +++ ret = check_flags(htab, l_old, map_flags); +++ if (ret) +++ goto err; +++ ++ if (l_old) { ++- hlist_del_rcu(&l_old->hash_node); ++- kfree_rcu(l_old, rcu); +++ /* per-cpu hash map can update value in-place */ +++ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), +++ value, onallcpus); ++ } else { ++- htab->count++; +++ l_new = alloc_htab_elem(htab, key, value, key_size, +++ hash, true, onallcpus, NULL); +++ if (IS_ERR(l_new)) { +++ ret = PTR_ERR(l_new); +++ goto err; +++ } +++ hlist_nulls_add_head_rcu(&l_new->hash_node, head); ++ } ++- raw_spin_unlock_irqrestore(&htab->lock, flags); +++ ret = 0; +++err: +++ raw_spin_unlock_irqrestore(&b->lock, flags); +++ return ret; +++} ++ ++- return 0; +++static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +++ void *value, u64 map_flags, +++ bool onallcpus) +++{ +++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +++ struct htab_elem *l_new = NULL, *l_old; +++ struct hlist_nulls_head *head; +++ unsigned long flags; +++ struct bucket *b; +++ u32 key_size, hash; +++ int ret; +++ +++ if (unlikely(map_flags > BPF_EXIST)) +++ /* unknown flags */ +++ return -EINVAL; +++ +++ WARN_ON_ONCE(!rcu_read_lock_held()); +++ +++ key_size = map->key_size; +++ +++ hash = htab_map_hash(key, key_size, htab->hashrnd); +++ +++ b = __select_bucket(htab, hash); +++ head = &b->head; +++ +++ /* For LRU, we need to alloc before taking bucket's +++ * spinlock because LRU's elem alloc may need +++ * to remove older elem from htab and this removal +++ * operation will need a bucket lock. +++ */ +++ if (map_flags != BPF_EXIST) { +++ l_new = prealloc_lru_pop(htab, key, hash); +++ if (!l_new) +++ return -ENOMEM; +++ } +++ +++ /* bpf_map_update_elem() can be called in_irq() */ +++ raw_spin_lock_irqsave(&b->lock, flags); +++ +++ l_old = lookup_elem_raw(head, hash, key, key_size); +++ +++ ret = check_flags(htab, l_old, map_flags); +++ if (ret) +++ goto err; +++ +++ if (l_old) { +++ bpf_lru_node_set_ref(&l_old->lru_node); +++ +++ /* per-cpu hash map can update value in-place */ +++ pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), +++ value, onallcpus); +++ } else { +++ pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), +++ value, onallcpus); +++ hlist_nulls_add_head_rcu(&l_new->hash_node, head); +++ l_new = NULL; +++ } +++ ret = 0; ++ err: ++- raw_spin_unlock_irqrestore(&htab->lock, flags); ++- kfree(l_new); +++ raw_spin_unlock_irqrestore(&b->lock, flags); +++ if (l_new) +++ bpf_lru_push_free(&htab->lru, &l_new->lru_node); ++ return ret; ++ } ++ +++static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, +++ void *value, u64 map_flags) +++{ +++ return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +++} +++ +++static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, +++ void *value, u64 map_flags) +++{ +++ return __htab_lru_percpu_map_update_elem(map, key, value, map_flags, +++ false); +++} +++ ++ /* Called from syscall or from eBPF program */ ++ static int htab_map_delete_elem(struct bpf_map *map, void *key) ++ { ++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); ++- struct hlist_head *head; +++ struct hlist_nulls_head *head; +++ struct bucket *b; ++ struct htab_elem *l; ++ unsigned long flags; ++ u32 hash, key_size; ++@@ -309,22 +1142,54 @@ static int htab_map_delete_elem(struct b ++ ++ key_size = map->key_size; ++ ++- hash = htab_map_hash(key, key_size); +++ hash = htab_map_hash(key, key_size, htab->hashrnd); +++ b = __select_bucket(htab, hash); +++ head = &b->head; ++ ++- raw_spin_lock_irqsave(&htab->lock, flags); +++ raw_spin_lock_irqsave(&b->lock, flags); ++ ++- head = select_bucket(htab, hash); +++ l = lookup_elem_raw(head, hash, key, key_size); +++ +++ if (l) { +++ hlist_nulls_del_rcu(&l->hash_node); +++ free_htab_elem(htab, l); +++ ret = 0; +++ } +++ +++ raw_spin_unlock_irqrestore(&b->lock, flags); +++ return ret; +++} +++ +++static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +++ struct hlist_nulls_head *head; +++ struct bucket *b; +++ struct htab_elem *l; +++ unsigned long flags; +++ u32 hash, key_size; +++ int ret = -ENOENT; +++ +++ WARN_ON_ONCE(!rcu_read_lock_held()); +++ +++ key_size = map->key_size; +++ +++ hash = htab_map_hash(key, key_size, htab->hashrnd); +++ b = __select_bucket(htab, hash); +++ head = &b->head; +++ +++ raw_spin_lock_irqsave(&b->lock, flags); ++ ++ l = lookup_elem_raw(head, hash, key, key_size); ++ ++ if (l) { ++- hlist_del_rcu(&l->hash_node); ++- htab->count--; ++- kfree_rcu(l, rcu); +++ hlist_nulls_del_rcu(&l->hash_node); ++ ret = 0; ++ } ++ ++- raw_spin_unlock_irqrestore(&htab->lock, flags); +++ raw_spin_unlock_irqrestore(&b->lock, flags); +++ if (l) +++ bpf_lru_push_free(&htab->lru, &l->lru_node); ++ return ret; ++ } ++ ++@@ -333,14 +1198,13 @@ static void delete_all_elements(struct b ++ int i; ++ ++ for (i = 0; i < htab->n_buckets; i++) { ++- struct hlist_head *head = select_bucket(htab, i); ++- struct hlist_node *n; +++ struct hlist_nulls_head *head = select_bucket(htab, i); +++ struct hlist_nulls_node *n; ++ struct htab_elem *l; ++ ++- hlist_for_each_entry_safe(l, n, head, hash_node) { ++- hlist_del_rcu(&l->hash_node); ++- htab->count--; ++- kfree(l); +++ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { +++ hlist_nulls_del_rcu(&l->hash_node); +++ htab_elem_free(htab, l); ++ } ++ } ++ } ++@@ -357,31 +1221,320 @@ static void htab_map_free(struct bpf_map ++ */ ++ synchronize_rcu(); ++ ++- /* some of kfree_rcu() callbacks for elements of this map may not have ++- * executed. It's ok. Proceed to free residual elements and map itself +++ /* some of free_htab_elem() callbacks for elements of this map may +++ * not have executed. Wait for them. ++ */ ++- delete_all_elements(htab); ++- kvfree(htab->buckets); +++ rcu_barrier(); +++ if (!htab_is_prealloc(htab)) +++ delete_all_elements(htab); +++ else +++ prealloc_destroy(htab); +++ +++ free_percpu(htab->extra_elems); +++ bpf_map_area_free(htab->buckets); ++ kfree(htab); ++ } ++ ++-static const struct bpf_map_ops htab_ops = { +++static void htab_map_seq_show_elem(struct bpf_map *map, void *key, +++ struct seq_file *m) +++{ +++ void *value; +++ +++ rcu_read_lock(); +++ +++ value = htab_map_lookup_elem(map, key); +++ if (!value) { +++ rcu_read_unlock(); +++ return; +++ } +++ +++ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); +++ seq_puts(m, ": "); +++ btf_type_seq_show(map->btf, map->btf_value_type_id, value, m); +++ seq_puts(m, "\n"); +++ +++ rcu_read_unlock(); +++} +++ +++const struct bpf_map_ops htab_map_ops = { +++ .map_alloc_check = htab_map_alloc_check, ++ .map_alloc = htab_map_alloc, ++ .map_free = htab_map_free, ++ .map_get_next_key = htab_map_get_next_key, ++ .map_lookup_elem = htab_map_lookup_elem, ++ .map_update_elem = htab_map_update_elem, ++ .map_delete_elem = htab_map_delete_elem, +++ .map_gen_lookup = htab_map_gen_lookup, +++ .map_seq_show_elem = htab_map_seq_show_elem, ++ }; ++ ++-static struct bpf_map_type_list htab_type __read_mostly = { ++- .ops = &htab_ops, ++- .type = BPF_MAP_TYPE_HASH, +++const struct bpf_map_ops htab_lru_map_ops = { +++ .map_alloc_check = htab_map_alloc_check, +++ .map_alloc = htab_map_alloc, +++ .map_free = htab_map_free, +++ .map_get_next_key = htab_map_get_next_key, +++ .map_lookup_elem = htab_lru_map_lookup_elem, +++ .map_lookup_elem_sys_only = htab_lru_map_lookup_elem_sys, +++ .map_update_elem = htab_lru_map_update_elem, +++ .map_delete_elem = htab_lru_map_delete_elem, +++ .map_gen_lookup = htab_lru_map_gen_lookup, +++ .map_seq_show_elem = htab_map_seq_show_elem, ++ }; ++ ++-static int __init register_htab_map(void) +++/* Called from eBPF program */ +++static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) ++ { ++- bpf_register_map_type(&htab_type); ++- return 0; +++ struct htab_elem *l = __htab_map_lookup_elem(map, key); +++ +++ if (l) +++ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); +++ else +++ return NULL; ++ } ++-late_initcall(register_htab_map); +++ +++static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ struct htab_elem *l = __htab_map_lookup_elem(map, key); +++ +++ if (l) { +++ bpf_lru_node_set_ref(&l->lru_node); +++ return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); +++ } +++ +++ return NULL; +++} +++ +++int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +++{ +++ struct htab_elem *l; +++ void __percpu *pptr; +++ int ret = -ENOENT; +++ int cpu, off = 0; +++ u32 size; +++ +++ /* per_cpu areas are zero-filled and bpf programs can only +++ * access 'value_size' of them, so copying rounded areas +++ * will not leak any kernel data +++ */ +++ size = round_up(map->value_size, 8); +++ rcu_read_lock(); +++ l = __htab_map_lookup_elem(map, key); +++ if (!l) +++ goto out; +++ /* We do not mark LRU map element here in order to not mess up +++ * eviction heuristics when user space does a map walk. +++ */ +++ pptr = htab_elem_get_ptr(l, map->key_size); +++ for_each_possible_cpu(cpu) { +++ bpf_long_memcpy(value + off, +++ per_cpu_ptr(pptr, cpu), size); +++ off += size; +++ } +++ ret = 0; +++out: +++ rcu_read_unlock(); +++ return ret; +++} +++ +++int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, +++ u64 map_flags) +++{ +++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +++ int ret; +++ +++ rcu_read_lock(); +++ if (htab_is_lru(htab)) +++ ret = __htab_lru_percpu_map_update_elem(map, key, value, +++ map_flags, true); +++ else +++ ret = __htab_percpu_map_update_elem(map, key, value, map_flags, +++ true); +++ rcu_read_unlock(); +++ +++ return ret; +++} +++ +++static void htab_percpu_map_seq_show_elem(struct bpf_map *map, void *key, +++ struct seq_file *m) +++{ +++ struct htab_elem *l; +++ void __percpu *pptr; +++ int cpu; +++ +++ rcu_read_lock(); +++ +++ l = __htab_map_lookup_elem(map, key); +++ if (!l) { +++ rcu_read_unlock(); +++ return; +++ } +++ +++ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); +++ seq_puts(m, ": {\n"); +++ pptr = htab_elem_get_ptr(l, map->key_size); +++ for_each_possible_cpu(cpu) { +++ seq_printf(m, "\tcpu%d: ", cpu); +++ btf_type_seq_show(map->btf, map->btf_value_type_id, +++ per_cpu_ptr(pptr, cpu), m); +++ seq_puts(m, "\n"); +++ } +++ seq_puts(m, "}\n"); +++ +++ rcu_read_unlock(); +++} +++ +++const struct bpf_map_ops htab_percpu_map_ops = { +++ .map_alloc_check = htab_map_alloc_check, +++ .map_alloc = htab_map_alloc, +++ .map_free = htab_map_free, +++ .map_get_next_key = htab_map_get_next_key, +++ .map_lookup_elem = htab_percpu_map_lookup_elem, +++ .map_update_elem = htab_percpu_map_update_elem, +++ .map_delete_elem = htab_map_delete_elem, +++ .map_seq_show_elem = htab_percpu_map_seq_show_elem, +++}; +++ +++const struct bpf_map_ops htab_lru_percpu_map_ops = { +++ .map_alloc_check = htab_map_alloc_check, +++ .map_alloc = htab_map_alloc, +++ .map_free = htab_map_free, +++ .map_get_next_key = htab_map_get_next_key, +++ .map_lookup_elem = htab_lru_percpu_map_lookup_elem, +++ .map_update_elem = htab_lru_percpu_map_update_elem, +++ .map_delete_elem = htab_lru_map_delete_elem, +++ .map_seq_show_elem = htab_percpu_map_seq_show_elem, +++}; +++ +++static int fd_htab_map_alloc_check(union bpf_attr *attr) +++{ +++ if (attr->value_size != sizeof(u32)) +++ return -EINVAL; +++ return htab_map_alloc_check(attr); +++} +++ +++static void fd_htab_map_free(struct bpf_map *map) +++{ +++ struct bpf_htab *htab = container_of(map, struct bpf_htab, map); +++ struct hlist_nulls_node *n; +++ struct hlist_nulls_head *head; +++ struct htab_elem *l; +++ int i; +++ +++ for (i = 0; i < htab->n_buckets; i++) { +++ head = select_bucket(htab, i); +++ +++ hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { +++ void *ptr = fd_htab_map_get_ptr(map, l); +++ +++ map->ops->map_fd_put_ptr(ptr); +++ } +++ } +++ +++ htab_map_free(map); +++} +++ +++/* only called from syscall */ +++int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value) +++{ +++ void **ptr; +++ int ret = 0; +++ +++ if (!map->ops->map_fd_sys_lookup_elem) +++ return -ENOTSUPP; +++ +++ rcu_read_lock(); +++ ptr = htab_map_lookup_elem(map, key); +++ if (ptr) +++ *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr)); +++ else +++ ret = -ENOENT; +++ rcu_read_unlock(); +++ +++ return ret; +++} +++ +++/* only called from syscall */ +++int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, +++ void *key, void *value, u64 map_flags) +++{ +++ void *ptr; +++ int ret; +++ u32 ufd = *(u32 *)value; +++ +++ ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); +++ if (IS_ERR(ptr)) +++ return PTR_ERR(ptr); +++ +++ ret = htab_map_update_elem(map, key, &ptr, map_flags); +++ if (ret) +++ map->ops->map_fd_put_ptr(ptr); +++ +++ return ret; +++} +++ +++static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +++{ +++ struct bpf_map *map, *inner_map_meta; +++ +++ inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); +++ if (IS_ERR(inner_map_meta)) +++ return inner_map_meta; +++ +++ map = htab_map_alloc(attr); +++ if (IS_ERR(map)) { +++ bpf_map_meta_free(inner_map_meta); +++ return map; +++ } +++ +++ map->inner_map_meta = inner_map_meta; +++ +++ return map; +++} +++ +++static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_map **inner_map = htab_map_lookup_elem(map, key); +++ +++ if (!inner_map) +++ return NULL; +++ +++ return READ_ONCE(*inner_map); +++} +++ +++static u32 htab_of_map_gen_lookup(struct bpf_map *map, +++ struct bpf_insn *insn_buf) +++{ +++ struct bpf_insn *insn = insn_buf; +++ const int ret = BPF_REG_0; +++ +++ BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem, +++ (void *(*)(struct bpf_map *map, void *key))NULL)); +++ *insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem)); +++ *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2); +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, +++ offsetof(struct htab_elem, key) + +++ round_up(map->key_size, 8)); +++ *insn++ = BPF_LDX_MEM(BPF_DW, ret, ret, 0); +++ +++ return insn - insn_buf; +++} +++ +++static void htab_of_map_free(struct bpf_map *map) +++{ +++ bpf_map_meta_free(map->inner_map_meta); +++ fd_htab_map_free(map); +++} +++ +++const struct bpf_map_ops htab_of_maps_map_ops = { +++ .map_alloc_check = fd_htab_map_alloc_check, +++ .map_alloc = htab_of_map_alloc, +++ .map_free = htab_of_map_free, +++ .map_get_next_key = htab_map_get_next_key, +++ .map_lookup_elem = htab_of_map_lookup_elem, +++ .map_delete_elem = htab_map_delete_elem, +++ .map_fd_get_ptr = bpf_map_fd_get_ptr, +++ .map_fd_put_ptr = bpf_map_fd_put_ptr, +++ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem, +++ .map_gen_lookup = htab_of_map_gen_lookup, +++ .map_check_btf = map_check_no_btf, +++}; ++--- a/kernel/bpf/helpers.c +++++ b/kernel/bpf/helpers.c ++@@ -1,21 +1,18 @@ +++// SPDX-License-Identifier: GPL-2.0-only ++ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++- * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of version 2 of the GNU General Public ++- * License as published by the Free Software Foundation. ++- * ++- * This program is distributed in the hope that it will be useful, but ++- * WITHOUT ANY WARRANTY; without even the implied warranty of ++- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++- * General Public License for more details. ++ */ ++ #include ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include +++#include +++#include +++ +++#include "../../lib/kstrtox.h" ++ ++ /* If kernel subsystem is allowing eBPF programs to call this function, ++ * inside its own verifier_ops->get_func_proto() callback it should return ++@@ -26,48 +23,32 @@ ++ * if program is allowed to access maps, so check rcu_read_lock_held in ++ * all three functions. ++ */ ++-static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) ++ { ++- /* verifier checked that R1 contains a valid pointer to bpf_map ++- * and R2 points to a program stack and map->key_size bytes were ++- * initialized ++- */ ++- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; ++- void *key = (void *) (unsigned long) r2; ++- void *value; ++- ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++- ++- value = map->ops->map_lookup_elem(map, key); ++- ++- /* lookup() returns either pointer to element value or NULL ++- * which is the meaning of PTR_TO_MAP_VALUE_OR_NULL type ++- */ ++- return (unsigned long) value; +++ return (unsigned long) map->ops->map_lookup_elem(map, key); ++ } ++ ++ const struct bpf_func_proto bpf_map_lookup_elem_proto = { ++ .func = bpf_map_lookup_elem, ++ .gpl_only = false, +++ .pkt_access = true, ++ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, ++ .arg1_type = ARG_CONST_MAP_PTR, ++ .arg2_type = ARG_PTR_TO_MAP_KEY, ++ }; ++ ++-static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, +++ void *, value, u64, flags) ++ { ++- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; ++- void *key = (void *) (unsigned long) r2; ++- void *value = (void *) (unsigned long) r3; ++- ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++- ++- return map->ops->map_update_elem(map, key, value, r4); +++ return map->ops->map_update_elem(map, key, value, flags); ++ } ++ ++ const struct bpf_func_proto bpf_map_update_elem_proto = { ++ .func = bpf_map_update_elem, ++ .gpl_only = false, +++ .pkt_access = true, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_CONST_MAP_PTR, ++ .arg2_type = ARG_PTR_TO_MAP_KEY, ++@@ -75,33 +56,71 @@ const struct bpf_func_proto bpf_map_upda ++ .arg4_type = ARG_ANYTHING, ++ }; ++ ++-static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) ++ { ++- struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; ++- void *key = (void *) (unsigned long) r2; ++- ++ WARN_ON_ONCE(!rcu_read_lock_held()); ++- ++ return map->ops->map_delete_elem(map, key); ++ } ++ ++ const struct bpf_func_proto bpf_map_delete_elem_proto = { ++ .func = bpf_map_delete_elem, ++ .gpl_only = false, +++ .pkt_access = true, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_CONST_MAP_PTR, ++ .arg2_type = ARG_PTR_TO_MAP_KEY, ++ }; ++ +++BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +++{ +++ return map->ops->map_push_elem(map, value, flags); +++} +++ +++const struct bpf_func_proto bpf_map_push_elem_proto = { +++ .func = bpf_map_push_elem, +++ .gpl_only = false, +++ .pkt_access = true, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_CONST_MAP_PTR, +++ .arg2_type = ARG_PTR_TO_MAP_VALUE, +++ .arg3_type = ARG_ANYTHING, +++}; +++ +++BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +++{ +++ return map->ops->map_pop_elem(map, value); +++} +++ +++const struct bpf_func_proto bpf_map_pop_elem_proto = { +++ .func = bpf_map_pop_elem, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_CONST_MAP_PTR, +++ .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +++}; +++ +++BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +++{ +++ return map->ops->map_peek_elem(map, value); +++} +++ +++const struct bpf_func_proto bpf_map_peek_elem_proto = { +++ .func = bpf_map_peek_elem, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_CONST_MAP_PTR, +++ .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +++}; +++ ++ const struct bpf_func_proto bpf_get_prandom_u32_proto = { ++ .func = bpf_user_rnd_u32, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ }; ++ ++-static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_0(bpf_get_smp_processor_id) ++ { ++- return raw_smp_processor_id(); +++ return smp_processor_id(); ++ } ++ ++ const struct bpf_func_proto bpf_get_smp_processor_id_proto = { ++@@ -110,7 +129,18 @@ const struct bpf_func_proto bpf_get_smp_ ++ .ret_type = RET_INTEGER, ++ }; ++ ++-static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_0(bpf_get_numa_node_id) +++{ +++ return numa_node_id(); +++} +++ +++const struct bpf_func_proto bpf_get_numa_node_id_proto = { +++ .func = bpf_get_numa_node_id, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++}; +++ +++BPF_CALL_0(bpf_ktime_get_ns) ++ { ++ /* NMI safe access to clock monotonic */ ++ return ktime_get_mono_fast_ns(); ++@@ -122,11 +152,11 @@ const struct bpf_func_proto bpf_ktime_ge ++ .ret_type = RET_INTEGER, ++ }; ++ ++-static u64 bpf_get_current_pid_tgid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_0(bpf_get_current_pid_tgid) ++ { ++ struct task_struct *task = current; ++ ++- if (!task) +++ if (unlikely(!task)) ++ return -EINVAL; ++ ++ return (u64) task->tgid << 32 | task->pid; ++@@ -138,18 +168,18 @@ const struct bpf_func_proto bpf_get_curr ++ .ret_type = RET_INTEGER, ++ }; ++ ++-static u64 bpf_get_current_uid_gid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_0(bpf_get_current_uid_gid) ++ { ++ struct task_struct *task = current; ++ kuid_t uid; ++ kgid_t gid; ++ ++- if (!task) +++ if (unlikely(!task)) ++ return -EINVAL; ++ ++ current_uid_gid(&uid, &gid); ++ return (u64) from_kgid(&init_user_ns, gid) << 32 | ++- from_kuid(&init_user_ns, uid); +++ from_kuid(&init_user_ns, uid); ++ } ++ ++ const struct bpf_func_proto bpf_get_current_uid_gid_proto = { ++@@ -158,22 +188,254 @@ const struct bpf_func_proto bpf_get_curr ++ .ret_type = RET_INTEGER, ++ }; ++ ++-static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) +++BPF_CALL_2(bpf_get_current_comm, char *, buf, u32, size) ++ { ++ struct task_struct *task = current; ++- char *buf = (char *) (long) r1; ++ ++- if (!task) ++- return -EINVAL; +++ if (unlikely(!task)) +++ goto err_clear; +++ +++ strncpy(buf, task->comm, size); ++ ++- strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); +++ /* Verifier guarantees that size > 0. For task->comm exceeding +++ * size, guarantee that buf is %NUL-terminated. Unconditionally +++ * done here to save the size test. +++ */ +++ buf[size - 1] = 0; ++ return 0; +++err_clear: +++ memset(buf, 0, size); +++ return -EINVAL; ++ } ++ ++ const struct bpf_func_proto bpf_get_current_comm_proto = { ++ .func = bpf_get_current_comm, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++- .arg1_type = ARG_PTR_TO_STACK, ++- .arg2_type = ARG_CONST_STACK_SIZE, +++ .arg1_type = ARG_PTR_TO_UNINIT_MEM, +++ .arg2_type = ARG_CONST_SIZE, +++}; +++ +++#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) +++ +++static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +++{ +++ arch_spinlock_t *l = (void *)lock; +++ union { +++ __u32 val; +++ arch_spinlock_t lock; +++ } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; +++ +++ compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); +++ BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); +++ BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); +++ arch_spin_lock(l); +++} +++ +++static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +++{ +++ arch_spinlock_t *l = (void *)lock; +++ +++ arch_spin_unlock(l); +++} +++ +++#else +++ +++static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +++{ +++ atomic_t *l = (void *)lock; +++ +++ BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); +++ do { +++ smp_cond_load_relaxed(&l->counter, !VAL); +++ } while (atomic_xchg(l, 1)); +++} +++ +++static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +++{ +++ atomic_t *l = (void *)lock; +++ +++ atomic_set_release(l, 0); +++} +++ +++#endif +++ +++static DEFINE_PER_CPU(unsigned long, irqsave_flags); +++ +++notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +++{ +++ unsigned long flags; +++ +++ local_irq_save(flags); +++ __bpf_spin_lock(lock); +++ __this_cpu_write(irqsave_flags, flags); +++ return 0; +++} +++ +++const struct bpf_func_proto bpf_spin_lock_proto = { +++ .func = bpf_spin_lock, +++ .gpl_only = false, +++ .ret_type = RET_VOID, +++ .arg1_type = ARG_PTR_TO_SPIN_LOCK, +++}; +++ +++notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +++{ +++ unsigned long flags; +++ +++ flags = __this_cpu_read(irqsave_flags); +++ __bpf_spin_unlock(lock); +++ local_irq_restore(flags); +++ return 0; +++} +++ +++const struct bpf_func_proto bpf_spin_unlock_proto = { +++ .func = bpf_spin_unlock, +++ .gpl_only = false, +++ .ret_type = RET_VOID, +++ .arg1_type = ARG_PTR_TO_SPIN_LOCK, +++}; +++ +++void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, +++ bool lock_src) +++{ +++ struct bpf_spin_lock *lock; +++ +++ if (lock_src) +++ lock = src + map->spin_lock_off; +++ else +++ lock = dst + map->spin_lock_off; +++ preempt_disable(); +++ ____bpf_spin_lock(lock); +++ copy_map_value(map, dst, src); +++ ____bpf_spin_unlock(lock); +++ preempt_enable(); +++} +++ +++#define BPF_STRTOX_BASE_MASK 0x1F +++ +++static int __bpf_strtoull(const char *buf, size_t buf_len, u64 flags, +++ unsigned long long *res, bool *is_negative) +++{ +++ unsigned int base = flags & BPF_STRTOX_BASE_MASK; +++ const char *cur_buf = buf; +++ size_t cur_len = buf_len; +++ unsigned int consumed; +++ size_t val_len; +++ char str[64]; +++ +++ if (!buf || !buf_len || !res || !is_negative) +++ return -EINVAL; +++ +++ if (base != 0 && base != 8 && base != 10 && base != 16) +++ return -EINVAL; +++ +++ if (flags & ~BPF_STRTOX_BASE_MASK) +++ return -EINVAL; +++ +++ while (cur_buf < buf + buf_len && isspace(*cur_buf)) +++ ++cur_buf; +++ +++ *is_negative = (cur_buf < buf + buf_len && *cur_buf == '-'); +++ if (*is_negative) +++ ++cur_buf; +++ +++ consumed = cur_buf - buf; +++ cur_len -= consumed; +++ if (!cur_len) +++ return -EINVAL; +++ +++ cur_len = min(cur_len, sizeof(str) - 1); +++ memcpy(str, cur_buf, cur_len); +++ str[cur_len] = '\0'; +++ cur_buf = str; +++ +++ cur_buf = _parse_integer_fixup_radix(cur_buf, &base); +++ val_len = _parse_integer(cur_buf, base, res); +++ +++ if (val_len & KSTRTOX_OVERFLOW) +++ return -ERANGE; +++ +++ if (val_len == 0) +++ return -EINVAL; +++ +++ cur_buf += val_len; +++ consumed += cur_buf - str; +++ +++ return consumed; +++} +++ +++static int __bpf_strtoll(const char *buf, size_t buf_len, u64 flags, +++ long long *res) +++{ +++ unsigned long long _res; +++ bool is_negative; +++ int err; +++ +++ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); +++ if (err < 0) +++ return err; +++ if (is_negative) { +++ if ((long long)-_res > 0) +++ return -ERANGE; +++ *res = -_res; +++ } else { +++ if ((long long)_res < 0) +++ return -ERANGE; +++ *res = _res; +++ } +++ return err; +++} +++ +++BPF_CALL_4(bpf_strtol, const char *, buf, size_t, buf_len, u64, flags, +++ long *, res) +++{ +++ long long _res; +++ int err; +++ +++ err = __bpf_strtoll(buf, buf_len, flags, &_res); +++ if (err < 0) +++ return err; +++ if (_res != (long)_res) +++ return -ERANGE; +++ *res = _res; +++ return err; +++} +++ +++const struct bpf_func_proto bpf_strtol_proto = { +++ .func = bpf_strtol, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_MEM, +++ .arg2_type = ARG_CONST_SIZE, +++ .arg3_type = ARG_ANYTHING, +++ .arg4_type = ARG_PTR_TO_LONG, +++}; +++ +++BPF_CALL_4(bpf_strtoul, const char *, buf, size_t, buf_len, u64, flags, +++ unsigned long *, res) +++{ +++ unsigned long long _res; +++ bool is_negative; +++ int err; +++ +++ err = __bpf_strtoull(buf, buf_len, flags, &_res, &is_negative); +++ if (err < 0) +++ return err; +++ if (is_negative) +++ return -EINVAL; +++ if (_res != (unsigned long)_res) +++ return -ERANGE; +++ *res = _res; +++ return err; +++} +++ +++const struct bpf_func_proto bpf_strtoul_proto = { +++ .func = bpf_strtoul, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_MEM, +++ .arg2_type = ARG_CONST_SIZE, +++ .arg3_type = ARG_ANYTHING, +++ .arg4_type = ARG_PTR_TO_LONG, ++ }; ++--- a/kernel/bpf/inode.c +++++ b/kernel/bpf/inode.c ++@@ -1,3 +1,4 @@ +++// SPDX-License-Identifier: GPL-2.0-only ++ /* ++ * Minimal file system backend for holding eBPF maps and programs, ++ * used by bpf(2) object pinning. ++@@ -5,21 +6,19 @@ ++ * Authors: ++ * ++ * Daniel Borkmann ++- * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of the GNU General Public License ++- * version 2 as published by the Free Software Foundation. ++ */ ++ ++-#include +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include ++ #include +++#include ++ ++ enum bpf_type { ++ BPF_TYPE_UNSPEC = 0, ++@@ -87,6 +86,7 @@ static struct inode *bpf_get_inode(struc ++ switch (mode & S_IFMT) { ++ case S_IFDIR: ++ case S_IFREG: +++ case S_IFLNK: ++ break; ++ default: ++ return ERR_PTR(-EINVAL); ++@@ -119,18 +119,20 @@ static int bpf_inode_type(const struct i ++ return 0; ++ } ++ ++-static bool bpf_dname_reserved(const struct dentry *dentry) +++static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, +++ struct inode *dir) ++ { ++- return strchr(dentry->d_name.name, '.'); +++ d_instantiate(dentry, inode); +++ dget(dentry); +++ +++ dir->i_mtime = CURRENT_TIME; +++ dir->i_ctime = dir->i_mtime; ++ } ++ ++ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ++ { ++ struct inode *inode; ++ ++- if (bpf_dname_reserved(dentry)) ++- return -EPERM; ++- ++ inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++@@ -141,30 +143,30 @@ static int bpf_mkdir(struct inode *dir, ++ inc_nlink(inode); ++ inc_nlink(dir); ++ ++- d_instantiate(dentry, inode); ++- dget(dentry); ++- +++ bpf_dentry_finalize(dentry, inode, dir); ++ return 0; ++ } ++ ++-static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, ++- umode_t mode, const struct inode_operations *iops) +++static int bpffs_obj_open(struct inode *inode, struct file *file) ++ { ++- struct inode *inode; +++ return -EIO; +++} ++ ++- if (bpf_dname_reserved(dentry)) ++- return -EPERM; +++static const struct file_operations bpffs_obj_fops = { +++ .open = bpffs_obj_open, +++}; ++ ++- inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); +++static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, +++ umode_t mode, const struct inode_operations *iops) +++{ +++ struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ inode->i_op = iops; ++ inode->i_private = dentry->d_fsdata; ++ ++- d_instantiate(dentry, inode); ++- dget(dentry); ++- +++ bpf_dentry_finalize(dentry, inode, dir); ++ return 0; ++ } ++ ++@@ -187,11 +189,48 @@ static int bpf_mkobj(struct inode *dir, ++ } ++ } ++ +++static struct dentry * +++bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) +++{ +++ /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future +++ * extensions. +++ */ +++ if (strchr(dentry->d_name.name, '.')) +++ return ERR_PTR(-EPERM); +++ +++ return simple_lookup(dir, dentry, flags); +++} +++ +++static int bpf_symlink(struct inode *dir, struct dentry *dentry, +++ const char *target) +++{ +++ char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); +++ struct inode *inode; +++ +++ if (!link) +++ return -ENOMEM; +++ +++ inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); +++ if (IS_ERR(inode)) { +++ kfree(link); +++ return PTR_ERR(inode); +++ } +++ +++ inode->i_op = &simple_symlink_inode_operations; +++ inode->i_link = link; +++ +++ bpf_dentry_finalize(dentry, inode, dir); +++ return 0; +++} +++ ++ static const struct inode_operations bpf_dir_iops = { ++- .lookup = simple_lookup, +++ .lookup = bpf_lookup, ++ .mknod = bpf_mkobj, ++ .mkdir = bpf_mkdir, +++ .symlink = bpf_symlink, ++ .rmdir = simple_rmdir, +++ .rename = simple_rename, +++ .link = simple_link, ++ .unlink = simple_unlink, ++ }; ++ ++@@ -256,7 +295,7 @@ out: ++ } ++ ++ static void *bpf_obj_do_get(const struct filename *pathname, ++- enum bpf_type *type) +++ enum bpf_type *type, int flags) ++ { ++ struct inode *inode; ++ struct path path; ++@@ -268,7 +307,7 @@ static void *bpf_obj_do_get(const struct ++ return ERR_PTR(ret); ++ ++ inode = d_backing_inode(path.dentry); ++- ret = inode_permission(inode, MAY_WRITE); +++ ret = inode_permission(inode, ACC_MODE(flags)); ++ if (ret) ++ goto out; ++ ++@@ -287,18 +326,23 @@ out: ++ return ERR_PTR(ret); ++ } ++ ++-int bpf_obj_get_user(const char __user *pathname) +++int bpf_obj_get_user(const char __user *pathname, int flags) ++ { ++ enum bpf_type type = BPF_TYPE_UNSPEC; ++ struct filename *pname; ++ int ret = -ENOENT; +++ int f_flags; ++ void *raw; ++ +++ f_flags = bpf_get_file_flag(flags); +++ if (f_flags < 0) +++ return f_flags; +++ ++ pname = getname(pathname); ++ if (IS_ERR(pname)) ++ return PTR_ERR(pname); ++ ++- raw = bpf_obj_do_get(pname, &type); +++ raw = bpf_obj_do_get(pname, &type, f_flags); ++ if (IS_ERR(raw)) { ++ ret = PTR_ERR(raw); ++ goto out; ++@@ -307,7 +351,7 @@ int bpf_obj_get_user(const char __user * ++ if (type == BPF_TYPE_PROG) ++ ret = bpf_prog_new_fd(raw); ++ else if (type == BPF_TYPE_MAP) ++- ret = bpf_map_new_fd(raw); +++ ret = bpf_map_new_fd(raw, f_flags); ++ else ++ goto out; ++ ++@@ -318,29 +362,131 @@ out: ++ return ret; ++ } ++ ++-static void bpf_evict_inode(struct inode *inode) +++static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) ++ { ++- enum bpf_type type; +++ struct bpf_prog *prog; +++ int ret = inode_permission(inode, MAY_READ); +++ if (ret) +++ return ERR_PTR(ret); +++ +++ if (inode->i_op == &bpf_map_iops) +++ return ERR_PTR(-EINVAL); +++ if (inode->i_op != &bpf_prog_iops) +++ return ERR_PTR(-EACCES); ++ ++- truncate_inode_pages_final(&inode->i_data); ++- clear_inode(inode); +++ prog = inode->i_private; ++ +++ if (!bpf_prog_get_ok(prog, &type, false)) +++ return ERR_PTR(-EINVAL); +++ +++ return bpf_prog_inc(prog); +++} +++ +++struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +++{ +++ struct bpf_prog *prog; +++ struct path path; +++ int ret = kern_path(name, LOOKUP_FOLLOW, &path); +++ if (ret) +++ return ERR_PTR(ret); +++ prog = __get_prog_inode(d_backing_inode(path.dentry), type); +++ if (!IS_ERR(prog)) +++ touch_atime(&path); +++ path_put(&path); +++ return prog; +++} +++EXPORT_SYMBOL(bpf_prog_get_type_path); +++ +++/* +++ * Display the mount options in /proc/mounts. +++ */ +++static int bpf_show_options(struct seq_file *m, struct dentry *root) +++{ +++ umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX; +++ +++ if (mode != S_IRWXUGO) +++ seq_printf(m, ",mode=%o", mode); +++ return 0; +++} +++ +++static void bpf_destroy_inode_deferred(struct rcu_head *head) +++{ +++ struct inode *inode = container_of(head, struct inode, i_rcu); +++ enum bpf_type type; +++ +++ if (S_ISLNK(inode->i_mode)) +++ kfree(inode->i_link); ++ if (!bpf_inode_type(inode, &type)) ++ bpf_any_put(inode->i_private, type); +++ free_inode_nonrcu(inode); +++} +++ +++static void bpf_destroy_inode(struct inode *inode) +++{ +++ call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); ++ } ++ ++ static const struct super_operations bpf_super_ops = { ++ .statfs = simple_statfs, ++ .drop_inode = generic_delete_inode, ++- .evict_inode = bpf_evict_inode, +++ .show_options = bpf_show_options, +++ .destroy_inode = bpf_destroy_inode, +++}; +++ +++enum { +++ OPT_MODE, +++ OPT_ERR, +++}; +++ +++static const match_table_t bpf_mount_tokens = { +++ { OPT_MODE, "mode=%o" }, +++ { OPT_ERR, NULL }, +++}; +++ +++struct bpf_mount_opts { +++ umode_t mode; ++ }; ++ +++static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) +++{ +++ substring_t args[MAX_OPT_ARGS]; +++ int option, token; +++ char *ptr; +++ +++ opts->mode = S_IRWXUGO; +++ +++ while ((ptr = strsep(&data, ",")) != NULL) { +++ if (!*ptr) +++ continue; +++ +++ token = match_token(ptr, bpf_mount_tokens, args); +++ switch (token) { +++ case OPT_MODE: +++ if (match_octal(&args[0], &option)) +++ return -EINVAL; +++ opts->mode = option & S_IALLUGO; +++ break; +++ /* We might like to report bad mount options here, but +++ * traditionally we've ignored all mount options, so we'd +++ * better continue to ignore non-existing options for bpf. +++ */ +++ } +++ } +++ +++ return 0; +++} +++ ++ static int bpf_fill_super(struct super_block *sb, void *data, int silent) ++ { ++ static struct tree_descr bpf_rfiles[] = { { "" } }; +++ struct bpf_mount_opts opts; ++ struct inode *inode; ++ int ret; ++ +++ ret = bpf_parse_options(data, &opts); +++ if (ret) +++ return ret; +++ ++ ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); ++ if (ret) ++ return ret; ++@@ -350,7 +496,7 @@ static int bpf_fill_super(struct super_b ++ inode = sb->s_root->d_inode; ++ inode->i_op = &bpf_dir_iops; ++ inode->i_mode &= ~S_IALLUGO; ++- inode->i_mode |= S_ISVTX | S_IRWXUGO; +++ inode->i_mode |= S_ISVTX | opts.mode; ++ ++ return 0; ++ } ++@@ -368,8 +514,6 @@ static struct file_system_type bpf_fs_ty ++ .kill_sb = kill_litter_super, ++ }; ++ ++-MODULE_ALIAS_FS("bpf"); ++- ++ static int __init bpf_init(void) ++ { ++ int ret; ++--- /dev/null +++++ b/kernel/bpf/local_storage.c ++@@ -0,0 +1,600 @@ +++//SPDX-License-Identifier: GPL-2.0 +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); +++ +++#ifdef CONFIG_CGROUP_BPF +++ +++#define LOCAL_STORAGE_CREATE_FLAG_MASK \ +++ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) +++ +++struct bpf_cgroup_storage_map { +++ struct bpf_map map; +++ +++ spinlock_t lock; +++ struct bpf_prog *prog; +++ struct rb_root root; +++ struct list_head list; +++}; +++ +++static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map) +++{ +++ return container_of(map, struct bpf_cgroup_storage_map, map); +++} +++ +++static int bpf_cgroup_storage_key_cmp( +++ const struct bpf_cgroup_storage_key *key1, +++ const struct bpf_cgroup_storage_key *key2) +++{ +++ if (key1->cgroup_inode_id < key2->cgroup_inode_id) +++ return -1; +++ else if (key1->cgroup_inode_id > key2->cgroup_inode_id) +++ return 1; +++ else if (key1->attach_type < key2->attach_type) +++ return -1; +++ else if (key1->attach_type > key2->attach_type) +++ return 1; +++ return 0; +++} +++ +++static struct bpf_cgroup_storage *cgroup_storage_lookup( +++ struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key, +++ bool locked) +++{ +++ struct rb_root *root = &map->root; +++ struct rb_node *node; +++ +++ if (!locked) +++ spin_lock_bh(&map->lock); +++ +++ node = root->rb_node; +++ while (node) { +++ struct bpf_cgroup_storage *storage; +++ +++ storage = container_of(node, struct bpf_cgroup_storage, node); +++ +++ switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) { +++ case -1: +++ node = node->rb_left; +++ break; +++ case 1: +++ node = node->rb_right; +++ break; +++ default: +++ if (!locked) +++ spin_unlock_bh(&map->lock); +++ return storage; +++ } +++ } +++ +++ if (!locked) +++ spin_unlock_bh(&map->lock); +++ +++ return NULL; +++} +++ +++static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map, +++ struct bpf_cgroup_storage *storage) +++{ +++ struct rb_root *root = &map->root; +++ struct rb_node **new = &(root->rb_node), *parent = NULL; +++ +++ while (*new) { +++ struct bpf_cgroup_storage *this; +++ +++ this = container_of(*new, struct bpf_cgroup_storage, node); +++ +++ parent = *new; +++ switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) { +++ case -1: +++ new = &((*new)->rb_left); +++ break; +++ case 1: +++ new = &((*new)->rb_right); +++ break; +++ default: +++ return -EEXIST; +++ } +++ } +++ +++ rb_link_node(&storage->node, parent, new); +++ rb_insert_color(&storage->node, root); +++ +++ return 0; +++} +++ +++static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key) +++{ +++ struct bpf_cgroup_storage_map *map = map_to_storage(_map); +++ struct bpf_cgroup_storage_key *key = _key; +++ struct bpf_cgroup_storage *storage; +++ +++ storage = cgroup_storage_lookup(map, key, false); +++ if (!storage) +++ return NULL; +++ +++ return &READ_ONCE(storage->buf)->data[0]; +++} +++ +++static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, +++ void *value, u64 flags) +++{ +++ struct bpf_cgroup_storage_key *key = _key; +++ struct bpf_cgroup_storage *storage; +++ struct bpf_storage_buffer *new; +++ +++ if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) +++ return -EINVAL; +++ +++ if (unlikely(flags & BPF_NOEXIST)) +++ return -EINVAL; +++ +++ if (unlikely((flags & BPF_F_LOCK) && +++ !map_value_has_spin_lock(map))) +++ return -EINVAL; +++ +++ storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, +++ key, false); +++ if (!storage) +++ return -ENOENT; +++ +++ if (flags & BPF_F_LOCK) { +++ copy_map_value_locked(map, storage->buf->data, value, false); +++ return 0; +++ } +++ +++ new = kmalloc_node(sizeof(struct bpf_storage_buffer) + +++ map->value_size, +++ __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, +++ map->numa_node); +++ if (!new) +++ return -ENOMEM; +++ +++ memcpy(&new->data[0], value, map->value_size); +++ check_and_init_map_lock(map, new->data); +++ +++ new = xchg(&storage->buf, new); +++ kfree_rcu(new, rcu); +++ +++ return 0; +++} +++ +++int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, +++ void *value) +++{ +++ struct bpf_cgroup_storage_map *map = map_to_storage(_map); +++ struct bpf_cgroup_storage_key *key = _key; +++ struct bpf_cgroup_storage *storage; +++ int cpu, off = 0; +++ u32 size; +++ +++ rcu_read_lock(); +++ storage = cgroup_storage_lookup(map, key, false); +++ if (!storage) { +++ rcu_read_unlock(); +++ return -ENOENT; +++ } +++ +++ /* per_cpu areas are zero-filled and bpf programs can only +++ * access 'value_size' of them, so copying rounded areas +++ * will not leak any kernel data +++ */ +++ size = round_up(_map->value_size, 8); +++ for_each_possible_cpu(cpu) { +++ bpf_long_memcpy(value + off, +++ per_cpu_ptr(storage->percpu_buf, cpu), size); +++ off += size; +++ } +++ rcu_read_unlock(); +++ return 0; +++} +++ +++int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, +++ void *value, u64 map_flags) +++{ +++ struct bpf_cgroup_storage_map *map = map_to_storage(_map); +++ struct bpf_cgroup_storage_key *key = _key; +++ struct bpf_cgroup_storage *storage; +++ int cpu, off = 0; +++ u32 size; +++ +++ if (map_flags != BPF_ANY && map_flags != BPF_EXIST) +++ return -EINVAL; +++ +++ rcu_read_lock(); +++ storage = cgroup_storage_lookup(map, key, false); +++ if (!storage) { +++ rcu_read_unlock(); +++ return -ENOENT; +++ } +++ +++ /* the user space will provide round_up(value_size, 8) bytes that +++ * will be copied into per-cpu area. bpf programs can only access +++ * value_size of it. During lookup the same extra bytes will be +++ * returned or zeros which were zero-filled by percpu_alloc, +++ * so no kernel data leaks possible +++ */ +++ size = round_up(_map->value_size, 8); +++ for_each_possible_cpu(cpu) { +++ bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), +++ value + off, size); +++ off += size; +++ } +++ rcu_read_unlock(); +++ return 0; +++} +++ +++static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, +++ void *_next_key) +++{ +++ struct bpf_cgroup_storage_map *map = map_to_storage(_map); +++ struct bpf_cgroup_storage_key *key = _key; +++ struct bpf_cgroup_storage_key *next = _next_key; +++ struct bpf_cgroup_storage *storage; +++ +++ spin_lock_bh(&map->lock); +++ +++ if (list_empty(&map->list)) +++ goto enoent; +++ +++ if (key) { +++ storage = cgroup_storage_lookup(map, key, true); +++ if (!storage) +++ goto enoent; +++ +++ storage = list_next_entry(storage, list); +++ if (!storage) +++ goto enoent; +++ } else { +++ storage = list_first_entry(&map->list, +++ struct bpf_cgroup_storage, list); +++ } +++ +++ spin_unlock_bh(&map->lock); +++ next->attach_type = storage->key.attach_type; +++ next->cgroup_inode_id = storage->key.cgroup_inode_id; +++ return 0; +++ +++enoent: +++ spin_unlock_bh(&map->lock); +++ return -ENOENT; +++} +++ +++static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) +++{ +++ int numa_node = bpf_map_attr_numa_node(attr); +++ struct bpf_cgroup_storage_map *map; +++ struct bpf_map_memory mem; +++ int ret; +++ +++ if (attr->key_size != sizeof(struct bpf_cgroup_storage_key)) +++ return ERR_PTR(-EINVAL); +++ +++ if (attr->value_size == 0) +++ return ERR_PTR(-EINVAL); +++ +++ if (attr->value_size > PAGE_SIZE) +++ return ERR_PTR(-E2BIG); +++ +++ if (attr->map_flags & ~LOCAL_STORAGE_CREATE_FLAG_MASK || +++ !bpf_map_flags_access_ok(attr->map_flags)) +++ return ERR_PTR(-EINVAL); +++ +++ if (attr->max_entries) +++ /* max_entries is not used and enforced to be 0 */ +++ return ERR_PTR(-EINVAL); +++ +++ ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); +++ if (ret < 0) +++ return ERR_PTR(ret); +++ +++ map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), +++ __GFP_ZERO | GFP_USER, numa_node); +++ if (!map) { +++ bpf_map_charge_finish(&mem); +++ return ERR_PTR(-ENOMEM); +++ } +++ +++ bpf_map_charge_move(&map->map.memory, &mem); +++ +++ /* copy mandatory map attributes */ +++ bpf_map_init_from_attr(&map->map, attr); +++ +++ spin_lock_init(&map->lock); +++ map->root = RB_ROOT; +++ INIT_LIST_HEAD(&map->list); +++ +++ return &map->map; +++} +++ +++static void cgroup_storage_map_free(struct bpf_map *_map) +++{ +++ struct bpf_cgroup_storage_map *map = map_to_storage(_map); +++ +++ WARN_ON(!RB_EMPTY_ROOT(&map->root)); +++ WARN_ON(!list_empty(&map->list)); +++ +++ kfree(map); +++} +++ +++static int cgroup_storage_delete_elem(struct bpf_map *map, void *key) +++{ +++ return -EINVAL; +++} +++ +++static int cgroup_storage_check_btf(const struct bpf_map *map, +++ const struct btf *btf, +++ const struct btf_type *key_type, +++ const struct btf_type *value_type) +++{ +++ struct btf_member *m; +++ u32 offset, size; +++ +++ /* Key is expected to be of struct bpf_cgroup_storage_key type, +++ * which is: +++ * struct bpf_cgroup_storage_key { +++ * __u64 cgroup_inode_id; +++ * __u32 attach_type; +++ * }; +++ */ +++ +++ /* +++ * Key_type must be a structure with two fields. +++ */ +++ if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT || +++ BTF_INFO_VLEN(key_type->info) != 2) +++ return -EINVAL; +++ +++ /* +++ * The first field must be a 64 bit integer at 0 offset. +++ */ +++ m = (struct btf_member *)(key_type + 1); +++ size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, cgroup_inode_id); +++ if (!btf_member_is_reg_int(btf, key_type, m, 0, size)) +++ return -EINVAL; +++ +++ /* +++ * The second field must be a 32 bit integer at 64 bit offset. +++ */ +++ m++; +++ offset = offsetof(struct bpf_cgroup_storage_key, attach_type); +++ size = FIELD_SIZEOF(struct bpf_cgroup_storage_key, attach_type); +++ if (!btf_member_is_reg_int(btf, key_type, m, offset, size)) +++ return -EINVAL; +++ +++ return 0; +++} +++ +++static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key, +++ struct seq_file *m) +++{ +++ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); +++ struct bpf_cgroup_storage_key *key = _key; +++ struct bpf_cgroup_storage *storage; +++ int cpu; +++ +++ rcu_read_lock(); +++ storage = cgroup_storage_lookup(map_to_storage(map), key, false); +++ if (!storage) { +++ rcu_read_unlock(); +++ return; +++ } +++ +++ btf_type_seq_show(map->btf, map->btf_key_type_id, key, m); +++ stype = cgroup_storage_type(map); +++ if (stype == BPF_CGROUP_STORAGE_SHARED) { +++ seq_puts(m, ": "); +++ btf_type_seq_show(map->btf, map->btf_value_type_id, +++ &READ_ONCE(storage->buf)->data[0], m); +++ seq_puts(m, "\n"); +++ } else { +++ seq_puts(m, ": {\n"); +++ for_each_possible_cpu(cpu) { +++ seq_printf(m, "\tcpu%d: ", cpu); +++ btf_type_seq_show(map->btf, map->btf_value_type_id, +++ per_cpu_ptr(storage->percpu_buf, cpu), +++ m); +++ seq_puts(m, "\n"); +++ } +++ seq_puts(m, "}\n"); +++ } +++ rcu_read_unlock(); +++} +++ +++const struct bpf_map_ops cgroup_storage_map_ops = { +++ .map_alloc = cgroup_storage_map_alloc, +++ .map_free = cgroup_storage_map_free, +++ .map_get_next_key = cgroup_storage_get_next_key, +++ .map_lookup_elem = cgroup_storage_lookup_elem, +++ .map_update_elem = cgroup_storage_update_elem, +++ .map_delete_elem = cgroup_storage_delete_elem, +++ .map_check_btf = cgroup_storage_check_btf, +++ .map_seq_show_elem = cgroup_storage_seq_show_elem, +++}; +++ +++int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *_map) +++{ +++ enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); +++ struct bpf_cgroup_storage_map *map = map_to_storage(_map); +++ int ret = -EBUSY; +++ +++ spin_lock_bh(&map->lock); +++ +++ if (map->prog && map->prog != prog) +++ goto unlock; +++ if (prog->aux->cgroup_storage[stype] && +++ prog->aux->cgroup_storage[stype] != _map) +++ goto unlock; +++ +++ map->prog = prog; +++ prog->aux->cgroup_storage[stype] = _map; +++ ret = 0; +++unlock: +++ spin_unlock_bh(&map->lock); +++ +++ return ret; +++} +++ +++void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) +++{ +++ enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map); +++ struct bpf_cgroup_storage_map *map = map_to_storage(_map); +++ +++ spin_lock_bh(&map->lock); +++ if (map->prog == prog) { +++ WARN_ON(prog->aux->cgroup_storage[stype] != _map); +++ map->prog = NULL; +++ prog->aux->cgroup_storage[stype] = NULL; +++ } +++ spin_unlock_bh(&map->lock); +++} +++ +++static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) +++{ +++ size_t size; +++ +++ if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { +++ size = sizeof(struct bpf_storage_buffer) + map->value_size; +++ *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, +++ PAGE_SIZE) >> PAGE_SHIFT; +++ } else { +++ size = map->value_size; +++ *pages = round_up(round_up(size, 8) * num_possible_cpus(), +++ PAGE_SIZE) >> PAGE_SHIFT; +++ } +++ +++ return size; +++} +++ +++struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, +++ enum bpf_cgroup_storage_type stype) +++{ +++ struct bpf_cgroup_storage *storage; +++ struct bpf_map *map; +++ gfp_t flags; +++ size_t size; +++ u32 pages; +++ +++ map = prog->aux->cgroup_storage[stype]; +++ if (!map) +++ return NULL; +++ +++ size = bpf_cgroup_storage_calculate_size(map, &pages); +++ +++ if (bpf_map_charge_memlock(map, pages)) +++ return ERR_PTR(-EPERM); +++ +++ storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), +++ __GFP_ZERO | GFP_USER, map->numa_node); +++ if (!storage) +++ goto enomem; +++ +++ flags = __GFP_ZERO | GFP_USER; +++ +++ if (stype == BPF_CGROUP_STORAGE_SHARED) { +++ storage->buf = kmalloc_node(size, flags, map->numa_node); +++ if (!storage->buf) +++ goto enomem; +++ check_and_init_map_lock(map, storage->buf->data); +++ } else { +++ storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); +++ if (!storage->percpu_buf) +++ goto enomem; +++ } +++ +++ storage->map = (struct bpf_cgroup_storage_map *)map; +++ +++ return storage; +++ +++enomem: +++ bpf_map_uncharge_memlock(map, pages); +++ kfree(storage); +++ return ERR_PTR(-ENOMEM); +++} +++ +++static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) +++{ +++ struct bpf_cgroup_storage *storage = +++ container_of(rcu, struct bpf_cgroup_storage, rcu); +++ +++ kfree(storage->buf); +++ kfree(storage); +++} +++ +++static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) +++{ +++ struct bpf_cgroup_storage *storage = +++ container_of(rcu, struct bpf_cgroup_storage, rcu); +++ +++ free_percpu(storage->percpu_buf); +++ kfree(storage); +++} +++ +++void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) +++{ +++ enum bpf_cgroup_storage_type stype; +++ struct bpf_map *map; +++ u32 pages; +++ +++ if (!storage) +++ return; +++ +++ map = &storage->map->map; +++ +++ bpf_cgroup_storage_calculate_size(map, &pages); +++ bpf_map_uncharge_memlock(map, pages); +++ +++ stype = cgroup_storage_type(map); +++ if (stype == BPF_CGROUP_STORAGE_SHARED) +++ call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); +++ else +++ call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); +++} +++ +++void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, +++ struct cgroup *cgroup, +++ enum bpf_attach_type type) +++{ +++ struct bpf_cgroup_storage_map *map; +++ +++ if (!storage) +++ return; +++ +++ storage->key.attach_type = type; +++ storage->key.cgroup_inode_id = cgroup->kn->id.id; +++ +++ map = storage->map; +++ +++ spin_lock_bh(&map->lock); +++ WARN_ON(cgroup_storage_insert(map, storage)); +++ list_add(&storage->list, &map->list); +++ spin_unlock_bh(&map->lock); +++} +++ +++void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage) +++{ +++ struct bpf_cgroup_storage_map *map; +++ struct rb_root *root; +++ +++ if (!storage) +++ return; +++ +++ map = storage->map; +++ +++ spin_lock_bh(&map->lock); +++ root = &map->root; +++ rb_erase(&storage->node, root); +++ +++ list_del(&storage->list); +++ spin_unlock_bh(&map->lock); +++} +++ +++#endif ++--- /dev/null +++++ b/kernel/bpf/lpm_trie.c ++@@ -0,0 +1,746 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* +++ * Longest prefix match list implementation +++ * +++ * Copyright (c) 2016,2017 Daniel Mack +++ * Copyright (c) 2016 David Herrmann +++ */ +++ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++/* Intermediate node */ +++#define LPM_TREE_NODE_FLAG_IM BIT(0) +++ +++struct lpm_trie_node; +++ +++struct lpm_trie_node { +++ struct rcu_head rcu; +++ struct lpm_trie_node __rcu *child[2]; +++ u32 prefixlen; +++ u32 flags; +++ u8 data[0]; +++}; +++ +++struct lpm_trie { +++ struct bpf_map map; +++ struct lpm_trie_node __rcu *root; +++ size_t n_entries; +++ size_t max_prefixlen; +++ size_t data_size; +++ raw_spinlock_t lock; +++}; +++ +++/* This trie implements a longest prefix match algorithm that can be used to +++ * match IP addresses to a stored set of ranges. +++ * +++ * Data stored in @data of struct bpf_lpm_key and struct lpm_trie_node is +++ * interpreted as big endian, so data[0] stores the most significant byte. +++ * +++ * Match ranges are internally stored in instances of struct lpm_trie_node +++ * which each contain their prefix length as well as two pointers that may +++ * lead to more nodes containing more specific matches. Each node also stores +++ * a value that is defined by and returned to userspace via the update_elem +++ * and lookup functions. +++ * +++ * For instance, let's start with a trie that was created with a prefix length +++ * of 32, so it can be used for IPv4 addresses, and one single element that +++ * matches 192.168.0.0/16. The data array would hence contain +++ * [0xc0, 0xa8, 0x00, 0x00] in big-endian notation. This documentation will +++ * stick to IP-address notation for readability though. +++ * +++ * As the trie is empty initially, the new node (1) will be places as root +++ * node, denoted as (R) in the example below. As there are no other node, both +++ * child pointers are %NULL. +++ * +++ * +----------------+ +++ * | (1) (R) | +++ * | 192.168.0.0/16 | +++ * | value: 1 | +++ * | [0] [1] | +++ * +----------------+ +++ * +++ * Next, let's add a new node (2) matching 192.168.0.0/24. As there is already +++ * a node with the same data and a smaller prefix (ie, a less specific one), +++ * node (2) will become a child of (1). In child index depends on the next bit +++ * that is outside of what (1) matches, and that bit is 0, so (2) will be +++ * child[0] of (1): +++ * +++ * +----------------+ +++ * | (1) (R) | +++ * | 192.168.0.0/16 | +++ * | value: 1 | +++ * | [0] [1] | +++ * +----------------+ +++ * | +++ * +----------------+ +++ * | (2) | +++ * | 192.168.0.0/24 | +++ * | value: 2 | +++ * | [0] [1] | +++ * +----------------+ +++ * +++ * The child[1] slot of (1) could be filled with another node which has bit #17 +++ * (the next bit after the ones that (1) matches on) set to 1. For instance, +++ * 192.168.128.0/24: +++ * +++ * +----------------+ +++ * | (1) (R) | +++ * | 192.168.0.0/16 | +++ * | value: 1 | +++ * | [0] [1] | +++ * +----------------+ +++ * | | +++ * +----------------+ +------------------+ +++ * | (2) | | (3) | +++ * | 192.168.0.0/24 | | 192.168.128.0/24 | +++ * | value: 2 | | value: 3 | +++ * | [0] [1] | | [0] [1] | +++ * +----------------+ +------------------+ +++ * +++ * Let's add another node (4) to the game for 192.168.1.0/24. In order to place +++ * it, node (1) is looked at first, and because (4) of the semantics laid out +++ * above (bit #17 is 0), it would normally be attached to (1) as child[0]. +++ * However, that slot is already allocated, so a new node is needed in between. +++ * That node does not have a value attached to it and it will never be +++ * returned to users as result of a lookup. It is only there to differentiate +++ * the traversal further. It will get a prefix as wide as necessary to +++ * distinguish its two children: +++ * +++ * +----------------+ +++ * | (1) (R) | +++ * | 192.168.0.0/16 | +++ * | value: 1 | +++ * | [0] [1] | +++ * +----------------+ +++ * | | +++ * +----------------+ +------------------+ +++ * | (4) (I) | | (3) | +++ * | 192.168.0.0/23 | | 192.168.128.0/24 | +++ * | value: --- | | value: 3 | +++ * | [0] [1] | | [0] [1] | +++ * +----------------+ +------------------+ +++ * | | +++ * +----------------+ +----------------+ +++ * | (2) | | (5) | +++ * | 192.168.0.0/24 | | 192.168.1.0/24 | +++ * | value: 2 | | value: 5 | +++ * | [0] [1] | | [0] [1] | +++ * +----------------+ +----------------+ +++ * +++ * 192.168.1.1/32 would be a child of (5) etc. +++ * +++ * An intermediate node will be turned into a 'real' node on demand. In the +++ * example above, (4) would be re-used if 192.168.0.0/23 is added to the trie. +++ * +++ * A fully populated trie would have a height of 32 nodes, as the trie was +++ * created with a prefix length of 32. +++ * +++ * The lookup starts at the root node. If the current node matches and if there +++ * is a child that can be used to become more specific, the trie is traversed +++ * downwards. The last node in the traversal that is a non-intermediate one is +++ * returned. +++ */ +++ +++static inline int extract_bit(const u8 *data, size_t index) +++{ +++ return !!(data[index / 8] & (1 << (7 - (index % 8)))); +++} +++ +++/** +++ * longest_prefix_match() - determine the longest prefix +++ * @trie: The trie to get internal sizes from +++ * @node: The node to operate on +++ * @key: The key to compare to @node +++ * +++ * Determine the longest prefix of @node that matches the bits in @key. +++ */ +++static size_t longest_prefix_match(const struct lpm_trie *trie, +++ const struct lpm_trie_node *node, +++ const struct bpf_lpm_trie_key *key) +++{ +++ u32 limit = min(node->prefixlen, key->prefixlen); +++ u32 prefixlen = 0, i = 0; +++ +++ BUILD_BUG_ON(offsetof(struct lpm_trie_node, data) % sizeof(u32)); +++ BUILD_BUG_ON(offsetof(struct bpf_lpm_trie_key, data) % sizeof(u32)); +++ +++#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && defined(CONFIG_64BIT) +++ +++ /* data_size >= 16 has very small probability. +++ * We do not use a loop for optimal code generation. +++ */ +++ if (trie->data_size >= 8) { +++ u64 diff = be64_to_cpu(*(__be64 *)node->data ^ +++ *(__be64 *)key->data); +++ +++ prefixlen = 64 - fls64(diff); +++ if (prefixlen >= limit) +++ return limit; +++ if (diff) +++ return prefixlen; +++ i = 8; +++ } +++#endif +++ +++ while (trie->data_size >= i + 4) { +++ u32 diff = be32_to_cpu(*(__be32 *)&node->data[i] ^ +++ *(__be32 *)&key->data[i]); +++ +++ prefixlen += 32 - fls(diff); +++ if (prefixlen >= limit) +++ return limit; +++ if (diff) +++ return prefixlen; +++ i += 4; +++ } +++ +++ if (trie->data_size >= i + 2) { +++ u16 diff = be16_to_cpu(*(__be16 *)&node->data[i] ^ +++ *(__be16 *)&key->data[i]); +++ +++ prefixlen += 16 - fls(diff); +++ if (prefixlen >= limit) +++ return limit; +++ if (diff) +++ return prefixlen; +++ i += 2; +++ } +++ +++ if (trie->data_size >= i + 1) { +++ prefixlen += 8 - fls(node->data[i] ^ key->data[i]); +++ +++ if (prefixlen >= limit) +++ return limit; +++ } +++ +++ return prefixlen; +++} +++ +++/* Called from syscall or from eBPF program */ +++static void *trie_lookup_elem(struct bpf_map *map, void *_key) +++{ +++ struct lpm_trie *trie = container_of(map, struct lpm_trie, map); +++ struct lpm_trie_node *node, *found = NULL; +++ struct bpf_lpm_trie_key *key = _key; +++ +++ /* Start walking the trie from the root node ... */ +++ +++ for (node = rcu_dereference(trie->root); node;) { +++ unsigned int next_bit; +++ size_t matchlen; +++ +++ /* Determine the longest prefix of @node that matches @key. +++ * If it's the maximum possible prefix for this trie, we have +++ * an exact match and can return it directly. +++ */ +++ matchlen = longest_prefix_match(trie, node, key); +++ if (matchlen == trie->max_prefixlen) { +++ found = node; +++ break; +++ } +++ +++ /* If the number of bits that match is smaller than the prefix +++ * length of @node, bail out and return the node we have seen +++ * last in the traversal (ie, the parent). +++ */ +++ if (matchlen < node->prefixlen) +++ break; +++ +++ /* Consider this node as return candidate unless it is an +++ * artificially added intermediate one. +++ */ +++ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) +++ found = node; +++ +++ /* If the node match is fully satisfied, let's see if we can +++ * become more specific. Determine the next bit in the key and +++ * traverse down. +++ */ +++ next_bit = extract_bit(key->data, node->prefixlen); +++ node = rcu_dereference(node->child[next_bit]); +++ } +++ +++ if (!found) +++ return NULL; +++ +++ return found->data + trie->data_size; +++} +++ +++static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, +++ const void *value) +++{ +++ struct lpm_trie_node *node; +++ size_t size = sizeof(struct lpm_trie_node) + trie->data_size; +++ +++ if (value) +++ size += trie->map.value_size; +++ +++ node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, +++ trie->map.numa_node); +++ if (!node) +++ return NULL; +++ +++ node->flags = 0; +++ +++ if (value) +++ memcpy(node->data + trie->data_size, value, +++ trie->map.value_size); +++ +++ return node; +++} +++ +++/* Called from syscall or from eBPF program */ +++static int trie_update_elem(struct bpf_map *map, +++ void *_key, void *value, u64 flags) +++{ +++ struct lpm_trie *trie = container_of(map, struct lpm_trie, map); +++ struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; +++ struct lpm_trie_node __rcu **slot; +++ struct bpf_lpm_trie_key *key = _key; +++ unsigned long irq_flags; +++ unsigned int next_bit; +++ size_t matchlen = 0; +++ int ret = 0; +++ +++ if (unlikely(flags > BPF_EXIST)) +++ return -EINVAL; +++ +++ if (key->prefixlen > trie->max_prefixlen) +++ return -EINVAL; +++ +++ raw_spin_lock_irqsave(&trie->lock, irq_flags); +++ +++ /* Allocate and fill a new node */ +++ +++ if (trie->n_entries == trie->map.max_entries) { +++ ret = -ENOSPC; +++ goto out; +++ } +++ +++ new_node = lpm_trie_node_alloc(trie, value); +++ if (!new_node) { +++ ret = -ENOMEM; +++ goto out; +++ } +++ +++ trie->n_entries++; +++ +++ new_node->prefixlen = key->prefixlen; +++ RCU_INIT_POINTER(new_node->child[0], NULL); +++ RCU_INIT_POINTER(new_node->child[1], NULL); +++ memcpy(new_node->data, key->data, trie->data_size); +++ +++ /* Now find a slot to attach the new node. To do that, walk the tree +++ * from the root and match as many bits as possible for each node until +++ * we either find an empty slot or a slot that needs to be replaced by +++ * an intermediate node. +++ */ +++ slot = &trie->root; +++ +++ while ((node = rcu_dereference_protected(*slot, +++ lockdep_is_held(&trie->lock)))) { +++ matchlen = longest_prefix_match(trie, node, key); +++ +++ if (node->prefixlen != matchlen || +++ node->prefixlen == key->prefixlen || +++ node->prefixlen == trie->max_prefixlen) +++ break; +++ +++ next_bit = extract_bit(key->data, node->prefixlen); +++ slot = &node->child[next_bit]; +++ } +++ +++ /* If the slot is empty (a free child pointer or an empty root), +++ * simply assign the @new_node to that slot and be done. +++ */ +++ if (!node) { +++ rcu_assign_pointer(*slot, new_node); +++ goto out; +++ } +++ +++ /* If the slot we picked already exists, replace it with @new_node +++ * which already has the correct data array set. +++ */ +++ if (node->prefixlen == matchlen) { +++ new_node->child[0] = node->child[0]; +++ new_node->child[1] = node->child[1]; +++ +++ if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) +++ trie->n_entries--; +++ +++ rcu_assign_pointer(*slot, new_node); +++ kfree_rcu(node, rcu); +++ +++ goto out; +++ } +++ +++ /* If the new node matches the prefix completely, it must be inserted +++ * as an ancestor. Simply insert it between @node and *@slot. +++ */ +++ if (matchlen == key->prefixlen) { +++ next_bit = extract_bit(node->data, matchlen); +++ rcu_assign_pointer(new_node->child[next_bit], node); +++ rcu_assign_pointer(*slot, new_node); +++ goto out; +++ } +++ +++ im_node = lpm_trie_node_alloc(trie, NULL); +++ if (!im_node) { +++ ret = -ENOMEM; +++ goto out; +++ } +++ +++ im_node->prefixlen = matchlen; +++ im_node->flags |= LPM_TREE_NODE_FLAG_IM; +++ memcpy(im_node->data, node->data, trie->data_size); +++ +++ /* Now determine which child to install in which slot */ +++ if (extract_bit(key->data, matchlen)) { +++ rcu_assign_pointer(im_node->child[0], node); +++ rcu_assign_pointer(im_node->child[1], new_node); +++ } else { +++ rcu_assign_pointer(im_node->child[0], new_node); +++ rcu_assign_pointer(im_node->child[1], node); +++ } +++ +++ /* Finally, assign the intermediate node to the determined spot */ +++ rcu_assign_pointer(*slot, im_node); +++ +++out: +++ if (ret) { +++ if (new_node) +++ trie->n_entries--; +++ +++ kfree(new_node); +++ kfree(im_node); +++ } +++ +++ raw_spin_unlock_irqrestore(&trie->lock, irq_flags); +++ +++ return ret; +++} +++ +++/* Called from syscall or from eBPF program */ +++static int trie_delete_elem(struct bpf_map *map, void *_key) +++{ +++ struct lpm_trie *trie = container_of(map, struct lpm_trie, map); +++ struct bpf_lpm_trie_key *key = _key; +++ struct lpm_trie_node __rcu **trim, **trim2; +++ struct lpm_trie_node *node, *parent; +++ unsigned long irq_flags; +++ unsigned int next_bit; +++ size_t matchlen = 0; +++ int ret = 0; +++ +++ if (key->prefixlen > trie->max_prefixlen) +++ return -EINVAL; +++ +++ raw_spin_lock_irqsave(&trie->lock, irq_flags); +++ +++ /* Walk the tree looking for an exact key/length match and keeping +++ * track of the path we traverse. We will need to know the node +++ * we wish to delete, and the slot that points to the node we want +++ * to delete. We may also need to know the nodes parent and the +++ * slot that contains it. +++ */ +++ trim = &trie->root; +++ trim2 = trim; +++ parent = NULL; +++ while ((node = rcu_dereference_protected( +++ *trim, lockdep_is_held(&trie->lock)))) { +++ matchlen = longest_prefix_match(trie, node, key); +++ +++ if (node->prefixlen != matchlen || +++ node->prefixlen == key->prefixlen) +++ break; +++ +++ parent = node; +++ trim2 = trim; +++ next_bit = extract_bit(key->data, node->prefixlen); +++ trim = &node->child[next_bit]; +++ } +++ +++ if (!node || node->prefixlen != key->prefixlen || +++ node->prefixlen != matchlen || +++ (node->flags & LPM_TREE_NODE_FLAG_IM)) { +++ ret = -ENOENT; +++ goto out; +++ } +++ +++ trie->n_entries--; +++ +++ /* If the node we are removing has two children, simply mark it +++ * as intermediate and we are done. +++ */ +++ if (rcu_access_pointer(node->child[0]) && +++ rcu_access_pointer(node->child[1])) { +++ node->flags |= LPM_TREE_NODE_FLAG_IM; +++ goto out; +++ } +++ +++ /* If the parent of the node we are about to delete is an intermediate +++ * node, and the deleted node doesn't have any children, we can delete +++ * the intermediate parent as well and promote its other child +++ * up the tree. Doing this maintains the invariant that all +++ * intermediate nodes have exactly 2 children and that there are no +++ * unnecessary intermediate nodes in the tree. +++ */ +++ if (parent && (parent->flags & LPM_TREE_NODE_FLAG_IM) && +++ !node->child[0] && !node->child[1]) { +++ if (node == rcu_access_pointer(parent->child[0])) +++ rcu_assign_pointer( +++ *trim2, rcu_access_pointer(parent->child[1])); +++ else +++ rcu_assign_pointer( +++ *trim2, rcu_access_pointer(parent->child[0])); +++ kfree_rcu(parent, rcu); +++ kfree_rcu(node, rcu); +++ goto out; +++ } +++ +++ /* The node we are removing has either zero or one child. If there +++ * is a child, move it into the removed node's slot then delete +++ * the node. Otherwise just clear the slot and delete the node. +++ */ +++ if (node->child[0]) +++ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[0])); +++ else if (node->child[1]) +++ rcu_assign_pointer(*trim, rcu_access_pointer(node->child[1])); +++ else +++ RCU_INIT_POINTER(*trim, NULL); +++ kfree_rcu(node, rcu); +++ +++out: +++ raw_spin_unlock_irqrestore(&trie->lock, irq_flags); +++ +++ return ret; +++} +++ +++#define LPM_DATA_SIZE_MAX 256 +++#define LPM_DATA_SIZE_MIN 1 +++ +++#define LPM_VAL_SIZE_MAX (KMALLOC_MAX_SIZE - LPM_DATA_SIZE_MAX - \ +++ sizeof(struct lpm_trie_node)) +++#define LPM_VAL_SIZE_MIN 1 +++ +++#define LPM_KEY_SIZE(X) (sizeof(struct bpf_lpm_trie_key) + (X)) +++#define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) +++#define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) +++ +++#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE | \ +++ BPF_F_ACCESS_MASK) +++ +++static struct bpf_map *trie_alloc(union bpf_attr *attr) +++{ +++ struct lpm_trie *trie; +++ u64 cost = sizeof(*trie), cost_per_node; +++ int ret; +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return ERR_PTR(-EPERM); +++ +++ /* check sanity of attributes */ +++ if (attr->max_entries == 0 || +++ !(attr->map_flags & BPF_F_NO_PREALLOC) || +++ attr->map_flags & ~LPM_CREATE_FLAG_MASK || +++ !bpf_map_flags_access_ok(attr->map_flags) || +++ attr->key_size < LPM_KEY_SIZE_MIN || +++ attr->key_size > LPM_KEY_SIZE_MAX || +++ attr->value_size < LPM_VAL_SIZE_MIN || +++ attr->value_size > LPM_VAL_SIZE_MAX) +++ return ERR_PTR(-EINVAL); +++ +++ trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); +++ if (!trie) +++ return ERR_PTR(-ENOMEM); +++ +++ /* copy mandatory map attributes */ +++ bpf_map_init_from_attr(&trie->map, attr); +++ trie->data_size = attr->key_size - +++ offsetof(struct bpf_lpm_trie_key, data); +++ trie->max_prefixlen = trie->data_size * 8; +++ +++ cost_per_node = sizeof(struct lpm_trie_node) + +++ attr->value_size + trie->data_size; +++ cost += (u64) attr->max_entries * cost_per_node; +++ +++ ret = bpf_map_charge_init(&trie->map.memory, cost); +++ if (ret) +++ goto out_err; +++ +++ raw_spin_lock_init(&trie->lock); +++ +++ return &trie->map; +++out_err: +++ kfree(trie); +++ return ERR_PTR(ret); +++} +++ +++static void trie_free(struct bpf_map *map) +++{ +++ struct lpm_trie *trie = container_of(map, struct lpm_trie, map); +++ struct lpm_trie_node __rcu **slot; +++ struct lpm_trie_node *node; +++ +++ /* Wait for outstanding programs to complete +++ * update/lookup/delete/get_next_key and free the trie. +++ */ +++ synchronize_rcu(); +++ +++ /* Always start at the root and walk down to a node that has no +++ * children. Then free that node, nullify its reference in the parent +++ * and start over. +++ */ +++ +++ for (;;) { +++ slot = &trie->root; +++ +++ for (;;) { +++ node = rcu_dereference_protected(*slot, 1); +++ if (!node) +++ goto out; +++ +++ if (rcu_access_pointer(node->child[0])) { +++ slot = &node->child[0]; +++ continue; +++ } +++ +++ if (rcu_access_pointer(node->child[1])) { +++ slot = &node->child[1]; +++ continue; +++ } +++ +++ kfree(node); +++ RCU_INIT_POINTER(*slot, NULL); +++ break; +++ } +++ } +++ +++out: +++ kfree(trie); +++} +++ +++static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) +++{ +++ struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; +++ struct lpm_trie *trie = container_of(map, struct lpm_trie, map); +++ struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; +++ struct lpm_trie_node **node_stack = NULL; +++ int err = 0, stack_ptr = -1; +++ unsigned int next_bit; +++ size_t matchlen; +++ +++ /* The get_next_key follows postorder. For the 4 node example in +++ * the top of this file, the trie_get_next_key() returns the following +++ * one after another: +++ * 192.168.0.0/24 +++ * 192.168.1.0/24 +++ * 192.168.128.0/24 +++ * 192.168.0.0/16 +++ * +++ * The idea is to return more specific keys before less specific ones. +++ */ +++ +++ /* Empty trie */ +++ search_root = rcu_dereference(trie->root); +++ if (!search_root) +++ return -ENOENT; +++ +++ /* For invalid key, find the leftmost node in the trie */ +++ if (!key || key->prefixlen > trie->max_prefixlen) +++ goto find_leftmost; +++ +++ node_stack = kmalloc_array(trie->max_prefixlen, +++ sizeof(struct lpm_trie_node *), +++ GFP_ATOMIC | __GFP_NOWARN); +++ if (!node_stack) +++ return -ENOMEM; +++ +++ /* Try to find the exact node for the given key */ +++ for (node = search_root; node;) { +++ node_stack[++stack_ptr] = node; +++ matchlen = longest_prefix_match(trie, node, key); +++ if (node->prefixlen != matchlen || +++ node->prefixlen == key->prefixlen) +++ break; +++ +++ next_bit = extract_bit(key->data, node->prefixlen); +++ node = rcu_dereference(node->child[next_bit]); +++ } +++ if (!node || node->prefixlen != key->prefixlen || +++ (node->flags & LPM_TREE_NODE_FLAG_IM)) +++ goto find_leftmost; +++ +++ /* The node with the exactly-matching key has been found, +++ * find the first node in postorder after the matched node. +++ */ +++ node = node_stack[stack_ptr]; +++ while (stack_ptr > 0) { +++ parent = node_stack[stack_ptr - 1]; +++ if (rcu_dereference(parent->child[0]) == node) { +++ search_root = rcu_dereference(parent->child[1]); +++ if (search_root) +++ goto find_leftmost; +++ } +++ if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { +++ next_node = parent; +++ goto do_copy; +++ } +++ +++ node = parent; +++ stack_ptr--; +++ } +++ +++ /* did not find anything */ +++ err = -ENOENT; +++ goto free_stack; +++ +++find_leftmost: +++ /* Find the leftmost non-intermediate node, all intermediate nodes +++ * have exact two children, so this function will never return NULL. +++ */ +++ for (node = search_root; node;) { +++ if (node->flags & LPM_TREE_NODE_FLAG_IM) { +++ node = rcu_dereference(node->child[0]); +++ } else { +++ next_node = node; +++ node = rcu_dereference(node->child[0]); +++ if (!node) +++ node = rcu_dereference(next_node->child[1]); +++ } +++ } +++do_copy: +++ next_key->prefixlen = next_node->prefixlen; +++ memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), +++ next_node->data, trie->data_size); +++free_stack: +++ kfree(node_stack); +++ return err; +++} +++ +++static int trie_check_btf(const struct bpf_map *map, +++ const struct btf *btf, +++ const struct btf_type *key_type, +++ const struct btf_type *value_type) +++{ +++ /* Keys must have struct bpf_lpm_trie_key embedded. */ +++ return BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ? +++ -EINVAL : 0; +++} +++ +++const struct bpf_map_ops trie_map_ops = { +++ .map_alloc = trie_alloc, +++ .map_free = trie_free, +++ .map_get_next_key = trie_get_next_key, +++ .map_lookup_elem = trie_lookup_elem, +++ .map_update_elem = trie_update_elem, +++ .map_delete_elem = trie_delete_elem, +++ .map_check_btf = trie_check_btf, +++}; ++--- a/kernel/bpf/Makefile +++++ b/kernel/bpf/Makefile ++@@ -1,4 +1,23 @@ +++# SPDX-License-Identifier: GPL-2.0 ++ obj-y := core.o +++ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +++# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +++cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +++endif +++CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) ++ ++-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o ++-obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +++obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o +++obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +++obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o +++obj-$(CONFIG_BPF_SYSCALL) += disasm.o +++obj-$(CONFIG_BPF_SYSCALL) += btf.o +++ifeq ($(CONFIG_NET),y) +++obj-$(CONFIG_BPF_SYSCALL) += devmap.o +++endif +++ifeq ($(CONFIG_PERF_EVENTS),y) +++obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +++endif +++obj-$(CONFIG_CGROUP_BPF) += cgroup.o +++ifeq ($(CONFIG_SYSFS),y) +++obj-$(CONFIG_DEBUG_INFO_BTF) += sysfs_btf.o +++endif ++--- /dev/null +++++ b/kernel/bpf/map_in_map.c ++@@ -0,0 +1,120 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* Copyright (c) 2017 Facebook +++ */ +++#include +++#include +++ +++#include "map_in_map.h" +++ +++struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +++{ +++ struct bpf_map *inner_map, *inner_map_meta; +++ u32 inner_map_meta_size; +++ struct fd f; +++ +++ f = fdget(inner_map_ufd); +++ inner_map = __bpf_map_get(f); +++ if (IS_ERR(inner_map)) +++ return inner_map; +++ +++ /* prog_array->owner_prog_type and owner_jited +++ * is a runtime binding. Doing static check alone +++ * in the verifier is not enough. +++ */ +++ if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY || +++ inner_map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || +++ inner_map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { +++ fdput(f); +++ return ERR_PTR(-ENOTSUPP); +++ } +++ +++ /* Does not support >1 level map-in-map */ +++ if (inner_map->inner_map_meta) { +++ fdput(f); +++ return ERR_PTR(-EINVAL); +++ } +++ +++ if (map_value_has_spin_lock(inner_map)) { +++ fdput(f); +++ return ERR_PTR(-ENOTSUPP); +++ } +++ +++ inner_map_meta_size = sizeof(*inner_map_meta); +++ /* In some cases verifier needs to access beyond just base map. */ +++ if (inner_map->ops == &array_map_ops) +++ inner_map_meta_size = sizeof(struct bpf_array); +++ +++ inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); +++ if (!inner_map_meta) { +++ fdput(f); +++ return ERR_PTR(-ENOMEM); +++ } +++ +++ inner_map_meta->map_type = inner_map->map_type; +++ inner_map_meta->key_size = inner_map->key_size; +++ inner_map_meta->value_size = inner_map->value_size; +++ inner_map_meta->map_flags = inner_map->map_flags; +++ inner_map_meta->max_entries = inner_map->max_entries; +++ inner_map_meta->spin_lock_off = inner_map->spin_lock_off; +++ +++ /* Misc members not needed in bpf_map_meta_equal() check. */ +++ inner_map_meta->ops = inner_map->ops; +++ if (inner_map->ops == &array_map_ops) { +++ inner_map_meta->unpriv_array = inner_map->unpriv_array; +++ container_of(inner_map_meta, struct bpf_array, map)->index_mask = +++ container_of(inner_map, struct bpf_array, map)->index_mask; +++ } +++ +++ fdput(f); +++ return inner_map_meta; +++} +++ +++void bpf_map_meta_free(struct bpf_map *map_meta) +++{ +++ kfree(map_meta); +++} +++ +++bool bpf_map_meta_equal(const struct bpf_map *meta0, +++ const struct bpf_map *meta1) +++{ +++ /* No need to compare ops because it is covered by map_type */ +++ return meta0->map_type == meta1->map_type && +++ meta0->key_size == meta1->key_size && +++ meta0->value_size == meta1->value_size && +++ meta0->map_flags == meta1->map_flags && +++ meta0->max_entries == meta1->max_entries; +++} +++ +++void *bpf_map_fd_get_ptr(struct bpf_map *map, +++ struct file *map_file /* not used */, +++ int ufd) +++{ +++ struct bpf_map *inner_map; +++ struct fd f; +++ +++ f = fdget(ufd); +++ inner_map = __bpf_map_get(f); +++ if (IS_ERR(inner_map)) +++ return inner_map; +++ +++ if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) +++ inner_map = bpf_map_inc(inner_map, false); +++ else +++ inner_map = ERR_PTR(-EINVAL); +++ +++ fdput(f); +++ return inner_map; +++} +++ +++void bpf_map_fd_put_ptr(void *ptr) +++{ +++ /* ptr->ops->map_free() has to go through one +++ * rcu grace period by itself. +++ */ +++ bpf_map_put(ptr); +++} +++ +++u32 bpf_map_fd_sys_lookup_elem(void *ptr) +++{ +++ return ((struct bpf_map *)ptr)->id; +++} ++--- /dev/null +++++ b/kernel/bpf/map_in_map.h ++@@ -0,0 +1,21 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ +++/* Copyright (c) 2017 Facebook +++ */ +++#ifndef __MAP_IN_MAP_H__ +++#define __MAP_IN_MAP_H__ +++ +++#include +++ +++struct file; +++struct bpf_map; +++ +++struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +++void bpf_map_meta_free(struct bpf_map *map_meta); +++bool bpf_map_meta_equal(const struct bpf_map *meta0, +++ const struct bpf_map *meta1); +++void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, +++ int ufd); +++void bpf_map_fd_put_ptr(void *ptr); +++u32 bpf_map_fd_sys_lookup_elem(void *ptr); +++ +++#endif ++--- /dev/null +++++ b/kernel/bpf/percpu_freelist.c ++@@ -0,0 +1,118 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* Copyright (c) 2016 Facebook +++ */ +++#include "percpu_freelist.h" +++ +++int pcpu_freelist_init(struct pcpu_freelist *s) +++{ +++ int cpu; +++ +++ s->freelist = alloc_percpu(struct pcpu_freelist_head); +++ if (!s->freelist) +++ return -ENOMEM; +++ +++ for_each_possible_cpu(cpu) { +++ struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); +++ +++ raw_spin_lock_init(&head->lock); +++ head->first = NULL; +++ } +++ return 0; +++} +++ +++void pcpu_freelist_destroy(struct pcpu_freelist *s) +++{ +++ free_percpu(s->freelist); +++} +++ +++static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, +++ struct pcpu_freelist_node *node) +++{ +++ raw_spin_lock(&head->lock); +++ node->next = head->first; +++ head->first = node; +++ raw_spin_unlock(&head->lock); +++} +++ +++void __pcpu_freelist_push(struct pcpu_freelist *s, +++ struct pcpu_freelist_node *node) +++{ +++ struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); +++ +++ ___pcpu_freelist_push(head, node); +++} +++ +++void pcpu_freelist_push(struct pcpu_freelist *s, +++ struct pcpu_freelist_node *node) +++{ +++ unsigned long flags; +++ +++ local_irq_save(flags); +++ __pcpu_freelist_push(s, node); +++ local_irq_restore(flags); +++} +++ +++void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, +++ u32 nr_elems) +++{ +++ struct pcpu_freelist_head *head; +++ unsigned long flags; +++ int i, cpu, pcpu_entries; +++ +++ pcpu_entries = nr_elems / num_possible_cpus() + 1; +++ i = 0; +++ +++ /* disable irq to workaround lockdep false positive +++ * in bpf usage pcpu_freelist_populate() will never race +++ * with pcpu_freelist_push() +++ */ +++ local_irq_save(flags); +++ for_each_possible_cpu(cpu) { +++again: +++ head = per_cpu_ptr(s->freelist, cpu); +++ ___pcpu_freelist_push(head, buf); +++ i++; +++ buf += elem_size; +++ if (i == nr_elems) +++ break; +++ if (i % pcpu_entries) +++ goto again; +++ } +++ local_irq_restore(flags); +++} +++ +++struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) +++{ +++ struct pcpu_freelist_head *head; +++ struct pcpu_freelist_node *node; +++ int orig_cpu, cpu; +++ +++ orig_cpu = cpu = raw_smp_processor_id(); +++ while (1) { +++ head = per_cpu_ptr(s->freelist, cpu); +++ raw_spin_lock(&head->lock); +++ node = head->first; +++ if (node) { +++ head->first = node->next; +++ raw_spin_unlock(&head->lock); +++ return node; +++ } +++ raw_spin_unlock(&head->lock); +++ cpu = cpumask_next(cpu, cpu_possible_mask); +++ if (cpu >= nr_cpu_ids) +++ cpu = 0; +++ if (cpu == orig_cpu) +++ return NULL; +++ } +++} +++ +++struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +++{ +++ struct pcpu_freelist_node *ret; +++ unsigned long flags; +++ +++ local_irq_save(flags); +++ ret = __pcpu_freelist_pop(s); +++ local_irq_restore(flags); +++ return ret; +++} ++--- /dev/null +++++ b/kernel/bpf/percpu_freelist.h ++@@ -0,0 +1,32 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ +++/* Copyright (c) 2016 Facebook +++ */ +++#ifndef __PERCPU_FREELIST_H__ +++#define __PERCPU_FREELIST_H__ +++#include +++#include +++ +++struct pcpu_freelist_head { +++ struct pcpu_freelist_node *first; +++ raw_spinlock_t lock; +++}; +++ +++struct pcpu_freelist { +++ struct pcpu_freelist_head __percpu *freelist; +++}; +++ +++struct pcpu_freelist_node { +++ struct pcpu_freelist_node *next; +++}; +++ +++/* pcpu_freelist_* do spin_lock_irqsave. */ +++void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +++struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +++/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */ +++void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +++struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *); +++void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, +++ u32 nr_elems); +++int pcpu_freelist_init(struct pcpu_freelist *); +++void pcpu_freelist_destroy(struct pcpu_freelist *s); +++#endif ++--- /dev/null +++++ b/kernel/bpf/queue_stack_maps.c ++@@ -0,0 +1,289 @@ +++// SPDX-License-Identifier: GPL-2.0 +++/* +++ * queue_stack_maps.c: BPF queue and stack maps +++ * +++ * Copyright (c) 2018 Politecnico di Torino +++ */ +++#include +++#include +++#include +++#include +++#include "percpu_freelist.h" +++ +++#define QUEUE_STACK_CREATE_FLAG_MASK \ +++ (BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK) +++ +++struct bpf_queue_stack { +++ struct bpf_map map; +++ raw_spinlock_t lock; +++ u32 head, tail; +++ u32 size; /* max_entries + 1 */ +++ +++ char elements[0] __aligned(8); +++}; +++ +++static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +++{ +++ return container_of(map, struct bpf_queue_stack, map); +++} +++ +++static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +++{ +++ return qs->head == qs->tail; +++} +++ +++static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +++{ +++ u32 head = qs->head + 1; +++ +++ if (unlikely(head >= qs->size)) +++ head = 0; +++ +++ return head == qs->tail; +++} +++ +++/* Called from syscall */ +++static int queue_stack_map_alloc_check(union bpf_attr *attr) +++{ +++ if (!capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ +++ /* check sanity of attributes */ +++ if (attr->max_entries == 0 || attr->key_size != 0 || +++ attr->value_size == 0 || +++ attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK || +++ !bpf_map_flags_access_ok(attr->map_flags)) +++ return -EINVAL; +++ +++ if (attr->value_size > KMALLOC_MAX_SIZE) +++ /* if value_size is bigger, the user space won't be able to +++ * access the elements. +++ */ +++ return -E2BIG; +++ +++ return 0; +++} +++ +++static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +++{ +++ int ret, numa_node = bpf_map_attr_numa_node(attr); +++ struct bpf_map_memory mem = {0}; +++ struct bpf_queue_stack *qs; +++ u64 size, queue_size, cost; +++ +++ size = (u64) attr->max_entries + 1; +++ cost = queue_size = sizeof(*qs) + size * attr->value_size; +++ +++ ret = bpf_map_charge_init(&mem, cost); +++ if (ret < 0) +++ return ERR_PTR(ret); +++ +++ qs = bpf_map_area_alloc(queue_size, numa_node); +++ if (!qs) { +++ bpf_map_charge_finish(&mem); +++ return ERR_PTR(-ENOMEM); +++ } +++ +++ memset(qs, 0, sizeof(*qs)); +++ +++ bpf_map_init_from_attr(&qs->map, attr); +++ +++ bpf_map_charge_move(&qs->map.memory, &mem); +++ qs->size = size; +++ +++ raw_spin_lock_init(&qs->lock); +++ +++ return &qs->map; +++} +++ +++/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +++static void queue_stack_map_free(struct bpf_map *map) +++{ +++ struct bpf_queue_stack *qs = bpf_queue_stack(map); +++ +++ /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, +++ * so the programs (can be more than one that used this map) were +++ * disconnected from events. Wait for outstanding critical sections in +++ * these programs to complete +++ */ +++ synchronize_rcu(); +++ +++ bpf_map_area_free(qs); +++} +++ +++static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +++{ +++ struct bpf_queue_stack *qs = bpf_queue_stack(map); +++ unsigned long flags; +++ int err = 0; +++ void *ptr; +++ +++ raw_spin_lock_irqsave(&qs->lock, flags); +++ +++ if (queue_stack_map_is_empty(qs)) { +++ memset(value, 0, qs->map.value_size); +++ err = -ENOENT; +++ goto out; +++ } +++ +++ ptr = &qs->elements[qs->tail * qs->map.value_size]; +++ memcpy(value, ptr, qs->map.value_size); +++ +++ if (delete) { +++ if (unlikely(++qs->tail >= qs->size)) +++ qs->tail = 0; +++ } +++ +++out: +++ raw_spin_unlock_irqrestore(&qs->lock, flags); +++ return err; +++} +++ +++ +++static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +++{ +++ struct bpf_queue_stack *qs = bpf_queue_stack(map); +++ unsigned long flags; +++ int err = 0; +++ void *ptr; +++ u32 index; +++ +++ raw_spin_lock_irqsave(&qs->lock, flags); +++ +++ if (queue_stack_map_is_empty(qs)) { +++ memset(value, 0, qs->map.value_size); +++ err = -ENOENT; +++ goto out; +++ } +++ +++ index = qs->head - 1; +++ if (unlikely(index >= qs->size)) +++ index = qs->size - 1; +++ +++ ptr = &qs->elements[index * qs->map.value_size]; +++ memcpy(value, ptr, qs->map.value_size); +++ +++ if (delete) +++ qs->head = index; +++ +++out: +++ raw_spin_unlock_irqrestore(&qs->lock, flags); +++ return err; +++} +++ +++/* Called from syscall or from eBPF program */ +++static int queue_map_peek_elem(struct bpf_map *map, void *value) +++{ +++ return __queue_map_get(map, value, false); +++} +++ +++/* Called from syscall or from eBPF program */ +++static int stack_map_peek_elem(struct bpf_map *map, void *value) +++{ +++ return __stack_map_get(map, value, false); +++} +++ +++/* Called from syscall or from eBPF program */ +++static int queue_map_pop_elem(struct bpf_map *map, void *value) +++{ +++ return __queue_map_get(map, value, true); +++} +++ +++/* Called from syscall or from eBPF program */ +++static int stack_map_pop_elem(struct bpf_map *map, void *value) +++{ +++ return __stack_map_get(map, value, true); +++} +++ +++/* Called from syscall or from eBPF program */ +++static int queue_stack_map_push_elem(struct bpf_map *map, void *value, +++ u64 flags) +++{ +++ struct bpf_queue_stack *qs = bpf_queue_stack(map); +++ unsigned long irq_flags; +++ int err = 0; +++ void *dst; +++ +++ /* BPF_EXIST is used to force making room for a new element in case the +++ * map is full +++ */ +++ bool replace = (flags & BPF_EXIST); +++ +++ /* Check supported flags for queue and stack maps */ +++ if (flags & BPF_NOEXIST || flags > BPF_EXIST) +++ return -EINVAL; +++ +++ raw_spin_lock_irqsave(&qs->lock, irq_flags); +++ +++ if (queue_stack_map_is_full(qs)) { +++ if (!replace) { +++ err = -E2BIG; +++ goto out; +++ } +++ /* advance tail pointer to overwrite oldest element */ +++ if (unlikely(++qs->tail >= qs->size)) +++ qs->tail = 0; +++ } +++ +++ dst = &qs->elements[qs->head * qs->map.value_size]; +++ memcpy(dst, value, qs->map.value_size); +++ +++ if (unlikely(++qs->head >= qs->size)) +++ qs->head = 0; +++ +++out: +++ raw_spin_unlock_irqrestore(&qs->lock, irq_flags); +++ return err; +++} +++ +++/* Called from syscall or from eBPF program */ +++static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ return NULL; +++} +++ +++/* Called from syscall or from eBPF program */ +++static int queue_stack_map_update_elem(struct bpf_map *map, void *key, +++ void *value, u64 flags) +++{ +++ return -EINVAL; +++} +++ +++/* Called from syscall or from eBPF program */ +++static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +++{ +++ return -EINVAL; +++} +++ +++/* Called from syscall */ +++static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, +++ void *next_key) +++{ +++ return -EINVAL; +++} +++ +++const struct bpf_map_ops queue_map_ops = { +++ .map_alloc_check = queue_stack_map_alloc_check, +++ .map_alloc = queue_stack_map_alloc, +++ .map_free = queue_stack_map_free, +++ .map_lookup_elem = queue_stack_map_lookup_elem, +++ .map_update_elem = queue_stack_map_update_elem, +++ .map_delete_elem = queue_stack_map_delete_elem, +++ .map_push_elem = queue_stack_map_push_elem, +++ .map_pop_elem = queue_map_pop_elem, +++ .map_peek_elem = queue_map_peek_elem, +++ .map_get_next_key = queue_stack_map_get_next_key, +++}; +++ +++const struct bpf_map_ops stack_map_ops = { +++ .map_alloc_check = queue_stack_map_alloc_check, +++ .map_alloc = queue_stack_map_alloc, +++ .map_free = queue_stack_map_free, +++ .map_lookup_elem = queue_stack_map_lookup_elem, +++ .map_update_elem = queue_stack_map_update_elem, +++ .map_delete_elem = queue_stack_map_delete_elem, +++ .map_push_elem = queue_stack_map_push_elem, +++ .map_pop_elem = stack_map_pop_elem, +++ .map_peek_elem = stack_map_peek_elem, +++ .map_get_next_key = queue_stack_map_get_next_key, +++}; ++--- /dev/null +++++ b/kernel/bpf/stackmap.c ++@@ -0,0 +1,634 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* Copyright (c) 2016 Facebook +++ */ +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include "percpu_freelist.h" +++ +++#define STACK_CREATE_FLAG_MASK \ +++ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \ +++ BPF_F_STACK_BUILD_ID) +++ +++struct stack_map_bucket { +++ struct pcpu_freelist_node fnode; +++ u32 hash; +++ u32 nr; +++ u64 data[]; +++}; +++ +++struct bpf_stack_map { +++ struct bpf_map map; +++ void *elems; +++ struct pcpu_freelist freelist; +++ u32 n_buckets; +++ struct stack_map_bucket *buckets[]; +++}; +++ +++/* irq_work to run up_read() for build_id lookup in nmi context */ +++struct stack_map_irq_work { +++ struct irq_work irq_work; +++ struct rw_semaphore *sem; +++}; +++ +++static void do_up_read(struct irq_work *entry) +++{ +++ struct stack_map_irq_work *work; +++ +++ work = container_of(entry, struct stack_map_irq_work, irq_work); +++ up_read_non_owner(work->sem); +++ work->sem = NULL; +++} +++ +++static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work); +++ +++static inline bool stack_map_use_build_id(struct bpf_map *map) +++{ +++ return (map->map_flags & BPF_F_STACK_BUILD_ID); +++} +++ +++static inline int stack_map_data_size(struct bpf_map *map) +++{ +++ return stack_map_use_build_id(map) ? +++ sizeof(struct bpf_stack_build_id) : sizeof(u64); +++} +++ +++static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +++{ +++ u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; +++ int err; +++ +++ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, +++ smap->map.numa_node); +++ if (!smap->elems) +++ return -ENOMEM; +++ +++ err = pcpu_freelist_init(&smap->freelist); +++ if (err) +++ goto free_elems; +++ +++ pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, +++ smap->map.max_entries); +++ return 0; +++ +++free_elems: +++ bpf_map_area_free(smap->elems); +++ return err; +++} +++ +++/* Called from syscall */ +++static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +++{ +++ u32 value_size = attr->value_size; +++ struct bpf_stack_map *smap; +++ struct bpf_map_memory mem; +++ u64 cost, n_buckets; +++ int err; +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return ERR_PTR(-EPERM); +++ +++ if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) +++ return ERR_PTR(-EINVAL); +++ +++ /* check sanity of attributes */ +++ if (attr->max_entries == 0 || attr->key_size != 4 || +++ value_size < 8 || value_size % 8) +++ return ERR_PTR(-EINVAL); +++ +++ BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64)); +++ if (attr->map_flags & BPF_F_STACK_BUILD_ID) { +++ if (value_size % sizeof(struct bpf_stack_build_id) || +++ value_size / sizeof(struct bpf_stack_build_id) +++ > sysctl_perf_event_max_stack) +++ return ERR_PTR(-EINVAL); +++ } else if (value_size / 8 > sysctl_perf_event_max_stack) +++ return ERR_PTR(-EINVAL); +++ +++ /* hash table size must be power of 2 */ +++ n_buckets = roundup_pow_of_two(attr->max_entries); +++ if (!n_buckets) +++ return ERR_PTR(-E2BIG); +++ +++ cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); +++ cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); +++ err = bpf_map_charge_init(&mem, cost); +++ if (err) +++ return ERR_PTR(err); +++ +++ smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); +++ if (!smap) { +++ bpf_map_charge_finish(&mem); +++ return ERR_PTR(-ENOMEM); +++ } +++ +++ bpf_map_init_from_attr(&smap->map, attr); +++ smap->map.value_size = value_size; +++ smap->n_buckets = n_buckets; +++ +++ err = get_callchain_buffers(sysctl_perf_event_max_stack); +++ if (err) +++ goto free_charge; +++ +++ err = prealloc_elems_and_freelist(smap); +++ if (err) +++ goto put_buffers; +++ +++ bpf_map_charge_move(&smap->map.memory, &mem); +++ +++ return &smap->map; +++ +++put_buffers: +++ put_callchain_buffers(); +++free_charge: +++ bpf_map_charge_finish(&mem); +++ bpf_map_area_free(smap); +++ return ERR_PTR(err); +++} +++ +++#define BPF_BUILD_ID 3 +++/* +++ * Parse build id from the note segment. This logic can be shared between +++ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are +++ * identical. +++ */ +++static inline int stack_map_parse_build_id(void *page_addr, +++ unsigned char *build_id, +++ void *note_start, +++ Elf32_Word note_size) +++{ +++ Elf32_Word note_offs = 0, new_offs; +++ +++ /* check for overflow */ +++ if (note_start < page_addr || note_start + note_size < note_start) +++ return -EINVAL; +++ +++ /* only supports note that fits in the first page */ +++ if (note_start + note_size > page_addr + PAGE_SIZE) +++ return -EINVAL; +++ +++ while (note_offs + sizeof(Elf32_Nhdr) < note_size) { +++ Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); +++ +++ if (nhdr->n_type == BPF_BUILD_ID && +++ nhdr->n_namesz == sizeof("GNU") && +++ nhdr->n_descsz > 0 && +++ nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { +++ memcpy(build_id, +++ note_start + note_offs + +++ ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), +++ nhdr->n_descsz); +++ memset(build_id + nhdr->n_descsz, 0, +++ BPF_BUILD_ID_SIZE - nhdr->n_descsz); +++ return 0; +++ } +++ new_offs = note_offs + sizeof(Elf32_Nhdr) + +++ ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); +++ if (new_offs <= note_offs) /* overflow */ +++ break; +++ note_offs = new_offs; +++ } +++ return -EINVAL; +++} +++ +++/* Parse build ID from 32-bit ELF */ +++static int stack_map_get_build_id_32(void *page_addr, +++ unsigned char *build_id) +++{ +++ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; +++ Elf32_Phdr *phdr; +++ int i; +++ +++ /* only supports phdr that fits in one page */ +++ if (ehdr->e_phnum > +++ (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) +++ return -EINVAL; +++ +++ phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); +++ +++ for (i = 0; i < ehdr->e_phnum; ++i) +++ if (phdr[i].p_type == PT_NOTE) +++ return stack_map_parse_build_id(page_addr, build_id, +++ page_addr + phdr[i].p_offset, +++ phdr[i].p_filesz); +++ return -EINVAL; +++} +++ +++/* Parse build ID from 64-bit ELF */ +++static int stack_map_get_build_id_64(void *page_addr, +++ unsigned char *build_id) +++{ +++ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; +++ Elf64_Phdr *phdr; +++ int i; +++ +++ /* only supports phdr that fits in one page */ +++ if (ehdr->e_phnum > +++ (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) +++ return -EINVAL; +++ +++ phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); +++ +++ for (i = 0; i < ehdr->e_phnum; ++i) +++ if (phdr[i].p_type == PT_NOTE) +++ return stack_map_parse_build_id(page_addr, build_id, +++ page_addr + phdr[i].p_offset, +++ phdr[i].p_filesz); +++ return -EINVAL; +++} +++ +++/* Parse build ID of ELF file mapped to vma */ +++static int stack_map_get_build_id(struct vm_area_struct *vma, +++ unsigned char *build_id) +++{ +++ Elf32_Ehdr *ehdr; +++ struct page *page; +++ void *page_addr; +++ int ret; +++ +++ /* only works for page backed storage */ +++ if (!vma->vm_file) +++ return -EINVAL; +++ +++ page = find_get_page(vma->vm_file->f_mapping, 0); +++ if (!page) +++ return -EFAULT; /* page not mapped */ +++ +++ ret = -EINVAL; +++ page_addr = kmap_atomic(page); +++ ehdr = (Elf32_Ehdr *)page_addr; +++ +++ /* compare magic x7f "ELF" */ +++ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) +++ goto out; +++ +++ /* only support executable file and shared object file */ +++ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) +++ goto out; +++ +++ if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) +++ ret = stack_map_get_build_id_32(page_addr, build_id); +++ else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) +++ ret = stack_map_get_build_id_64(page_addr, build_id); +++out: +++ kunmap_atomic(page_addr); +++ put_page(page); +++ return ret; +++} +++ +++static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, +++ u64 *ips, u32 trace_nr, bool user) +++{ +++ int i; +++ struct vm_area_struct *vma; +++ bool irq_work_busy = false; +++ struct stack_map_irq_work *work = NULL; +++ +++ if (irqs_disabled()) { +++ work = this_cpu_ptr(&up_read_work); +++ if (work->irq_work.flags & IRQ_WORK_BUSY) +++ /* cannot queue more up_read, fallback */ +++ irq_work_busy = true; +++ } +++ +++ /* +++ * We cannot do up_read() when the irq is disabled, because of +++ * risk to deadlock with rq_lock. To do build_id lookup when the +++ * irqs are disabled, we need to run up_read() in irq_work. We use +++ * a percpu variable to do the irq_work. If the irq_work is +++ * already used by another lookup, we fall back to report ips. +++ * +++ * Same fallback is used for kernel stack (!user) on a stackmap +++ * with build_id. +++ */ +++ if (!user || !current || !current->mm || irq_work_busy || +++ down_read_trylock(¤t->mm->mmap_sem) == 0) { +++ /* cannot access current->mm, fall back to ips */ +++ for (i = 0; i < trace_nr; i++) { +++ id_offs[i].status = BPF_STACK_BUILD_ID_IP; +++ id_offs[i].ip = ips[i]; +++ memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); +++ } +++ return; +++ } +++ +++ for (i = 0; i < trace_nr; i++) { +++ vma = find_vma(current->mm, ips[i]); +++ if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { +++ /* per entry fall back to ips */ +++ id_offs[i].status = BPF_STACK_BUILD_ID_IP; +++ id_offs[i].ip = ips[i]; +++ memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); +++ continue; +++ } +++ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] +++ - vma->vm_start; +++ id_offs[i].status = BPF_STACK_BUILD_ID_VALID; +++ } +++ +++ if (!work) { +++ up_read(¤t->mm->mmap_sem); +++ } else { +++ work->sem = ¤t->mm->mmap_sem; +++ irq_work_queue(&work->irq_work); +++ /* +++ * The irq_work will release the mmap_sem with +++ * up_read_non_owner(). The rwsem_release() is called +++ * here to release the lock from lockdep's perspective. +++ */ +++ rwsem_release(¤t->mm->mmap_sem.dep_map, 1, _RET_IP_); +++ } +++} +++ +++BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, +++ u64, flags) +++{ +++ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); +++ struct perf_callchain_entry *trace; +++ struct stack_map_bucket *bucket, *new_bucket, *old_bucket; +++ u32 max_depth = map->value_size / stack_map_data_size(map); +++ /* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */ +++ u32 init_nr = sysctl_perf_event_max_stack - max_depth; +++ u32 skip = flags & BPF_F_SKIP_FIELD_MASK; +++ u32 hash, id, trace_nr, trace_len; +++ bool user = flags & BPF_F_USER_STACK; +++ bool kernel = !user; +++ u64 *ips; +++ bool hash_matches; +++ +++ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | +++ BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) +++ return -EINVAL; +++ +++ trace = get_perf_callchain(regs, init_nr, kernel, user, +++ sysctl_perf_event_max_stack, false, false); +++ +++ if (unlikely(!trace)) +++ /* couldn't fetch the stack trace */ +++ return -EFAULT; +++ +++ /* get_perf_callchain() guarantees that trace->nr >= init_nr +++ * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth +++ */ +++ trace_nr = trace->nr - init_nr; +++ +++ if (trace_nr <= skip) +++ /* skipping more than usable stack trace */ +++ return -EFAULT; +++ +++ trace_nr -= skip; +++ trace_len = trace_nr * sizeof(u64); +++ ips = trace->ip + skip + init_nr; +++ hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); +++ id = hash & (smap->n_buckets - 1); +++ bucket = READ_ONCE(smap->buckets[id]); +++ +++ hash_matches = bucket && bucket->hash == hash; +++ /* fast cmp */ +++ if (hash_matches && flags & BPF_F_FAST_STACK_CMP) +++ return id; +++ +++ if (stack_map_use_build_id(map)) { +++ /* for build_id+offset, pop a bucket before slow cmp */ +++ new_bucket = (struct stack_map_bucket *) +++ pcpu_freelist_pop(&smap->freelist); +++ if (unlikely(!new_bucket)) +++ return -ENOMEM; +++ new_bucket->nr = trace_nr; +++ stack_map_get_build_id_offset( +++ (struct bpf_stack_build_id *)new_bucket->data, +++ ips, trace_nr, user); +++ trace_len = trace_nr * sizeof(struct bpf_stack_build_id); +++ if (hash_matches && bucket->nr == trace_nr && +++ memcmp(bucket->data, new_bucket->data, trace_len) == 0) { +++ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); +++ return id; +++ } +++ if (bucket && !(flags & BPF_F_REUSE_STACKID)) { +++ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode); +++ return -EEXIST; +++ } +++ } else { +++ if (hash_matches && bucket->nr == trace_nr && +++ memcmp(bucket->data, ips, trace_len) == 0) +++ return id; +++ if (bucket && !(flags & BPF_F_REUSE_STACKID)) +++ return -EEXIST; +++ +++ new_bucket = (struct stack_map_bucket *) +++ pcpu_freelist_pop(&smap->freelist); +++ if (unlikely(!new_bucket)) +++ return -ENOMEM; +++ memcpy(new_bucket->data, ips, trace_len); +++ } +++ +++ new_bucket->hash = hash; +++ new_bucket->nr = trace_nr; +++ +++ old_bucket = xchg(&smap->buckets[id], new_bucket); +++ if (old_bucket) +++ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); +++ return id; +++} +++ +++const struct bpf_func_proto bpf_get_stackid_proto = { +++ .func = bpf_get_stackid, +++ .gpl_only = true, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_CONST_MAP_PTR, +++ .arg3_type = ARG_ANYTHING, +++}; +++ +++BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size, +++ u64, flags) +++{ +++ u32 init_nr, trace_nr, copy_len, elem_size, num_elem; +++ bool user_build_id = flags & BPF_F_USER_BUILD_ID; +++ u32 skip = flags & BPF_F_SKIP_FIELD_MASK; +++ bool user = flags & BPF_F_USER_STACK; +++ struct perf_callchain_entry *trace; +++ bool kernel = !user; +++ int err = -EINVAL; +++ u64 *ips; +++ +++ if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | +++ BPF_F_USER_BUILD_ID))) +++ goto clear; +++ if (kernel && user_build_id) +++ goto clear; +++ +++ elem_size = (user && user_build_id) ? sizeof(struct bpf_stack_build_id) +++ : sizeof(u64); +++ if (unlikely(size % elem_size)) +++ goto clear; +++ +++ num_elem = size / elem_size; +++ if (sysctl_perf_event_max_stack < num_elem) +++ init_nr = 0; +++ else +++ init_nr = sysctl_perf_event_max_stack - num_elem; +++ trace = get_perf_callchain(regs, init_nr, kernel, user, +++ sysctl_perf_event_max_stack, false, false); +++ if (unlikely(!trace)) +++ goto err_fault; +++ +++ trace_nr = trace->nr - init_nr; +++ if (trace_nr < skip) +++ goto err_fault; +++ +++ trace_nr -= skip; +++ trace_nr = (trace_nr <= num_elem) ? trace_nr : num_elem; +++ copy_len = trace_nr * elem_size; +++ ips = trace->ip + skip + init_nr; +++ if (user && user_build_id) +++ stack_map_get_build_id_offset(buf, ips, trace_nr, user); +++ else +++ memcpy(buf, ips, copy_len); +++ +++ if (size > copy_len) +++ memset(buf + copy_len, 0, size - copy_len); +++ return copy_len; +++ +++err_fault: +++ err = -EFAULT; +++clear: +++ memset(buf, 0, size); +++ return err; +++} +++ +++const struct bpf_func_proto bpf_get_stack_proto = { +++ .func = bpf_get_stack, +++ .gpl_only = true, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_PTR_TO_UNINIT_MEM, +++ .arg3_type = ARG_CONST_SIZE_OR_ZERO, +++ .arg4_type = ARG_ANYTHING, +++}; +++ +++/* Called from eBPF program */ +++static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +++{ +++ return ERR_PTR(-EOPNOTSUPP); +++} +++ +++/* Called from syscall */ +++int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +++{ +++ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); +++ struct stack_map_bucket *bucket, *old_bucket; +++ u32 id = *(u32 *)key, trace_len; +++ +++ if (unlikely(id >= smap->n_buckets)) +++ return -ENOENT; +++ +++ bucket = xchg(&smap->buckets[id], NULL); +++ if (!bucket) +++ return -ENOENT; +++ +++ trace_len = bucket->nr * stack_map_data_size(map); +++ memcpy(value, bucket->data, trace_len); +++ memset(value + trace_len, 0, map->value_size - trace_len); +++ +++ old_bucket = xchg(&smap->buckets[id], bucket); +++ if (old_bucket) +++ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); +++ return 0; +++} +++ +++static int stack_map_get_next_key(struct bpf_map *map, void *key, +++ void *next_key) +++{ +++ struct bpf_stack_map *smap = container_of(map, +++ struct bpf_stack_map, map); +++ u32 id; +++ +++ WARN_ON_ONCE(!rcu_read_lock_held()); +++ +++ if (!key) { +++ id = 0; +++ } else { +++ id = *(u32 *)key; +++ if (id >= smap->n_buckets || !smap->buckets[id]) +++ id = 0; +++ else +++ id++; +++ } +++ +++ while (id < smap->n_buckets && !smap->buckets[id]) +++ id++; +++ +++ if (id >= smap->n_buckets) +++ return -ENOENT; +++ +++ *(u32 *)next_key = id; +++ return 0; +++} +++ +++static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, +++ u64 map_flags) +++{ +++ return -EINVAL; +++} +++ +++/* Called from syscall or from eBPF program */ +++static int stack_map_delete_elem(struct bpf_map *map, void *key) +++{ +++ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); +++ struct stack_map_bucket *old_bucket; +++ u32 id = *(u32 *)key; +++ +++ if (unlikely(id >= smap->n_buckets)) +++ return -E2BIG; +++ +++ old_bucket = xchg(&smap->buckets[id], NULL); +++ if (old_bucket) { +++ pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); +++ return 0; +++ } else { +++ return -ENOENT; +++ } +++} +++ +++/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +++static void stack_map_free(struct bpf_map *map) +++{ +++ struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); +++ +++ /* wait for bpf programs to complete before freeing stack map */ +++ synchronize_rcu(); +++ +++ bpf_map_area_free(smap->elems); +++ pcpu_freelist_destroy(&smap->freelist); +++ bpf_map_area_free(smap); +++ put_callchain_buffers(); +++} +++ +++const struct bpf_map_ops stack_trace_map_ops = { +++ .map_alloc = stack_map_alloc, +++ .map_free = stack_map_free, +++ .map_get_next_key = stack_map_get_next_key, +++ .map_lookup_elem = stack_map_lookup_elem, +++ .map_update_elem = stack_map_update_elem, +++ .map_delete_elem = stack_map_delete_elem, +++ .map_check_btf = map_check_no_btf, +++}; +++ +++static int __init stack_map_init(void) +++{ +++ int cpu; +++ struct stack_map_irq_work *work; +++ +++ for_each_possible_cpu(cpu) { +++ work = per_cpu_ptr(&up_read_work, cpu); +++ init_irq_work(&work->irq_work, do_up_read); +++ } +++ return 0; +++} +++subsys_initcall(stack_map_init); ++--- a/kernel/bpf/syscall.c +++++ b/kernel/bpf/syscall.c ++@@ -1,106 +1,333 @@ +++// SPDX-License-Identifier: GPL-2.0-only ++ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++- * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of version 2 of the GNU General Public ++- * License as published by the Free Software Foundation. ++- * ++- * This program is distributed in the hope that it will be useful, but ++- * WITHOUT ANY WARRANTY; without even the implied warranty of ++- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++- * General Public License for more details. ++ */ ++ #include +++#include +++#include ++ #include ++ #include +++#include +++#include ++ #include +++#include ++ #include +++#include ++ #include ++ #include ++ #include +++#include +++#include +++#include +++#include +++#include +++ +++#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \ +++ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ +++ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ +++ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +++#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) +++#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map)) +++ +++#define BPF_OBJ_FLAG_MASK (BPF_F_RDONLY | BPF_F_WRONLY) +++ +++DEFINE_PER_CPU(int, bpf_prog_active); +++static DEFINE_IDR(prog_idr); +++static DEFINE_SPINLOCK(prog_idr_lock); +++static DEFINE_IDR(map_idr); +++static DEFINE_SPINLOCK(map_idr_lock); ++ ++ int sysctl_unprivileged_bpf_disabled __read_mostly; ++ ++-static LIST_HEAD(bpf_map_types); +++static const struct bpf_map_ops * const bpf_map_types[] = { +++#define BPF_PROG_TYPE(_id, _ops) +++#define BPF_MAP_TYPE(_id, _ops) \ +++ [_id] = &_ops, +++#include +++#undef BPF_PROG_TYPE +++#undef BPF_MAP_TYPE +++}; +++ +++/* +++ * If we're handed a bigger struct than we know of, ensure all the unknown bits +++ * are 0 - i.e. new user-space does not rely on any kernel feature extensions +++ * we don't know about yet. +++ * +++ * There is a ToCToU between this function call and the following +++ * copy_from_user() call. However, this is not a concern since this function is +++ * meant to be a future-proofing of bits. +++ */ +++int bpf_check_uarg_tail_zero(void __user *uaddr, +++ size_t expected_size, +++ size_t actual_size) +++{ +++ unsigned char __user *addr; +++ unsigned char __user *end; +++ unsigned char val; +++ int err; +++ +++ if (unlikely(actual_size > PAGE_SIZE)) /* silly large */ +++ return -E2BIG; +++ +++ if (unlikely(!access_ok(VERIFY_READ, uaddr, actual_size))) +++ return -EFAULT; +++ +++ if (actual_size <= expected_size) +++ return 0; +++ +++ addr = uaddr + expected_size; +++ end = uaddr + actual_size; +++ +++ for (; addr < end; addr++) { +++ err = get_user(val, addr); +++ if (err) +++ return err; +++ if (val) +++ return -E2BIG; +++ } +++ +++ return 0; +++} ++ ++ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) ++ { ++- struct bpf_map_type_list *tl; +++ const struct bpf_map_ops *ops; +++ u32 type = attr->map_type; ++ struct bpf_map *map; +++ int err; ++ ++- list_for_each_entry(tl, &bpf_map_types, list_node) { ++- if (tl->type == attr->map_type) { ++- map = tl->ops->map_alloc(attr); ++- if (IS_ERR(map)) ++- return map; ++- map->ops = tl->ops; ++- map->map_type = attr->map_type; ++- return map; ++- } +++ if (type >= ARRAY_SIZE(bpf_map_types)) +++ return ERR_PTR(-EINVAL); +++ ops = bpf_map_types[type]; +++ if (!ops) +++ return ERR_PTR(-EINVAL); +++ +++ if (ops->map_alloc_check) { +++ err = ops->map_alloc_check(attr); +++ if (err) +++ return ERR_PTR(err); +++ } +++ map = ops->map_alloc(attr); +++ if (IS_ERR(map)) +++ return map; +++ map->ops = ops; +++ map->map_type = type; +++ return map; +++} +++ +++void *bpf_map_area_alloc(u64 size, int numa_node) +++{ +++ /* We really just want to fail instead of triggering OOM killer +++ * under memory pressure, therefore we set __GFP_NORETRY to kmalloc, +++ * which is used for lower order allocation requests. +++ * +++ * It has been observed that higher order allocation requests done by +++ * vmalloc with __GFP_NORETRY being set might fail due to not trying +++ * to reclaim memory from the page cache, thus we set +++ * __GFP_RETRY_MAYFAIL to avoid such situations. +++ */ +++ +++ const gfp_t flags = __GFP_NOWARN | __GFP_ZERO; +++ void *area; +++ +++ if (size >= SIZE_MAX) +++ return NULL; +++ +++ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { +++ area = kmalloc_node(size, GFP_USER | __GFP_NORETRY | flags, +++ numa_node); +++ if (area != NULL) +++ return area; ++ } ++- return ERR_PTR(-EINVAL); +++ +++ return __vmalloc_node_range(size, 1, +++ VMALLOC_START, VMALLOC_END, +++ GFP_KERNEL | flags, +++ PAGE_KERNEL, 0, numa_node, +++ __builtin_return_address(0)); ++ } ++ ++-/* boot time registration of different map implementations */ ++-void bpf_register_map_type(struct bpf_map_type_list *tl) +++void bpf_map_area_free(void *area) ++ { ++- list_add(&tl->list_node, &bpf_map_types); +++ kvfree(area); ++ } ++ ++-static int bpf_map_charge_memlock(struct bpf_map *map) +++static u32 bpf_map_flags_retain_permanent(u32 flags) ++ { ++- struct user_struct *user = get_current_user(); ++- unsigned long memlock_limit; +++ /* Some map creation flags are not tied to the map object but +++ * rather to the map fd instead, so they have no meaning upon +++ * map object inspection since multiple file descriptors with +++ * different (access) properties can exist here. Thus, given +++ * this has zero meaning for the map itself, lets clear these +++ * from here. +++ */ +++ return flags & ~(BPF_F_RDONLY | BPF_F_WRONLY); +++} +++ +++void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) +++{ +++ map->map_type = attr->map_type; +++ map->key_size = attr->key_size; +++ map->value_size = attr->value_size; +++ map->max_entries = attr->max_entries; +++ map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags); +++ map->numa_node = bpf_map_attr_numa_node(attr); +++} +++ +++static int bpf_charge_memlock(struct user_struct *user, u32 pages) +++{ +++ unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; +++ +++ if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { +++ atomic_long_sub(pages, &user->locked_vm); +++ return -EPERM; +++ } +++ return 0; +++} +++ +++static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) +++{ +++ if (user) +++ atomic_long_sub(pages, &user->locked_vm); +++} ++ ++- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; +++int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) +++{ +++ u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; +++ struct user_struct *user; +++ int ret; ++ ++- atomic_long_add(map->pages, &user->locked_vm); +++ if (size >= U32_MAX - PAGE_SIZE) +++ return -E2BIG; ++ ++- if (atomic_long_read(&user->locked_vm) > memlock_limit) { ++- atomic_long_sub(map->pages, &user->locked_vm); +++ user = get_current_user(); +++ ret = bpf_charge_memlock(user, pages); +++ if (ret) { ++ free_uid(user); ++- return -EPERM; +++ return ret; ++ } ++- map->user = user; +++ +++ mem->pages = pages; +++ mem->user = user; +++ ++ return 0; ++ } ++ ++-static void bpf_map_uncharge_memlock(struct bpf_map *map) +++void bpf_map_charge_finish(struct bpf_map_memory *mem) ++ { ++- struct user_struct *user = map->user; +++ bpf_uncharge_memlock(mem->user, mem->pages); +++ free_uid(mem->user); +++} ++ ++- atomic_long_sub(map->pages, &user->locked_vm); ++- free_uid(user); +++void bpf_map_charge_move(struct bpf_map_memory *dst, +++ struct bpf_map_memory *src) +++{ +++ *dst = *src; +++ +++ /* Make sure src will not be used for the redundant uncharging. */ +++ memset(src, 0, sizeof(struct bpf_map_memory)); +++} +++ +++int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) +++{ +++ int ret; +++ +++ ret = bpf_charge_memlock(map->memory.user, pages); +++ if (ret) +++ return ret; +++ map->memory.pages += pages; +++ return ret; +++} +++ +++void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) +++{ +++ bpf_uncharge_memlock(map->memory.user, pages); +++ map->memory.pages -= pages; +++} +++ +++static int bpf_map_alloc_id(struct bpf_map *map) +++{ +++ int id; +++ +++ idr_preload(GFP_KERNEL); +++ spin_lock_bh(&map_idr_lock); +++ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); +++ if (id > 0) +++ map->id = id; +++ spin_unlock_bh(&map_idr_lock); +++ idr_preload_end(); +++ +++ if (WARN_ON_ONCE(!id)) +++ return -ENOSPC; +++ +++ return id > 0 ? 0 : id; +++} +++ +++void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +++{ +++ unsigned long flags; +++ +++ /* Offloaded maps are removed from the IDR store when their device +++ * disappears - even if someone holds an fd to them they are unusable, +++ * the memory is gone, all ops will fail; they are simply waiting for +++ * refcnt to drop to be freed. +++ */ +++ if (!map->id) +++ return; +++ +++ if (do_idr_lock) +++ spin_lock_irqsave(&map_idr_lock, flags); +++ else +++ __acquire(&map_idr_lock); +++ +++ idr_remove(&map_idr, map->id); +++ map->id = 0; +++ +++ if (do_idr_lock) +++ spin_unlock_irqrestore(&map_idr_lock, flags); +++ else +++ __release(&map_idr_lock); ++ } ++ ++ /* called from workqueue */ ++ static void bpf_map_free_deferred(struct work_struct *work) ++ { ++ struct bpf_map *map = container_of(work, struct bpf_map, work); +++ struct bpf_map_memory mem; ++ ++- bpf_map_uncharge_memlock(map); +++ bpf_map_charge_move(&mem, &map->memory); ++ /* implementation dependent freeing */ ++ map->ops->map_free(map); +++ bpf_map_charge_finish(&mem); ++ } ++ ++ static void bpf_map_put_uref(struct bpf_map *map) ++ { ++ if (atomic_dec_and_test(&map->usercnt)) { ++- if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) ++- bpf_fd_array_map_clear(map); +++ if (map->ops->map_release_uref) +++ map->ops->map_release_uref(map); ++ } ++ } ++ ++ /* decrement map refcnt and schedule it for freeing via workqueue ++ * (unrelying map implementation ops->map_free() might sleep) ++ */ ++-void bpf_map_put(struct bpf_map *map) +++static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock) ++ { ++ if (atomic_dec_and_test(&map->refcnt)) { +++ /* bpf_map_free_id() must be called first */ +++ bpf_map_free_id(map, do_idr_lock); +++ btf_put(map->btf); ++ INIT_WORK(&map->work, bpf_map_free_deferred); ++ schedule_work(&map->work); ++ } ++ } ++ +++void bpf_map_put(struct bpf_map *map) +++{ +++ __bpf_map_put(map, true); +++} +++EXPORT_SYMBOL_GPL(bpf_map_put); +++ ++ void bpf_map_put_with_uref(struct bpf_map *map) ++ { ++ bpf_map_put_uref(map); ++@@ -109,18 +336,110 @@ void bpf_map_put_with_uref(struct bpf_ma ++ ++ static int bpf_map_release(struct inode *inode, struct file *filp) ++ { ++- bpf_map_put_with_uref(filp->private_data); +++ struct bpf_map *map = filp->private_data; +++ +++ if (map->ops->map_release) +++ map->ops->map_release(map, filp); +++ +++ bpf_map_put_with_uref(map); ++ return 0; ++ } ++ ++-static const struct file_operations bpf_map_fops = { ++- .release = bpf_map_release, +++static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) +++{ +++ fmode_t mode = f.file->f_mode; +++ +++ /* Our file permissions may have been overridden by global +++ * map permissions facing syscall side. +++ */ +++ if (READ_ONCE(map->frozen)) +++ mode &= ~FMODE_CAN_WRITE; +++ return mode; +++} +++ +++#ifdef CONFIG_PROC_FS +++static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +++{ +++ const struct bpf_map *map = filp->private_data; +++ const struct bpf_array *array; +++ u32 owner_prog_type = 0; +++ u32 owner_jited = 0; +++ +++ if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { +++ array = container_of(map, struct bpf_array, map); +++ owner_prog_type = array->owner_prog_type; +++ owner_jited = array->owner_jited; +++ } +++ +++ seq_printf(m, +++ "map_type:\t%u\n" +++ "key_size:\t%u\n" +++ "value_size:\t%u\n" +++ "max_entries:\t%u\n" +++ "map_flags:\t%#x\n" +++ "memlock:\t%llu\n" +++ "map_id:\t%u\n" +++ "frozen:\t%u\n", +++ map->map_type, +++ map->key_size, +++ map->value_size, +++ map->max_entries, +++ map->map_flags, +++ map->memory.pages * 1ULL << PAGE_SHIFT, +++ map->id, +++ READ_ONCE(map->frozen)); +++ +++ if (owner_prog_type) { +++ seq_printf(m, "owner_prog_type:\t%u\n", +++ owner_prog_type); +++ seq_printf(m, "owner_jited:\t%u\n", +++ owner_jited); +++ } +++} +++#endif +++ +++static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz, +++ loff_t *ppos) +++{ +++ /* We need this handler such that alloc_file() enables +++ * f_mode with FMODE_CAN_READ. +++ */ +++ return -EINVAL; +++} +++ +++static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf, +++ size_t siz, loff_t *ppos) +++{ +++ /* We need this handler such that alloc_file() enables +++ * f_mode with FMODE_CAN_WRITE. +++ */ +++ return -EINVAL; +++} +++ +++const struct file_operations bpf_map_fops = { +++#ifdef CONFIG_PROC_FS +++ .show_fdinfo = bpf_map_show_fdinfo, +++#endif +++ .release = bpf_map_release, +++ .read = bpf_dummy_read, +++ .write = bpf_dummy_write, ++ }; ++ ++-int bpf_map_new_fd(struct bpf_map *map) +++int bpf_map_new_fd(struct bpf_map *map, int flags) ++ { ++ return anon_inode_getfd("bpf-map", &bpf_map_fops, map, ++- O_RDWR | O_CLOEXEC); +++ flags | O_CLOEXEC); +++} +++ +++int bpf_get_file_flag(int flags) +++{ +++ if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY)) +++ return -EINVAL; +++ if (flags & BPF_F_RDONLY) +++ return O_RDONLY; +++ if (flags & BPF_F_WRONLY) +++ return O_WRONLY; +++ return O_RDWR; ++ } ++ ++ /* helper macro to check that unused fields 'union bpf_attr' are zero */ ++@@ -131,38 +450,171 @@ int bpf_map_new_fd(struct bpf_map *map) ++ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ ++ sizeof(attr->CMD##_LAST_FIELD)) != NULL ++ ++-#define BPF_MAP_CREATE_LAST_FIELD max_entries +++/* dst and src must have at least BPF_OBJ_NAME_LEN number of bytes. +++ * Return 0 on success and < 0 on error. +++ */ +++static int bpf_obj_name_cpy(char *dst, const char *src) +++{ +++ const char *end = src + BPF_OBJ_NAME_LEN; +++ +++ memset(dst, 0, BPF_OBJ_NAME_LEN); +++ /* Copy all isalnum(), '_' and '.' chars. */ +++ while (src < end && *src) { +++ if (!isalnum(*src) && +++ *src != '_' && *src != '.') +++ return -EINVAL; +++ *dst++ = *src++; +++ } +++ +++ /* No '\0' found in BPF_OBJ_NAME_LEN number of bytes */ +++ if (src == end) +++ return -EINVAL; +++ +++ return 0; +++} +++ +++int map_check_no_btf(const struct bpf_map *map, +++ const struct btf *btf, +++ const struct btf_type *key_type, +++ const struct btf_type *value_type) +++{ +++ return -ENOTSUPP; +++} +++ +++static int map_check_btf(struct bpf_map *map, const struct btf *btf, +++ u32 btf_key_id, u32 btf_value_id) +++{ +++ const struct btf_type *key_type, *value_type; +++ u32 key_size, value_size; +++ int ret = 0; +++ +++ /* Some maps allow key to be unspecified. */ +++ if (btf_key_id) { +++ key_type = btf_type_id_size(btf, &btf_key_id, &key_size); +++ if (!key_type || key_size != map->key_size) +++ return -EINVAL; +++ } else { +++ key_type = btf_type_by_id(btf, 0); +++ if (!map->ops->map_check_btf) +++ return -EINVAL; +++ } +++ +++ value_type = btf_type_id_size(btf, &btf_value_id, &value_size); +++ if (!value_type || value_size != map->value_size) +++ return -EINVAL; +++ +++ map->spin_lock_off = btf_find_spin_lock(btf, value_type); +++ +++ if (map_value_has_spin_lock(map)) { +++ if (map->map_flags & BPF_F_RDONLY_PROG) +++ return -EACCES; +++ if (map->map_type != BPF_MAP_TYPE_HASH && +++ map->map_type != BPF_MAP_TYPE_ARRAY && +++ map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && +++ map->map_type != BPF_MAP_TYPE_SK_STORAGE) +++ return -ENOTSUPP; +++ if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > +++ map->value_size) { +++ WARN_ONCE(1, +++ "verifier bug spin_lock_off %d value_size %d\n", +++ map->spin_lock_off, map->value_size); +++ return -EFAULT; +++ } +++ } +++ +++ if (map->ops->map_check_btf) +++ ret = map->ops->map_check_btf(map, btf, key_type, value_type); +++ +++ return ret; +++} +++ +++#define BPF_MAP_CREATE_LAST_FIELD btf_value_type_id ++ /* called via syscall */ ++ static int map_create(union bpf_attr *attr) ++ { +++ int numa_node = bpf_map_attr_numa_node(attr); +++ struct bpf_map_memory mem; ++ struct bpf_map *map; +++ int f_flags; ++ int err; ++ ++ err = CHECK_ATTR(BPF_MAP_CREATE); ++ if (err) ++ return -EINVAL; ++ +++ f_flags = bpf_get_file_flag(attr->map_flags); +++ if (f_flags < 0) +++ return f_flags; +++ +++ if (numa_node != NUMA_NO_NODE && +++ ((unsigned int)numa_node >= nr_node_ids || +++ !node_online(numa_node))) +++ return -EINVAL; +++ ++ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ ++ map = find_and_alloc_map(attr); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++ +++ err = bpf_obj_name_cpy(map->name, attr->map_name); +++ if (err) +++ goto free_map; +++ ++ atomic_set(&map->refcnt, 1); ++ atomic_set(&map->usercnt, 1); ++ ++- err = bpf_map_charge_memlock(map); +++ if (attr->btf_key_type_id || attr->btf_value_type_id) { +++ struct btf *btf; +++ +++ if (!attr->btf_value_type_id) { +++ err = -EINVAL; +++ goto free_map; +++ } +++ +++ btf = btf_get_by_fd(attr->btf_fd); +++ if (IS_ERR(btf)) { +++ err = PTR_ERR(btf); +++ goto free_map; +++ } +++ +++ err = map_check_btf(map, btf, attr->btf_key_type_id, +++ attr->btf_value_type_id); +++ if (err) { +++ btf_put(btf); +++ goto free_map; +++ } +++ +++ map->btf = btf; +++ map->btf_key_type_id = attr->btf_key_type_id; +++ map->btf_value_type_id = attr->btf_value_type_id; +++ } else { +++ map->spin_lock_off = -EINVAL; +++ } +++ +++ err = bpf_map_alloc_id(map); ++ if (err) ++- goto free_map; +++ goto free_map_sec; ++ ++- err = bpf_map_new_fd(map); ++- if (err < 0) ++- /* failed to allocate fd */ ++- goto free_map; +++ err = bpf_map_new_fd(map, f_flags); +++ if (err < 0) { +++ /* failed to allocate fd. +++ * bpf_map_put_with_uref() is needed because the above +++ * bpf_map_alloc_id() has published the map +++ * to the userspace and the userspace may +++ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. +++ */ +++ bpf_map_put_with_uref(map); +++ return err; +++ } ++ ++ return err; ++ +++free_map_sec: ++ free_map: +++ btf_put(map->btf); +++ bpf_map_charge_move(&mem, &map->memory); ++ map->ops->map_free(map); +++ bpf_map_charge_finish(&mem); ++ return err; ++ } ++ ++@@ -194,6 +646,7 @@ struct bpf_map *bpf_map_inc(struct bpf_m ++ atomic_inc(&map->usercnt); ++ return map; ++ } +++EXPORT_SYMBOL_GPL(bpf_map_inc); ++ ++ struct bpf_map *bpf_map_get_with_uref(u32 ufd) ++ { ++@@ -210,59 +663,155 @@ struct bpf_map *bpf_map_get_with_uref(u3 ++ return map; ++ } ++ ++-/* helper to convert user pointers passed inside __aligned_u64 fields */ ++-static void __user *u64_to_ptr(__u64 val) +++/* map_idr_lock should have been held */ +++static struct bpf_map *__bpf_map_inc_not_zero(struct bpf_map *map, +++ bool uref) +++{ +++ int refold; +++ +++ refold = atomic_fetch_add_unless(&map->refcnt, 1, 0); +++ +++ if (refold >= BPF_MAX_REFCNT) { +++ __bpf_map_put(map, false); +++ return ERR_PTR(-EBUSY); +++ } +++ +++ if (!refold) +++ return ERR_PTR(-ENOENT); +++ +++ if (uref) +++ atomic_inc(&map->usercnt); +++ +++ return map; +++} +++ +++struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map, bool uref) +++{ +++ spin_lock_bh(&map_idr_lock); +++ map = __bpf_map_inc_not_zero(map, uref); +++ spin_unlock_bh(&map_idr_lock); +++ +++ return map; +++} +++EXPORT_SYMBOL_GPL(bpf_map_inc_not_zero); +++ +++int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +++{ +++ return -ENOTSUPP; +++} +++ +++static void *__bpf_copy_key(void __user *ukey, u64 key_size) ++ { ++- return (void __user *) (unsigned long) val; +++ if (key_size) +++ return memdup_user(ukey, key_size); +++ +++ if (ukey) +++ return ERR_PTR(-EINVAL); +++ +++ return NULL; ++ } ++ ++ /* last field in 'union bpf_attr' used by this command */ ++-#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value +++#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags ++ ++ static int map_lookup_elem(union bpf_attr *attr) ++ { ++- void __user *ukey = u64_to_ptr(attr->key); ++- void __user *uvalue = u64_to_ptr(attr->value); +++ void __user *ukey = u64_to_user_ptr(attr->key); +++ void __user *uvalue = u64_to_user_ptr(attr->value); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value, *ptr; +++ u32 value_size; ++ struct fd f; ++ int err; ++ ++ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) ++ return -EINVAL; ++ +++ if (attr->flags & ~BPF_F_LOCK) +++ return -EINVAL; +++ ++ f = fdget(ufd); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); +++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { +++ err = -EPERM; +++ goto err_put; +++ } ++ ++- err = -ENOMEM; ++- key = kmalloc(map->key_size, GFP_USER); ++- if (!key) +++ if ((attr->flags & BPF_F_LOCK) && +++ !map_value_has_spin_lock(map)) { +++ err = -EINVAL; ++ goto err_put; +++ } ++ ++- err = -EFAULT; ++- if (copy_from_user(key, ukey, map->key_size) != 0) ++- goto free_key; +++ key = __bpf_copy_key(ukey, map->key_size); +++ if (IS_ERR(key)) { +++ err = PTR_ERR(key); +++ goto err_put; +++ } +++ +++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || +++ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) +++ value_size = round_up(map->value_size, 8) * num_possible_cpus(); +++ else if (IS_FD_MAP(map)) +++ value_size = sizeof(u32); +++ else +++ value_size = map->value_size; ++ ++ err = -ENOMEM; ++- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); +++ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++- rcu_read_lock(); ++- ptr = map->ops->map_lookup_elem(map, key); ++- if (ptr) ++- memcpy(value, ptr, map->value_size); ++- rcu_read_unlock(); +++ preempt_disable(); +++ this_cpu_inc(bpf_prog_active); +++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { +++ err = bpf_percpu_hash_copy(map, key, value); +++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { +++ err = bpf_percpu_array_copy(map, key, value); +++ } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { +++ err = bpf_stackmap_copy(map, key, value); +++ } else if (IS_FD_ARRAY(map)) { +++ err = bpf_fd_array_map_lookup_elem(map, key, value); +++ } else if (IS_FD_HASH(map)) { +++ err = bpf_fd_htab_map_lookup_elem(map, key, value); +++ } else if (map->map_type == BPF_MAP_TYPE_QUEUE || +++ map->map_type == BPF_MAP_TYPE_STACK) { +++ err = map->ops->map_peek_elem(map, value); +++ } else { +++ rcu_read_lock(); +++ if (map->ops->map_lookup_elem_sys_only) +++ ptr = map->ops->map_lookup_elem_sys_only(map, key); +++ else +++ ptr = map->ops->map_lookup_elem(map, key); +++ if (IS_ERR(ptr)) { +++ err = PTR_ERR(ptr); +++ } else if (!ptr) { +++ err = -ENOENT; +++ } else { +++ err = 0; +++ if (attr->flags & BPF_F_LOCK) +++ /* lock 'ptr' and copy everything but lock */ +++ copy_map_value_locked(map, value, ptr, true); +++ else +++ copy_map_value(map, value, ptr); +++ /* mask lock, since value wasn't zero inited */ +++ check_and_init_map_lock(map, value); +++ } +++ rcu_read_unlock(); +++ } +++ this_cpu_dec(bpf_prog_active); +++ preempt_enable(); ++ ++- err = -ENOENT; ++- if (!ptr) +++ if (err) ++ goto free_value; ++ ++ err = -EFAULT; ++- if (copy_to_user(uvalue, value, map->value_size) != 0) +++ if (copy_to_user(uvalue, value, value_size) != 0) ++ goto free_value; ++ ++ err = 0; ++@@ -276,15 +825,27 @@ err_put: ++ return err; ++ } ++ +++static void maybe_wait_bpf_programs(struct bpf_map *map) +++{ +++ /* Wait for any running BPF programs to complete so that +++ * userspace, when we return to it, knows that all programs +++ * that could be running use the new map value. +++ */ +++ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || +++ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) +++ synchronize_rcu(); +++} +++ ++ #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags ++ ++ static int map_update_elem(union bpf_attr *attr) ++ { ++- void __user *ukey = u64_to_ptr(attr->key); ++- void __user *uvalue = u64_to_ptr(attr->value); +++ void __user *ukey = u64_to_user_ptr(attr->key); +++ void __user *uvalue = u64_to_user_ptr(attr->value); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *value; +++ u32 value_size; ++ struct fd f; ++ int err; ++ ++@@ -295,32 +856,79 @@ static int map_update_elem(union bpf_att ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); +++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { +++ err = -EPERM; +++ goto err_put; +++ } ++ ++- err = -ENOMEM; ++- key = kmalloc(map->key_size, GFP_USER); ++- if (!key) +++ if ((attr->flags & BPF_F_LOCK) && +++ !map_value_has_spin_lock(map)) { +++ err = -EINVAL; +++ goto err_put; +++ } +++ +++ key = __bpf_copy_key(ukey, map->key_size); +++ if (IS_ERR(key)) { +++ err = PTR_ERR(key); ++ goto err_put; +++ } ++ ++- err = -EFAULT; ++- if (copy_from_user(key, ukey, map->key_size) != 0) ++- goto free_key; +++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || +++ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) +++ value_size = round_up(map->value_size, 8) * num_possible_cpus(); +++ else +++ value_size = map->value_size; ++ ++ err = -ENOMEM; ++- value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); +++ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); ++ if (!value) ++ goto free_key; ++ ++ err = -EFAULT; ++- if (copy_from_user(value, uvalue, map->value_size) != 0) +++ if (copy_from_user(value, uvalue, value_size) != 0) ++ goto free_value; ++ ++- /* eBPF program that use maps are running under rcu_read_lock(), ++- * therefore all map accessors rely on this fact, so do the same here ++- */ ++- rcu_read_lock(); ++- err = map->ops->map_update_elem(map, key, value, attr->flags); ++- rcu_read_unlock(); +++ /* Need to create a kthread, thus must support schedule */ +++ if (map->map_type == BPF_MAP_TYPE_CPUMAP || +++ map->map_type == BPF_MAP_TYPE_SOCKHASH || +++ map->map_type == BPF_MAP_TYPE_SOCKMAP) { +++ err = map->ops->map_update_elem(map, key, value, attr->flags); +++ goto out; +++ } ++ +++ /* must increment bpf_prog_active to avoid kprobe+bpf triggering from +++ * inside bpf map update or delete otherwise deadlocks are possible +++ */ +++ preempt_disable(); +++ __this_cpu_inc(bpf_prog_active); +++ if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || +++ map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { +++ err = bpf_percpu_hash_update(map, key, value, attr->flags); +++ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { +++ err = bpf_percpu_array_update(map, key, value, attr->flags); +++ } else if (IS_FD_ARRAY(map)) { +++ rcu_read_lock(); +++ err = bpf_fd_array_map_update_elem(map, f.file, key, value, +++ attr->flags); +++ rcu_read_unlock(); +++ } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { +++ rcu_read_lock(); +++ err = bpf_fd_htab_map_update_elem(map, f.file, key, value, +++ attr->flags); +++ rcu_read_unlock(); +++ } else if (map->map_type == BPF_MAP_TYPE_QUEUE || +++ map->map_type == BPF_MAP_TYPE_STACK) { +++ err = map->ops->map_push_elem(map, value, attr->flags); +++ } else { +++ rcu_read_lock(); +++ err = map->ops->map_update_elem(map, key, value, attr->flags); +++ rcu_read_unlock(); +++ } +++ __this_cpu_dec(bpf_prog_active); +++ preempt_enable(); +++ maybe_wait_bpf_programs(map); +++out: ++ free_value: ++ kfree(value); ++ free_key: ++@@ -334,7 +942,7 @@ err_put: ++ ++ static int map_delete_elem(union bpf_attr *attr) ++ { ++- void __user *ukey = u64_to_ptr(attr->key); +++ void __user *ukey = u64_to_user_ptr(attr->key); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ struct fd f; ++@@ -348,21 +956,25 @@ static int map_delete_elem(union bpf_att ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++- ++- err = -ENOMEM; ++- key = kmalloc(map->key_size, GFP_USER); ++- if (!key) +++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { +++ err = -EPERM; ++ goto err_put; +++ } ++ ++- err = -EFAULT; ++- if (copy_from_user(key, ukey, map->key_size) != 0) ++- goto free_key; +++ key = __bpf_copy_key(ukey, map->key_size); +++ if (IS_ERR(key)) { +++ err = PTR_ERR(key); +++ goto err_put; +++ } ++ +++ preempt_disable(); +++ __this_cpu_inc(bpf_prog_active); ++ rcu_read_lock(); ++ err = map->ops->map_delete_elem(map, key); ++ rcu_read_unlock(); ++- ++-free_key: +++ __this_cpu_dec(bpf_prog_active); +++ preempt_enable(); +++ maybe_wait_bpf_programs(map); ++ kfree(key); ++ err_put: ++ fdput(f); ++@@ -374,8 +986,8 @@ err_put: ++ ++ static int map_get_next_key(union bpf_attr *attr) ++ { ++- void __user *ukey = u64_to_ptr(attr->key); ++- void __user *unext_key = u64_to_ptr(attr->next_key); +++ void __user *ukey = u64_to_user_ptr(attr->key); +++ void __user *unext_key = u64_to_user_ptr(attr->next_key); ++ int ufd = attr->map_fd; ++ struct bpf_map *map; ++ void *key, *next_key; ++@@ -389,15 +1001,20 @@ static int map_get_next_key(union bpf_at ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) ++ return PTR_ERR(map); ++- ++- err = -ENOMEM; ++- key = kmalloc(map->key_size, GFP_USER); ++- if (!key) +++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) { +++ err = -EPERM; ++ goto err_put; +++ } ++ ++- err = -EFAULT; ++- if (copy_from_user(key, ukey, map->key_size) != 0) ++- goto free_key; +++ if (ukey) { +++ key = __bpf_copy_key(ukey, map->key_size); +++ if (IS_ERR(key)) { +++ err = PTR_ERR(key); +++ goto err_put; +++ } +++ } else { +++ key = NULL; +++ } ++ ++ err = -ENOMEM; ++ next_key = kmalloc(map->key_size, GFP_USER); ++@@ -425,77 +1042,126 @@ err_put: ++ return err; ++ } ++ ++-static LIST_HEAD(bpf_prog_types); +++#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value ++ ++-static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +++static int map_lookup_and_delete_elem(union bpf_attr *attr) ++ { ++- struct bpf_prog_type_list *tl; +++ void __user *ukey = u64_to_user_ptr(attr->key); +++ void __user *uvalue = u64_to_user_ptr(attr->value); +++ int ufd = attr->map_fd; +++ struct bpf_map *map; +++ void *key, *value; +++ u32 value_size; +++ struct fd f; +++ int err; ++ ++- list_for_each_entry(tl, &bpf_prog_types, list_node) { ++- if (tl->type == type) { ++- prog->aux->ops = tl->ops; ++- prog->type = type; ++- return 0; ++- } +++ if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) +++ return -EINVAL; +++ +++ f = fdget(ufd); +++ map = __bpf_map_get(f); +++ if (IS_ERR(map)) +++ return PTR_ERR(map); +++ if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ) || +++ !(map_get_sys_perms(map, f) & FMODE_CAN_WRITE)) { +++ err = -EPERM; +++ goto err_put; ++ } ++ ++- return -EINVAL; ++-} +++ key = __bpf_copy_key(ukey, map->key_size); +++ if (IS_ERR(key)) { +++ err = PTR_ERR(key); +++ goto err_put; +++ } ++ ++-void bpf_register_prog_type(struct bpf_prog_type_list *tl) ++-{ ++- list_add(&tl->list_node, &bpf_prog_types); +++ value_size = map->value_size; +++ +++ err = -ENOMEM; +++ value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); +++ if (!value) +++ goto free_key; +++ +++ if (map->map_type == BPF_MAP_TYPE_QUEUE || +++ map->map_type == BPF_MAP_TYPE_STACK) { +++ err = map->ops->map_pop_elem(map, value); +++ } else { +++ err = -ENOTSUPP; +++ } +++ +++ if (err) +++ goto free_value; +++ +++ if (copy_to_user(uvalue, value, value_size) != 0) { +++ err = -EFAULT; +++ goto free_value; +++ } +++ +++ err = 0; +++ +++free_value: +++ kfree(value); +++free_key: +++ kfree(key); +++err_put: +++ fdput(f); +++ return err; ++ } ++ ++-/* fixup insn->imm field of bpf_call instructions: ++- * if (insn->imm == BPF_FUNC_map_lookup_elem) ++- * insn->imm = bpf_map_lookup_elem - __bpf_call_base; ++- * else if (insn->imm == BPF_FUNC_map_update_elem) ++- * insn->imm = bpf_map_update_elem - __bpf_call_base; ++- * else ... ++- * ++- * this function is called after eBPF program passed verification ++- */ ++-static void fixup_bpf_calls(struct bpf_prog *prog) +++#define BPF_MAP_FREEZE_LAST_FIELD map_fd +++ +++static int map_freeze(const union bpf_attr *attr) ++ { ++- const struct bpf_func_proto *fn; ++- int i; +++ int err = 0, ufd = attr->map_fd; +++ struct bpf_map *map; +++ struct fd f; ++ ++- for (i = 0; i < prog->len; i++) { ++- struct bpf_insn *insn = &prog->insnsi[i]; +++ if (CHECK_ATTR(BPF_MAP_FREEZE)) +++ return -EINVAL; ++ ++- if (insn->code == (BPF_JMP | BPF_CALL)) { ++- /* we reach here when program has bpf_call instructions ++- * and it passed bpf_check(), means that ++- * ops->get_func_proto must have been supplied, check it ++- */ ++- BUG_ON(!prog->aux->ops->get_func_proto); +++ f = fdget(ufd); +++ map = __bpf_map_get(f); +++ if (IS_ERR(map)) +++ return PTR_ERR(map); +++ if (READ_ONCE(map->frozen)) { +++ err = -EBUSY; +++ goto err_put; +++ } +++ if (!capable(CAP_SYS_ADMIN)) { +++ err = -EPERM; +++ goto err_put; +++ } ++ ++- if (insn->imm == BPF_FUNC_get_route_realm) ++- prog->dst_needed = 1; ++- if (insn->imm == BPF_FUNC_get_prandom_u32) ++- bpf_user_rnd_init_once(); ++- if (insn->imm == BPF_FUNC_tail_call) { ++- /* mark bpf_tail_call as different opcode ++- * to avoid conditional branch in ++- * interpeter for every normal call ++- * and to prevent accidental JITing by ++- * JIT compiler that doesn't support ++- * bpf_tail_call yet ++- */ ++- insn->imm = 0; ++- insn->code |= BPF_X; ++- continue; ++- } +++ WRITE_ONCE(map->frozen, true); +++err_put: +++ fdput(f); +++ return err; +++} ++ ++- fn = prog->aux->ops->get_func_proto(insn->imm); ++- /* all functions that have prototype and verifier allowed ++- * programs to call them, must be real in-kernel functions ++- */ ++- BUG_ON(!fn->func); ++- insn->imm = fn->func - __bpf_call_base; ++- } ++- } +++static const struct bpf_prog_ops * const bpf_prog_types[] = { +++#define BPF_PROG_TYPE(_id, _name) \ +++ [_id] = & _name ## _prog_ops, +++#define BPF_MAP_TYPE(_id, _ops) +++#include +++#undef BPF_PROG_TYPE +++#undef BPF_MAP_TYPE +++}; +++ +++static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) +++{ +++ const struct bpf_prog_ops *ops; +++ +++ if (type >= ARRAY_SIZE(bpf_prog_types)) +++ return -EINVAL; +++ ops = bpf_prog_types[type]; +++ if (!ops) +++ return -EINVAL; +++ +++ if (!bpf_prog_is_dev_bound(prog->aux)) +++ prog->aux->ops = ops; +++ else +++ return -EINVAL; +++ prog->type = type; +++ return 0; ++ } ++ ++ /* drop refcnt on maps used by eBPF program and free auxilary data */ ++@@ -509,19 +1175,39 @@ static void free_used_maps(struct bpf_pr ++ kfree(aux->used_maps); ++ } ++ +++int __bpf_prog_charge(struct user_struct *user, u32 pages) +++{ +++ unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; +++ unsigned long user_bufs; +++ +++ if (user) { +++ user_bufs = atomic_long_add_return(pages, &user->locked_vm); +++ if (user_bufs > memlock_limit) { +++ atomic_long_sub(pages, &user->locked_vm); +++ return -EPERM; +++ } +++ } +++ +++ return 0; +++} +++ +++void __bpf_prog_uncharge(struct user_struct *user, u32 pages) +++{ +++ if (user) +++ atomic_long_sub(pages, &user->locked_vm); +++} +++ ++ static int bpf_prog_charge_memlock(struct bpf_prog *prog) ++ { ++ struct user_struct *user = get_current_user(); ++- unsigned long memlock_limit; +++ int ret; ++ ++- memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; ++- ++- atomic_long_add(prog->pages, &user->locked_vm); ++- if (atomic_long_read(&user->locked_vm) > memlock_limit) { ++- atomic_long_sub(prog->pages, &user->locked_vm); +++ ret = __bpf_prog_charge(user, prog->pages); +++ if (ret) { ++ free_uid(user); ++- return -EPERM; +++ return ret; ++ } +++ ++ prog->aux->user = user; ++ return 0; ++ } ++@@ -530,30 +1216,87 @@ static void bpf_prog_uncharge_memlock(st ++ { ++ struct user_struct *user = prog->aux->user; ++ ++- atomic_long_sub(prog->pages, &user->locked_vm); +++ __bpf_prog_uncharge(user, prog->pages); ++ free_uid(user); ++ } ++ ++-static void __prog_put_common(struct rcu_head *rcu) +++static int bpf_prog_alloc_id(struct bpf_prog *prog) +++{ +++ int id; +++ +++ idr_preload(GFP_KERNEL); +++ spin_lock_bh(&prog_idr_lock); +++ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); +++ if (id > 0) +++ prog->aux->id = id; +++ spin_unlock_bh(&prog_idr_lock); +++ idr_preload_end(); +++ +++ /* id is in [1, INT_MAX) */ +++ if (WARN_ON_ONCE(!id)) +++ return -ENOSPC; +++ +++ return id > 0 ? 0 : id; +++} +++ +++void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +++{ +++ /* cBPF to eBPF migrations are currently not in the idr store. +++ * Offloaded programs are removed from the store when their device +++ * disappears - even if someone grabs an fd to them they are unusable, +++ * simply waiting for refcnt to drop to be freed. +++ */ +++ if (!prog->aux->id) +++ return; +++ +++ if (do_idr_lock) +++ spin_lock_bh(&prog_idr_lock); +++ else +++ __acquire(&prog_idr_lock); +++ +++ idr_remove(&prog_idr, prog->aux->id); +++ prog->aux->id = 0; +++ +++ if (do_idr_lock) +++ spin_unlock_bh(&prog_idr_lock); +++ else +++ __release(&prog_idr_lock); +++} +++ +++static void __bpf_prog_put_rcu(struct rcu_head *rcu) ++ { ++ struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu); ++ +++ kvfree(aux->func_info); ++ free_used_maps(aux); ++ bpf_prog_uncharge_memlock(aux->prog); ++ bpf_prog_free(aux->prog); ++ } ++ ++-/* version of bpf_prog_put() that is called after a grace period */ ++-void bpf_prog_put_rcu(struct bpf_prog *prog) +++static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) +++{ +++ bpf_prog_kallsyms_del_all(prog); +++ btf_put(prog->aux->btf); +++ bpf_prog_free_linfo(prog); +++ +++ if (deferred) +++ call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); +++ else +++ __bpf_prog_put_rcu(&prog->aux->rcu); +++} +++ +++static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) ++ { ++- if (atomic_dec_and_test(&prog->aux->refcnt)) ++- call_rcu(&prog->aux->rcu, __prog_put_common); +++ if (atomic_dec_and_test(&prog->aux->refcnt)) { +++ /* bpf_prog_free_id() must be called first */ +++ bpf_prog_free_id(prog, do_idr_lock); +++ __bpf_prog_put_noref(prog, true); +++ } ++ } ++ ++ void bpf_prog_put(struct bpf_prog *prog) ++ { ++- if (atomic_dec_and_test(&prog->aux->refcnt)) ++- __prog_put_common(&prog->aux->rcu); +++ __bpf_prog_put(prog, true); ++ } ++ EXPORT_SYMBOL_GPL(bpf_prog_put); ++ ++@@ -561,12 +1304,68 @@ static int bpf_prog_release(struct inode ++ { ++ struct bpf_prog *prog = filp->private_data; ++ ++- bpf_prog_put_rcu(prog); +++ bpf_prog_put(prog); ++ return 0; ++ } ++ ++-static const struct file_operations bpf_prog_fops = { ++- .release = bpf_prog_release, +++static void bpf_prog_get_stats(const struct bpf_prog *prog, +++ struct bpf_prog_stats *stats) +++{ +++ u64 nsecs = 0, cnt = 0; +++ int cpu; +++ +++ for_each_possible_cpu(cpu) { +++ const struct bpf_prog_stats *st; +++ unsigned int start; +++ u64 tnsecs, tcnt; +++ +++ st = per_cpu_ptr(prog->aux->stats, cpu); +++ do { +++ start = u64_stats_fetch_begin_irq(&st->syncp); +++ tnsecs = st->nsecs; +++ tcnt = st->cnt; +++ } while (u64_stats_fetch_retry_irq(&st->syncp, start)); +++ nsecs += tnsecs; +++ cnt += tcnt; +++ } +++ stats->nsecs = nsecs; +++ stats->cnt = cnt; +++} +++ +++#ifdef CONFIG_PROC_FS +++static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) +++{ +++ const struct bpf_prog *prog = filp->private_data; +++ char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; +++ struct bpf_prog_stats stats; +++ +++ bpf_prog_get_stats(prog, &stats); +++ bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); +++ seq_printf(m, +++ "prog_type:\t%u\n" +++ "prog_jited:\t%u\n" +++ "prog_tag:\t%s\n" +++ "memlock:\t%llu\n" +++ "prog_id:\t%u\n" +++ "run_time_ns:\t%llu\n" +++ "run_cnt:\t%llu\n", +++ prog->type, +++ prog->jited, +++ prog_tag, +++ prog->pages * 1ULL << PAGE_SHIFT, +++ prog->aux->id, +++ stats.nsecs, +++ stats.cnt); +++} +++#endif +++ +++const struct file_operations bpf_prog_fops = { +++#ifdef CONFIG_PROC_FS +++ .show_fdinfo = bpf_prog_show_fdinfo, +++#endif +++ .release = bpf_prog_release, +++ .read = bpf_dummy_read, +++ .write = bpf_dummy_write, ++ }; ++ ++ int bpf_prog_new_fd(struct bpf_prog *prog) ++@@ -575,7 +1374,7 @@ int bpf_prog_new_fd(struct bpf_prog *pro ++ O_RDWR | O_CLOEXEC); ++ } ++ ++-static struct bpf_prog *__bpf_prog_get(struct fd f) +++static struct bpf_prog *____bpf_prog_get(struct fd f) ++ { ++ if (!f.file) ++ return ERR_PTR(-EBADF); ++@@ -587,38 +1386,178 @@ static struct bpf_prog *__bpf_prog_get(s ++ return f.file->private_data; ++ } ++ +++struct bpf_prog *bpf_prog_add(struct bpf_prog *prog, int i) +++{ +++ if (atomic_add_return(i, &prog->aux->refcnt) > BPF_MAX_REFCNT) { +++ atomic_sub(i, &prog->aux->refcnt); +++ return ERR_PTR(-EBUSY); +++ } +++ return prog; +++} +++EXPORT_SYMBOL_GPL(bpf_prog_add); +++ +++void bpf_prog_sub(struct bpf_prog *prog, int i) +++{ +++ /* Only to be used for undoing previous bpf_prog_add() in some +++ * error path. We still know that another entity in our call +++ * path holds a reference to the program, thus atomic_sub() can +++ * be safely used in such cases! +++ */ +++ WARN_ON(atomic_sub_return(i, &prog->aux->refcnt) == 0); +++} +++EXPORT_SYMBOL_GPL(bpf_prog_sub); +++ ++ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog) ++ { ++- if (atomic_inc_return(&prog->aux->refcnt) > BPF_MAX_REFCNT) { ++- atomic_dec(&prog->aux->refcnt); +++ return bpf_prog_add(prog, 1); +++} +++EXPORT_SYMBOL_GPL(bpf_prog_inc); +++ +++/* prog_idr_lock should have been held */ +++struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) +++{ +++ int refold; +++ +++ refold = atomic_fetch_add_unless(&prog->aux->refcnt, 1, 0); +++ +++ if (refold >= BPF_MAX_REFCNT) { +++ __bpf_prog_put(prog, false); ++ return ERR_PTR(-EBUSY); ++ } +++ +++ if (!refold) +++ return ERR_PTR(-ENOENT); +++ ++ return prog; ++ } +++EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); ++ ++-/* called by sockets/tracing/seccomp before attaching program to an event ++- * pairs with bpf_prog_put() ++- */ ++-struct bpf_prog *bpf_prog_get(u32 ufd) +++bool bpf_prog_get_ok(struct bpf_prog *prog, +++ enum bpf_prog_type *attach_type, bool attach_drv) +++{ +++ /* not an attachment, just a refcount inc, always allow */ +++ if (!attach_type) +++ return true; +++ +++ if (prog->type != *attach_type) +++ return false; +++ if (bpf_prog_is_dev_bound(prog->aux) && !attach_drv) +++ return false; +++ +++ return true; +++} +++ +++static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *attach_type, +++ bool attach_drv) ++ { ++ struct fd f = fdget(ufd); ++ struct bpf_prog *prog; ++ ++- prog = __bpf_prog_get(f); +++ prog = ____bpf_prog_get(f); ++ if (IS_ERR(prog)) ++ return prog; +++ if (!bpf_prog_get_ok(prog, attach_type, attach_drv)) { +++ prog = ERR_PTR(-EINVAL); +++ goto out; +++ } ++ ++ prog = bpf_prog_inc(prog); +++out: ++ fdput(f); ++- ++ return prog; ++ } ++-EXPORT_SYMBOL_GPL(bpf_prog_get); +++ +++struct bpf_prog *bpf_prog_get(u32 ufd) +++{ +++ return __bpf_prog_get(ufd, NULL, false); +++} +++ +++struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, +++ bool attach_drv) +++{ +++ return __bpf_prog_get(ufd, &type, attach_drv); +++} +++EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +++ +++/* Initially all BPF programs could be loaded w/o specifying +++ * expected_attach_type. Later for some of them specifying expected_attach_type +++ * at load time became required so that program could be validated properly. +++ * Programs of types that are allowed to be loaded both w/ and w/o (for +++ * backward compatibility) expected_attach_type, should have the default attach +++ * type assigned to expected_attach_type for the latter case, so that it can be +++ * validated later at attach time. +++ * +++ * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if +++ * prog type requires it but has some attach types that have to be backward +++ * compatible. +++ */ +++static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +++{ +++ switch (attr->prog_type) { +++ case BPF_PROG_TYPE_CGROUP_SOCK: +++ /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't +++ * exist so checking for non-zero is the way to go here. +++ */ +++ if (!attr->expected_attach_type) +++ attr->expected_attach_type = +++ BPF_CGROUP_INET_SOCK_CREATE; +++ break; +++ } +++} +++ +++static int +++bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, +++ enum bpf_attach_type expected_attach_type) +++{ +++ switch (prog_type) { +++ case BPF_PROG_TYPE_CGROUP_SOCK: +++ switch (expected_attach_type) { +++ case BPF_CGROUP_INET_SOCK_CREATE: +++ case BPF_CGROUP_INET4_POST_BIND: +++ case BPF_CGROUP_INET6_POST_BIND: +++ return 0; +++ default: +++ return -EINVAL; +++ } +++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: +++ switch (expected_attach_type) { +++ case BPF_CGROUP_INET4_BIND: +++ case BPF_CGROUP_INET6_BIND: +++ case BPF_CGROUP_INET4_CONNECT: +++ case BPF_CGROUP_INET6_CONNECT: +++ case BPF_CGROUP_UDP4_SENDMSG: +++ case BPF_CGROUP_UDP6_SENDMSG: +++ case BPF_CGROUP_UDP4_RECVMSG: +++ case BPF_CGROUP_UDP6_RECVMSG: +++ return 0; +++ default: +++ return -EINVAL; +++ } +++ case BPF_PROG_TYPE_CGROUP_SKB: +++ switch (expected_attach_type) { +++ case BPF_CGROUP_INET_INGRESS: +++ case BPF_CGROUP_INET_EGRESS: +++ return 0; +++ default: +++ return -EINVAL; +++ } +++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: +++ switch (expected_attach_type) { +++ case BPF_CGROUP_SETSOCKOPT: +++ case BPF_CGROUP_GETSOCKOPT: +++ return 0; +++ default: +++ return -EINVAL; +++ } +++ default: +++ return 0; +++ } +++} ++ ++ /* last field in 'union bpf_attr' used by this command */ ++-#define BPF_PROG_LOAD_LAST_FIELD kern_version +++#define BPF_PROG_LOAD_LAST_FIELD line_info_cnt ++ ++-static int bpf_prog_load(union bpf_attr *attr) +++static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) ++ { ++ enum bpf_prog_type type = attr->prog_type; ++ struct bpf_prog *prog; ++@@ -629,8 +1568,19 @@ static int bpf_prog_load(union bpf_attr ++ if (CHECK_ATTR(BPF_PROG_LOAD)) ++ return -EINVAL; ++ +++ if (attr->prog_flags & ~(BPF_F_STRICT_ALIGNMENT | +++ BPF_F_ANY_ALIGNMENT | +++ BPF_F_TEST_STATE_FREQ | +++ BPF_F_TEST_RND_HI32)) +++ return -EINVAL; +++ +++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && +++ (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && +++ !capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ ++ /* copy eBPF program license from user space */ ++- if (strncpy_from_user(license, u64_to_ptr(attr->license), +++ if (strncpy_from_user(license, u64_to_user_ptr(attr->license), ++ sizeof(license) - 1) < 0) ++ return -EFAULT; ++ license[sizeof(license) - 1] = 0; ++@@ -638,30 +1588,36 @@ static int bpf_prog_load(union bpf_attr ++ /* eBPF programs must be GPL compatible to use GPL-ed functions */ ++ is_gpl = license_is_gpl_compatible(license); ++ ++- if (attr->insn_cnt >= BPF_MAXINSNS) ++- return -EINVAL; +++ if (attr->insn_cnt == 0 || +++ attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) +++ return -E2BIG; +++ if (type != BPF_PROG_TYPE_SOCKET_FILTER && +++ type != BPF_PROG_TYPE_CGROUP_SKB && +++ !capable(CAP_SYS_ADMIN)) +++ return -EPERM; ++ ++- if (type == BPF_PROG_TYPE_KPROBE && ++- attr->kern_version != LINUX_VERSION_CODE) +++ bpf_prog_load_fixup_attach_type(attr); +++ if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) ++ return -EINVAL; ++ ++- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) ++- return -EPERM; ++- ++ /* plain bpf_prog allocation */ ++ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); ++ if (!prog) ++ return -ENOMEM; ++ +++ prog->expected_attach_type = attr->expected_attach_type; +++ +++ prog->aux->offload_requested = !!attr->prog_ifindex; +++ ++ err = bpf_prog_charge_memlock(prog); ++ if (err) ++- goto free_prog_nouncharge; +++ goto free_prog_sec; ++ ++ prog->len = attr->insn_cnt; ++ ++ err = -EFAULT; ++- if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), ++- prog->len * sizeof(struct bpf_insn)) != 0) +++ if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), +++ bpf_prog_insn_size(prog)) != 0) ++ goto free_prog; ++ ++ prog->orig_prog = NULL; ++@@ -675,91 +1631,720 @@ static int bpf_prog_load(union bpf_attr ++ if (err < 0) ++ goto free_prog; ++ +++ prog->aux->load_time = ktime_get_boot_ns(); +++ err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name); +++ if (err) +++ goto free_prog; +++ ++ /* run eBPF verifier */ ++- err = bpf_check(&prog, attr); +++ err = bpf_check(&prog, attr, uattr); ++ if (err < 0) ++ goto free_used_maps; ++ ++- /* fixup BPF_CALL->imm field */ ++- fixup_bpf_calls(prog); ++- ++- /* eBPF program is ready to be JITed */ ++- err = bpf_prog_select_runtime(prog); +++ prog = bpf_prog_select_runtime(prog, &err); ++ if (err < 0) ++ goto free_used_maps; ++ ++- err = bpf_prog_new_fd(prog); ++- if (err < 0) ++- /* failed to allocate fd */ +++ err = bpf_prog_alloc_id(prog); +++ if (err) ++ goto free_used_maps; ++ +++ /* Upon success of bpf_prog_alloc_id(), the BPF prog is +++ * effectively publicly exposed. However, retrieving via +++ * bpf_prog_get_fd_by_id() will take another reference, +++ * therefore it cannot be gone underneath us. +++ * +++ * Only for the time /after/ successful bpf_prog_new_fd() +++ * and before returning to userspace, we might just hold +++ * one reference and any parallel close on that fd could +++ * rip everything out. Hence, below notifications must +++ * happen before bpf_prog_new_fd(). +++ * +++ * Also, any failure handling from this point onwards must +++ * be using bpf_prog_put() given the program is exposed. +++ */ +++ bpf_prog_kallsyms_add(prog); +++ +++ err = bpf_prog_new_fd(prog); +++ if (err < 0) +++ bpf_prog_put(prog); ++ return err; ++ ++ free_used_maps: ++- free_used_maps(prog->aux); +++ /* In case we have subprogs, we need to wait for a grace +++ * period before we can tear down JIT memory since symbols +++ * are already exposed under kallsyms. +++ */ +++ __bpf_prog_put_noref(prog, prog->aux->func_cnt); +++ return err; ++ free_prog: ++ bpf_prog_uncharge_memlock(prog); ++-free_prog_nouncharge: +++free_prog_sec: ++ bpf_prog_free(prog); ++ return err; ++ } ++ ++-#define BPF_OBJ_LAST_FIELD bpf_fd +++#define BPF_OBJ_LAST_FIELD file_flags ++ ++ static int bpf_obj_pin(const union bpf_attr *attr) ++ { ++- if (CHECK_ATTR(BPF_OBJ)) +++ if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0) ++ return -EINVAL; ++ ++- return bpf_obj_pin_user(attr->bpf_fd, u64_to_ptr(attr->pathname)); +++ return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname)); ++ } ++ ++ static int bpf_obj_get(const union bpf_attr *attr) ++ { ++- if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0) +++ if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 || +++ attr->file_flags & ~BPF_OBJ_FLAG_MASK) ++ return -EINVAL; ++ ++- return bpf_obj_get_user(u64_to_ptr(attr->pathname)); +++ return bpf_obj_get_user(u64_to_user_ptr(attr->pathname), +++ attr->file_flags); ++ } ++ ++-SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +++ +++#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +++ +++#define BPF_F_ATTACH_MASK \ +++ (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI) +++ +++ +++#define BPF_PROG_DETACH_LAST_FIELD attach_type +++ +++ +++#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt +++ +++ +++#define BPF_PROG_TEST_RUN_LAST_FIELD test.ctx_out +++ +++static int bpf_prog_test_run(const union bpf_attr *attr, +++ union bpf_attr __user *uattr) ++ { ++- union bpf_attr attr = {}; ++- int err; +++ struct bpf_prog *prog; +++ int ret = -ENOTSUPP; ++ ++- if (!capable(CAP_SYS_ADMIN) && sysctl_unprivileged_bpf_disabled) +++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; +++ if (CHECK_ATTR(BPF_PROG_TEST_RUN)) +++ return -EINVAL; +++ +++ if ((attr->test.ctx_size_in && !attr->test.ctx_in) || +++ (!attr->test.ctx_size_in && attr->test.ctx_in)) +++ return -EINVAL; +++ +++ if ((attr->test.ctx_size_out && !attr->test.ctx_out) || +++ (!attr->test.ctx_size_out && attr->test.ctx_out)) +++ return -EINVAL; +++ +++ prog = bpf_prog_get(attr->test.prog_fd); +++ if (IS_ERR(prog)) +++ return PTR_ERR(prog); +++ +++ if (prog->aux->ops->test_run) +++ ret = prog->aux->ops->test_run(prog, attr, uattr); +++ +++ bpf_prog_put(prog); +++ return ret; +++} +++ +++#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id +++ +++static int bpf_obj_get_next_id(const union bpf_attr *attr, +++ union bpf_attr __user *uattr, +++ struct idr *idr, +++ spinlock_t *lock) +++{ +++ u32 next_id = attr->start_id; +++ int err = 0; +++ +++ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX) +++ return -EINVAL; +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ +++ next_id++; +++ spin_lock_bh(lock); +++ if (!idr_get_next(idr, &next_id)) +++ err = -ENOENT; +++ spin_unlock_bh(lock); +++ +++ if (!err) +++ err = put_user(next_id, &uattr->next_id); +++ +++ return err; +++} +++ +++#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id +++ +++static int bpf_prog_get_fd_by_id(const union bpf_attr *attr) +++{ +++ struct bpf_prog *prog; +++ u32 id = attr->prog_id; +++ int fd; +++ +++ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID)) +++ return -EINVAL; +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ +++ spin_lock_bh(&prog_idr_lock); +++ prog = idr_find(&prog_idr, id); +++ if (prog) +++ prog = bpf_prog_inc_not_zero(prog); +++ else +++ prog = ERR_PTR(-ENOENT); +++ spin_unlock_bh(&prog_idr_lock); +++ +++ if (IS_ERR(prog)) +++ return PTR_ERR(prog); +++ +++ fd = bpf_prog_new_fd(prog); +++ if (fd < 0) +++ bpf_prog_put(prog); +++ +++ return fd; +++} +++ +++#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags +++ +++static int bpf_map_get_fd_by_id(const union bpf_attr *attr) +++{ +++ struct bpf_map *map; +++ u32 id = attr->map_id; +++ int f_flags; +++ int fd; +++ +++ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) || +++ attr->open_flags & ~BPF_OBJ_FLAG_MASK) +++ return -EINVAL; +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ +++ f_flags = bpf_get_file_flag(attr->open_flags); +++ if (f_flags < 0) +++ return f_flags; +++ +++ spin_lock_bh(&map_idr_lock); +++ map = idr_find(&map_idr, id); +++ if (map) +++ map = __bpf_map_inc_not_zero(map, true); +++ else +++ map = ERR_PTR(-ENOENT); +++ spin_unlock_bh(&map_idr_lock); +++ +++ if (IS_ERR(map)) +++ return PTR_ERR(map); ++ ++- if (!access_ok(VERIFY_READ, uattr, 1)) +++ fd = bpf_map_new_fd(map, f_flags); +++ if (fd < 0) +++ bpf_map_put_with_uref(map); +++ +++ return fd; +++} +++ +++static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, +++ unsigned long addr, u32 *off, +++ u32 *type) +++{ +++ const struct bpf_map *map; +++ int i; +++ +++ for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) { +++ map = prog->aux->used_maps[i]; +++ if (map == (void *)addr) { +++ *type = BPF_PSEUDO_MAP_FD; +++ return map; +++ } +++ if (!map->ops->map_direct_value_meta) +++ continue; +++ if (!map->ops->map_direct_value_meta(map, addr, off)) { +++ *type = BPF_PSEUDO_MAP_VALUE; +++ return map; +++ } +++ } +++ +++ return NULL; +++} +++ +++static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog, +++ const struct cred *f_cred) +++{ +++ const struct bpf_map *map; +++ struct bpf_insn *insns; +++ u32 off, type; +++ u64 imm; +++ int i; +++ +++ insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), +++ GFP_USER); +++ if (!insns) +++ return insns; +++ +++ for (i = 0; i < prog->len; i++) { +++ if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { +++ insns[i].code = BPF_JMP | BPF_CALL; +++ insns[i].imm = BPF_FUNC_tail_call; +++ /* fall-through */ +++ } +++ if (insns[i].code == (BPF_JMP | BPF_CALL) || +++ insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { +++ if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) +++ insns[i].code = BPF_JMP | BPF_CALL; +++ if (!bpf_dump_raw_ok(f_cred)) +++ insns[i].imm = 0; +++ continue; +++ } +++ +++ if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) +++ continue; +++ +++ imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; +++ map = bpf_map_from_imm(prog, imm, &off, &type); +++ if (map) { +++ insns[i].src_reg = type; +++ insns[i].imm = map->id; +++ insns[i + 1].imm = off; +++ continue; +++ } +++ } +++ +++ return insns; +++} +++ +++static int set_info_rec_size(struct bpf_prog_info *info) +++{ +++ /* +++ * Ensure info.*_rec_size is the same as kernel expected size +++ * +++ * or +++ * +++ * Only allow zero *_rec_size if both _rec_size and _cnt are +++ * zero. In this case, the kernel will set the expected +++ * _rec_size back to the info. +++ */ +++ +++ if ((info->nr_func_info || info->func_info_rec_size) && +++ info->func_info_rec_size != sizeof(struct bpf_func_info)) +++ return -EINVAL; +++ +++ if ((info->nr_line_info || info->line_info_rec_size) && +++ info->line_info_rec_size != sizeof(struct bpf_line_info)) +++ return -EINVAL; +++ +++ if ((info->nr_jited_line_info || info->jited_line_info_rec_size) && +++ info->jited_line_info_rec_size != sizeof(__u64)) +++ return -EINVAL; +++ +++ info->func_info_rec_size = sizeof(struct bpf_func_info); +++ info->line_info_rec_size = sizeof(struct bpf_line_info); +++ info->jited_line_info_rec_size = sizeof(__u64); +++ +++ return 0; +++} +++ +++static int bpf_prog_get_info_by_fd(struct file *file, +++ struct bpf_prog *prog, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); +++ struct bpf_prog_info info; +++ u32 info_len = attr->info.info_len; +++ struct bpf_prog_stats stats; +++ char __user *uinsns; +++ u32 ulen; +++ int err; +++ +++ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); +++ if (err) +++ return err; +++ info_len = min_t(u32, sizeof(info), info_len); +++ +++ memset(&info, 0, sizeof(info)); +++ if (copy_from_user(&info, uinfo, info_len)) ++ return -EFAULT; ++ ++- if (size > PAGE_SIZE) /* silly large */ ++- return -E2BIG; +++ info.type = prog->type; +++ info.id = prog->aux->id; +++ info.load_time = prog->aux->load_time; +++ info.created_by_uid = from_kuid_munged(current_user_ns(), +++ prog->aux->user->uid); +++ info.gpl_compatible = prog->gpl_compatible; +++ +++ memcpy(info.tag, prog->tag, sizeof(prog->tag)); +++ memcpy(info.name, prog->aux->name, sizeof(prog->aux->name)); +++ +++ ulen = info.nr_map_ids; +++ info.nr_map_ids = prog->aux->used_map_cnt; +++ ulen = min_t(u32, info.nr_map_ids, ulen); +++ if (ulen) { +++ u32 __user *user_map_ids = u64_to_user_ptr(info.map_ids); +++ u32 i; +++ +++ for (i = 0; i < ulen; i++) +++ if (put_user(prog->aux->used_maps[i]->id, +++ &user_map_ids[i])) +++ return -EFAULT; +++ } +++ +++ err = set_info_rec_size(&info); +++ if (err) +++ return err; +++ +++ bpf_prog_get_stats(prog, &stats); +++ info.run_time_ns = stats.nsecs; +++ info.run_cnt = stats.cnt; +++ +++ if (!capable(CAP_SYS_ADMIN)) { +++ info.jited_prog_len = 0; +++ info.xlated_prog_len = 0; +++ info.nr_jited_ksyms = 0; +++ info.nr_jited_func_lens = 0; +++ info.nr_func_info = 0; +++ info.nr_line_info = 0; +++ info.nr_jited_line_info = 0; +++ goto done; +++ } +++ +++ ulen = info.xlated_prog_len; +++ info.xlated_prog_len = bpf_prog_insn_size(prog); +++ if (info.xlated_prog_len && ulen) { +++ struct bpf_insn *insns_sanitized; +++ bool fault; +++ +++ if (prog->blinded && !bpf_dump_raw_ok(file->f_cred)) { +++ info.xlated_prog_insns = 0; +++ goto done; +++ } +++ insns_sanitized = bpf_insn_prepare_dump(prog, file->f_cred); +++ if (!insns_sanitized) +++ return -ENOMEM; +++ uinsns = u64_to_user_ptr(info.xlated_prog_insns); +++ ulen = min_t(u32, info.xlated_prog_len, ulen); +++ fault = copy_to_user(uinsns, insns_sanitized, ulen); +++ kfree(insns_sanitized); +++ if (fault) +++ return -EFAULT; +++ } +++ +++ /* NOTE: the following code is supposed to be skipped for offload. +++ * bpf_prog_offload_info_fill() is the place to fill similar fields +++ * for offload. +++ */ +++ ulen = info.jited_prog_len; +++ if (prog->aux->func_cnt) { +++ u32 i; +++ +++ info.jited_prog_len = 0; +++ for (i = 0; i < prog->aux->func_cnt; i++) +++ info.jited_prog_len += prog->aux->func[i]->jited_len; +++ } else { +++ info.jited_prog_len = prog->jited_len; +++ } +++ +++ if (info.jited_prog_len && ulen) { +++ if (bpf_dump_raw_ok(file->f_cred)) { +++ uinsns = u64_to_user_ptr(info.jited_prog_insns); +++ ulen = min_t(u32, info.jited_prog_len, ulen); +++ +++ /* for multi-function programs, copy the JITed +++ * instructions for all the functions +++ */ +++ if (prog->aux->func_cnt) { +++ u32 len, free, i; +++ u8 *img; +++ +++ free = ulen; +++ for (i = 0; i < prog->aux->func_cnt; i++) { +++ len = prog->aux->func[i]->jited_len; +++ len = min_t(u32, len, free); +++ img = (u8 *) prog->aux->func[i]->bpf_func; +++ if (copy_to_user(uinsns, img, len)) +++ return -EFAULT; +++ uinsns += len; +++ free -= len; +++ if (!free) +++ break; +++ } +++ } else { +++ if (copy_to_user(uinsns, prog->bpf_func, ulen)) +++ return -EFAULT; +++ } +++ } else { +++ info.jited_prog_insns = 0; +++ } +++ } +++ +++ ulen = info.nr_jited_ksyms; +++ info.nr_jited_ksyms = prog->aux->func_cnt ? : 1; +++ if (ulen) { +++ if (bpf_dump_raw_ok(file->f_cred)) { +++ unsigned long ksym_addr; +++ u64 __user *user_ksyms; +++ u32 i; +++ +++ /* copy the address of the kernel symbol +++ * corresponding to each function +++ */ +++ ulen = min_t(u32, info.nr_jited_ksyms, ulen); +++ user_ksyms = u64_to_user_ptr(info.jited_ksyms); +++ if (prog->aux->func_cnt) { +++ for (i = 0; i < ulen; i++) { +++ ksym_addr = (unsigned long) +++ prog->aux->func[i]->bpf_func; +++ if (put_user((u64) ksym_addr, +++ &user_ksyms[i])) +++ return -EFAULT; +++ } +++ } else { +++ ksym_addr = (unsigned long) prog->bpf_func; +++ if (put_user((u64) ksym_addr, &user_ksyms[0])) +++ return -EFAULT; +++ } +++ } else { +++ info.jited_ksyms = 0; +++ } +++ } +++ +++ ulen = info.nr_jited_func_lens; +++ info.nr_jited_func_lens = prog->aux->func_cnt ? : 1; +++ if (ulen) { +++ if (bpf_dump_raw_ok(file->f_cred)) { +++ u32 __user *user_lens; +++ u32 func_len, i; +++ +++ /* copy the JITed image lengths for each function */ +++ ulen = min_t(u32, info.nr_jited_func_lens, ulen); +++ user_lens = u64_to_user_ptr(info.jited_func_lens); +++ if (prog->aux->func_cnt) { +++ for (i = 0; i < ulen; i++) { +++ func_len = +++ prog->aux->func[i]->jited_len; +++ if (put_user(func_len, &user_lens[i])) +++ return -EFAULT; +++ } +++ } else { +++ func_len = prog->jited_len; +++ if (put_user(func_len, &user_lens[0])) +++ return -EFAULT; +++ } +++ } else { +++ info.jited_func_lens = 0; +++ } +++ } +++ +++ if (prog->aux->btf) +++ info.btf_id = btf_id(prog->aux->btf); +++ +++ ulen = info.nr_func_info; +++ info.nr_func_info = prog->aux->func_info_cnt; +++ if (info.nr_func_info && ulen) { +++ char __user *user_finfo; +++ +++ user_finfo = u64_to_user_ptr(info.func_info); +++ ulen = min_t(u32, info.nr_func_info, ulen); +++ if (copy_to_user(user_finfo, prog->aux->func_info, +++ info.func_info_rec_size * ulen)) +++ return -EFAULT; +++ } +++ +++ ulen = info.nr_line_info; +++ info.nr_line_info = prog->aux->nr_linfo; +++ if (info.nr_line_info && ulen) { +++ __u8 __user *user_linfo; +++ +++ user_linfo = u64_to_user_ptr(info.line_info); +++ ulen = min_t(u32, info.nr_line_info, ulen); +++ if (copy_to_user(user_linfo, prog->aux->linfo, +++ info.line_info_rec_size * ulen)) +++ return -EFAULT; +++ } ++ ++- /* If we're handed a bigger struct than we know of, ++- * ensure all the unknown bits are 0 - i.e. new ++- * user-space does not rely on any kernel feature ++- * extensions we dont know about yet. ++- */ ++- if (size > sizeof(attr)) { ++- unsigned char __user *addr; ++- unsigned char __user *end; ++- unsigned char val; ++- ++- addr = (void __user *)uattr + sizeof(attr); ++- end = (void __user *)uattr + size; ++- ++- for (; addr < end; addr++) { ++- err = get_user(val, addr); ++- if (err) ++- return err; ++- if (val) ++- return -E2BIG; +++ ulen = info.nr_jited_line_info; +++ if (prog->aux->jited_linfo) +++ info.nr_jited_line_info = prog->aux->nr_linfo; +++ else +++ info.nr_jited_line_info = 0; +++ if (info.nr_jited_line_info && ulen) { +++ if (bpf_dump_raw_ok(file->f_cred)) { +++ __u64 __user *user_linfo; +++ u32 i; +++ +++ user_linfo = u64_to_user_ptr(info.jited_line_info); +++ ulen = min_t(u32, info.nr_jited_line_info, ulen); +++ for (i = 0; i < ulen; i++) { +++ if (put_user((__u64)(long)prog->aux->jited_linfo[i], +++ &user_linfo[i])) +++ return -EFAULT; +++ } +++ } else { +++ info.jited_line_info = 0; ++ } ++- size = sizeof(attr); ++ } ++ +++ ulen = info.nr_prog_tags; +++ info.nr_prog_tags = prog->aux->func_cnt ? : 1; +++ if (ulen) { +++ __u8 __user (*user_prog_tags)[BPF_TAG_SIZE]; +++ u32 i; +++ +++ user_prog_tags = u64_to_user_ptr(info.prog_tags); +++ ulen = min_t(u32, info.nr_prog_tags, ulen); +++ if (prog->aux->func_cnt) { +++ for (i = 0; i < ulen; i++) { +++ if (copy_to_user(user_prog_tags[i], +++ prog->aux->func[i]->tag, +++ BPF_TAG_SIZE)) +++ return -EFAULT; +++ } +++ } else { +++ if (copy_to_user(user_prog_tags[0], +++ prog->tag, BPF_TAG_SIZE)) +++ return -EFAULT; +++ } +++ } +++ +++done: +++ if (copy_to_user(uinfo, &info, info_len) || +++ put_user(info_len, &uattr->info.info_len)) +++ return -EFAULT; +++ +++ return 0; +++} +++ +++static int bpf_map_get_info_by_fd(struct file *file, +++ struct bpf_map *map, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info); +++ struct bpf_map_info info; +++ u32 info_len = attr->info.info_len; +++ int err; +++ +++ err = bpf_check_uarg_tail_zero(uinfo, sizeof(info), info_len); +++ if (err) +++ return err; +++ info_len = min_t(u32, sizeof(info), info_len); +++ +++ memset(&info, 0, sizeof(info)); +++ info.type = map->map_type; +++ info.id = map->id; +++ info.key_size = map->key_size; +++ info.value_size = map->value_size; +++ info.max_entries = map->max_entries; +++ info.map_flags = map->map_flags; +++ memcpy(info.name, map->name, sizeof(map->name)); +++ +++ if (map->btf) { +++ info.btf_id = btf_id(map->btf); +++ info.btf_key_type_id = map->btf_key_type_id; +++ info.btf_value_type_id = map->btf_value_type_id; +++ } +++ +++ if (copy_to_user(uinfo, &info, info_len) || +++ put_user(info_len, &uattr->info.info_len)) +++ return -EFAULT; +++ +++ return 0; +++} +++ +++static int bpf_btf_get_info_by_fd(struct file *file, +++ struct btf *btf, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ struct bpf_btf_info __user *uinfo = u64_to_user_ptr(attr->info.info); +++ u32 info_len = attr->info.info_len; +++ int err; +++ +++ err = bpf_check_uarg_tail_zero(uinfo, sizeof(*uinfo), info_len); +++ if (err) +++ return err; +++ +++ return btf_get_info_by_fd(btf, attr, uattr); +++} +++ +++#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info +++ +++static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ int ufd = attr->info.bpf_fd; +++ struct fd f; +++ int err; +++ +++ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD)) +++ return -EINVAL; +++ +++ f = fdget(ufd); +++ if (!f.file) +++ return -EBADFD; +++ +++ if (f.file->f_op == &bpf_prog_fops) +++ err = bpf_prog_get_info_by_fd(f.file, f.file->private_data, attr, +++ uattr); +++ else if (f.file->f_op == &bpf_map_fops) +++ err = bpf_map_get_info_by_fd(f.file, f.file->private_data, attr, +++ uattr); +++ else if (f.file->f_op == &btf_fops) +++ err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); +++ else +++ err = -EINVAL; +++ +++ fdput(f); +++ return err; +++} +++ +++#define BPF_BTF_LOAD_LAST_FIELD btf_log_level +++ +++static int bpf_btf_load(const union bpf_attr *attr) +++{ +++ if (CHECK_ATTR(BPF_BTF_LOAD)) +++ return -EINVAL; +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ +++ return btf_new_fd(attr); +++} +++ +++#define BPF_BTF_GET_FD_BY_ID_LAST_FIELD btf_id +++ +++static int bpf_btf_get_fd_by_id(const union bpf_attr *attr) +++{ +++ if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID)) +++ return -EINVAL; +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ +++ return btf_get_fd_by_id(attr->btf_id); +++} +++ +++ +++#define BPF_TASK_FD_QUERY_LAST_FIELD task_fd_query.probe_addr +++ +++SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) +++{ +++ union bpf_attr attr; +++ int err; +++ +++ if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) +++ return -EPERM; +++ +++ err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); +++ if (err) +++ return err; +++ size = min_t(u32, size, sizeof(attr)); +++ ++ /* copy attributes from user space, may be less than sizeof(bpf_attr) */ +++ memset(&attr, 0, sizeof(attr)); ++ if (copy_from_user(&attr, uattr, size) != 0) ++ return -EFAULT; ++ ++@@ -779,8 +2364,11 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf ++ case BPF_MAP_GET_NEXT_KEY: ++ err = map_get_next_key(&attr); ++ break; +++ case BPF_MAP_FREEZE: +++ err = map_freeze(&attr); +++ break; ++ case BPF_PROG_LOAD: ++- err = bpf_prog_load(&attr); +++ err = bpf_prog_load(&attr, uattr); ++ break; ++ case BPF_OBJ_PIN: ++ err = bpf_obj_pin(&attr); ++@@ -788,6 +2376,39 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf ++ case BPF_OBJ_GET: ++ err = bpf_obj_get(&attr); ++ break; +++ case BPF_PROG_TEST_RUN: +++ err = bpf_prog_test_run(&attr, uattr); +++ break; +++ case BPF_PROG_GET_NEXT_ID: +++ err = bpf_obj_get_next_id(&attr, uattr, +++ &prog_idr, &prog_idr_lock); +++ break; +++ case BPF_MAP_GET_NEXT_ID: +++ err = bpf_obj_get_next_id(&attr, uattr, +++ &map_idr, &map_idr_lock); +++ break; +++ case BPF_BTF_GET_NEXT_ID: +++ err = bpf_obj_get_next_id(&attr, uattr, +++ &btf_idr, &btf_idr_lock); +++ break; +++ case BPF_PROG_GET_FD_BY_ID: +++ err = bpf_prog_get_fd_by_id(&attr); +++ break; +++ case BPF_MAP_GET_FD_BY_ID: +++ err = bpf_map_get_fd_by_id(&attr); +++ break; +++ case BPF_OBJ_GET_INFO_BY_FD: +++ err = bpf_obj_get_info_by_fd(&attr, uattr); +++ break; +++ case BPF_BTF_LOAD: +++ err = bpf_btf_load(&attr); +++ break; +++ case BPF_BTF_GET_FD_BY_ID: +++ err = bpf_btf_get_fd_by_id(&attr); +++ break; +++ case BPF_MAP_LOOKUP_AND_DELETE_ELEM: +++ err = map_lookup_and_delete_elem(&attr); +++ break; ++ default: ++ err = -EINVAL; ++ break; ++--- /dev/null +++++ b/kernel/bpf/sysfs_btf.c ++@@ -0,0 +1,45 @@ +++// SPDX-License-Identifier: GPL-2.0 +++/* +++ * Provide kernel BTF information for introspection and use by eBPF tools. +++ */ +++#include +++#include +++#include +++#include +++#include +++ +++/* See scripts/link-vmlinux.sh, gen_btf() func for details */ +++extern char __weak __start_BTF[]; +++extern char __weak __stop_BTF[]; +++ +++static ssize_t +++btf_vmlinux_read(struct file *file, struct kobject *kobj, +++ struct bin_attribute *bin_attr, +++ char *buf, loff_t off, size_t len) +++{ +++ memcpy(buf, __start_BTF + off, len); +++ return len; +++} +++ +++static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { +++ .attr = { .name = "vmlinux", .mode = 0444, }, +++ .read = btf_vmlinux_read, +++}; +++ +++static struct kobject *btf_kobj; +++ +++static int __init btf_vmlinux_init(void) +++{ +++ bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF; +++ +++ if (!__start_BTF || bin_attr_btf_vmlinux.size == 0) +++ return 0; +++ +++ btf_kobj = kobject_create_and_add("btf", kernel_kobj); +++ if (!btf_kobj) +++ return -ENOMEM; +++ +++ return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux); +++} +++ +++subsys_initcall(btf_vmlinux_init); ++--- /dev/null +++++ b/kernel/bpf/tnum.c ++@@ -0,0 +1,196 @@ +++// SPDX-License-Identifier: GPL-2.0-only +++/* tnum: tracked (or tristate) numbers +++ * +++ * A tnum tracks knowledge about the bits of a value. Each bit can be either +++ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will +++ * propagate the unknown bits such that the tnum result represents all the +++ * possible results for possible values of the operands. +++ */ +++#include +++#include +++ +++#define TNUM(_v, _m) (struct tnum){.value = _v, .mask = _m} +++/* A completely unknown value */ +++const struct tnum tnum_unknown = { .value = 0, .mask = -1 }; +++ +++struct tnum tnum_const(u64 value) +++{ +++ return TNUM(value, 0); +++} +++ +++struct tnum tnum_range(u64 min, u64 max) +++{ +++ u64 chi = min ^ max, delta; +++ u8 bits = fls64(chi); +++ +++ /* special case, needed because 1ULL << 64 is undefined */ +++ if (bits > 63) +++ return tnum_unknown; +++ /* e.g. if chi = 4, bits = 3, delta = (1<<3) - 1 = 7. +++ * if chi = 0, bits = 0, delta = (1<<0) - 1 = 0, so we return +++ * constant min (since min == max). +++ */ +++ delta = (1ULL << bits) - 1; +++ return TNUM(min & ~delta, delta); +++} +++ +++struct tnum tnum_lshift(struct tnum a, u8 shift) +++{ +++ return TNUM(a.value << shift, a.mask << shift); +++} +++ +++struct tnum tnum_rshift(struct tnum a, u8 shift) +++{ +++ return TNUM(a.value >> shift, a.mask >> shift); +++} +++ +++struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness) +++{ +++ /* if a.value is negative, arithmetic shifting by minimum shift +++ * will have larger negative offset compared to more shifting. +++ * If a.value is nonnegative, arithmetic shifting by minimum shift +++ * will have larger positive offset compare to more shifting. +++ */ +++ if (insn_bitness == 32) +++ return TNUM((u32)(((s32)a.value) >> min_shift), +++ (u32)(((s32)a.mask) >> min_shift)); +++ else +++ return TNUM((s64)a.value >> min_shift, +++ (s64)a.mask >> min_shift); +++} +++ +++struct tnum tnum_add(struct tnum a, struct tnum b) +++{ +++ u64 sm, sv, sigma, chi, mu; +++ +++ sm = a.mask + b.mask; +++ sv = a.value + b.value; +++ sigma = sm + sv; +++ chi = sigma ^ sv; +++ mu = chi | a.mask | b.mask; +++ return TNUM(sv & ~mu, mu); +++} +++ +++struct tnum tnum_sub(struct tnum a, struct tnum b) +++{ +++ u64 dv, alpha, beta, chi, mu; +++ +++ dv = a.value - b.value; +++ alpha = dv + a.mask; +++ beta = dv - b.mask; +++ chi = alpha ^ beta; +++ mu = chi | a.mask | b.mask; +++ return TNUM(dv & ~mu, mu); +++} +++ +++struct tnum tnum_and(struct tnum a, struct tnum b) +++{ +++ u64 alpha, beta, v; +++ +++ alpha = a.value | a.mask; +++ beta = b.value | b.mask; +++ v = a.value & b.value; +++ return TNUM(v, alpha & beta & ~v); +++} +++ +++struct tnum tnum_or(struct tnum a, struct tnum b) +++{ +++ u64 v, mu; +++ +++ v = a.value | b.value; +++ mu = a.mask | b.mask; +++ return TNUM(v, mu & ~v); +++} +++ +++struct tnum tnum_xor(struct tnum a, struct tnum b) +++{ +++ u64 v, mu; +++ +++ v = a.value ^ b.value; +++ mu = a.mask | b.mask; +++ return TNUM(v & ~mu, mu); +++} +++ +++/* half-multiply add: acc += (unknown * mask * value). +++ * An intermediate step in the multiply algorithm. +++ */ +++static struct tnum hma(struct tnum acc, u64 value, u64 mask) +++{ +++ while (mask) { +++ if (mask & 1) +++ acc = tnum_add(acc, TNUM(0, value)); +++ mask >>= 1; +++ value <<= 1; +++ } +++ return acc; +++} +++ +++struct tnum tnum_mul(struct tnum a, struct tnum b) +++{ +++ struct tnum acc; +++ u64 pi; +++ +++ pi = a.value * b.value; +++ acc = hma(TNUM(pi, 0), a.mask, b.mask | b.value); +++ return hma(acc, b.mask, a.value); +++} +++ +++/* Note that if a and b disagree - i.e. one has a 'known 1' where the other has +++ * a 'known 0' - this will return a 'known 1' for that bit. +++ */ +++struct tnum tnum_intersect(struct tnum a, struct tnum b) +++{ +++ u64 v, mu; +++ +++ v = a.value | b.value; +++ mu = a.mask & b.mask; +++ return TNUM(v & ~mu, mu); +++} +++ +++struct tnum tnum_cast(struct tnum a, u8 size) +++{ +++ a.value &= (1ULL << (size * 8)) - 1; +++ a.mask &= (1ULL << (size * 8)) - 1; +++ return a; +++} +++ +++bool tnum_is_aligned(struct tnum a, u64 size) +++{ +++ if (!size) +++ return true; +++ return !((a.value | a.mask) & (size - 1)); +++} +++ +++bool tnum_in(struct tnum a, struct tnum b) +++{ +++ if (b.mask & ~a.mask) +++ return false; +++ b.value &= ~a.mask; +++ return a.value == b.value; +++} +++ +++int tnum_strn(char *str, size_t size, struct tnum a) +++{ +++ return snprintf(str, size, "(%#llx; %#llx)", a.value, a.mask); +++} +++EXPORT_SYMBOL_GPL(tnum_strn); +++ +++int tnum_sbin(char *str, size_t size, struct tnum a) +++{ +++ size_t n; +++ +++ for (n = 64; n; n--) { +++ if (n < size) { +++ if (a.mask & 1) +++ str[n - 1] = 'x'; +++ else if (a.value & 1) +++ str[n - 1] = '1'; +++ else +++ str[n - 1] = '0'; +++ } +++ a.mask >>= 1; +++ a.value >>= 1; +++ } +++ str[min(size - 1, (size_t)64)] = 0; +++ return 64; +++} ++--- a/kernel/bpf/verifier.c +++++ b/kernel/bpf/verifier.c ++@@ -1,22 +1,36 @@ +++// SPDX-License-Identifier: GPL-2.0-only ++ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com ++- * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of version 2 of the GNU General Public ++- * License as published by the Free Software Foundation. ++- * ++- * This program is distributed in the hope that it will be useful, but ++- * WITHOUT ANY WARRANTY; without even the implied warranty of ++- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++- * General Public License for more details. +++ * Copyright (c) 2016 Facebook +++ * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io ++ */ +++#include ++ #include ++ #include ++ #include ++ #include +++#include +++#include ++ #include ++ #include ++ #include ++ #include +++#include +++#include +++#include +++#include +++#include +++#include +++ +++#include "disasm.h" +++ +++static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { +++#define BPF_PROG_TYPE(_id, _name) \ +++ [_id] = & _name ## _verifier_ops, +++#define BPF_MAP_TYPE(_id, _ops) +++#include +++#undef BPF_PROG_TYPE +++#undef BPF_MAP_TYPE +++}; ++ ++ /* bpf_check() is a static code analyzer that walks eBPF program ++ * instruction by instruction and updates register/stack state. ++@@ -30,7 +44,7 @@ ++ * - out of bounds or malformed jumps ++ * The second pass is all possible path descent from the 1st insn. ++ * Since it's analyzing all pathes through the program, the length of the ++- * analysis is limited to 32k insn, which may be hit even if total number of +++ * analysis is limited to 64k insn, which may be hit even if total number of ++ * insn is less then 4K, but there are too many branches that change stack/regs. ++ * Number of 'branches to be analyzed' is limited to 1k ++ * ++@@ -58,13 +72,13 @@ ++ * (and -20 constant is saved for further stack bounds checking). ++ * Meaning that this reg is a pointer to stack plus known immediate constant. ++ * ++- * Most of the time the registers have UNKNOWN_VALUE type, which +++ * Most of the time the registers have SCALAR_VALUE type, which ++ * means the register has some value, but it's not a valid pointer. ++- * (like pointer plus pointer becomes UNKNOWN_VALUE type) +++ * (like pointer plus pointer becomes SCALAR_VALUE type) ++ * ++ * When verifier sees load or store instructions the type of base register ++- * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer ++- * types recognized by check_mem_access() function. +++ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, PTR_TO_STACK, PTR_TO_SOCKET. These are +++ * four pointer types recognized by check_mem_access() function. ++ * ++ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value' ++ * and the range of [ptr, ptr + map's value_size) is accessible. ++@@ -123,346 +137,713 @@ ++ * ++ * After the call R0 is set to return type of the function and registers R1-R5 ++ * are set to NOT_INIT to indicate that they are no longer readable. +++ * +++ * The following reference types represent a potential reference to a kernel +++ * resource which, after first being allocated, must be checked and freed by +++ * the BPF program: +++ * - PTR_TO_SOCKET_OR_NULL, PTR_TO_SOCKET +++ * +++ * When the verifier sees a helper call return a reference type, it allocates a +++ * pointer id for the reference and stores it in the current function state. +++ * Similar to the way that PTR_TO_MAP_VALUE_OR_NULL is converted into +++ * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type +++ * passes through a NULL-check conditional. For the branch wherein the state is +++ * changed to CONST_IMM, the verifier releases the reference. +++ * +++ * For each helper function that allocates a reference, such as +++ * bpf_sk_lookup_tcp(), there is a corresponding release function, such as +++ * bpf_sk_release(). When a reference type passes into the release function, +++ * the verifier also releases the reference. If any unchecked or unreleased +++ * reference remains at the end of the program, the verifier rejects it. ++ */ ++ ++-/* types of values stored in eBPF registers */ ++-enum bpf_reg_type { ++- NOT_INIT = 0, /* nothing was written into register */ ++- UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ ++- PTR_TO_CTX, /* reg points to bpf_context */ ++- CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ ++- PTR_TO_MAP_VALUE, /* reg points to map element value */ ++- PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ ++- FRAME_PTR, /* reg == frame_pointer */ ++- PTR_TO_STACK, /* reg == frame_pointer + imm */ ++- CONST_IMM, /* constant integer value */ ++-}; ++- ++-struct reg_state { ++- enum bpf_reg_type type; ++- union { ++- /* valid when type == CONST_IMM | PTR_TO_STACK */ ++- int imm; ++- ++- /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | ++- * PTR_TO_MAP_VALUE_OR_NULL ++- */ ++- struct bpf_map *map_ptr; ++- }; ++-}; ++- ++-enum bpf_stack_slot_type { ++- STACK_INVALID, /* nothing was stored in this stack slot */ ++- STACK_SPILL, /* register spilled into stack */ ++- STACK_MISC /* BPF program wrote some data into this slot */ ++-}; ++- ++-#define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ ++- ++-/* state of the program: ++- * type of all registers and stack info ++- */ ++-struct verifier_state { ++- struct reg_state regs[MAX_BPF_REG]; ++- u8 stack_slot_type[MAX_BPF_STACK]; ++- struct reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE]; ++-}; ++- ++-/* linked list of verifier states used to prune search */ ++-struct verifier_state_list { ++- struct verifier_state state; ++- struct verifier_state_list *next; ++-}; ++- ++ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ ++-struct verifier_stack_elem { +++struct bpf_verifier_stack_elem { ++ /* verifer state is 'st' ++ * before processing instruction 'insn_idx' ++ * and after processing instruction 'prev_insn_idx' ++ */ ++- struct verifier_state st; +++ struct bpf_verifier_state st; ++ int insn_idx; ++ int prev_insn_idx; ++- struct verifier_stack_elem *next; +++ struct bpf_verifier_stack_elem *next; ++ }; ++ ++-#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +++#define BPF_COMPLEXITY_LIMIT_JMP_SEQ 8192 +++#define BPF_COMPLEXITY_LIMIT_STATES 64 ++ ++-/* single container for all structs ++- * one verifier_env per bpf_check() call ++- */ ++-struct verifier_env { ++- struct bpf_prog *prog; /* eBPF program being verified */ ++- struct verifier_stack_elem *head; /* stack of verifier states to be processed */ ++- int stack_size; /* number of states to be processed */ ++- struct verifier_state cur_state; /* current verifier state */ ++- struct verifier_state_list **explored_states; /* search pruning optimization */ ++- struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ ++- u32 used_map_cnt; /* number of used maps */ ++- bool allow_ptr_leaks; ++-}; +++#define BPF_MAP_PTR_UNPRIV 1UL +++#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ +++ POISON_POINTER_DELTA)) +++#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) ++ ++-/* verbose verifier prints what it's seeing ++- * bpf_check() is called under lock, so no race to access these global vars ++- */ ++-static u32 log_level, log_size, log_len; ++-static char *log_buf; +++static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) +++{ +++ return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; +++} +++ +++static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) +++{ +++ return aux->map_state & BPF_MAP_PTR_UNPRIV; +++} +++ +++static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, +++ const struct bpf_map *map, bool unpriv) +++{ +++ BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); +++ unpriv |= bpf_map_ptr_unpriv(aux); +++ aux->map_state = (unsigned long)map | +++ (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +++} +++ +++struct bpf_call_arg_meta { +++ struct bpf_map *map_ptr; +++ bool raw_mode; +++ bool pkt_access; +++ int regno; +++ int access_size; +++ u64 msize_max_value; +++ int ref_obj_id; +++ int func_id; +++}; ++ ++ static DEFINE_MUTEX(bpf_verifier_lock); ++ +++static const struct bpf_line_info * +++find_linfo(const struct bpf_verifier_env *env, u32 insn_off) +++{ +++ const struct bpf_line_info *linfo; +++ const struct bpf_prog *prog; +++ u32 i, nr_linfo; +++ +++ prog = env->prog; +++ nr_linfo = prog->aux->nr_linfo; +++ +++ if (!nr_linfo || insn_off >= prog->len) +++ return NULL; +++ +++ linfo = prog->aux->linfo; +++ for (i = 1; i < nr_linfo; i++) +++ if (insn_off < linfo[i].insn_off) +++ break; +++ +++ return &linfo[i - 1]; +++} +++ +++void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, +++ va_list args) +++{ +++ unsigned int n; +++ +++ n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); +++ +++ WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, +++ "verifier log line truncated - local buffer too short\n"); +++ +++ n = min(log->len_total - log->len_used - 1, n); +++ log->kbuf[n] = '\0'; +++ +++ if (!copy_to_user(log->ubuf + log->len_used, log->kbuf, n + 1)) +++ log->len_used += n; +++ else +++ log->ubuf = NULL; +++} +++ ++ /* log_level controls verbosity level of eBPF verifier. ++- * verbose() is used to dump the verification trace to the log, so the user ++- * can figure out what's wrong with the program +++ * bpf_verifier_log_write() is used to dump the verification trace to the log, +++ * so the user can figure out what's wrong with the program ++ */ ++-static __printf(1, 2) void verbose(const char *fmt, ...) +++__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, +++ const char *fmt, ...) ++ { ++ va_list args; ++ ++- if (log_level == 0 || log_len >= log_size - 1) +++ if (!bpf_verifier_log_needed(&env->log)) ++ return; ++ ++ va_start(args, fmt); ++- log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args); +++ bpf_verifier_vlog(&env->log, fmt, args); ++ va_end(args); ++ } +++EXPORT_SYMBOL_GPL(bpf_verifier_log_write); +++ +++__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) +++{ +++ struct bpf_verifier_env *env = private_data; +++ va_list args; +++ +++ if (!bpf_verifier_log_needed(&env->log)) +++ return; +++ +++ va_start(args, fmt); +++ bpf_verifier_vlog(&env->log, fmt, args); +++ va_end(args); +++} +++ +++static const char *ltrim(const char *s) +++{ +++ while (isspace(*s)) +++ s++; +++ +++ return s; +++} +++ +++__printf(3, 4) static void verbose_linfo(struct bpf_verifier_env *env, +++ u32 insn_off, +++ const char *prefix_fmt, ...) +++{ +++ const struct bpf_line_info *linfo; +++ +++ if (!bpf_verifier_log_needed(&env->log)) +++ return; +++ +++ linfo = find_linfo(env, insn_off); +++ if (!linfo || linfo == env->prev_linfo) +++ return; +++ +++ if (prefix_fmt) { +++ va_list args; +++ +++ va_start(args, prefix_fmt); +++ bpf_verifier_vlog(&env->log, prefix_fmt, args); +++ va_end(args); +++ } +++ +++ verbose(env, "%s\n", +++ ltrim(btf_name_by_offset(env->prog->aux->btf, +++ linfo->line_off))); +++ +++ env->prev_linfo = linfo; +++} +++ +++static bool type_is_pkt_pointer(enum bpf_reg_type type) +++{ +++ return type == PTR_TO_PACKET || +++ type == PTR_TO_PACKET_META; +++} +++ +++static bool type_is_sk_pointer(enum bpf_reg_type type) +++{ +++ return type == PTR_TO_SOCKET || +++ type == PTR_TO_SOCK_COMMON || +++ type == PTR_TO_TCP_SOCK || +++ type == PTR_TO_XDP_SOCK; +++} +++ +++static bool reg_type_may_be_null(enum bpf_reg_type type) +++{ +++ return type == PTR_TO_MAP_VALUE_OR_NULL || +++ type == PTR_TO_SOCKET_OR_NULL || +++ type == PTR_TO_SOCK_COMMON_OR_NULL || +++ type == PTR_TO_TCP_SOCK_OR_NULL; +++} +++ +++static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +++{ +++ return reg->type == PTR_TO_MAP_VALUE && +++ map_value_has_spin_lock(reg->map_ptr); +++} +++ +++static bool reg_type_may_be_refcounted_or_null(enum bpf_reg_type type) +++{ +++ return type == PTR_TO_SOCKET || +++ type == PTR_TO_SOCKET_OR_NULL || +++ type == PTR_TO_TCP_SOCK || +++ type == PTR_TO_TCP_SOCK_OR_NULL; +++} +++ +++static bool arg_type_may_be_refcounted(enum bpf_arg_type type) +++{ +++ return type == ARG_PTR_TO_SOCK_COMMON; +++} +++ +++/* Determine whether the function releases some resources allocated by another +++ * function call. The first reference type argument will be assumed to be +++ * released by release_reference(). +++ */ +++static bool is_release_function(enum bpf_func_id func_id) +++{ +++ return func_id == BPF_FUNC_sk_release; +++} +++ +++static bool is_acquire_function(enum bpf_func_id func_id) +++{ +++ return func_id == BPF_FUNC_sk_lookup_tcp || +++ func_id == BPF_FUNC_sk_lookup_udp || +++ func_id == BPF_FUNC_skc_lookup_tcp; +++} +++ +++static bool is_ptr_cast_function(enum bpf_func_id func_id) +++{ +++ return func_id == BPF_FUNC_tcp_sock || +++ func_id == BPF_FUNC_sk_fullsock; +++} ++ ++ /* string representation of 'enum bpf_reg_type' */ ++ static const char * const reg_type_str[] = { ++ [NOT_INIT] = "?", ++- [UNKNOWN_VALUE] = "inv", +++ [SCALAR_VALUE] = "inv", ++ [PTR_TO_CTX] = "ctx", ++ [CONST_PTR_TO_MAP] = "map_ptr", ++ [PTR_TO_MAP_VALUE] = "map_value", ++ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null", ++- [FRAME_PTR] = "fp", ++ [PTR_TO_STACK] = "fp", ++- [CONST_IMM] = "imm", +++ [PTR_TO_PACKET] = "pkt", +++ [PTR_TO_PACKET_META] = "pkt_meta", +++ [PTR_TO_PACKET_END] = "pkt_end", +++ [PTR_TO_FLOW_KEYS] = "flow_keys", +++ [PTR_TO_SOCKET] = "sock", +++ [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", +++ [PTR_TO_SOCK_COMMON] = "sock_common", +++ [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", +++ [PTR_TO_TCP_SOCK] = "tcp_sock", +++ [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", +++ [PTR_TO_TP_BUFFER] = "tp_buffer", +++ [PTR_TO_XDP_SOCK] = "xdp_sock", ++ }; ++ ++-static void print_verifier_state(struct verifier_env *env) +++static char slot_type_char[] = { +++ [STACK_INVALID] = '?', +++ [STACK_SPILL] = 'r', +++ [STACK_MISC] = 'm', +++ [STACK_ZERO] = '0', +++}; +++ +++static void print_liveness(struct bpf_verifier_env *env, +++ enum bpf_reg_liveness live) +++{ +++ if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN | REG_LIVE_DONE)) +++ verbose(env, "_"); +++ if (live & REG_LIVE_READ) +++ verbose(env, "r"); +++ if (live & REG_LIVE_WRITTEN) +++ verbose(env, "w"); +++ if (live & REG_LIVE_DONE) +++ verbose(env, "D"); +++} +++ +++static struct bpf_func_state *func(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg) ++ { +++ struct bpf_verifier_state *cur = env->cur_state; +++ +++ return cur->frame[reg->frameno]; +++} +++ +++static void print_verifier_state(struct bpf_verifier_env *env, +++ const struct bpf_func_state *state) +++{ +++ const struct bpf_reg_state *reg; ++ enum bpf_reg_type t; ++ int i; ++ +++ if (state->frameno) +++ verbose(env, " frame%d:", state->frameno); ++ for (i = 0; i < MAX_BPF_REG; i++) { ++- t = env->cur_state.regs[i].type; +++ reg = &state->regs[i]; +++ t = reg->type; ++ if (t == NOT_INIT) ++ continue; ++- verbose(" R%d=%s", i, reg_type_str[t]); ++- if (t == CONST_IMM || t == PTR_TO_STACK) ++- verbose("%d", env->cur_state.regs[i].imm); ++- else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE || ++- t == PTR_TO_MAP_VALUE_OR_NULL) ++- verbose("(ks=%d,vs=%d)", ++- env->cur_state.regs[i].map_ptr->key_size, ++- env->cur_state.regs[i].map_ptr->value_size); ++- } ++- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { ++- if (env->cur_state.stack_slot_type[i] == STACK_SPILL) ++- verbose(" fp%d=%s", -MAX_BPF_STACK + i, ++- reg_type_str[env->cur_state.spilled_regs[i / BPF_REG_SIZE].type]); ++- } ++- verbose("\n"); ++-} ++- ++-static const char *const bpf_class_string[] = { ++- [BPF_LD] = "ld", ++- [BPF_LDX] = "ldx", ++- [BPF_ST] = "st", ++- [BPF_STX] = "stx", ++- [BPF_ALU] = "alu", ++- [BPF_JMP] = "jmp", ++- [BPF_RET] = "BUG", ++- [BPF_ALU64] = "alu64", ++-}; +++ verbose(env, " R%d", i); +++ print_liveness(env, reg->live); +++ verbose(env, "=%s", reg_type_str[t]); +++ if (t == SCALAR_VALUE && reg->precise) +++ verbose(env, "P"); +++ if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && +++ tnum_is_const(reg->var_off)) { +++ /* reg->off should be 0 for SCALAR_VALUE */ +++ verbose(env, "%lld", reg->var_off.value + reg->off); +++ } else { +++ verbose(env, "(id=%d", reg->id); +++ if (reg_type_may_be_refcounted_or_null(t)) +++ verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); +++ if (t != SCALAR_VALUE) +++ verbose(env, ",off=%d", reg->off); +++ if (type_is_pkt_pointer(t)) +++ verbose(env, ",r=%d", reg->range); +++ else if (t == CONST_PTR_TO_MAP || +++ t == PTR_TO_MAP_VALUE || +++ t == PTR_TO_MAP_VALUE_OR_NULL) +++ verbose(env, ",ks=%d,vs=%d", +++ reg->map_ptr->key_size, +++ reg->map_ptr->value_size); +++ if (tnum_is_const(reg->var_off)) { +++ /* Typically an immediate SCALAR_VALUE, but +++ * could be a pointer whose offset is too big +++ * for reg->off +++ */ +++ verbose(env, ",imm=%llx", reg->var_off.value); +++ } else { +++ if (reg->smin_value != reg->umin_value && +++ reg->smin_value != S64_MIN) +++ verbose(env, ",smin_value=%lld", +++ (long long)reg->smin_value); +++ if (reg->smax_value != reg->umax_value && +++ reg->smax_value != S64_MAX) +++ verbose(env, ",smax_value=%lld", +++ (long long)reg->smax_value); +++ if (reg->umin_value != 0) +++ verbose(env, ",umin_value=%llu", +++ (unsigned long long)reg->umin_value); +++ if (reg->umax_value != U64_MAX) +++ verbose(env, ",umax_value=%llu", +++ (unsigned long long)reg->umax_value); +++ if (!tnum_is_unknown(reg->var_off)) { +++ char tn_buf[48]; ++ ++-static const char *const bpf_alu_string[16] = { ++- [BPF_ADD >> 4] = "+=", ++- [BPF_SUB >> 4] = "-=", ++- [BPF_MUL >> 4] = "*=", ++- [BPF_DIV >> 4] = "/=", ++- [BPF_OR >> 4] = "|=", ++- [BPF_AND >> 4] = "&=", ++- [BPF_LSH >> 4] = "<<=", ++- [BPF_RSH >> 4] = ">>=", ++- [BPF_NEG >> 4] = "neg", ++- [BPF_MOD >> 4] = "%=", ++- [BPF_XOR >> 4] = "^=", ++- [BPF_MOV >> 4] = "=", ++- [BPF_ARSH >> 4] = "s>>=", ++- [BPF_END >> 4] = "endian", ++-}; +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, ",var_off=%s", tn_buf); +++ } +++ } +++ verbose(env, ")"); +++ } +++ } +++ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +++ char types_buf[BPF_REG_SIZE + 1]; +++ bool valid = false; +++ int j; +++ +++ for (j = 0; j < BPF_REG_SIZE; j++) { +++ if (state->stack[i].slot_type[j] != STACK_INVALID) +++ valid = true; +++ types_buf[j] = slot_type_char[ +++ state->stack[i].slot_type[j]]; +++ } +++ types_buf[BPF_REG_SIZE] = 0; +++ if (!valid) +++ continue; +++ verbose(env, " fp%d", (-i - 1) * BPF_REG_SIZE); +++ print_liveness(env, state->stack[i].spilled_ptr.live); +++ if (state->stack[i].slot_type[0] == STACK_SPILL) { +++ reg = &state->stack[i].spilled_ptr; +++ t = reg->type; +++ verbose(env, "=%s", reg_type_str[t]); +++ if (t == SCALAR_VALUE && reg->precise) +++ verbose(env, "P"); +++ if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) +++ verbose(env, "%lld", reg->var_off.value + reg->off); +++ } else { +++ verbose(env, "=%s", types_buf); +++ } +++ } +++ if (state->acquired_refs && state->refs[0].id) { +++ verbose(env, " refs=%d", state->refs[0].id); +++ for (i = 1; i < state->acquired_refs; i++) +++ if (state->refs[i].id) +++ verbose(env, ",%d", state->refs[i].id); +++ } +++ verbose(env, "\n"); +++} ++ ++-static const char *const bpf_ldst_string[] = { ++- [BPF_W >> 3] = "u32", ++- [BPF_H >> 3] = "u16", ++- [BPF_B >> 3] = "u8", ++- [BPF_DW >> 3] = "u64", ++-}; +++#define COPY_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +++static int copy_##NAME##_state(struct bpf_func_state *dst, \ +++ const struct bpf_func_state *src) \ +++{ \ +++ if (!src->FIELD) \ +++ return 0; \ +++ if (WARN_ON_ONCE(dst->COUNT < src->COUNT)) { \ +++ /* internal bug, make state invalid to reject the program */ \ +++ memset(dst, 0, sizeof(*dst)); \ +++ return -EFAULT; \ +++ } \ +++ memcpy(dst->FIELD, src->FIELD, \ +++ sizeof(*src->FIELD) * (src->COUNT / SIZE)); \ +++ return 0; \ +++} +++/* copy_reference_state() */ +++COPY_STATE_FN(reference, acquired_refs, refs, 1) +++/* copy_stack_state() */ +++COPY_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +++#undef COPY_STATE_FN +++ +++#define REALLOC_STATE_FN(NAME, COUNT, FIELD, SIZE) \ +++static int realloc_##NAME##_state(struct bpf_func_state *state, int size, \ +++ bool copy_old) \ +++{ \ +++ u32 old_size = state->COUNT; \ +++ struct bpf_##NAME##_state *new_##FIELD; \ +++ int slot = size / SIZE; \ +++ \ +++ if (size <= old_size || !size) { \ +++ if (copy_old) \ +++ return 0; \ +++ state->COUNT = slot * SIZE; \ +++ if (!size && old_size) { \ +++ kfree(state->FIELD); \ +++ state->FIELD = NULL; \ +++ } \ +++ return 0; \ +++ } \ +++ new_##FIELD = kmalloc_array(slot, sizeof(struct bpf_##NAME##_state), \ +++ GFP_KERNEL); \ +++ if (!new_##FIELD) \ +++ return -ENOMEM; \ +++ if (copy_old) { \ +++ if (state->FIELD) \ +++ memcpy(new_##FIELD, state->FIELD, \ +++ sizeof(*new_##FIELD) * (old_size / SIZE)); \ +++ memset(new_##FIELD + old_size / SIZE, 0, \ +++ sizeof(*new_##FIELD) * (size - old_size) / SIZE); \ +++ } \ +++ state->COUNT = slot * SIZE; \ +++ kfree(state->FIELD); \ +++ state->FIELD = new_##FIELD; \ +++ return 0; \ +++} +++/* realloc_reference_state() */ +++REALLOC_STATE_FN(reference, acquired_refs, refs, 1) +++/* realloc_stack_state() */ +++REALLOC_STATE_FN(stack, allocated_stack, stack, BPF_REG_SIZE) +++#undef REALLOC_STATE_FN +++ +++/* do_check() starts with zero-sized stack in struct bpf_verifier_state to +++ * make it consume minimal amount of memory. check_stack_write() access from +++ * the program calls into realloc_func_state() to grow the stack size. +++ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state +++ * which realloc_stack_state() copies over. It points to previous +++ * bpf_verifier_state which is never reallocated. +++ */ +++static int realloc_func_state(struct bpf_func_state *state, int stack_size, +++ int refs_size, bool copy_old) +++{ +++ int err = realloc_reference_state(state, refs_size, copy_old); +++ if (err) +++ return err; +++ return realloc_stack_state(state, stack_size, copy_old); +++} ++ ++-static const char *const bpf_jmp_string[16] = { ++- [BPF_JA >> 4] = "jmp", ++- [BPF_JEQ >> 4] = "==", ++- [BPF_JGT >> 4] = ">", ++- [BPF_JGE >> 4] = ">=", ++- [BPF_JSET >> 4] = "&", ++- [BPF_JNE >> 4] = "!=", ++- [BPF_JSGT >> 4] = "s>", ++- [BPF_JSGE >> 4] = "s>=", ++- [BPF_CALL >> 4] = "call", ++- [BPF_EXIT >> 4] = "exit", ++-}; +++/* Acquire a pointer id from the env and update the state->refs to include +++ * this new pointer reference. +++ * On success, returns a valid pointer id to associate with the register +++ * On failure, returns a negative errno. +++ */ +++static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) +++{ +++ struct bpf_func_state *state = cur_func(env); +++ int new_ofs = state->acquired_refs; +++ int id, err; ++ ++-static void print_bpf_insn(const struct verifier_env *env, ++- const struct bpf_insn *insn) +++ err = realloc_reference_state(state, state->acquired_refs + 1, true); +++ if (err) +++ return err; +++ id = ++env->id_gen; +++ state->refs[new_ofs].id = id; +++ state->refs[new_ofs].insn_idx = insn_idx; +++ +++ return id; +++} +++ +++/* release function corresponding to acquire_reference_state(). Idempotent. */ +++static int release_reference_state(struct bpf_func_state *state, int ptr_id) ++ { ++- u8 class = BPF_CLASS(insn->code); +++ int i, last_idx; ++ ++- if (class == BPF_ALU || class == BPF_ALU64) { ++- if (BPF_SRC(insn->code) == BPF_X) ++- verbose("(%02x) %sr%d %s %sr%d\n", ++- insn->code, class == BPF_ALU ? "(u32) " : "", ++- insn->dst_reg, ++- bpf_alu_string[BPF_OP(insn->code) >> 4], ++- class == BPF_ALU ? "(u32) " : "", ++- insn->src_reg); ++- else ++- verbose("(%02x) %sr%d %s %s%d\n", ++- insn->code, class == BPF_ALU ? "(u32) " : "", ++- insn->dst_reg, ++- bpf_alu_string[BPF_OP(insn->code) >> 4], ++- class == BPF_ALU ? "(u32) " : "", ++- insn->imm); ++- } else if (class == BPF_STX) { ++- if (BPF_MODE(insn->code) == BPF_MEM) ++- verbose("(%02x) *(%s *)(r%d %+d) = r%d\n", ++- insn->code, ++- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++- insn->dst_reg, ++- insn->off, insn->src_reg); ++- else if (BPF_MODE(insn->code) == BPF_XADD) ++- verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n", ++- insn->code, ++- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++- insn->dst_reg, insn->off, ++- insn->src_reg); ++- else ++- verbose("BUG_%02x\n", insn->code); ++- } else if (class == BPF_ST) { ++- if (BPF_MODE(insn->code) != BPF_MEM) { ++- verbose("BUG_st_%02x\n", insn->code); ++- return; ++- } ++- verbose("(%02x) *(%s *)(r%d %+d) = %d\n", ++- insn->code, ++- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++- insn->dst_reg, ++- insn->off, insn->imm); ++- } else if (class == BPF_LDX) { ++- if (BPF_MODE(insn->code) != BPF_MEM) { ++- verbose("BUG_ldx_%02x\n", insn->code); ++- return; +++ last_idx = state->acquired_refs - 1; +++ for (i = 0; i < state->acquired_refs; i++) { +++ if (state->refs[i].id == ptr_id) { +++ if (last_idx && i != last_idx) +++ memcpy(&state->refs[i], &state->refs[last_idx], +++ sizeof(*state->refs)); +++ memset(&state->refs[last_idx], 0, sizeof(*state->refs)); +++ state->acquired_refs--; +++ return 0; ++ } ++- verbose("(%02x) r%d = *(%s *)(r%d %+d)\n", ++- insn->code, insn->dst_reg, ++- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++- insn->src_reg, insn->off); ++- } else if (class == BPF_LD) { ++- if (BPF_MODE(insn->code) == BPF_ABS) { ++- verbose("(%02x) r0 = *(%s *)skb[%d]\n", ++- insn->code, ++- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++- insn->imm); ++- } else if (BPF_MODE(insn->code) == BPF_IND) { ++- verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n", ++- insn->code, ++- bpf_ldst_string[BPF_SIZE(insn->code) >> 3], ++- insn->src_reg, insn->imm); ++- } else if (BPF_MODE(insn->code) == BPF_IMM && ++- BPF_SIZE(insn->code) == BPF_DW) { ++- /* At this point, we already made sure that the second ++- * part of the ldimm64 insn is accessible. ++- */ ++- u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; ++- bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; +++ } +++ return -EINVAL; +++} ++ ++- if (map_ptr && !env->allow_ptr_leaks) ++- imm = 0; +++static int transfer_reference_state(struct bpf_func_state *dst, +++ struct bpf_func_state *src) +++{ +++ int err = realloc_reference_state(dst, src->acquired_refs, false); +++ if (err) +++ return err; +++ err = copy_reference_state(dst, src); +++ if (err) +++ return err; +++ return 0; +++} ++ ++- verbose("(%02x) r%d = 0x%llx\n", insn->code, ++- insn->dst_reg, (unsigned long long)imm); ++- } else { ++- verbose("BUG_ld_%02x\n", insn->code); ++- return; ++- } ++- } else if (class == BPF_JMP) { ++- u8 opcode = BPF_OP(insn->code); +++static void free_func_state(struct bpf_func_state *state) +++{ +++ if (!state) +++ return; +++ kfree(state->refs); +++ kfree(state->stack); +++ kfree(state); +++} ++ ++- if (opcode == BPF_CALL) { ++- verbose("(%02x) call %d\n", insn->code, insn->imm); ++- } else if (insn->code == (BPF_JMP | BPF_JA)) { ++- verbose("(%02x) goto pc%+d\n", ++- insn->code, insn->off); ++- } else if (insn->code == (BPF_JMP | BPF_EXIT)) { ++- verbose("(%02x) exit\n", insn->code); ++- } else if (BPF_SRC(insn->code) == BPF_X) { ++- verbose("(%02x) if r%d %s r%d goto pc%+d\n", ++- insn->code, insn->dst_reg, ++- bpf_jmp_string[BPF_OP(insn->code) >> 4], ++- insn->src_reg, insn->off); ++- } else { ++- verbose("(%02x) if r%d %s 0x%x goto pc%+d\n", ++- insn->code, insn->dst_reg, ++- bpf_jmp_string[BPF_OP(insn->code) >> 4], ++- insn->imm, insn->off); +++static void clear_jmp_history(struct bpf_verifier_state *state) +++{ +++ kfree(state->jmp_history); +++ state->jmp_history = NULL; +++ state->jmp_history_cnt = 0; +++} +++ +++static void free_verifier_state(struct bpf_verifier_state *state, +++ bool free_self) +++{ +++ int i; +++ +++ for (i = 0; i <= state->curframe; i++) { +++ free_func_state(state->frame[i]); +++ state->frame[i] = NULL; +++ } +++ clear_jmp_history(state); +++ if (free_self) +++ kfree(state); +++} +++ +++/* copy verifier state from src to dst growing dst stack space +++ * when necessary to accommodate larger src stack +++ */ +++static int copy_func_state(struct bpf_func_state *dst, +++ const struct bpf_func_state *src) +++{ +++ int err; +++ +++ err = realloc_func_state(dst, src->allocated_stack, src->acquired_refs, +++ false); +++ if (err) +++ return err; +++ memcpy(dst, src, offsetof(struct bpf_func_state, acquired_refs)); +++ err = copy_reference_state(dst, src); +++ if (err) +++ return err; +++ return copy_stack_state(dst, src); +++} +++ +++static int copy_verifier_state(struct bpf_verifier_state *dst_state, +++ const struct bpf_verifier_state *src) +++{ +++ struct bpf_func_state *dst; +++ u32 jmp_sz = sizeof(struct bpf_idx_pair) * src->jmp_history_cnt; +++ int i, err; +++ +++ if (dst_state->jmp_history_cnt < src->jmp_history_cnt) { +++ kfree(dst_state->jmp_history); +++ dst_state->jmp_history = kmalloc(jmp_sz, GFP_USER); +++ if (!dst_state->jmp_history) +++ return -ENOMEM; +++ } +++ memcpy(dst_state->jmp_history, src->jmp_history, jmp_sz); +++ dst_state->jmp_history_cnt = src->jmp_history_cnt; +++ +++ /* if dst has more stack frames then src frame, free them */ +++ for (i = src->curframe + 1; i <= dst_state->curframe; i++) { +++ free_func_state(dst_state->frame[i]); +++ dst_state->frame[i] = NULL; +++ } +++ dst_state->speculative = src->speculative; +++ dst_state->curframe = src->curframe; +++ dst_state->active_spin_lock = src->active_spin_lock; +++ dst_state->branches = src->branches; +++ dst_state->parent = src->parent; +++ dst_state->first_insn_idx = src->first_insn_idx; +++ dst_state->last_insn_idx = src->last_insn_idx; +++ for (i = 0; i <= src->curframe; i++) { +++ dst = dst_state->frame[i]; +++ if (!dst) { +++ dst = kzalloc(sizeof(*dst), GFP_KERNEL); +++ if (!dst) +++ return -ENOMEM; +++ dst_state->frame[i] = dst; ++ } ++- } else { ++- verbose("(%02x) %s\n", insn->code, bpf_class_string[class]); +++ err = copy_func_state(dst, src->frame[i]); +++ if (err) +++ return err; ++ } +++ return 0; ++ } ++ ++-static int pop_stack(struct verifier_env *env, int *prev_insn_idx) +++static void update_branch_counts(struct bpf_verifier_env *env, struct bpf_verifier_state *st) ++ { ++- struct verifier_stack_elem *elem; ++- int insn_idx; +++ while (st) { +++ u32 br = --st->branches; +++ +++ /* WARN_ON(br > 1) technically makes sense here, +++ * but see comment in push_stack(), hence: +++ */ +++ WARN_ONCE((int)br < 0, +++ "BUG update_branch_counts:branches_to_explore=%d\n", +++ br); +++ if (br) +++ break; +++ st = st->parent; +++ } +++} +++ +++static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, +++ int *insn_idx) +++{ +++ struct bpf_verifier_state *cur = env->cur_state; +++ struct bpf_verifier_stack_elem *elem, *head = env->head; +++ int err; ++ ++ if (env->head == NULL) ++- return -1; +++ return -ENOENT; ++ ++- memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); ++- insn_idx = env->head->insn_idx; +++ if (cur) { +++ err = copy_verifier_state(cur, &head->st); +++ if (err) +++ return err; +++ } +++ if (insn_idx) +++ *insn_idx = head->insn_idx; ++ if (prev_insn_idx) ++- *prev_insn_idx = env->head->prev_insn_idx; ++- elem = env->head->next; ++- kfree(env->head); +++ *prev_insn_idx = head->prev_insn_idx; +++ elem = head->next; +++ free_verifier_state(&head->st, false); +++ kfree(head); ++ env->head = elem; ++ env->stack_size--; ++- return insn_idx; +++ return 0; ++ } ++ ++-static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx, ++- int prev_insn_idx) +++static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, +++ int insn_idx, int prev_insn_idx, +++ bool speculative) ++ { ++- struct verifier_stack_elem *elem; +++ struct bpf_verifier_state *cur = env->cur_state; +++ struct bpf_verifier_stack_elem *elem; +++ int err; ++ ++- elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL); +++ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); ++ if (!elem) ++ goto err; ++ ++- memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); ++ elem->insn_idx = insn_idx; ++ elem->prev_insn_idx = prev_insn_idx; ++ elem->next = env->head; ++ env->head = elem; ++ env->stack_size++; ++- if (env->stack_size > 1024) { ++- verbose("BPF program is too complex\n"); +++ err = copy_verifier_state(&elem->st, cur); +++ if (err) +++ goto err; +++ elem->st.speculative |= speculative; +++ if (env->stack_size > BPF_COMPLEXITY_LIMIT_JMP_SEQ) { +++ verbose(env, "The sequence of %d jumps is too complex.\n", +++ env->stack_size); ++ goto err; ++ } +++ if (elem->st.parent) { +++ ++elem->st.parent->branches; +++ /* WARN_ON(branches > 2) technically makes sense here, +++ * but +++ * 1. speculative states will bump 'branches' for non-branch +++ * instructions +++ * 2. is_state_visited() heuristics may decide not to create +++ * a new state for a sequence of branches and all such current +++ * and cloned states will be pointing to a single parent state +++ * which might have large 'branches' count. +++ */ +++ } ++ return &elem->st; ++ err: +++ free_verifier_state(env->cur_state, true); +++ env->cur_state = NULL; ++ /* pop all elements and return */ ++- while (pop_stack(env, NULL) >= 0); +++ while (!pop_stack(env, NULL, NULL)); ++ return NULL; ++ } ++ ++@@ -471,29 +852,225 @@ static const int caller_saved[CALLER_SAV ++ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 ++ }; ++ ++-static void init_reg_state(struct reg_state *regs) +++static void __mark_reg_not_init(const struct bpf_verifier_env *env, +++ struct bpf_reg_state *reg); +++ +++/* Mark the unknown part of a register (variable offset or scalar value) as +++ * known to have the value @imm. +++ */ +++static void __mark_reg_known(struct bpf_reg_state *reg, u64 imm) +++{ +++ /* Clear id, off, and union(map_ptr, range) */ +++ memset(((u8 *)reg) + sizeof(reg->type), 0, +++ offsetof(struct bpf_reg_state, var_off) - sizeof(reg->type)); +++ reg->var_off = tnum_const(imm); +++ reg->smin_value = (s64)imm; +++ reg->smax_value = (s64)imm; +++ reg->umin_value = imm; +++ reg->umax_value = imm; +++} +++ +++/* Mark the 'variable offset' part of a register as zero. This should be +++ * used only on registers holding a pointer type. +++ */ +++static void __mark_reg_known_zero(struct bpf_reg_state *reg) +++{ +++ __mark_reg_known(reg, 0); +++} +++ +++static void __mark_reg_const_zero(struct bpf_reg_state *reg) +++{ +++ __mark_reg_known(reg, 0); +++ reg->type = SCALAR_VALUE; +++} +++ +++static void mark_reg_known_zero(struct bpf_verifier_env *env, +++ struct bpf_reg_state *regs, u32 regno) +++{ +++ if (WARN_ON(regno >= MAX_BPF_REG)) { +++ verbose(env, "mark_reg_known_zero(regs, %u)\n", regno); +++ /* Something bad happened, let's kill all regs */ +++ for (regno = 0; regno < MAX_BPF_REG; regno++) +++ __mark_reg_not_init(env, regs + regno); +++ return; +++ } +++ __mark_reg_known_zero(regs + regno); +++} +++ +++static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) +++{ +++ return type_is_pkt_pointer(reg->type); +++} +++ +++static bool reg_is_pkt_pointer_any(const struct bpf_reg_state *reg) +++{ +++ return reg_is_pkt_pointer(reg) || +++ reg->type == PTR_TO_PACKET_END; +++} +++ +++/* Unmodified PTR_TO_PACKET[_META,_END] register from ctx access. */ +++static bool reg_is_init_pkt_pointer(const struct bpf_reg_state *reg, +++ enum bpf_reg_type which) +++{ +++ /* The register can already have a range from prior markings. +++ * This is fine as long as it hasn't been advanced from its +++ * origin. +++ */ +++ return reg->type == which && +++ reg->id == 0 && +++ reg->off == 0 && +++ tnum_equals_const(reg->var_off, 0); +++} +++ +++/* Attempts to improve min/max values based on var_off information */ +++static void __update_reg_bounds(struct bpf_reg_state *reg) +++{ +++ /* min signed is max(sign bit) | min(other bits) */ +++ reg->smin_value = max_t(s64, reg->smin_value, +++ reg->var_off.value | (reg->var_off.mask & S64_MIN)); +++ /* max signed is min(sign bit) | max(other bits) */ +++ reg->smax_value = min_t(s64, reg->smax_value, +++ reg->var_off.value | (reg->var_off.mask & S64_MAX)); +++ reg->umin_value = max(reg->umin_value, reg->var_off.value); +++ reg->umax_value = min(reg->umax_value, +++ reg->var_off.value | reg->var_off.mask); +++} +++ +++/* Uses signed min/max values to inform unsigned, and vice-versa */ +++static void __reg_deduce_bounds(struct bpf_reg_state *reg) ++ { +++ /* Learn sign from signed bounds. +++ * If we cannot cross the sign boundary, then signed and unsigned bounds +++ * are the same, so combine. This works even in the negative case, e.g. +++ * -3 s<= x s<= -1 implies 0xf...fd u<= x u<= 0xf...ff. +++ */ +++ if (reg->smin_value >= 0 || reg->smax_value < 0) { +++ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, +++ reg->umin_value); +++ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, +++ reg->umax_value); +++ return; +++ } +++ /* Learn sign from unsigned bounds. Signed bounds cross the sign +++ * boundary, so we must be careful. +++ */ +++ if ((s64)reg->umax_value >= 0) { +++ /* Positive. We can't learn anything from the smin, but smax +++ * is positive, hence safe. +++ */ +++ reg->smin_value = reg->umin_value; +++ reg->smax_value = reg->umax_value = min_t(u64, reg->smax_value, +++ reg->umax_value); +++ } else if ((s64)reg->umin_value < 0) { +++ /* Negative. We can't learn anything from the smax, but smin +++ * is negative, hence safe. +++ */ +++ reg->smin_value = reg->umin_value = max_t(u64, reg->smin_value, +++ reg->umin_value); +++ reg->smax_value = reg->umax_value; +++ } +++} +++ +++/* Attempts to improve var_off based on unsigned min/max information */ +++static void __reg_bound_offset(struct bpf_reg_state *reg) +++{ +++ reg->var_off = tnum_intersect(reg->var_off, +++ tnum_range(reg->umin_value, +++ reg->umax_value)); +++} +++ +++/* Reset the min/max bounds of a register */ +++static void __mark_reg_unbounded(struct bpf_reg_state *reg) +++{ +++ reg->smin_value = S64_MIN; +++ reg->smax_value = S64_MAX; +++ reg->umin_value = 0; +++ reg->umax_value = U64_MAX; +++} +++ +++/* Mark a register as having a completely unknown (scalar) value. */ +++static void __mark_reg_unknown(const struct bpf_verifier_env *env, +++ struct bpf_reg_state *reg) +++{ +++ /* +++ * Clear type, id, off, and union(map_ptr, range) and +++ * padding between 'type' and union +++ */ +++ memset(reg, 0, offsetof(struct bpf_reg_state, var_off)); +++ reg->type = SCALAR_VALUE; +++ reg->var_off = tnum_unknown; +++ reg->frameno = 0; +++ reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks ? +++ true : false; +++ __mark_reg_unbounded(reg); +++} +++ +++static void mark_reg_unknown(struct bpf_verifier_env *env, +++ struct bpf_reg_state *regs, u32 regno) +++{ +++ if (WARN_ON(regno >= MAX_BPF_REG)) { +++ verbose(env, "mark_reg_unknown(regs, %u)\n", regno); +++ /* Something bad happened, let's kill all regs except FP */ +++ for (regno = 0; regno < BPF_REG_FP; regno++) +++ __mark_reg_not_init(env, regs + regno); +++ return; +++ } +++ __mark_reg_unknown(env, regs + regno); +++} +++ +++static void __mark_reg_not_init(const struct bpf_verifier_env *env, +++ struct bpf_reg_state *reg) +++{ +++ __mark_reg_unknown(env, reg); +++ reg->type = NOT_INIT; +++} +++ +++static void mark_reg_not_init(struct bpf_verifier_env *env, +++ struct bpf_reg_state *regs, u32 regno) +++{ +++ if (WARN_ON(regno >= MAX_BPF_REG)) { +++ verbose(env, "mark_reg_not_init(regs, %u)\n", regno); +++ /* Something bad happened, let's kill all regs except FP */ +++ for (regno = 0; regno < BPF_REG_FP; regno++) +++ __mark_reg_not_init(env, regs + regno); +++ return; +++ } +++ __mark_reg_not_init(env, regs + regno); +++} +++ +++#define DEF_NOT_SUBREG (0) +++static void init_reg_state(struct bpf_verifier_env *env, +++ struct bpf_func_state *state) +++{ +++ struct bpf_reg_state *regs = state->regs; ++ int i; ++ ++ for (i = 0; i < MAX_BPF_REG; i++) { ++- regs[i].type = NOT_INIT; ++- regs[i].imm = 0; ++- regs[i].map_ptr = NULL; +++ mark_reg_not_init(env, regs, i); +++ regs[i].live = REG_LIVE_NONE; +++ regs[i].parent = NULL; +++ regs[i].subreg_def = DEF_NOT_SUBREG; ++ } ++ ++ /* frame pointer */ ++- regs[BPF_REG_FP].type = FRAME_PTR; +++ regs[BPF_REG_FP].type = PTR_TO_STACK; +++ mark_reg_known_zero(env, regs, BPF_REG_FP); +++ regs[BPF_REG_FP].frameno = state->frameno; ++ ++ /* 1st arg to a function */ ++ regs[BPF_REG_1].type = PTR_TO_CTX; +++ mark_reg_known_zero(env, regs, BPF_REG_1); ++ } ++ ++-static void mark_reg_unknown_value(struct reg_state *regs, u32 regno) +++#define BPF_MAIN_FUNC (-1) +++static void init_func_state(struct bpf_verifier_env *env, +++ struct bpf_func_state *state, +++ int callsite, int frameno, int subprogno) ++ { ++- BUG_ON(regno >= MAX_BPF_REG); ++- regs[regno].type = UNKNOWN_VALUE; ++- regs[regno].imm = 0; ++- regs[regno].map_ptr = NULL; +++ state->callsite = callsite; +++ state->frameno = frameno; +++ state->subprogno = subprogno; +++ init_reg_state(env, state); ++ } ++ ++ enum reg_arg_type { ++@@ -502,44 +1079,760 @@ enum reg_arg_type { ++ DST_OP_NO_MARK /* same as above, check only, don't mark */ ++ }; ++ ++-static int check_reg_arg(struct reg_state *regs, u32 regno, +++static int cmp_subprogs(const void *a, const void *b) +++{ +++ return ((struct bpf_subprog_info *)a)->start - +++ ((struct bpf_subprog_info *)b)->start; +++} +++ +++static int find_subprog(struct bpf_verifier_env *env, int off) +++{ +++ struct bpf_subprog_info *p; +++ +++ p = bsearch(&off, env->subprog_info, env->subprog_cnt, +++ sizeof(env->subprog_info[0]), cmp_subprogs); +++ if (!p) +++ return -ENOENT; +++ return p - env->subprog_info; +++ +++} +++ +++static int add_subprog(struct bpf_verifier_env *env, int off) +++{ +++ int insn_cnt = env->prog->len; +++ int ret; +++ +++ if (off >= insn_cnt || off < 0) { +++ verbose(env, "call to invalid destination\n"); +++ return -EINVAL; +++ } +++ ret = find_subprog(env, off); +++ if (ret >= 0) +++ return 0; +++ if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { +++ verbose(env, "too many subprograms\n"); +++ return -E2BIG; +++ } +++ env->subprog_info[env->subprog_cnt++].start = off; +++ sort(env->subprog_info, env->subprog_cnt, +++ sizeof(env->subprog_info[0]), cmp_subprogs, NULL); +++ return 0; +++} +++ +++static int check_subprogs(struct bpf_verifier_env *env) +++{ +++ int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; +++ struct bpf_subprog_info *subprog = env->subprog_info; +++ struct bpf_insn *insn = env->prog->insnsi; +++ int insn_cnt = env->prog->len; +++ +++ /* Add entry function. */ +++ ret = add_subprog(env, 0); +++ if (ret < 0) +++ return ret; +++ +++ /* determine subprog starts. The end is one before the next starts */ +++ for (i = 0; i < insn_cnt; i++) { +++ if (insn[i].code != (BPF_JMP | BPF_CALL)) +++ continue; +++ if (insn[i].src_reg != BPF_PSEUDO_CALL) +++ continue; +++ if (!env->allow_ptr_leaks) { +++ verbose(env, "function calls to other bpf functions are allowed for root only\n"); +++ return -EPERM; +++ } +++ ret = add_subprog(env, i + insn[i].imm + 1); +++ if (ret < 0) +++ return ret; +++ } +++ +++ /* Add a fake 'exit' subprog which could simplify subprog iteration +++ * logic. 'subprog_cnt' should not be increased. +++ */ +++ subprog[env->subprog_cnt].start = insn_cnt; +++ +++ if (env->log.level & BPF_LOG_LEVEL2) +++ for (i = 0; i < env->subprog_cnt; i++) +++ verbose(env, "func#%d @%d\n", i, subprog[i].start); +++ +++ /* now check that all jumps are within the same subprog */ +++ subprog_start = subprog[cur_subprog].start; +++ subprog_end = subprog[cur_subprog + 1].start; +++ for (i = 0; i < insn_cnt; i++) { +++ u8 code = insn[i].code; +++ +++ if (code == (BPF_JMP | BPF_CALL) && +++ insn[i].imm == BPF_FUNC_tail_call && +++ insn[i].src_reg != BPF_PSEUDO_CALL) +++ subprog[cur_subprog].has_tail_call = true; +++ if (BPF_CLASS(code) != BPF_JMP && BPF_CLASS(code) != BPF_JMP32) +++ goto next; +++ if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) +++ goto next; +++ off = i + insn[i].off + 1; +++ if (off < subprog_start || off >= subprog_end) { +++ verbose(env, "jump out of range from insn %d to %d\n", i, off); +++ return -EINVAL; +++ } +++next: +++ if (i == subprog_end - 1) { +++ /* to avoid fall-through from one subprog into another +++ * the last insn of the subprog should be either exit +++ * or unconditional jump back +++ */ +++ if (code != (BPF_JMP | BPF_EXIT) && +++ code != (BPF_JMP | BPF_JA)) { +++ verbose(env, "last insn is not an exit or jmp\n"); +++ return -EINVAL; +++ } +++ subprog_start = subprog_end; +++ cur_subprog++; +++ if (cur_subprog < env->subprog_cnt) +++ subprog_end = subprog[cur_subprog + 1].start; +++ } +++ } +++ return 0; +++} +++ +++/* Parentage chain of this register (or stack slot) should take care of all +++ * issues like callee-saved registers, stack slot allocation time, etc. +++ */ +++static int mark_reg_read(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *state, +++ struct bpf_reg_state *parent, u8 flag) +++{ +++ bool writes = parent == state->parent; /* Observe write marks */ +++ int cnt = 0; +++ +++ while (parent) { +++ /* if read wasn't screened by an earlier write ... */ +++ if (writes && state->live & REG_LIVE_WRITTEN) +++ break; +++ if (parent->live & REG_LIVE_DONE) { +++ verbose(env, "verifier BUG type %s var_off %lld off %d\n", +++ reg_type_str[parent->type], +++ parent->var_off.value, parent->off); +++ return -EFAULT; +++ } +++ /* The first condition is more likely to be true than the +++ * second, checked it first. +++ */ +++ if ((parent->live & REG_LIVE_READ) == flag || +++ parent->live & REG_LIVE_READ64) +++ /* The parentage chain never changes and +++ * this parent was already marked as LIVE_READ. +++ * There is no need to keep walking the chain again and +++ * keep re-marking all parents as LIVE_READ. +++ * This case happens when the same register is read +++ * multiple times without writes into it in-between. +++ * Also, if parent has the stronger REG_LIVE_READ64 set, +++ * then no need to set the weak REG_LIVE_READ32. +++ */ +++ break; +++ /* ... then we depend on parent's value */ +++ parent->live |= flag; +++ /* REG_LIVE_READ64 overrides REG_LIVE_READ32. */ +++ if (flag == REG_LIVE_READ64) +++ parent->live &= ~REG_LIVE_READ32; +++ state = parent; +++ parent = state->parent; +++ writes = true; +++ cnt++; +++ } +++ +++ if (env->longest_mark_read_walk < cnt) +++ env->longest_mark_read_walk = cnt; +++ return 0; +++} +++ +++/* This function is supposed to be used by the following 32-bit optimization +++ * code only. It returns TRUE if the source or destination register operates +++ * on 64-bit, otherwise return FALSE. +++ */ +++static bool is_reg64(struct bpf_verifier_env *env, struct bpf_insn *insn, +++ u32 regno, struct bpf_reg_state *reg, enum reg_arg_type t) +++{ +++ u8 code, class, op; +++ +++ code = insn->code; +++ class = BPF_CLASS(code); +++ op = BPF_OP(code); +++ if (class == BPF_JMP) { +++ /* BPF_EXIT for "main" will reach here. Return TRUE +++ * conservatively. +++ */ +++ if (op == BPF_EXIT) +++ return true; +++ if (op == BPF_CALL) { +++ /* BPF to BPF call will reach here because of marking +++ * caller saved clobber with DST_OP_NO_MARK for which we +++ * don't care the register def because they are anyway +++ * marked as NOT_INIT already. +++ */ +++ if (insn->src_reg == BPF_PSEUDO_CALL) +++ return false; +++ /* Helper call will reach here because of arg type +++ * check, conservatively return TRUE. +++ */ +++ if (t == SRC_OP) +++ return true; +++ +++ return false; +++ } +++ } +++ +++ if (class == BPF_ALU64 || class == BPF_JMP || +++ /* BPF_END always use BPF_ALU class. */ +++ (class == BPF_ALU && op == BPF_END && insn->imm == 64)) +++ return true; +++ +++ if (class == BPF_ALU || class == BPF_JMP32) +++ return false; +++ +++ if (class == BPF_LDX) { +++ if (t != SRC_OP) +++ return BPF_SIZE(code) == BPF_DW; +++ /* LDX source must be ptr. */ +++ return true; +++ } +++ +++ if (class == BPF_STX) { +++ if (reg->type != SCALAR_VALUE) +++ return true; +++ return BPF_SIZE(code) == BPF_DW; +++ } +++ +++ if (class == BPF_LD) { +++ u8 mode = BPF_MODE(code); +++ +++ /* LD_IMM64 */ +++ if (mode == BPF_IMM) +++ return true; +++ +++ /* Both LD_IND and LD_ABS return 32-bit data. */ +++ if (t != SRC_OP) +++ return false; +++ +++ /* Implicit ctx ptr. */ +++ if (regno == BPF_REG_6) +++ return true; +++ +++ /* Explicit source could be any width. */ +++ return true; +++ } +++ +++ if (class == BPF_ST) +++ /* The only source register for BPF_ST is a ptr. */ +++ return true; +++ +++ /* Conservatively return true at default. */ +++ return true; +++} +++ +++/* Return TRUE if INSN doesn't have explicit value define. */ +++static bool insn_no_def(struct bpf_insn *insn) +++{ +++ u8 class = BPF_CLASS(insn->code); +++ +++ return (class == BPF_JMP || class == BPF_JMP32 || +++ class == BPF_STX || class == BPF_ST); +++} +++ +++/* Return TRUE if INSN has defined any 32-bit value explicitly. */ +++static bool insn_has_def32(struct bpf_verifier_env *env, struct bpf_insn *insn) +++{ +++ if (insn_no_def(insn)) +++ return false; +++ +++ return !is_reg64(env, insn, insn->dst_reg, NULL, DST_OP); +++} +++ +++static void mark_insn_zext(struct bpf_verifier_env *env, +++ struct bpf_reg_state *reg) +++{ +++ s32 def_idx = reg->subreg_def; +++ +++ if (def_idx == DEF_NOT_SUBREG) +++ return; +++ +++ env->insn_aux_data[def_idx - 1].zext_dst = true; +++ /* The dst will be zero extended, so won't be sub-register anymore. */ +++ reg->subreg_def = DEF_NOT_SUBREG; +++} +++ +++static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, ++ enum reg_arg_type t) ++ { +++ struct bpf_verifier_state *vstate = env->cur_state; +++ struct bpf_func_state *state = vstate->frame[vstate->curframe]; +++ struct bpf_insn *insn = env->prog->insnsi + env->insn_idx; +++ struct bpf_reg_state *reg, *regs = state->regs; +++ bool rw64; +++ ++ if (regno >= MAX_BPF_REG) { ++- verbose("R%d is invalid\n", regno); +++ verbose(env, "R%d is invalid\n", regno); ++ return -EINVAL; ++ } ++ +++ reg = ®s[regno]; +++ rw64 = is_reg64(env, insn, regno, reg, t); ++ if (t == SRC_OP) { ++ /* check whether register used as source operand can be read */ ++- if (regs[regno].type == NOT_INIT) { ++- verbose("R%d !read_ok\n", regno); +++ if (reg->type == NOT_INIT) { +++ verbose(env, "R%d !read_ok\n", regno); ++ return -EACCES; ++ } +++ /* We don't need to worry about FP liveness because it's read-only */ +++ if (regno == BPF_REG_FP) +++ return 0; +++ +++ if (rw64) +++ mark_insn_zext(env, reg); +++ +++ return mark_reg_read(env, reg, reg->parent, +++ rw64 ? REG_LIVE_READ64 : REG_LIVE_READ32); ++ } else { ++ /* check whether register used as dest operand can be written to */ ++ if (regno == BPF_REG_FP) { ++- verbose("frame pointer is read only\n"); +++ verbose(env, "frame pointer is read only\n"); ++ return -EACCES; ++ } +++ reg->live |= REG_LIVE_WRITTEN; +++ reg->subreg_def = rw64 ? DEF_NOT_SUBREG : env->insn_idx + 1; ++ if (t == DST_OP) ++- mark_reg_unknown_value(regs, regno); +++ mark_reg_unknown(env, regs, regno); ++ } ++ return 0; ++ } ++ ++-static int bpf_size_to_bytes(int bpf_size) +++/* for any branch, call, exit record the history of jmps in the given state */ +++static int push_jmp_history(struct bpf_verifier_env *env, +++ struct bpf_verifier_state *cur) ++ { ++- if (bpf_size == BPF_W) ++- return 4; ++- else if (bpf_size == BPF_H) ++- return 2; ++- else if (bpf_size == BPF_B) ++- return 1; ++- else if (bpf_size == BPF_DW) ++- return 8; ++- else ++- return -EINVAL; +++ u32 cnt = cur->jmp_history_cnt; +++ struct bpf_idx_pair *p; +++ +++ cnt++; +++ p = krealloc(cur->jmp_history, cnt * sizeof(*p), GFP_USER); +++ if (!p) +++ return -ENOMEM; +++ p[cnt - 1].idx = env->insn_idx; +++ p[cnt - 1].prev_idx = env->prev_insn_idx; +++ cur->jmp_history = p; +++ cur->jmp_history_cnt = cnt; +++ return 0; +++} +++ +++/* Backtrack one insn at a time. If idx is not at the top of recorded +++ * history then previous instruction came from straight line execution. +++ */ +++static int get_prev_insn_idx(struct bpf_verifier_state *st, int i, +++ u32 *history) +++{ +++ u32 cnt = *history; +++ +++ if (cnt && st->jmp_history[cnt - 1].idx == i) { +++ i = st->jmp_history[cnt - 1].prev_idx; +++ (*history)--; +++ } else { +++ i--; +++ } +++ return i; +++} +++ +++/* For given verifier state backtrack_insn() is called from the last insn to +++ * the first insn. Its purpose is to compute a bitmask of registers and +++ * stack slots that needs precision in the parent verifier state. +++ */ +++static int backtrack_insn(struct bpf_verifier_env *env, int idx, +++ u32 *reg_mask, u64 *stack_mask) +++{ +++ const struct bpf_insn_cbs cbs = { +++ .cb_print = verbose, +++ .private_data = env, +++ }; +++ struct bpf_insn *insn = env->prog->insnsi + idx; +++ u8 class = BPF_CLASS(insn->code); +++ u8 opcode = BPF_OP(insn->code); +++ u8 mode = BPF_MODE(insn->code); +++ u32 dreg = 1u << insn->dst_reg; +++ u32 sreg = 1u << insn->src_reg; +++ u32 spi; +++ +++ if (insn->code == 0) +++ return 0; +++ if (env->log.level & BPF_LOG_LEVEL) { +++ verbose(env, "regs=%x stack=%llx before ", *reg_mask, *stack_mask); +++ verbose(env, "%d: ", idx); +++ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); +++ } +++ +++ if (class == BPF_ALU || class == BPF_ALU64) { +++ if (!(*reg_mask & dreg)) +++ return 0; +++ if (opcode == BPF_MOV) { +++ if (BPF_SRC(insn->code) == BPF_X) { +++ /* dreg = sreg +++ * dreg needs precision after this insn +++ * sreg needs precision before this insn +++ */ +++ *reg_mask &= ~dreg; +++ *reg_mask |= sreg; +++ } else { +++ /* dreg = K +++ * dreg needs precision after this insn. +++ * Corresponding register is already marked +++ * as precise=true in this verifier state. +++ * No further markings in parent are necessary +++ */ +++ *reg_mask &= ~dreg; +++ } +++ } else { +++ if (BPF_SRC(insn->code) == BPF_X) { +++ /* dreg += sreg +++ * both dreg and sreg need precision +++ * before this insn +++ */ +++ *reg_mask |= sreg; +++ } /* else dreg += K +++ * dreg still needs precision before this insn +++ */ +++ } +++ } else if (class == BPF_LDX) { +++ if (!(*reg_mask & dreg)) +++ return 0; +++ *reg_mask &= ~dreg; +++ +++ /* scalars can only be spilled into stack w/o losing precision. +++ * Load from any other memory can be zero extended. +++ * The desire to keep that precision is already indicated +++ * by 'precise' mark in corresponding register of this state. +++ * No further tracking necessary. +++ */ +++ if (insn->src_reg != BPF_REG_FP) +++ return 0; +++ if (BPF_SIZE(insn->code) != BPF_DW) +++ return 0; +++ +++ /* dreg = *(u64 *)[fp - off] was a fill from the stack. +++ * that [fp - off] slot contains scalar that needs to be +++ * tracked with precision +++ */ +++ spi = (-insn->off - 1) / BPF_REG_SIZE; +++ if (spi >= 64) { +++ verbose(env, "BUG spi %d\n", spi); +++ WARN_ONCE(1, "verifier backtracking bug"); +++ return -EFAULT; +++ } +++ *stack_mask |= 1ull << spi; +++ } else if (class == BPF_STX || class == BPF_ST) { +++ if (*reg_mask & dreg) +++ /* stx & st shouldn't be using _scalar_ dst_reg +++ * to access memory. It means backtracking +++ * encountered a case of pointer subtraction. +++ */ +++ return -ENOTSUPP; +++ /* scalars can only be spilled into stack */ +++ if (insn->dst_reg != BPF_REG_FP) +++ return 0; +++ if (BPF_SIZE(insn->code) != BPF_DW) +++ return 0; +++ spi = (-insn->off - 1) / BPF_REG_SIZE; +++ if (spi >= 64) { +++ verbose(env, "BUG spi %d\n", spi); +++ WARN_ONCE(1, "verifier backtracking bug"); +++ return -EFAULT; +++ } +++ if (!(*stack_mask & (1ull << spi))) +++ return 0; +++ *stack_mask &= ~(1ull << spi); +++ if (class == BPF_STX) +++ *reg_mask |= sreg; +++ } else if (class == BPF_JMP || class == BPF_JMP32) { +++ if (opcode == BPF_CALL) { +++ if (insn->src_reg == BPF_PSEUDO_CALL) +++ return -ENOTSUPP; +++ /* regular helper call sets R0 */ +++ *reg_mask &= ~1; +++ if (*reg_mask & 0x3f) { +++ /* if backtracing was looking for registers R1-R5 +++ * they should have been found already. +++ */ +++ verbose(env, "BUG regs %x\n", *reg_mask); +++ WARN_ONCE(1, "verifier backtracking bug"); +++ return -EFAULT; +++ } +++ } else if (opcode == BPF_EXIT) { +++ return -ENOTSUPP; +++ } +++ } else if (class == BPF_LD) { +++ if (!(*reg_mask & dreg)) +++ return 0; +++ *reg_mask &= ~dreg; +++ /* It's ld_imm64 or ld_abs or ld_ind. +++ * For ld_imm64 no further tracking of precision +++ * into parent is necessary +++ */ +++ if (mode == BPF_IND || mode == BPF_ABS) +++ /* to be analyzed */ +++ return -ENOTSUPP; +++ } +++ return 0; +++} +++ +++/* the scalar precision tracking algorithm: +++ * . at the start all registers have precise=false. +++ * . scalar ranges are tracked as normal through alu and jmp insns. +++ * . once precise value of the scalar register is used in: +++ * . ptr + scalar alu +++ * . if (scalar cond K|scalar) +++ * . helper_call(.., scalar, ...) where ARG_CONST is expected +++ * backtrack through the verifier states and mark all registers and +++ * stack slots with spilled constants that these scalar regisers +++ * should be precise. +++ * . during state pruning two registers (or spilled stack slots) +++ * are equivalent if both are not precise. +++ * +++ * Note the verifier cannot simply walk register parentage chain, +++ * since many different registers and stack slots could have been +++ * used to compute single precise scalar. +++ * +++ * The approach of starting with precise=true for all registers and then +++ * backtrack to mark a register as not precise when the verifier detects +++ * that program doesn't care about specific value (e.g., when helper +++ * takes register as ARG_ANYTHING parameter) is not safe. +++ * +++ * It's ok to walk single parentage chain of the verifier states. +++ * It's possible that this backtracking will go all the way till 1st insn. +++ * All other branches will be explored for needing precision later. +++ * +++ * The backtracking needs to deal with cases like: +++ * R8=map_value(id=0,off=0,ks=4,vs=1952,imm=0) R9_w=map_value(id=0,off=40,ks=4,vs=1952,imm=0) +++ * r9 -= r8 +++ * r5 = r9 +++ * if r5 > 0x79f goto pc+7 +++ * R5_w=inv(id=0,umax_value=1951,var_off=(0x0; 0x7ff)) +++ * r5 += 1 +++ * ... +++ * call bpf_perf_event_output#25 +++ * where .arg5_type = ARG_CONST_SIZE_OR_ZERO +++ * +++ * and this case: +++ * r6 = 1 +++ * call foo // uses callee's r6 inside to compute r0 +++ * r0 += r6 +++ * if r0 == 0 goto +++ * +++ * to track above reg_mask/stack_mask needs to be independent for each frame. +++ * +++ * Also if parent's curframe > frame where backtracking started, +++ * the verifier need to mark registers in both frames, otherwise callees +++ * may incorrectly prune callers. This is similar to +++ * commit 7640ead93924 ("bpf: verifier: make sure callees don't prune with caller differences") +++ * +++ * For now backtracking falls back into conservative marking. +++ */ +++static void mark_all_scalars_precise(struct bpf_verifier_env *env, +++ struct bpf_verifier_state *st) +++{ +++ struct bpf_func_state *func; +++ struct bpf_reg_state *reg; +++ int i, j; +++ +++ /* big hammer: mark all scalars precise in this path. +++ * pop_stack may still get !precise scalars. +++ */ +++ for (; st; st = st->parent) +++ for (i = 0; i <= st->curframe; i++) { +++ func = st->frame[i]; +++ for (j = 0; j < BPF_REG_FP; j++) { +++ reg = &func->regs[j]; +++ if (reg->type != SCALAR_VALUE) +++ continue; +++ reg->precise = true; +++ } +++ for (j = 0; j < func->allocated_stack / BPF_REG_SIZE; j++) { +++ if (func->stack[j].slot_type[0] != STACK_SPILL) +++ continue; +++ reg = &func->stack[j].spilled_ptr; +++ if (reg->type != SCALAR_VALUE) +++ continue; +++ reg->precise = true; +++ } +++ } +++} +++ +++static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, +++ int spi) +++{ +++ struct bpf_verifier_state *st = env->cur_state; +++ int first_idx = st->first_insn_idx; +++ int last_idx = env->insn_idx; +++ struct bpf_func_state *func; +++ struct bpf_reg_state *reg; +++ u32 reg_mask = regno >= 0 ? 1u << regno : 0; +++ u64 stack_mask = spi >= 0 ? 1ull << spi : 0; +++ bool skip_first = true; +++ bool new_marks = false; +++ int i, err; +++ +++ if (!env->allow_ptr_leaks) +++ /* backtracking is root only for now */ +++ return 0; +++ +++ func = st->frame[st->curframe]; +++ if (regno >= 0) { +++ reg = &func->regs[regno]; +++ if (reg->type != SCALAR_VALUE) { +++ WARN_ONCE(1, "backtracing misuse"); +++ return -EFAULT; +++ } +++ if (!reg->precise) +++ new_marks = true; +++ else +++ reg_mask = 0; +++ reg->precise = true; +++ } +++ +++ while (spi >= 0) { +++ if (func->stack[spi].slot_type[0] != STACK_SPILL) { +++ stack_mask = 0; +++ break; +++ } +++ reg = &func->stack[spi].spilled_ptr; +++ if (reg->type != SCALAR_VALUE) { +++ stack_mask = 0; +++ break; +++ } +++ if (!reg->precise) +++ new_marks = true; +++ else +++ stack_mask = 0; +++ reg->precise = true; +++ break; +++ } +++ +++ if (!new_marks) +++ return 0; +++ if (!reg_mask && !stack_mask) +++ return 0; +++ for (;;) { +++ DECLARE_BITMAP(mask, 64); +++ u32 history = st->jmp_history_cnt; +++ +++ if (env->log.level & BPF_LOG_LEVEL) +++ verbose(env, "last_idx %d first_idx %d\n", last_idx, first_idx); +++ for (i = last_idx;;) { +++ if (skip_first) { +++ err = 0; +++ skip_first = false; +++ } else { +++ err = backtrack_insn(env, i, ®_mask, &stack_mask); +++ } +++ if (err == -ENOTSUPP) { +++ mark_all_scalars_precise(env, st); +++ return 0; +++ } else if (err) { +++ return err; +++ } +++ if (!reg_mask && !stack_mask) +++ /* Found assignment(s) into tracked register in this state. +++ * Since this state is already marked, just return. +++ * Nothing to be tracked further in the parent state. +++ */ +++ return 0; +++ if (i == first_idx) +++ break; +++ i = get_prev_insn_idx(st, i, &history); +++ if (i >= env->prog->len) { +++ /* This can happen if backtracking reached insn 0 +++ * and there are still reg_mask or stack_mask +++ * to backtrack. +++ * It means the backtracking missed the spot where +++ * particular register was initialized with a constant. +++ */ +++ verbose(env, "BUG backtracking idx %d\n", i); +++ WARN_ONCE(1, "verifier backtracking bug"); +++ return -EFAULT; +++ } +++ } +++ st = st->parent; +++ if (!st) +++ break; +++ +++ new_marks = false; +++ func = st->frame[st->curframe]; +++ bitmap_from_u64(mask, reg_mask); +++ for_each_set_bit(i, mask, 32) { +++ reg = &func->regs[i]; +++ if (reg->type != SCALAR_VALUE) { +++ reg_mask &= ~(1u << i); +++ continue; +++ } +++ if (!reg->precise) +++ new_marks = true; +++ reg->precise = true; +++ } +++ +++ bitmap_from_u64(mask, stack_mask); +++ for_each_set_bit(i, mask, 64) { +++ if (i >= func->allocated_stack / BPF_REG_SIZE) { +++ /* the sequence of instructions: +++ * 2: (bf) r3 = r10 +++ * 3: (7b) *(u64 *)(r3 -8) = r0 +++ * 4: (79) r4 = *(u64 *)(r10 -8) +++ * doesn't contain jmps. It's backtracked +++ * as a single block. +++ * During backtracking insn 3 is not recognized as +++ * stack access, so at the end of backtracking +++ * stack slot fp-8 is still marked in stack_mask. +++ * However the parent state may not have accessed +++ * fp-8 and it's "unallocated" stack space. +++ * In such case fallback to conservative. +++ */ +++ mark_all_scalars_precise(env, st); +++ return 0; +++ } +++ +++ if (func->stack[i].slot_type[0] != STACK_SPILL) { +++ stack_mask &= ~(1ull << i); +++ continue; +++ } +++ reg = &func->stack[i].spilled_ptr; +++ if (reg->type != SCALAR_VALUE) { +++ stack_mask &= ~(1ull << i); +++ continue; +++ } +++ if (!reg->precise) +++ new_marks = true; +++ reg->precise = true; +++ } +++ if (env->log.level & BPF_LOG_LEVEL) { +++ print_verifier_state(env, func); +++ verbose(env, "parent %s regs=%x stack=%llx marks\n", +++ new_marks ? "didn't have" : "already had", +++ reg_mask, stack_mask); +++ } +++ +++ if (!reg_mask && !stack_mask) +++ break; +++ if (!new_marks) +++ break; +++ +++ last_idx = st->last_insn_idx; +++ first_idx = st->first_insn_idx; +++ } +++ return 0; +++} +++ +++static int mark_chain_precision(struct bpf_verifier_env *env, int regno) +++{ +++ return __mark_chain_precision(env, regno, -1); +++} +++ +++static int mark_chain_precision_stack(struct bpf_verifier_env *env, int spi) +++{ +++ return __mark_chain_precision(env, -1, spi); ++ } ++ ++ static bool is_spillable_regtype(enum bpf_reg_type type) ++@@ -549,129 +1842,932 @@ static bool is_spillable_regtype(enum bp ++ case PTR_TO_MAP_VALUE_OR_NULL: ++ case PTR_TO_STACK: ++ case PTR_TO_CTX: ++- case FRAME_PTR: +++ case PTR_TO_PACKET: +++ case PTR_TO_PACKET_META: +++ case PTR_TO_PACKET_END: +++ case PTR_TO_FLOW_KEYS: ++ case CONST_PTR_TO_MAP: +++ case PTR_TO_SOCKET: +++ case PTR_TO_SOCKET_OR_NULL: +++ case PTR_TO_SOCK_COMMON: +++ case PTR_TO_SOCK_COMMON_OR_NULL: +++ case PTR_TO_TCP_SOCK: +++ case PTR_TO_TCP_SOCK_OR_NULL: +++ case PTR_TO_XDP_SOCK: ++ return true; ++ default: ++ return false; ++ } ++ } ++ +++/* Does this register contain a constant zero? */ +++static bool register_is_null(struct bpf_reg_state *reg) +++{ +++ return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); +++} +++ +++static bool register_is_const(struct bpf_reg_state *reg) +++{ +++ return reg->type == SCALAR_VALUE && tnum_is_const(reg->var_off); +++} +++ +++static bool __is_pointer_value(bool allow_ptr_leaks, +++ const struct bpf_reg_state *reg) +++{ +++ if (allow_ptr_leaks) +++ return false; +++ +++ return reg->type != SCALAR_VALUE; +++} +++ +++static void save_register_state(struct bpf_func_state *state, +++ int spi, struct bpf_reg_state *reg) +++{ +++ int i; +++ +++ state->stack[spi].spilled_ptr = *reg; +++ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; +++ +++ for (i = 0; i < BPF_REG_SIZE; i++) +++ state->stack[spi].slot_type[i] = STACK_SPILL; +++} +++ ++ /* check_stack_read/write functions track spill/fill of registers, ++ * stack boundary and alignment are checked in check_mem_access() ++ */ ++-static int check_stack_write(struct verifier_state *state, int off, int size, ++- int value_regno) +++static int check_stack_write(struct bpf_verifier_env *env, +++ struct bpf_func_state *state, /* func where register points to */ +++ int off, int size, int value_regno, int insn_idx) ++ { ++- int i; +++ struct bpf_func_state *cur; /* state of the current function */ +++ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; +++ u32 dst_reg = env->prog->insnsi[insn_idx].dst_reg; +++ struct bpf_reg_state *reg = NULL; +++ +++ err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), +++ state->acquired_refs, true); +++ if (err) +++ return err; ++ /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, ++ * so it's aligned access and [off, off + size) are within stack limits ++ */ +++ if (!env->allow_ptr_leaks && +++ state->stack[spi].slot_type[0] == STACK_SPILL && +++ size != BPF_REG_SIZE) { +++ verbose(env, "attempt to corrupt spilled pointer on stack\n"); +++ return -EACCES; +++ } ++ ++- if (value_regno >= 0 && ++- is_spillable_regtype(state->regs[value_regno].type)) { ++- +++ cur = env->cur_state->frame[env->cur_state->curframe]; +++ if (value_regno >= 0) +++ reg = &cur->regs[value_regno]; +++ +++ if (reg && size == BPF_REG_SIZE && register_is_const(reg) && +++ !register_is_null(reg) && env->allow_ptr_leaks) { +++ if (dst_reg != BPF_REG_FP) { +++ /* The backtracking logic can only recognize explicit +++ * stack slot address like [fp - 8]. Other spill of +++ * scalar via different register has to be conervative. +++ * Backtrack from here and mark all registers as precise +++ * that contributed into 'reg' being a constant. +++ */ +++ err = mark_chain_precision(env, value_regno); +++ if (err) +++ return err; +++ } +++ save_register_state(state, spi, reg); +++ } else if (reg && is_spillable_regtype(reg->type)) { ++ /* register containing pointer is being spilled into stack */ ++ if (size != BPF_REG_SIZE) { ++- verbose("invalid size of register spill\n"); +++ verbose_linfo(env, insn_idx, "; "); +++ verbose(env, "invalid size of register spill\n"); ++ return -EACCES; ++ } ++ ++- /* save register state */ ++- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = ++- state->regs[value_regno]; +++ if (state != cur && reg->type == PTR_TO_STACK) { +++ verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); +++ return -EINVAL; +++ } ++ ++- for (i = 0; i < BPF_REG_SIZE; i++) ++- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; +++ if (!env->allow_ptr_leaks) { +++ bool sanitize = false; +++ +++ if (state->stack[spi].slot_type[0] == STACK_SPILL && +++ register_is_const(&state->stack[spi].spilled_ptr)) +++ sanitize = true; +++ for (i = 0; i < BPF_REG_SIZE; i++) +++ if (state->stack[spi].slot_type[i] == STACK_MISC) { +++ sanitize = true; +++ break; +++ } +++ if (sanitize) { +++ int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; +++ int soff = (-spi - 1) * BPF_REG_SIZE; +++ +++ /* detected reuse of integer stack slot with a pointer +++ * which means either llvm is reusing stack slot or +++ * an attacker is trying to exploit CVE-2018-3639 +++ * (speculative store bypass) +++ * Have to sanitize that slot with preemptive +++ * store of zero. +++ */ +++ if (*poff && *poff != soff) { +++ /* disallow programs where single insn stores +++ * into two different stack slots, since verifier +++ * cannot sanitize them +++ */ +++ verbose(env, +++ "insn %d cannot access two stack slots fp%d and fp%d", +++ insn_idx, *poff, soff); +++ return -EINVAL; +++ } +++ *poff = soff; +++ } +++ } +++ save_register_state(state, spi, reg); ++ } else { ++- /* regular write of data into stack */ ++- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE] = ++- (struct reg_state) {}; +++ u8 type = STACK_MISC; +++ +++ /* regular write of data into stack destroys any spilled ptr */ +++ state->stack[spi].spilled_ptr.type = NOT_INIT; +++ /* Mark slots as STACK_MISC if they belonged to spilled ptr. */ +++ if (state->stack[spi].slot_type[0] == STACK_SPILL) +++ for (i = 0; i < BPF_REG_SIZE; i++) +++ state->stack[spi].slot_type[i] = STACK_MISC; +++ +++ /* only mark the slot as written if all 8 bytes were written +++ * otherwise read propagation may incorrectly stop too soon +++ * when stack slots are partially written. +++ * This heuristic means that read propagation will be +++ * conservative, since it will add reg_live_read marks +++ * to stack slots all the way to first state when programs +++ * writes+reads less than 8 bytes +++ */ +++ if (size == BPF_REG_SIZE) +++ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; +++ +++ /* when we zero initialize stack slots mark them as such */ +++ if (reg && register_is_null(reg)) { +++ /* backtracking doesn't work for STACK_ZERO yet. */ +++ err = mark_chain_precision(env, value_regno); +++ if (err) +++ return err; +++ type = STACK_ZERO; +++ } ++ +++ /* Mark slots affected by this stack write. */ ++ for (i = 0; i < size; i++) ++- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; +++ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = +++ type; ++ } ++ return 0; ++ } ++ ++-static int check_stack_read(struct verifier_state *state, int off, int size, ++- int value_regno) +++static int check_stack_read(struct bpf_verifier_env *env, +++ struct bpf_func_state *reg_state /* func where register points to */, +++ int off, int size, int value_regno) ++ { ++- u8 *slot_type; ++- int i; ++- ++- slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; +++ struct bpf_verifier_state *vstate = env->cur_state; +++ struct bpf_func_state *state = vstate->frame[vstate->curframe]; +++ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; +++ struct bpf_reg_state *reg; +++ u8 *stype; +++ +++ if (reg_state->allocated_stack <= slot) { +++ verbose(env, "invalid read from stack off %d+0 size %d\n", +++ off, size); +++ return -EACCES; +++ } +++ stype = reg_state->stack[spi].slot_type; +++ reg = ®_state->stack[spi].spilled_ptr; ++ ++- if (slot_type[0] == STACK_SPILL) { +++ if (stype[0] == STACK_SPILL) { ++ if (size != BPF_REG_SIZE) { ++- verbose("invalid size of register spill\n"); ++- return -EACCES; +++ if (reg->type != SCALAR_VALUE) { +++ verbose_linfo(env, env->insn_idx, "; "); +++ verbose(env, "invalid size of register fill\n"); +++ return -EACCES; +++ } +++ if (value_regno >= 0) { +++ mark_reg_unknown(env, state->regs, value_regno); +++ state->regs[value_regno].live |= REG_LIVE_WRITTEN; +++ } +++ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); +++ return 0; ++ } ++ for (i = 1; i < BPF_REG_SIZE; i++) { ++- if (slot_type[i] != STACK_SPILL) { ++- verbose("corrupted spill memory\n"); +++ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { +++ verbose(env, "corrupted spill memory\n"); ++ return -EACCES; ++ } ++ } ++ ++- if (value_regno >= 0) +++ if (value_regno >= 0) { ++ /* restore register state from stack */ ++- state->regs[value_regno] = ++- state->spilled_regs[(MAX_BPF_STACK + off) / BPF_REG_SIZE]; ++- return 0; +++ state->regs[value_regno] = *reg; +++ /* mark reg as written since spilled pointer state likely +++ * has its liveness marks cleared by is_state_visited() +++ * which resets stack/reg liveness for state transitions +++ */ +++ state->regs[value_regno].live |= REG_LIVE_WRITTEN; +++ } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { +++ /* If value_regno==-1, the caller is asking us whether +++ * it is acceptable to use this value as a SCALAR_VALUE +++ * (e.g. for XADD). +++ * We must not allow unprivileged callers to do that +++ * with spilled pointers. +++ */ +++ verbose(env, "leaking pointer from stack off %d\n", +++ off); +++ return -EACCES; +++ } +++ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); ++ } else { +++ int zeros = 0; +++ ++ for (i = 0; i < size; i++) { ++- if (slot_type[i] != STACK_MISC) { ++- verbose("invalid read from stack off %d+%d size %d\n", ++- off, i, size); ++- return -EACCES; +++ if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) +++ continue; +++ if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { +++ zeros++; +++ continue; ++ } +++ verbose(env, "invalid read from stack off %d+%d size %d\n", +++ off, i, size); +++ return -EACCES; +++ } +++ mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); +++ if (value_regno >= 0) { +++ if (zeros == size) { +++ /* any size read into register is zero extended, +++ * so the whole register == const_zero +++ */ +++ __mark_reg_const_zero(&state->regs[value_regno]); +++ /* backtracking doesn't support STACK_ZERO yet, +++ * so mark it precise here, so that later +++ * backtracking can stop here. +++ * Backtracking may not need this if this register +++ * doesn't participate in pointer adjustment. +++ * Forward propagation of precise flag is not +++ * necessary either. This mark is only to stop +++ * backtracking. Any register that contributed +++ * to const 0 was marked precise before spill. +++ */ +++ state->regs[value_regno].precise = true; +++ } else { +++ /* have read misc data from the stack */ +++ mark_reg_unknown(env, state->regs, value_regno); +++ } +++ state->regs[value_regno].live |= REG_LIVE_WRITTEN; ++ } ++- if (value_regno >= 0) ++- /* have read misc data from the stack */ ++- mark_reg_unknown_value(state->regs, value_regno); ++- return 0; ++ } +++ return 0; +++} +++ +++static int check_stack_access(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg, +++ int off, int size) +++{ +++ /* Stack accesses must be at a fixed offset, so that we +++ * can determine what type of data were returned. See +++ * check_stack_read(). +++ */ +++ if (!tnum_is_const(reg->var_off)) { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, "variable stack access var_off=%s off=%d size=%d\n", +++ tn_buf, off, size); +++ return -EACCES; +++ } +++ +++ if (off >= 0 || off < -MAX_BPF_STACK) { +++ verbose(env, "invalid stack off=%d size=%d\n", off, size); +++ return -EACCES; +++ } +++ +++ return 0; +++} +++ +++static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, +++ int off, int size, enum bpf_access_type type) +++{ +++ struct bpf_reg_state *regs = cur_regs(env); +++ struct bpf_map *map = regs[regno].map_ptr; +++ u32 cap = bpf_map_flags_to_cap(map); +++ +++ if (type == BPF_WRITE && !(cap & BPF_MAP_CAN_WRITE)) { +++ verbose(env, "write into map forbidden, value_size=%d off=%d size=%d\n", +++ map->value_size, off, size); +++ return -EACCES; +++ } +++ +++ if (type == BPF_READ && !(cap & BPF_MAP_CAN_READ)) { +++ verbose(env, "read from map forbidden, value_size=%d off=%d size=%d\n", +++ map->value_size, off, size); +++ return -EACCES; +++ } +++ +++ return 0; ++ } ++ ++ /* check read/write into map element returned by bpf_map_lookup_elem() */ ++-static int check_map_access(struct verifier_env *env, u32 regno, int off, ++- int size) +++static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, +++ int size, bool zero_size_allowed) ++ { ++- struct bpf_map *map = env->cur_state.regs[regno].map_ptr; +++ struct bpf_reg_state *regs = cur_regs(env); +++ struct bpf_map *map = regs[regno].map_ptr; ++ ++- if (off < 0 || off + size > map->value_size) { ++- verbose("invalid access to map value, value_size=%d off=%d size=%d\n", +++ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || +++ off + size > map->value_size) { +++ verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n", ++ map->value_size, off, size); ++ return -EACCES; ++ } ++ return 0; ++ } ++ ++-/* check access to 'struct bpf_context' fields */ ++-static int check_ctx_access(struct verifier_env *env, int off, int size, ++- enum bpf_access_type t) +++/* check read/write into a map element with possible variable offset */ +++static int check_map_access(struct bpf_verifier_env *env, u32 regno, +++ int off, int size, bool zero_size_allowed) +++{ +++ struct bpf_verifier_state *vstate = env->cur_state; +++ struct bpf_func_state *state = vstate->frame[vstate->curframe]; +++ struct bpf_reg_state *reg = &state->regs[regno]; +++ int err; +++ +++ /* We may have adjusted the register to this map value, so we +++ * need to try adding each of min_value and max_value to off +++ * to make sure our theoretical access will be safe. +++ */ +++ if (env->log.level & BPF_LOG_LEVEL) +++ print_verifier_state(env, state); +++ +++ /* The minimum value is only important with signed +++ * comparisons where we can't assume the floor of a +++ * value is 0. If we are using signed variables for our +++ * index'es we need to make sure that whatever we use +++ * will have a set floor within our range. +++ */ +++ if (reg->smin_value < 0 && +++ (reg->smin_value == S64_MIN || +++ (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || +++ reg->smin_value + off < 0)) { +++ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", +++ regno); +++ return -EACCES; +++ } +++ err = __check_map_access(env, regno, reg->smin_value + off, size, +++ zero_size_allowed); +++ if (err) { +++ verbose(env, "R%d min value is outside of the array range\n", +++ regno); +++ return err; +++ } +++ +++ /* If we haven't set a max value then we need to bail since we can't be +++ * sure we won't do bad things. +++ * If reg->umax_value + off could overflow, treat that as unbounded too. +++ */ +++ if (reg->umax_value >= BPF_MAX_VAR_OFF) { +++ verbose(env, "R%d unbounded memory access, make sure to bounds check any array access into a map\n", +++ regno); +++ return -EACCES; +++ } +++ err = __check_map_access(env, regno, reg->umax_value + off, size, +++ zero_size_allowed); +++ if (err) +++ verbose(env, "R%d max value is outside of the array range\n", +++ regno); +++ +++ if (map_value_has_spin_lock(reg->map_ptr)) { +++ u32 lock = reg->map_ptr->spin_lock_off; +++ +++ /* if any part of struct bpf_spin_lock can be touched by +++ * load/store reject this program. +++ * To check that [x1, x2) overlaps with [y1, y2) +++ * it is sufficient to check x1 < y2 && y1 < x2. +++ */ +++ if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && +++ lock < reg->umax_value + off + size) { +++ verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); +++ return -EACCES; +++ } +++ } +++ return err; +++} +++ +++#define MAX_PACKET_OFF 0xffff +++ +++static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, +++ const struct bpf_call_arg_meta *meta, +++ enum bpf_access_type t) +++{ +++ switch (env->prog->type) { +++ /* Program types only with direct read access go here! */ +++ case BPF_PROG_TYPE_LWT_IN: +++ case BPF_PROG_TYPE_LWT_OUT: +++ case BPF_PROG_TYPE_LWT_SEG6LOCAL: +++ case BPF_PROG_TYPE_SK_REUSEPORT: +++ case BPF_PROG_TYPE_FLOW_DISSECTOR: +++ case BPF_PROG_TYPE_CGROUP_SKB: +++ if (t == BPF_WRITE) +++ return false; +++ /* fallthrough */ +++ +++ /* Program types with direct read + write access go here! */ +++ case BPF_PROG_TYPE_SCHED_CLS: +++ case BPF_PROG_TYPE_SCHED_ACT: +++ case BPF_PROG_TYPE_XDP: +++ case BPF_PROG_TYPE_LWT_XMIT: +++ case BPF_PROG_TYPE_SK_SKB: +++ case BPF_PROG_TYPE_SK_MSG: +++ if (meta) +++ return meta->pkt_access; +++ +++ env->seen_direct_write = true; +++ return true; +++ +++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: +++ if (t == BPF_WRITE) +++ env->seen_direct_write = true; +++ +++ return true; +++ +++ default: +++ return false; +++ } +++} +++ +++static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, +++ int off, int size, bool zero_size_allowed) +++{ +++ struct bpf_reg_state *regs = cur_regs(env); +++ struct bpf_reg_state *reg = ®s[regno]; +++ +++ if (off < 0 || size < 0 || (size == 0 && !zero_size_allowed) || +++ (u64)off + size > reg->range) { +++ verbose(env, "invalid access to packet, off=%d size=%d, R%d(id=%d,off=%d,r=%d)\n", +++ off, size, regno, reg->id, reg->off, reg->range); +++ return -EACCES; +++ } +++ return 0; +++} +++ +++static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, +++ int size, bool zero_size_allowed) +++{ +++ struct bpf_reg_state *regs = cur_regs(env); +++ struct bpf_reg_state *reg = ®s[regno]; +++ int err; +++ +++ /* We may have added a variable offset to the packet pointer; but any +++ * reg->range we have comes after that. We are only checking the fixed +++ * offset. +++ */ +++ +++ /* We don't allow negative numbers, because we aren't tracking enough +++ * detail to prove they're safe. +++ */ +++ if (reg->smin_value < 0) { +++ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", +++ regno); +++ return -EACCES; +++ } +++ err = __check_packet_access(env, regno, off, size, zero_size_allowed); +++ if (err) { +++ verbose(env, "R%d offset is outside of the packet\n", regno); +++ return err; +++ } +++ +++ /* __check_packet_access has made sure "off + size - 1" is within u16. +++ * reg->umax_value can't be bigger than MAX_PACKET_OFF which is 0xffff, +++ * otherwise find_good_pkt_pointers would have refused to set range info +++ * that __check_packet_access would have rejected this pkt access. +++ * Therefore, "off + reg->umax_value + size - 1" won't overflow u32. +++ */ +++ env->prog->aux->max_pkt_offset = +++ max_t(u32, env->prog->aux->max_pkt_offset, +++ off + reg->umax_value + size - 1); +++ +++ return err; +++} +++ +++/* check access to 'struct bpf_context' fields. Supports fixed offsets only */ +++static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, +++ enum bpf_access_type t, enum bpf_reg_type *reg_type) ++ { ++- if (env->prog->aux->ops->is_valid_access && ++- env->prog->aux->ops->is_valid_access(off, size, t)) +++ struct bpf_insn_access_aux info = { +++ .reg_type = *reg_type, +++ }; +++ +++ if (env->ops->is_valid_access && +++ env->ops->is_valid_access(off, size, t, env->prog, &info)) { +++ /* A non zero info.ctx_field_size indicates that this field is a +++ * candidate for later verifier transformation to load the whole +++ * field and then apply a mask when accessed with a narrower +++ * access than actual ctx access size. A zero info.ctx_field_size +++ * will only allow for whole field access and rejects any other +++ * type of narrower access. +++ */ +++ *reg_type = info.reg_type; +++ +++ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; +++ /* remember the offset of last byte accessed in ctx */ +++ if (env->prog->aux->max_ctx_offset < off + size) +++ env->prog->aux->max_ctx_offset = off + size; ++ return 0; +++ } ++ ++- verbose("invalid bpf_context access off=%d size=%d\n", off, size); +++ verbose(env, "invalid bpf_context access off=%d size=%d\n", off, size); ++ return -EACCES; ++ } ++ ++-static bool is_pointer_value(struct verifier_env *env, int regno) +++static int check_flow_keys_access(struct bpf_verifier_env *env, int off, +++ int size) ++ { ++- if (env->allow_ptr_leaks) ++- return false; +++ if (size < 0 || off < 0 || +++ (u64)off + size > sizeof(struct bpf_flow_keys)) { +++ verbose(env, "invalid access to flow keys off=%d size=%d\n", +++ off, size); +++ return -EACCES; +++ } +++ return 0; +++} ++ ++- switch (env->cur_state.regs[regno].type) { ++- case UNKNOWN_VALUE: ++- case CONST_IMM: ++- return false; +++static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, +++ u32 regno, int off, int size, +++ enum bpf_access_type t) +++{ +++ struct bpf_reg_state *regs = cur_regs(env); +++ struct bpf_reg_state *reg = ®s[regno]; +++ struct bpf_insn_access_aux info = {}; +++ bool valid; +++ +++ if (reg->smin_value < 0) { +++ verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", +++ regno); +++ return -EACCES; +++ } +++ +++ switch (reg->type) { +++ case PTR_TO_SOCK_COMMON: +++ valid = bpf_sock_common_is_valid_access(off, size, t, &info); +++ break; ++ default: ++- return true; +++ valid = false; +++ } +++ +++ +++ if (valid) { +++ env->insn_aux_data[insn_idx].ctx_field_size = +++ info.ctx_field_size; +++ return 0; +++ } +++ +++ verbose(env, "R%d invalid %s access off=%d size=%d\n", +++ regno, reg_type_str[reg->type], off, size); +++ +++ return -EACCES; +++} +++ +++static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +++{ +++ return cur_regs(env) + regno; +++} +++ +++static bool is_pointer_value(struct bpf_verifier_env *env, int regno) +++{ +++ return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); +++} +++ +++static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +++{ +++ const struct bpf_reg_state *reg = reg_state(env, regno); +++ +++ return reg->type == PTR_TO_CTX; +++} +++ +++static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +++{ +++ const struct bpf_reg_state *reg = reg_state(env, regno); +++ +++ return type_is_sk_pointer(reg->type); +++} +++ +++static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) +++{ +++ const struct bpf_reg_state *reg = reg_state(env, regno); +++ +++ return type_is_pkt_pointer(reg->type); +++} +++ +++static bool is_flow_key_reg(struct bpf_verifier_env *env, int regno) +++{ +++ const struct bpf_reg_state *reg = reg_state(env, regno); +++ +++ /* Separate to is_ctx_reg() since we still want to allow BPF_ST here. */ +++ return reg->type == PTR_TO_FLOW_KEYS; +++} +++ +++static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg, +++ int off, int size, bool strict) +++{ +++ struct tnum reg_off; +++ int ip_align; +++ +++ /* Byte size accesses are always allowed. */ +++ if (!strict || size == 1) +++ return 0; +++ +++ /* For platforms that do not have a Kconfig enabling +++ * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS the value of +++ * NET_IP_ALIGN is universally set to '2'. And on platforms +++ * that do set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS, we get +++ * to this code only in strict mode where we want to emulate +++ * the NET_IP_ALIGN==2 checking. Therefore use an +++ * unconditional IP align value of '2'. +++ */ +++ ip_align = 2; +++ +++ reg_off = tnum_add(reg->var_off, tnum_const(ip_align + reg->off + off)); +++ if (!tnum_is_aligned(reg_off, size)) { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, +++ "misaligned packet access off %d+%s+%d+%d size %d\n", +++ ip_align, tn_buf, reg->off, off, size); +++ return -EACCES; +++ } +++ +++ return 0; +++} +++ +++static int check_generic_ptr_alignment(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg, +++ const char *pointer_desc, +++ int off, int size, bool strict) +++{ +++ struct tnum reg_off; +++ +++ /* Byte size accesses are always allowed. */ +++ if (!strict || size == 1) +++ return 0; +++ +++ reg_off = tnum_add(reg->var_off, tnum_const(reg->off + off)); +++ if (!tnum_is_aligned(reg_off, size)) { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, "misaligned %saccess off %s+%d+%d size %d\n", +++ pointer_desc, tn_buf, reg->off, off, size); +++ return -EACCES; +++ } +++ +++ return 0; +++} +++ +++static int check_ptr_alignment(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg, int off, +++ int size, bool strict_alignment_once) +++{ +++ bool strict = env->strict_alignment || strict_alignment_once; +++ const char *pointer_desc = ""; +++ +++ switch (reg->type) { +++ case PTR_TO_PACKET: +++ case PTR_TO_PACKET_META: +++ /* Special case, because of NET_IP_ALIGN. Given metadata sits +++ * right in front, treat it the very same way. +++ */ +++ return check_pkt_ptr_alignment(env, reg, off, size, strict); +++ case PTR_TO_FLOW_KEYS: +++ pointer_desc = "flow keys "; +++ break; +++ case PTR_TO_MAP_VALUE: +++ pointer_desc = "value "; +++ break; +++ case PTR_TO_CTX: +++ pointer_desc = "context "; +++ break; +++ case PTR_TO_STACK: +++ pointer_desc = "stack "; +++ /* The stack spill tracking logic in check_stack_write() +++ * and check_stack_read() relies on stack accesses being +++ * aligned. +++ */ +++ strict = true; +++ break; +++ case PTR_TO_SOCKET: +++ pointer_desc = "sock "; +++ break; +++ case PTR_TO_SOCK_COMMON: +++ pointer_desc = "sock_common "; +++ break; +++ case PTR_TO_TCP_SOCK: +++ pointer_desc = "tcp_sock "; +++ break; +++ case PTR_TO_XDP_SOCK: +++ pointer_desc = "xdp_sock "; +++ break; +++ default: +++ break; +++ } +++ return check_generic_ptr_alignment(env, reg, pointer_desc, off, size, +++ strict); +++} +++ +++static int update_stack_depth(struct bpf_verifier_env *env, +++ const struct bpf_func_state *func, +++ int off) +++{ +++ u16 stack = env->subprog_info[func->subprogno].stack_depth; +++ +++ if (stack >= -off) +++ return 0; +++ +++ /* update known max for given subprogram */ +++ env->subprog_info[func->subprogno].stack_depth = -off; +++ return 0; +++} +++ +++/* starting from main bpf function walk all instructions of the function +++ * and recursively walk all callees that given function can call. +++ * Ignore jump and exit insns. +++ * Since recursion is prevented by check_cfg() this algorithm +++ * only needs a local stack of MAX_CALL_FRAMES to remember callsites +++ */ +++static int check_max_stack_depth(struct bpf_verifier_env *env) +++{ +++ int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; +++ struct bpf_subprog_info *subprog = env->subprog_info; +++ struct bpf_insn *insn = env->prog->insnsi; +++ int ret_insn[MAX_CALL_FRAMES]; +++ int ret_prog[MAX_CALL_FRAMES]; +++ +++process_func: +++ /* protect against potential stack overflow that might happen when +++ * bpf2bpf calls get combined with tailcalls. Limit the caller's stack +++ * depth for such case down to 256 so that the worst case scenario +++ * would result in 8k stack size (32 which is tailcall limit * 256 = +++ * 8k). +++ * +++ * To get the idea what might happen, see an example: +++ * func1 -> sub rsp, 128 +++ * subfunc1 -> sub rsp, 256 +++ * tailcall1 -> add rsp, 256 +++ * func2 -> sub rsp, 192 (total stack size = 128 + 192 = 320) +++ * subfunc2 -> sub rsp, 64 +++ * subfunc22 -> sub rsp, 128 +++ * tailcall2 -> add rsp, 128 +++ * func3 -> sub rsp, 32 (total stack size 128 + 192 + 64 + 32 = 416) +++ * +++ * tailcall will unwind the current stack frame but it will not get rid +++ * of caller's stack as shown on the example above. +++ */ +++ if (idx && subprog[idx].has_tail_call && depth >= 256) { +++ verbose(env, +++ "tail_calls are not allowed when call stack of previous frames is %d bytes. Too large\n", +++ depth); +++ return -EACCES; +++ } +++ /* round up to 32-bytes, since this is granularity +++ * of interpreter stack size +++ */ +++ depth += round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); +++ if (depth > MAX_BPF_STACK) { +++ verbose(env, "combined stack size of %d calls is %d. Too large\n", +++ frame + 1, depth); +++ return -EACCES; +++ } +++continue_func: +++ subprog_end = subprog[idx + 1].start; +++ for (; i < subprog_end; i++) { +++ if (insn[i].code != (BPF_JMP | BPF_CALL)) +++ continue; +++ if (insn[i].src_reg != BPF_PSEUDO_CALL) +++ continue; +++ /* remember insn and function to return to */ +++ ret_insn[frame] = i + 1; +++ ret_prog[frame] = idx; +++ +++ /* find the callee */ +++ i = i + insn[i].imm + 1; +++ idx = find_subprog(env, i); +++ if (idx < 0) { +++ WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", +++ i); +++ return -EFAULT; +++ } +++ frame++; +++ if (frame >= MAX_CALL_FRAMES) { +++ verbose(env, "the call stack of %d frames is too deep !\n", +++ frame); +++ return -E2BIG; +++ } +++ goto process_func; +++ } +++ /* end of for() loop means the last insn of the 'subprog' +++ * was reached. Doesn't matter whether it was JA or EXIT +++ */ +++ if (frame == 0) +++ return 0; +++ depth -= round_up(max_t(u32, subprog[idx].stack_depth, 1), 32); +++ frame--; +++ i = ret_insn[frame]; +++ idx = ret_prog[frame]; +++ goto continue_func; +++} +++ +++#ifndef CONFIG_BPF_JIT_ALWAYS_ON +++static int get_callee_stack_depth(struct bpf_verifier_env *env, +++ const struct bpf_insn *insn, int idx) +++{ +++ int start = idx + insn->imm + 1, subprog; +++ +++ subprog = find_subprog(env, start); +++ if (subprog < 0) { +++ WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", +++ start); +++ return -EFAULT; ++ } +++ return env->subprog_info[subprog].stack_depth; +++} +++#endif +++ +++static int check_ctx_reg(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg, int regno) +++{ +++ /* Access to ctx or passing it to a helper is only allowed in +++ * its original, unmodified form. +++ */ +++ +++ if (reg->off) { +++ verbose(env, "dereference of modified ctx ptr R%d off=%d disallowed\n", +++ regno, reg->off); +++ return -EACCES; +++ } +++ +++ if (!tnum_is_const(reg->var_off) || reg->var_off.value) { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, "variable ctx access var_off=%s disallowed\n", tn_buf); +++ return -EACCES; +++ } +++ +++ return 0; +++} +++ +++static int check_tp_buffer_access(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg, +++ int regno, int off, int size) +++{ +++ if (off < 0) { +++ verbose(env, +++ "R%d invalid tracepoint buffer access: off=%d, size=%d", +++ regno, off, size); +++ return -EACCES; +++ } +++ if (!tnum_is_const(reg->var_off) || reg->var_off.value) { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, +++ "R%d invalid variable buffer offset: off=%d, var_off=%s", +++ regno, off, tn_buf); +++ return -EACCES; +++ } +++ if (off + size > env->prog->aux->max_tp_access) +++ env->prog->aux->max_tp_access = off + size; +++ +++ return 0; +++} +++ +++ +++/* truncate register to smaller size (in bytes) +++ * must be called with size < BPF_REG_SIZE +++ */ +++static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) +++{ +++ u64 mask; +++ +++ /* clear high bits in bit representation */ +++ reg->var_off = tnum_cast(reg->var_off, size); +++ +++ /* fix arithmetic bounds */ +++ mask = ((u64)1 << (size * 8)) - 1; +++ if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { +++ reg->umin_value &= mask; +++ reg->umax_value &= mask; +++ } else { +++ reg->umin_value = 0; +++ reg->umax_value = mask; +++ } +++ reg->smin_value = reg->umin_value; +++ reg->smax_value = reg->umax_value; ++ } ++ ++ /* check whether memory at (regno + off) is accessible for t = (read | write) ++@@ -680,225 +2776,692 @@ static bool is_pointer_value(struct veri ++ * if t==write && value_regno==-1, some unknown value is stored into memory ++ * if t==read && value_regno==-1, don't care what we read from memory ++ */ ++-static int check_mem_access(struct verifier_env *env, u32 regno, int off, ++- int bpf_size, enum bpf_access_type t, ++- int value_regno) +++static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, +++ int off, int bpf_size, enum bpf_access_type t, +++ int value_regno, bool strict_alignment_once) ++ { ++- struct verifier_state *state = &env->cur_state; +++ struct bpf_reg_state *regs = cur_regs(env); +++ struct bpf_reg_state *reg = regs + regno; +++ struct bpf_func_state *state; ++ int size, err = 0; ++ ++- if (state->regs[regno].type == PTR_TO_STACK) ++- off += state->regs[regno].imm; ++- ++ size = bpf_size_to_bytes(bpf_size); ++ if (size < 0) ++ return size; ++ ++- if (off % size != 0) { ++- verbose("misaligned access off %d size %d\n", off, size); ++- return -EACCES; ++- } +++ /* alignment checks will add in reg->off themselves */ +++ err = check_ptr_alignment(env, reg, off, size, strict_alignment_once); +++ if (err) +++ return err; +++ +++ /* for access checks, reg->off is just part of off */ +++ off += reg->off; ++ ++- if (state->regs[regno].type == PTR_TO_MAP_VALUE) { +++ if (reg->type == PTR_TO_MAP_VALUE) { ++ if (t == BPF_WRITE && value_regno >= 0 && ++ is_pointer_value(env, value_regno)) { ++- verbose("R%d leaks addr into map\n", value_regno); +++ verbose(env, "R%d leaks addr into map\n", value_regno); ++ return -EACCES; ++ } ++- err = check_map_access(env, regno, off, size); +++ err = check_map_access_type(env, regno, off, size, t); +++ if (err) +++ return err; +++ err = check_map_access(env, regno, off, size, false); ++ if (!err && t == BPF_READ && value_regno >= 0) ++- mark_reg_unknown_value(state->regs, value_regno); +++ mark_reg_unknown(env, regs, value_regno); +++ +++ } else if (reg->type == PTR_TO_CTX) { +++ enum bpf_reg_type reg_type = SCALAR_VALUE; ++ ++- } else if (state->regs[regno].type == PTR_TO_CTX) { ++ if (t == BPF_WRITE && value_regno >= 0 && ++ is_pointer_value(env, value_regno)) { ++- verbose("R%d leaks addr into ctx\n", value_regno); +++ verbose(env, "R%d leaks addr into ctx\n", value_regno); ++ return -EACCES; ++ } ++- err = check_ctx_access(env, off, size, t); ++- if (!err && t == BPF_READ && value_regno >= 0) ++- mark_reg_unknown_value(state->regs, value_regno); ++ ++- } else if (state->regs[regno].type == FRAME_PTR || ++- state->regs[regno].type == PTR_TO_STACK) { ++- if (off >= 0 || off < -MAX_BPF_STACK) { ++- verbose("invalid stack off=%d size=%d\n", off, size); +++ err = check_ctx_reg(env, reg, regno); +++ if (err < 0) +++ return err; +++ +++ err = check_ctx_access(env, insn_idx, off, size, t, ®_type); +++ if (!err && t == BPF_READ && value_regno >= 0) { +++ /* ctx access returns either a scalar, or a +++ * PTR_TO_PACKET[_META,_END]. In the latter +++ * case, we know the offset is zero. +++ */ +++ if (reg_type == SCALAR_VALUE) { +++ mark_reg_unknown(env, regs, value_regno); +++ } else { +++ mark_reg_known_zero(env, regs, +++ value_regno); +++ if (reg_type_may_be_null(reg_type)) +++ regs[value_regno].id = ++env->id_gen; +++ /* A load of ctx field could have different +++ * actual load size with the one encoded in the +++ * insn. When the dst is PTR, it is for sure not +++ * a sub-register. +++ */ +++ regs[value_regno].subreg_def = DEF_NOT_SUBREG; +++ } +++ regs[value_regno].type = reg_type; +++ } +++ +++ } else if (reg->type == PTR_TO_STACK) { +++ off += reg->var_off.value; +++ err = check_stack_access(env, reg, off, size); +++ if (err) +++ return err; +++ +++ state = func(env, reg); +++ err = update_stack_depth(env, state, off); +++ if (err) +++ return err; +++ +++ if (t == BPF_WRITE) +++ err = check_stack_write(env, state, off, size, +++ value_regno, insn_idx); +++ else +++ err = check_stack_read(env, state, off, size, +++ value_regno); +++ } else if (reg_is_pkt_pointer(reg)) { +++ if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { +++ verbose(env, "cannot write into packet\n"); +++ return -EACCES; +++ } +++ if (t == BPF_WRITE && value_regno >= 0 && +++ is_pointer_value(env, value_regno)) { +++ verbose(env, "R%d leaks addr into packet\n", +++ value_regno); +++ return -EACCES; +++ } +++ err = check_packet_access(env, regno, off, size, false); +++ if (!err && t == BPF_READ && value_regno >= 0) +++ mark_reg_unknown(env, regs, value_regno); +++ } else if (reg->type == PTR_TO_FLOW_KEYS) { +++ if (t == BPF_WRITE && value_regno >= 0 && +++ is_pointer_value(env, value_regno)) { +++ verbose(env, "R%d leaks addr into flow keys\n", +++ value_regno); ++ return -EACCES; ++ } +++ +++ err = check_flow_keys_access(env, off, size); +++ if (!err && t == BPF_READ && value_regno >= 0) +++ mark_reg_unknown(env, regs, value_regno); +++ } else if (type_is_sk_pointer(reg->type)) { ++ if (t == BPF_WRITE) { ++- if (!env->allow_ptr_leaks && ++- state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && ++- size != BPF_REG_SIZE) { ++- verbose("attempt to corrupt spilled pointer on stack\n"); ++- return -EACCES; ++- } ++- err = check_stack_write(state, off, size, value_regno); ++- } else { ++- err = check_stack_read(state, off, size, value_regno); +++ verbose(env, "R%d cannot write into %s\n", +++ regno, reg_type_str[reg->type]); +++ return -EACCES; ++ } +++ err = check_sock_access(env, insn_idx, regno, off, size, t); +++ if (!err && value_regno >= 0) +++ mark_reg_unknown(env, regs, value_regno); +++ } else if (reg->type == PTR_TO_TP_BUFFER) { +++ err = check_tp_buffer_access(env, reg, regno, off, size); +++ if (!err && t == BPF_READ && value_regno >= 0) +++ mark_reg_unknown(env, regs, value_regno); ++ } else { ++- verbose("R%d invalid mem access '%s'\n", ++- regno, reg_type_str[state->regs[regno].type]); +++ verbose(env, "R%d invalid mem access '%s'\n", regno, +++ reg_type_str[reg->type]); ++ return -EACCES; ++ } +++ +++ if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && +++ regs[value_regno].type == SCALAR_VALUE) { +++ /* b/h/w load zero-extends, mark upper bits as known 0 */ +++ coerce_reg_to_size(®s[value_regno], size); +++ } ++ return err; ++ } ++ ++-static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) +++static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) ++ { ++- struct reg_state *regs = env->cur_state.regs; ++ int err; ++ ++ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || ++ insn->imm != 0) { ++- verbose("BPF_XADD uses reserved fields\n"); +++ verbose(env, "BPF_XADD uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++ /* check src1 operand */ ++- err = check_reg_arg(regs, insn->src_reg, SRC_OP); +++ err = check_reg_arg(env, insn->src_reg, SRC_OP); ++ if (err) ++ return err; ++ ++ /* check src2 operand */ ++- err = check_reg_arg(regs, insn->dst_reg, SRC_OP); +++ err = check_reg_arg(env, insn->dst_reg, SRC_OP); ++ if (err) ++ return err; ++ +++ if (is_pointer_value(env, insn->src_reg)) { +++ verbose(env, "R%d leaks addr into mem\n", insn->src_reg); +++ return -EACCES; +++ } +++ +++ if (is_ctx_reg(env, insn->dst_reg) || +++ is_pkt_reg(env, insn->dst_reg) || +++ is_flow_key_reg(env, insn->dst_reg) || +++ is_sk_reg(env, insn->dst_reg)) { +++ verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", +++ insn->dst_reg, +++ reg_type_str[reg_state(env, insn->dst_reg)->type]); +++ return -EACCES; +++ } +++ ++ /* check whether atomic_add can read the memory */ ++- err = check_mem_access(env, insn->dst_reg, insn->off, ++- BPF_SIZE(insn->code), BPF_READ, -1); +++ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, +++ BPF_SIZE(insn->code), BPF_READ, -1, true); ++ if (err) ++ return err; ++ ++ /* check whether atomic_add can write into the same memory */ ++- return check_mem_access(env, insn->dst_reg, insn->off, ++- BPF_SIZE(insn->code), BPF_WRITE, -1); +++ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, +++ BPF_SIZE(insn->code), BPF_WRITE, -1, true); +++} +++ +++static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, +++ int off, int access_size, +++ bool zero_size_allowed) +++{ +++ struct bpf_reg_state *reg = reg_state(env, regno); +++ +++ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || +++ access_size < 0 || (access_size == 0 && !zero_size_allowed)) { +++ if (tnum_is_const(reg->var_off)) { +++ verbose(env, "invalid stack type R%d off=%d access_size=%d\n", +++ regno, off, access_size); +++ } else { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", +++ regno, tn_buf, access_size); +++ } +++ return -EACCES; +++ } +++ return 0; ++ } ++ ++ /* when register 'regno' is passed into function that will read 'access_size' ++ * bytes from that pointer, make sure that it's within stack boundary ++- * and all elements of stack are initialized +++ * and all elements of stack are initialized. +++ * Unlike most pointer bounds-checking functions, this one doesn't take an +++ * 'off' argument, so it has to add in reg->off itself. ++ */ ++-static int check_stack_boundary(struct verifier_env *env, ++- int regno, int access_size) +++static int check_stack_boundary(struct bpf_verifier_env *env, int regno, +++ int access_size, bool zero_size_allowed, +++ struct bpf_call_arg_meta *meta) ++ { ++- struct verifier_state *state = &env->cur_state; ++- struct reg_state *regs = state->regs; ++- int off, i; +++ struct bpf_reg_state *reg = reg_state(env, regno); +++ struct bpf_func_state *state = func(env, reg); +++ int err, min_off, max_off, i, j, slot, spi; +++ +++ if (reg->type != PTR_TO_STACK) { +++ /* Allow zero-byte read from NULL, regardless of pointer type */ +++ if (zero_size_allowed && access_size == 0 && +++ register_is_null(reg)) +++ return 0; ++ ++- if (regs[regno].type != PTR_TO_STACK) +++ verbose(env, "R%d type=%s expected=%s\n", regno, +++ reg_type_str[reg->type], +++ reg_type_str[PTR_TO_STACK]); ++ return -EACCES; +++ } ++ ++- off = regs[regno].imm; ++- if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || ++- access_size <= 0) { ++- verbose("invalid stack type R%d off=%d access_size=%d\n", ++- regno, off, access_size); +++ if (tnum_is_const(reg->var_off)) { +++ min_off = max_off = reg->var_off.value + reg->off; +++ err = __check_stack_boundary(env, regno, min_off, access_size, +++ zero_size_allowed); +++ if (err) +++ return err; +++ } else { +++ /* Variable offset is prohibited for unprivileged mode for +++ * simplicity since it requires corresponding support in +++ * Spectre masking for stack ALU. +++ * See also retrieve_ptr_limit(). +++ */ +++ if (!env->allow_ptr_leaks) { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", +++ regno, tn_buf); +++ return -EACCES; +++ } +++ /* Only initialized buffer on stack is allowed to be accessed +++ * with variable offset. With uninitialized buffer it's hard to +++ * guarantee that whole memory is marked as initialized on +++ * helper return since specific bounds are unknown what may +++ * cause uninitialized stack leaking. +++ */ +++ if (meta && meta->raw_mode) +++ meta = NULL; +++ +++ if (reg->smax_value >= BPF_MAX_VAR_OFF || +++ reg->smax_value <= -BPF_MAX_VAR_OFF) { +++ verbose(env, "R%d unbounded indirect variable offset stack access\n", +++ regno); +++ return -EACCES; +++ } +++ min_off = reg->smin_value + reg->off; +++ max_off = reg->smax_value + reg->off; +++ err = __check_stack_boundary(env, regno, min_off, access_size, +++ zero_size_allowed); +++ if (err) { +++ verbose(env, "R%d min value is outside of stack bound\n", +++ regno); +++ return err; +++ } +++ err = __check_stack_boundary(env, regno, max_off, access_size, +++ zero_size_allowed); +++ if (err) { +++ verbose(env, "R%d max value is outside of stack bound\n", +++ regno); +++ return err; +++ } +++ } +++ +++ if (meta && meta->raw_mode) { +++ meta->access_size = access_size; +++ meta->regno = regno; +++ return 0; +++ } +++ +++ for (i = min_off; i < max_off + access_size; i++) { +++ u8 *stype; +++ +++ slot = -i - 1; +++ spi = slot / BPF_REG_SIZE; +++ if (state->allocated_stack <= slot) +++ goto err; +++ stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; +++ if (*stype == STACK_MISC) +++ goto mark; +++ if (*stype == STACK_ZERO) { +++ /* helper can write anything into the stack */ +++ *stype = STACK_MISC; +++ goto mark; +++ } +++ if (state->stack[spi].slot_type[0] == STACK_SPILL && +++ state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { +++ __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); +++ for (j = 0; j < BPF_REG_SIZE; j++) +++ state->stack[spi].slot_type[j] = STACK_MISC; +++ goto mark; +++ } +++ +++err: +++ if (tnum_is_const(reg->var_off)) { +++ verbose(env, "invalid indirect read from stack off %d+%d size %d\n", +++ min_off, i - min_off, access_size); +++ } else { +++ char tn_buf[48]; +++ +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", +++ tn_buf, i - min_off, access_size); +++ } ++ return -EACCES; +++mark: +++ /* reading any byte out of 8-byte 'spill_slot' will cause +++ * the whole slot to be marked as 'read' +++ */ +++ mark_reg_read(env, &state->stack[spi].spilled_ptr, +++ state->stack[spi].spilled_ptr.parent, +++ REG_LIVE_READ64); ++ } +++ return update_stack_depth(env, state, min_off); +++} +++ +++static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, +++ int access_size, bool zero_size_allowed, +++ struct bpf_call_arg_meta *meta) +++{ +++ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; ++ ++- for (i = 0; i < access_size; i++) { ++- if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { ++- verbose("invalid indirect read from stack off %d+%d size %d\n", ++- off, i, access_size); +++ switch (reg->type) { +++ case PTR_TO_PACKET: +++ case PTR_TO_PACKET_META: +++ return check_packet_access(env, regno, reg->off, access_size, +++ zero_size_allowed); +++ case PTR_TO_MAP_VALUE: +++ if (check_map_access_type(env, regno, reg->off, access_size, +++ meta && meta->raw_mode ? BPF_WRITE : +++ BPF_READ)) ++ return -EACCES; +++ return check_map_access(env, regno, reg->off, access_size, +++ zero_size_allowed); +++ default: /* scalar_value|ptr_to_stack or invalid ptr */ +++ return check_stack_boundary(env, regno, access_size, +++ zero_size_allowed, meta); +++ } +++} +++ +++/* Implementation details: +++ * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL +++ * Two bpf_map_lookups (even with the same key) will have different reg->id. +++ * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after +++ * value_or_null->value transition, since the verifier only cares about +++ * the range of access to valid map value pointer and doesn't care about actual +++ * address of the map element. +++ * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps +++ * reg->id > 0 after value_or_null->value transition. By doing so +++ * two bpf_map_lookups will be considered two different pointers that +++ * point to different bpf_spin_locks. +++ * The verifier allows taking only one bpf_spin_lock at a time to avoid +++ * dead-locks. +++ * Since only one bpf_spin_lock is allowed the checks are simpler than +++ * reg_is_refcounted() logic. The verifier needs to remember only +++ * one spin_lock instead of array of acquired_refs. +++ * cur_state->active_spin_lock remembers which map value element got locked +++ * and clears it after bpf_spin_unlock. +++ */ +++static int process_spin_lock(struct bpf_verifier_env *env, int regno, +++ bool is_lock) +++{ +++ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; +++ struct bpf_verifier_state *cur = env->cur_state; +++ bool is_const = tnum_is_const(reg->var_off); +++ struct bpf_map *map = reg->map_ptr; +++ u64 val = reg->var_off.value; +++ +++ if (reg->type != PTR_TO_MAP_VALUE) { +++ verbose(env, "R%d is not a pointer to map_value\n", regno); +++ return -EINVAL; +++ } +++ if (!is_const) { +++ verbose(env, +++ "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", +++ regno); +++ return -EINVAL; +++ } +++ if (!map->btf) { +++ verbose(env, +++ "map '%s' has to have BTF in order to use bpf_spin_lock\n", +++ map->name); +++ return -EINVAL; +++ } +++ if (!map_value_has_spin_lock(map)) { +++ if (map->spin_lock_off == -E2BIG) +++ verbose(env, +++ "map '%s' has more than one 'struct bpf_spin_lock'\n", +++ map->name); +++ else if (map->spin_lock_off == -ENOENT) +++ verbose(env, +++ "map '%s' doesn't have 'struct bpf_spin_lock'\n", +++ map->name); +++ else +++ verbose(env, +++ "map '%s' is not a struct type or bpf_spin_lock is mangled\n", +++ map->name); +++ return -EINVAL; +++ } +++ if (map->spin_lock_off != val + reg->off) { +++ verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", +++ val + reg->off); +++ return -EINVAL; +++ } +++ if (is_lock) { +++ if (cur->active_spin_lock) { +++ verbose(env, +++ "Locking two bpf_spin_locks are not allowed\n"); +++ return -EINVAL; +++ } +++ cur->active_spin_lock = reg->id; +++ } else { +++ if (!cur->active_spin_lock) { +++ verbose(env, "bpf_spin_unlock without taking a lock\n"); +++ return -EINVAL; ++ } +++ if (cur->active_spin_lock != reg->id) { +++ verbose(env, "bpf_spin_unlock of different lock\n"); +++ return -EINVAL; +++ } +++ cur->active_spin_lock = 0; ++ } ++ return 0; ++ } ++ ++-static int check_func_arg(struct verifier_env *env, u32 regno, ++- enum bpf_arg_type arg_type, struct bpf_map **mapp) +++static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +++{ +++ return type == ARG_PTR_TO_MEM || +++ type == ARG_PTR_TO_MEM_OR_NULL || +++ type == ARG_PTR_TO_UNINIT_MEM; +++} +++ +++static bool arg_type_is_mem_size(enum bpf_arg_type type) +++{ +++ return type == ARG_CONST_SIZE || +++ type == ARG_CONST_SIZE_OR_ZERO; +++} +++ +++static bool arg_type_is_int_ptr(enum bpf_arg_type type) +++{ +++ return type == ARG_PTR_TO_INT || +++ type == ARG_PTR_TO_LONG; +++} +++ +++static int int_ptr_type_to_size(enum bpf_arg_type type) +++{ +++ if (type == ARG_PTR_TO_INT) +++ return sizeof(u32); +++ else if (type == ARG_PTR_TO_LONG) +++ return sizeof(u64); +++ +++ return -EINVAL; +++} +++ +++static int check_func_arg(struct bpf_verifier_env *env, u32 regno, +++ enum bpf_arg_type arg_type, +++ struct bpf_call_arg_meta *meta) ++ { ++- struct reg_state *reg = env->cur_state.regs + regno; ++- enum bpf_reg_type expected_type; +++ struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; +++ enum bpf_reg_type expected_type, type = reg->type; ++ int err = 0; ++ ++ if (arg_type == ARG_DONTCARE) ++ return 0; ++ ++- if (reg->type == NOT_INIT) { ++- verbose("R%d !read_ok\n", regno); ++- return -EACCES; ++- } +++ err = check_reg_arg(env, regno, SRC_OP); +++ if (err) +++ return err; ++ ++ if (arg_type == ARG_ANYTHING) { ++ if (is_pointer_value(env, regno)) { ++- verbose("R%d leaks addr into helper function\n", regno); +++ verbose(env, "R%d leaks addr into helper function\n", +++ regno); ++ return -EACCES; ++ } ++ return 0; ++ } ++ ++- if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || ++- arg_type == ARG_PTR_TO_MAP_VALUE) { +++ if (type_is_pkt_pointer(type) && +++ !may_access_direct_pkt_data(env, meta, BPF_READ)) { +++ verbose(env, "helper access to the packet is not allowed\n"); +++ return -EACCES; +++ } +++ +++ if (arg_type == ARG_PTR_TO_MAP_KEY || +++ arg_type == ARG_PTR_TO_MAP_VALUE || +++ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE || +++ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) { ++ expected_type = PTR_TO_STACK; ++- } else if (arg_type == ARG_CONST_STACK_SIZE) { ++- expected_type = CONST_IMM; +++ if (register_is_null(reg) && +++ arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL) +++ /* final test in check_stack_boundary() */; +++ else if (!type_is_pkt_pointer(type) && +++ type != PTR_TO_MAP_VALUE && +++ type != expected_type) +++ goto err_type; +++ } else if (arg_type == ARG_CONST_SIZE || +++ arg_type == ARG_CONST_SIZE_OR_ZERO) { +++ expected_type = SCALAR_VALUE; +++ if (type != expected_type) +++ goto err_type; ++ } else if (arg_type == ARG_CONST_MAP_PTR) { ++ expected_type = CONST_PTR_TO_MAP; +++ if (type != expected_type) +++ goto err_type; ++ } else if (arg_type == ARG_PTR_TO_CTX) { ++ expected_type = PTR_TO_CTX; +++ if (type != expected_type) +++ goto err_type; +++ err = check_ctx_reg(env, reg, regno); +++ if (err < 0) +++ return err; +++ } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { +++ expected_type = PTR_TO_SOCK_COMMON; +++ /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ +++ if (!type_is_sk_pointer(type)) +++ goto err_type; +++ if (reg->ref_obj_id) { +++ if (meta->ref_obj_id) { +++ verbose(env, "verifier internal error: more than one arg with ref_obj_id R%d %u %u\n", +++ regno, reg->ref_obj_id, +++ meta->ref_obj_id); +++ return -EFAULT; +++ } +++ meta->ref_obj_id = reg->ref_obj_id; +++ } +++ } else if (arg_type == ARG_PTR_TO_SOCKET) { +++ expected_type = PTR_TO_SOCKET; +++ if (type != expected_type) +++ goto err_type; +++ } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { +++ if (meta->func_id == BPF_FUNC_spin_lock) { +++ if (process_spin_lock(env, regno, true)) +++ return -EACCES; +++ } else if (meta->func_id == BPF_FUNC_spin_unlock) { +++ if (process_spin_lock(env, regno, false)) +++ return -EACCES; +++ } else { +++ verbose(env, "verifier internal error\n"); +++ return -EFAULT; +++ } +++ } else if (arg_type_is_mem_ptr(arg_type)) { +++ expected_type = PTR_TO_STACK; +++ /* One exception here. In case function allows for NULL to be +++ * passed in as argument, it's a SCALAR_VALUE type. Final test +++ * happens during stack boundary checking. +++ */ +++ if (register_is_null(reg) && +++ arg_type == ARG_PTR_TO_MEM_OR_NULL) +++ /* final test in check_stack_boundary() */; +++ else if (!type_is_pkt_pointer(type) && +++ type != PTR_TO_MAP_VALUE && +++ type != expected_type) +++ goto err_type; +++ meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM; +++ } else if (arg_type_is_int_ptr(arg_type)) { +++ expected_type = PTR_TO_STACK; +++ if (!type_is_pkt_pointer(type) && +++ type != PTR_TO_MAP_VALUE && +++ type != expected_type) +++ goto err_type; ++ } else { ++- verbose("unsupported arg_type %d\n", arg_type); +++ verbose(env, "unsupported arg_type %d\n", arg_type); ++ return -EFAULT; ++ } ++ ++- if (reg->type != expected_type) { ++- verbose("R%d type=%s expected=%s\n", regno, ++- reg_type_str[reg->type], reg_type_str[expected_type]); ++- return -EACCES; ++- } ++- ++ if (arg_type == ARG_CONST_MAP_PTR) { ++ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */ ++- *mapp = reg->map_ptr; ++- +++ meta->map_ptr = reg->map_ptr; ++ } else if (arg_type == ARG_PTR_TO_MAP_KEY) { ++ /* bpf_map_xxx(..., map_ptr, ..., key) call: ++ * check that [key, key + map->key_size) are within ++ * stack limits and initialized ++ */ ++- if (!*mapp) { +++ if (!meta->map_ptr) { ++ /* in function declaration map_ptr must come before ++ * map_key, so that it's verified and known before ++ * we have to check map_key here. Otherwise it means ++ * that kernel subsystem misconfigured verifier ++ */ ++- verbose("invalid map_ptr to access map->key\n"); +++ verbose(env, "invalid map_ptr to access map->key\n"); ++ return -EACCES; ++ } ++- err = check_stack_boundary(env, regno, (*mapp)->key_size); ++- ++- } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { +++ err = check_helper_mem_access(env, regno, +++ meta->map_ptr->key_size, false, +++ NULL); +++ } else if (arg_type == ARG_PTR_TO_MAP_VALUE || +++ (arg_type == ARG_PTR_TO_MAP_VALUE_OR_NULL && +++ !register_is_null(reg)) || +++ arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE) { ++ /* bpf_map_xxx(..., map_ptr, ..., value) call: ++ * check [value, value + map->value_size) validity ++ */ ++- if (!*mapp) { +++ if (!meta->map_ptr) { ++ /* kernel subsystem misconfigured verifier */ ++- verbose("invalid map_ptr to access map->value\n"); +++ verbose(env, "invalid map_ptr to access map->value\n"); ++ return -EACCES; ++ } ++- err = check_stack_boundary(env, regno, (*mapp)->value_size); +++ meta->raw_mode = (arg_type == ARG_PTR_TO_UNINIT_MAP_VALUE); +++ err = check_helper_mem_access(env, regno, +++ meta->map_ptr->value_size, false, +++ meta); +++ } else if (arg_type_is_mem_size(arg_type)) { +++ bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); ++ ++- } else if (arg_type == ARG_CONST_STACK_SIZE) { ++- /* bpf_xxx(..., buf, len) call will access 'len' bytes ++- * from stack pointer 'buf'. Check it ++- * note: regno == len, regno - 1 == buf +++ /* remember the mem_size which may be used later +++ * to refine return values. ++ */ ++- if (regno == 0) { ++- /* kernel subsystem misconfigured verifier */ ++- verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); +++ meta->msize_max_value = reg->umax_value; +++ +++ /* The register is SCALAR_VALUE; the access check +++ * happens using its boundaries. +++ */ +++ if (!tnum_is_const(reg->var_off)) +++ /* For unprivileged variable accesses, disable raw +++ * mode so that the program is required to +++ * initialize all the memory that the helper could +++ * just partially fill up. +++ */ +++ meta = NULL; +++ +++ if (reg->smin_value < 0) { +++ verbose(env, "R%d min value is negative, either use unsigned or 'var &= const'\n", +++ regno); +++ return -EACCES; +++ } +++ +++ if (reg->umin_value == 0) { +++ err = check_helper_mem_access(env, regno - 1, 0, +++ zero_size_allowed, +++ meta); +++ if (err) +++ return err; +++ } +++ +++ if (reg->umax_value >= BPF_MAX_VAR_SIZ) { +++ verbose(env, "R%d unbounded memory access, use 'var &= const' or 'if (var < const)'\n", +++ regno); ++ return -EACCES; ++ } ++- err = check_stack_boundary(env, regno - 1, reg->imm); +++ err = check_helper_mem_access(env, regno - 1, +++ reg->umax_value, +++ zero_size_allowed, meta); +++ if (!err) +++ err = mark_chain_precision(env, regno); +++ } else if (arg_type_is_int_ptr(arg_type)) { +++ int size = int_ptr_type_to_size(arg_type); +++ +++ err = check_helper_mem_access(env, regno, size, false, meta); +++ if (err) +++ return err; +++ err = check_ptr_alignment(env, reg, 0, size, true); ++ } ++ ++ return err; +++err_type: +++ verbose(env, "R%d type=%s expected=%s\n", regno, +++ reg_type_str[type], reg_type_str[expected_type]); +++ return -EACCES; ++ } ++ ++-static int check_map_func_compatibility(struct bpf_map *map, int func_id) +++static int check_map_func_compatibility(struct bpf_verifier_env *env, +++ struct bpf_map *map, int func_id) ++ { ++ if (!map) ++ return 0; ++@@ -911,7 +3474,74 @@ static int check_map_func_compatibility( ++ break; ++ case BPF_MAP_TYPE_PERF_EVENT_ARRAY: ++ if (func_id != BPF_FUNC_perf_event_read && ++- func_id != BPF_FUNC_perf_event_output) +++ func_id != BPF_FUNC_perf_event_output && +++ func_id != BPF_FUNC_perf_event_read_value) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_STACK_TRACE: +++ if (func_id != BPF_FUNC_get_stackid) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_CGROUP_ARRAY: +++ if (func_id != BPF_FUNC_skb_under_cgroup && +++ func_id != BPF_FUNC_current_task_under_cgroup) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_CGROUP_STORAGE: +++ if (func_id != BPF_FUNC_get_local_storage) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_DEVMAP: +++ case BPF_MAP_TYPE_DEVMAP_HASH: +++ if (func_id != BPF_FUNC_redirect_map && +++ func_id != BPF_FUNC_map_lookup_elem) +++ goto error; +++ break; +++ /* Restrict bpf side of cpumap and xskmap, open when use-cases +++ * appear. +++ */ +++ case BPF_MAP_TYPE_CPUMAP: +++ if (func_id != BPF_FUNC_redirect_map) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_XSKMAP: +++ if (func_id != BPF_FUNC_redirect_map && +++ func_id != BPF_FUNC_map_lookup_elem) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_ARRAY_OF_MAPS: +++ case BPF_MAP_TYPE_HASH_OF_MAPS: +++ if (func_id != BPF_FUNC_map_lookup_elem) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_SOCKMAP: +++ if (func_id != BPF_FUNC_sk_redirect_map && +++ func_id != BPF_FUNC_sock_map_update && +++ func_id != BPF_FUNC_map_delete_elem && +++ func_id != BPF_FUNC_msg_redirect_map) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_SOCKHASH: +++ if (func_id != BPF_FUNC_sk_redirect_hash && +++ func_id != BPF_FUNC_sock_hash_update && +++ func_id != BPF_FUNC_map_delete_elem && +++ func_id != BPF_FUNC_msg_redirect_hash) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: +++ if (func_id != BPF_FUNC_sk_select_reuseport) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_QUEUE: +++ case BPF_MAP_TYPE_STACK: +++ if (func_id != BPF_FUNC_map_peek_elem && +++ func_id != BPF_FUNC_map_pop_elem && +++ func_id != BPF_FUNC_map_push_elem) +++ goto error; +++ break; +++ case BPF_MAP_TYPE_SK_STORAGE: +++ if (func_id != BPF_FUNC_sk_storage_get && +++ func_id != BPF_FUNC_sk_storage_delete) ++ goto error; ++ break; ++ default: ++@@ -923,109 +3553,1579 @@ static int check_map_func_compatibility( ++ case BPF_FUNC_tail_call: ++ if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) ++ goto error; +++ if (env->subprog_cnt > 1) { +++ verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); +++ return -EINVAL; +++ } ++ break; ++ case BPF_FUNC_perf_event_read: ++ case BPF_FUNC_perf_event_output: +++ case BPF_FUNC_perf_event_read_value: ++ if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) ++ goto error; ++ break; +++ case BPF_FUNC_get_stackid: +++ if (map->map_type != BPF_MAP_TYPE_STACK_TRACE) +++ goto error; +++ break; +++ case BPF_FUNC_current_task_under_cgroup: +++ case BPF_FUNC_skb_under_cgroup: +++ if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) +++ goto error; +++ break; +++ case BPF_FUNC_redirect_map: +++ if (map->map_type != BPF_MAP_TYPE_DEVMAP && +++ map->map_type != BPF_MAP_TYPE_DEVMAP_HASH && +++ map->map_type != BPF_MAP_TYPE_CPUMAP && +++ map->map_type != BPF_MAP_TYPE_XSKMAP) +++ goto error; +++ break; +++ case BPF_FUNC_sk_redirect_map: +++ case BPF_FUNC_msg_redirect_map: +++ case BPF_FUNC_sock_map_update: +++ if (map->map_type != BPF_MAP_TYPE_SOCKMAP) +++ goto error; +++ break; +++ case BPF_FUNC_sk_redirect_hash: +++ case BPF_FUNC_msg_redirect_hash: +++ case BPF_FUNC_sock_hash_update: +++ if (map->map_type != BPF_MAP_TYPE_SOCKHASH) +++ goto error; +++ break; +++ case BPF_FUNC_get_local_storage: +++ if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && +++ map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) +++ goto error; +++ break; +++ case BPF_FUNC_sk_select_reuseport: +++ if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) +++ goto error; +++ break; +++ case BPF_FUNC_map_peek_elem: +++ case BPF_FUNC_map_pop_elem: +++ case BPF_FUNC_map_push_elem: +++ if (map->map_type != BPF_MAP_TYPE_QUEUE && +++ map->map_type != BPF_MAP_TYPE_STACK) +++ goto error; +++ break; +++ case BPF_FUNC_sk_storage_get: +++ case BPF_FUNC_sk_storage_delete: +++ if (map->map_type != BPF_MAP_TYPE_SK_STORAGE) +++ goto error; +++ break; ++ default: ++ break; ++ } ++ ++ return 0; ++ error: ++- verbose("cannot pass map_type %d into func %d\n", ++- map->map_type, func_id); +++ verbose(env, "cannot pass map_type %d into func %s#%d\n", +++ map->map_type, func_id_name(func_id), func_id); ++ return -EINVAL; ++ } ++ ++-static int check_call(struct verifier_env *env, int func_id) +++static bool check_raw_mode_ok(const struct bpf_func_proto *fn) +++{ +++ int count = 0; +++ +++ if (fn->arg1_type == ARG_PTR_TO_UNINIT_MEM) +++ count++; +++ if (fn->arg2_type == ARG_PTR_TO_UNINIT_MEM) +++ count++; +++ if (fn->arg3_type == ARG_PTR_TO_UNINIT_MEM) +++ count++; +++ if (fn->arg4_type == ARG_PTR_TO_UNINIT_MEM) +++ count++; +++ if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) +++ count++; +++ +++ /* We only support one arg being in raw mode at the moment, +++ * which is sufficient for the helper functions we have +++ * right now. +++ */ +++ return count <= 1; +++} +++ +++static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, +++ enum bpf_arg_type arg_next) +++{ +++ return (arg_type_is_mem_ptr(arg_curr) && +++ !arg_type_is_mem_size(arg_next)) || +++ (!arg_type_is_mem_ptr(arg_curr) && +++ arg_type_is_mem_size(arg_next)); +++} +++ +++static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +++{ +++ /* bpf_xxx(..., buf, len) call will access 'len' +++ * bytes from memory 'buf'. Both arg types need +++ * to be paired, so make sure there's no buggy +++ * helper function specification. +++ */ +++ if (arg_type_is_mem_size(fn->arg1_type) || +++ arg_type_is_mem_ptr(fn->arg5_type) || +++ check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || +++ check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || +++ check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || +++ check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) +++ return false; +++ +++ return true; +++} +++ +++static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) +++{ +++ int count = 0; +++ +++ if (arg_type_may_be_refcounted(fn->arg1_type)) +++ count++; +++ if (arg_type_may_be_refcounted(fn->arg2_type)) +++ count++; +++ if (arg_type_may_be_refcounted(fn->arg3_type)) +++ count++; +++ if (arg_type_may_be_refcounted(fn->arg4_type)) +++ count++; +++ if (arg_type_may_be_refcounted(fn->arg5_type)) +++ count++; +++ +++ /* A reference acquiring function cannot acquire +++ * another refcounted ptr. +++ */ +++ if (is_acquire_function(func_id) && count) +++ return false; +++ +++ /* We only support one arg being unreferenced at the moment, +++ * which is sufficient for the helper functions we have right now. +++ */ +++ return count <= 1; +++} +++ +++static int check_func_proto(const struct bpf_func_proto *fn, int func_id) +++{ +++ return check_raw_mode_ok(fn) && +++ check_arg_pair_ok(fn) && +++ check_refcount_ok(fn, func_id) ? 0 : -EINVAL; +++} +++ +++/* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] +++ * are now invalid, so turn them into unknown SCALAR_VALUE. +++ */ +++static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, +++ struct bpf_func_state *state) +++{ +++ struct bpf_reg_state *regs = state->regs, *reg; +++ int i; +++ +++ for (i = 0; i < MAX_BPF_REG; i++) +++ if (reg_is_pkt_pointer_any(®s[i])) +++ mark_reg_unknown(env, regs, i); +++ +++ bpf_for_each_spilled_reg(i, state, reg) { +++ if (!reg) +++ continue; +++ if (reg_is_pkt_pointer_any(reg)) +++ __mark_reg_unknown(env, reg); +++ } +++} +++ +++static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +++{ +++ struct bpf_verifier_state *vstate = env->cur_state; +++ int i; +++ +++ for (i = 0; i <= vstate->curframe; i++) +++ __clear_all_pkt_pointers(env, vstate->frame[i]); +++} +++ +++static void release_reg_references(struct bpf_verifier_env *env, +++ struct bpf_func_state *state, +++ int ref_obj_id) +++{ +++ struct bpf_reg_state *regs = state->regs, *reg; +++ int i; +++ +++ for (i = 0; i < MAX_BPF_REG; i++) +++ if (regs[i].ref_obj_id == ref_obj_id) +++ mark_reg_unknown(env, regs, i); +++ +++ bpf_for_each_spilled_reg(i, state, reg) { +++ if (!reg) +++ continue; +++ if (reg->ref_obj_id == ref_obj_id) +++ __mark_reg_unknown(env, reg); +++ } +++} +++ +++/* The pointer with the specified id has released its reference to kernel +++ * resources. Identify all copies of the same pointer and clear the reference. +++ */ +++static int release_reference(struct bpf_verifier_env *env, +++ int ref_obj_id) +++{ +++ struct bpf_verifier_state *vstate = env->cur_state; +++ int err; +++ int i; +++ +++ err = release_reference_state(cur_func(env), ref_obj_id); +++ if (err) +++ return err; +++ +++ for (i = 0; i <= vstate->curframe; i++) +++ release_reg_references(env, vstate->frame[i], ref_obj_id); +++ +++ return 0; +++} +++ +++static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, +++ int *insn_idx) +++{ +++ struct bpf_verifier_state *state = env->cur_state; +++ struct bpf_func_state *caller, *callee; +++ int i, err, subprog, target_insn; +++ +++ if (state->curframe + 1 >= MAX_CALL_FRAMES) { +++ verbose(env, "the call stack of %d frames is too deep\n", +++ state->curframe + 2); +++ return -E2BIG; +++ } +++ +++ target_insn = *insn_idx + insn->imm; +++ subprog = find_subprog(env, target_insn + 1); +++ if (subprog < 0) { +++ verbose(env, "verifier bug. No program starts at insn %d\n", +++ target_insn + 1); +++ return -EFAULT; +++ } +++ +++ caller = state->frame[state->curframe]; +++ if (state->frame[state->curframe + 1]) { +++ verbose(env, "verifier bug. Frame %d already allocated\n", +++ state->curframe + 1); +++ return -EFAULT; +++ } +++ +++ callee = kzalloc(sizeof(*callee), GFP_KERNEL); +++ if (!callee) +++ return -ENOMEM; +++ state->frame[state->curframe + 1] = callee; +++ +++ /* callee cannot access r0, r6 - r9 for reading and has to write +++ * into its own stack before reading from it. +++ * callee can read/write into caller's stack +++ */ +++ init_func_state(env, callee, +++ /* remember the callsite, it will be used by bpf_exit */ +++ *insn_idx /* callsite */, +++ state->curframe + 1 /* frameno within this callchain */, +++ subprog /* subprog number within this prog */); +++ +++ /* Transfer references to the callee */ +++ err = transfer_reference_state(callee, caller); +++ if (err) +++ return err; +++ +++ /* copy r1 - r5 args that callee can access. The copy includes parent +++ * pointers, which connects us up to the liveness chain +++ */ +++ for (i = BPF_REG_1; i <= BPF_REG_5; i++) +++ callee->regs[i] = caller->regs[i]; +++ +++ /* after the call registers r0 - r5 were scratched */ +++ for (i = 0; i < CALLER_SAVED_REGS; i++) { +++ mark_reg_not_init(env, caller->regs, caller_saved[i]); +++ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); +++ } +++ +++ /* only increment it after check_reg_arg() finished */ +++ state->curframe++; +++ +++ /* and go analyze first insn of the callee */ +++ *insn_idx = target_insn; +++ +++ if (env->log.level & BPF_LOG_LEVEL) { +++ verbose(env, "caller:\n"); +++ print_verifier_state(env, caller); +++ verbose(env, "callee:\n"); +++ print_verifier_state(env, callee); +++ } +++ return 0; +++} +++ +++static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) +++{ +++ struct bpf_verifier_state *state = env->cur_state; +++ struct bpf_func_state *caller, *callee; +++ struct bpf_reg_state *r0; +++ int err; +++ +++ callee = state->frame[state->curframe]; +++ r0 = &callee->regs[BPF_REG_0]; +++ if (r0->type == PTR_TO_STACK) { +++ /* technically it's ok to return caller's stack pointer +++ * (or caller's caller's pointer) back to the caller, +++ * since these pointers are valid. Only current stack +++ * pointer will be invalid as soon as function exits, +++ * but let's be conservative +++ */ +++ verbose(env, "cannot return stack pointer to the caller\n"); +++ return -EINVAL; +++ } +++ +++ state->curframe--; +++ caller = state->frame[state->curframe]; +++ /* return to the caller whatever r0 had in the callee */ +++ caller->regs[BPF_REG_0] = *r0; +++ +++ /* Transfer references to the caller */ +++ err = transfer_reference_state(caller, callee); +++ if (err) +++ return err; +++ +++ *insn_idx = callee->callsite + 1; +++ if (env->log.level & BPF_LOG_LEVEL) { +++ verbose(env, "returning from callee:\n"); +++ print_verifier_state(env, callee); +++ verbose(env, "to caller at %d:\n", *insn_idx); +++ print_verifier_state(env, caller); +++ } +++ /* clear everything in the callee */ +++ free_func_state(callee); +++ state->frame[state->curframe + 1] = NULL; +++ return 0; +++} +++ +++static int do_refine_retval_range(struct bpf_verifier_env *env, +++ struct bpf_reg_state *regs, int ret_type, +++ int func_id, struct bpf_call_arg_meta *meta) +++{ +++ struct bpf_reg_state *ret_reg = ®s[BPF_REG_0]; +++ struct bpf_reg_state tmp_reg = *ret_reg; +++ bool ret; +++ +++ if (ret_type != RET_INTEGER || +++ (func_id != BPF_FUNC_get_stack && +++ func_id != BPF_FUNC_probe_read_str)) +++ return 0; +++ +++ /* Error case where ret is in interval [S32MIN, -1]. */ +++ ret_reg->smin_value = S32_MIN; +++ ret_reg->smax_value = -1; +++ +++ __reg_deduce_bounds(ret_reg); +++ __reg_bound_offset(ret_reg); +++ __update_reg_bounds(ret_reg); +++ +++ ret = push_stack(env, env->insn_idx + 1, env->insn_idx, false); +++ if (!ret) +++ return -EFAULT; +++ +++ *ret_reg = tmp_reg; +++ +++ /* Success case where ret is in range [0, msize_max_value]. */ +++ ret_reg->smin_value = 0; +++ ret_reg->smax_value = meta->msize_max_value; +++ ret_reg->umin_value = ret_reg->smin_value; +++ ret_reg->umax_value = ret_reg->smax_value; +++ +++ __reg_deduce_bounds(ret_reg); +++ __reg_bound_offset(ret_reg); +++ __update_reg_bounds(ret_reg); +++ +++ return 0; +++} +++ +++static int +++record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, +++ int func_id, int insn_idx) +++{ +++ struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; +++ struct bpf_map *map = meta->map_ptr; +++ +++ if (func_id != BPF_FUNC_tail_call && +++ func_id != BPF_FUNC_map_lookup_elem && +++ func_id != BPF_FUNC_map_update_elem && +++ func_id != BPF_FUNC_map_delete_elem && +++ func_id != BPF_FUNC_map_push_elem && +++ func_id != BPF_FUNC_map_pop_elem && +++ func_id != BPF_FUNC_map_peek_elem) +++ return 0; +++ +++ if (map == NULL) { +++ verbose(env, "kernel subsystem misconfigured verifier\n"); +++ return -EINVAL; +++ } +++ +++ /* In case of read-only, some additional restrictions +++ * need to be applied in order to prevent altering the +++ * state of the map from program side. +++ */ +++ if ((map->map_flags & BPF_F_RDONLY_PROG) && +++ (func_id == BPF_FUNC_map_delete_elem || +++ func_id == BPF_FUNC_map_update_elem || +++ func_id == BPF_FUNC_map_push_elem || +++ func_id == BPF_FUNC_map_pop_elem)) { +++ verbose(env, "write into map forbidden\n"); +++ return -EACCES; +++ } +++ +++ if (!BPF_MAP_PTR(aux->map_state)) +++ bpf_map_ptr_store(aux, meta->map_ptr, +++ meta->map_ptr->unpriv_array); +++ else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) +++ bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, +++ meta->map_ptr->unpriv_array); +++ return 0; +++} +++ +++static int check_reference_leak(struct bpf_verifier_env *env) +++{ +++ struct bpf_func_state *state = cur_func(env); +++ int i; +++ +++ for (i = 0; i < state->acquired_refs; i++) { +++ verbose(env, "Unreleased reference id=%d alloc_insn=%d\n", +++ state->refs[i].id, state->refs[i].insn_idx); +++ } +++ return state->acquired_refs ? -EINVAL : 0; +++} +++ +++static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) ++ { ++- struct verifier_state *state = &env->cur_state; ++ const struct bpf_func_proto *fn = NULL; ++- struct reg_state *regs = state->regs; ++- struct bpf_map *map = NULL; ++- struct reg_state *reg; +++ struct bpf_reg_state *regs; +++ struct bpf_call_arg_meta meta; +++ bool changes_data; ++ int i, err; ++ ++ /* find function prototype */ ++ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) { ++- verbose("invalid func %d\n", func_id); +++ verbose(env, "invalid func %s#%d\n", func_id_name(func_id), +++ func_id); ++ return -EINVAL; ++ } ++ ++- if (env->prog->aux->ops->get_func_proto) ++- fn = env->prog->aux->ops->get_func_proto(func_id); ++- +++ if (env->ops->get_func_proto) +++ fn = env->ops->get_func_proto(func_id, env->prog); ++ if (!fn) { ++- verbose("unknown func %d\n", func_id); +++ verbose(env, "unknown func %s#%d\n", func_id_name(func_id), +++ func_id); ++ return -EINVAL; ++ } ++ ++ /* eBPF programs must be GPL compatible to use GPL-ed functions */ ++ if (!env->prog->gpl_compatible && fn->gpl_only) { ++- verbose("cannot call GPL only function from proprietary program\n"); +++ verbose(env, "cannot call GPL-restricted function from non-GPL compatible program\n"); +++ return -EINVAL; +++ } +++ +++ /* With LD_ABS/IND some JITs save/restore skb from r1. */ +++ changes_data = bpf_helper_changes_pkt_data(fn->func); +++ if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { +++ verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", +++ func_id_name(func_id), func_id); ++ return -EINVAL; ++ } ++ +++ memset(&meta, 0, sizeof(meta)); +++ meta.pkt_access = fn->pkt_access; +++ +++ err = check_func_proto(fn, func_id); +++ if (err) { +++ verbose(env, "kernel subsystem misconfigured func %s#%d\n", +++ func_id_name(func_id), func_id); +++ return err; +++ } +++ +++ meta.func_id = func_id; ++ /* check args */ ++- err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map); +++ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); ++ if (err) ++ return err; ++- err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map); +++ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); ++ if (err) ++ return err; ++- err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map); +++ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); ++ if (err) ++ return err; ++- err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map); +++ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &meta); ++ if (err) ++ return err; ++- err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map); +++ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &meta); ++ if (err) ++ return err; ++ +++ err = record_func_map(env, &meta, func_id, insn_idx); +++ if (err) +++ return err; +++ +++ /* Mark slots with STACK_MISC in case of raw mode, stack offset +++ * is inferred from register state. +++ */ +++ for (i = 0; i < meta.access_size; i++) { +++ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, +++ BPF_WRITE, -1, false); +++ if (err) +++ return err; +++ } +++ +++ if (func_id == BPF_FUNC_tail_call) { +++ err = check_reference_leak(env); +++ if (err) { +++ verbose(env, "tail_call would lead to reference leak\n"); +++ return err; +++ } +++ } else if (is_release_function(func_id)) { +++ err = release_reference(env, meta.ref_obj_id); +++ if (err) { +++ verbose(env, "func %s#%d reference has not been acquired before\n", +++ func_id_name(func_id), func_id); +++ return err; +++ } +++ } +++ +++ regs = cur_regs(env); +++ +++ /* check that flags argument in get_local_storage(map, flags) is 0, +++ * this is required because get_local_storage() can't return an error. +++ */ +++ if (func_id == BPF_FUNC_get_local_storage && +++ !register_is_null(®s[BPF_REG_2])) { +++ verbose(env, "get_local_storage() doesn't support non-zero flags\n"); +++ return -EINVAL; +++ } +++ ++ /* reset caller saved regs */ ++ for (i = 0; i < CALLER_SAVED_REGS; i++) { ++- reg = regs + caller_saved[i]; ++- reg->type = NOT_INIT; ++- reg->imm = 0; +++ mark_reg_not_init(env, regs, caller_saved[i]); +++ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); ++ } ++ ++- /* update return register */ +++ /* helper call returns 64-bit value. */ +++ regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; +++ +++ /* update return register (already marked as written above) */ ++ if (fn->ret_type == RET_INTEGER) { ++- regs[BPF_REG_0].type = UNKNOWN_VALUE; +++ /* sets type to SCALAR_VALUE */ +++ mark_reg_unknown(env, regs, BPF_REG_0); ++ } else if (fn->ret_type == RET_VOID) { ++ regs[BPF_REG_0].type = NOT_INIT; ++- } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { ++- regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; +++ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL || +++ fn->ret_type == RET_PTR_TO_MAP_VALUE) { +++ /* There is no offset yet applied, variable or fixed */ +++ mark_reg_known_zero(env, regs, BPF_REG_0); ++ /* remember map_ptr, so that check_map_access() ++ * can check 'value_size' boundary of memory access ++ * to map element returned from bpf_map_lookup_elem() ++ */ ++- if (map == NULL) { ++- verbose("kernel subsystem misconfigured verifier\n"); +++ if (meta.map_ptr == NULL) { +++ verbose(env, +++ "kernel subsystem misconfigured verifier\n"); ++ return -EINVAL; ++ } ++- regs[BPF_REG_0].map_ptr = map; +++ regs[BPF_REG_0].map_ptr = meta.map_ptr; +++ if (fn->ret_type == RET_PTR_TO_MAP_VALUE) { +++ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE; +++ if (map_value_has_spin_lock(meta.map_ptr)) +++ regs[BPF_REG_0].id = ++env->id_gen; +++ } else { +++ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; +++ regs[BPF_REG_0].id = ++env->id_gen; +++ } +++ } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { +++ mark_reg_known_zero(env, regs, BPF_REG_0); +++ regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; +++ regs[BPF_REG_0].id = ++env->id_gen; +++ } else if (fn->ret_type == RET_PTR_TO_SOCK_COMMON_OR_NULL) { +++ mark_reg_known_zero(env, regs, BPF_REG_0); +++ regs[BPF_REG_0].type = PTR_TO_SOCK_COMMON_OR_NULL; +++ regs[BPF_REG_0].id = ++env->id_gen; +++ } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { +++ mark_reg_known_zero(env, regs, BPF_REG_0); +++ regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; +++ regs[BPF_REG_0].id = ++env->id_gen; ++ } else { ++- verbose("unknown return type %d of func %d\n", ++- fn->ret_type, func_id); +++ verbose(env, "unknown return type %d of func %s#%d\n", +++ fn->ret_type, func_id_name(func_id), func_id); ++ return -EINVAL; ++ } ++ ++- err = check_map_func_compatibility(map, func_id); +++ if (is_ptr_cast_function(func_id)) { +++ /* For release_reference() */ +++ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; +++ } else if (is_acquire_function(func_id)) { +++ int id = acquire_reference_state(env, insn_idx); +++ +++ if (id < 0) +++ return id; +++ /* For mark_ptr_or_null_reg() */ +++ regs[BPF_REG_0].id = id; +++ /* For release_reference() */ +++ regs[BPF_REG_0].ref_obj_id = id; +++ } +++ +++ err = do_refine_retval_range(env, regs, fn->ret_type, func_id, &meta); +++ if (err) +++ return err; +++ +++ err = check_map_func_compatibility(env, meta.map_ptr, func_id); ++ if (err) ++ return err; ++ +++ if (func_id == BPF_FUNC_get_stack && !env->prog->has_callchain_buf) { +++ const char *err_str; +++ +++#ifdef CONFIG_PERF_EVENTS +++ err = get_callchain_buffers(sysctl_perf_event_max_stack); +++ err_str = "cannot get callchain buffer for func %s#%d\n"; +++#else +++ err = -ENOTSUPP; +++ err_str = "func %s#%d not supported without CONFIG_PERF_EVENTS\n"; +++#endif +++ if (err) { +++ verbose(env, err_str, func_id_name(func_id), func_id); +++ return err; +++ } +++ +++ env->prog->has_callchain_buf = true; +++ } +++ +++ if (changes_data) +++ clear_all_pkt_pointers(env); ++ return 0; ++ } ++ +++static bool signed_add_overflows(s64 a, s64 b) +++{ +++ /* Do the add in u64, where overflow is well-defined */ +++ s64 res = (s64)((u64)a + (u64)b); +++ +++ if (b < 0) +++ return res > a; +++ return res < a; +++} +++ +++static bool signed_sub_overflows(s64 a, s64 b) +++{ +++ /* Do the sub in u64, where overflow is well-defined */ +++ s64 res = (s64)((u64)a - (u64)b); +++ +++ if (b < 0) +++ return res < a; +++ return res > a; +++} +++ +++static bool check_reg_sane_offset(struct bpf_verifier_env *env, +++ const struct bpf_reg_state *reg, +++ enum bpf_reg_type type) +++{ +++ bool known = tnum_is_const(reg->var_off); +++ s64 val = reg->var_off.value; +++ s64 smin = reg->smin_value; +++ +++ if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { +++ verbose(env, "math between %s pointer and %lld is not allowed\n", +++ reg_type_str[type], val); +++ return false; +++ } +++ +++ if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { +++ verbose(env, "%s pointer offset %d is not allowed\n", +++ reg_type_str[type], reg->off); +++ return false; +++ } +++ +++ if (smin == S64_MIN) { +++ verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", +++ reg_type_str[type]); +++ return false; +++ } +++ +++ if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { +++ verbose(env, "value %lld makes %s pointer be out of bounds\n", +++ smin, reg_type_str[type]); +++ return false; +++ } +++ +++ return true; +++} +++ +++static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +++{ +++ return &env->insn_aux_data[env->insn_idx]; +++} +++ +++enum { +++ REASON_BOUNDS = -1, +++ REASON_TYPE = -2, +++ REASON_PATHS = -3, +++ REASON_LIMIT = -4, +++ REASON_STACK = -5, +++}; +++ +++static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, +++ u32 *alu_limit, bool mask_to_left) +++{ +++ u32 max = 0, ptr_limit = 0; +++ +++ switch (ptr_reg->type) { +++ case PTR_TO_STACK: +++ /* Offset 0 is out-of-bounds, but acceptable start for the +++ * left direction, see BPF_REG_FP. Also, unknown scalar +++ * offset where we would need to deal with min/max bounds is +++ * currently prohibited for unprivileged. +++ */ +++ max = MAX_BPF_STACK + mask_to_left; +++ ptr_limit = -(ptr_reg->var_off.value + ptr_reg->off); +++ break; +++ case PTR_TO_MAP_VALUE: +++ max = ptr_reg->map_ptr->value_size; +++ ptr_limit = (mask_to_left ? +++ ptr_reg->smin_value : +++ ptr_reg->umax_value) + ptr_reg->off; +++ break; +++ default: +++ return REASON_TYPE; +++ } +++ +++ if (ptr_limit >= max) +++ return REASON_LIMIT; +++ *alu_limit = ptr_limit; +++ return 0; +++} +++ +++static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, +++ const struct bpf_insn *insn) +++{ +++ return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; +++} +++ +++static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, +++ u32 alu_state, u32 alu_limit) +++{ +++ /* If we arrived here from different branches with different +++ * state or limits to sanitize, then this won't work. +++ */ +++ if (aux->alu_state && +++ (aux->alu_state != alu_state || +++ aux->alu_limit != alu_limit)) +++ return REASON_PATHS; +++ +++ /* Corresponding fixup done in fixup_bpf_calls(). */ +++ aux->alu_state = alu_state; +++ aux->alu_limit = alu_limit; +++ return 0; +++} +++ +++static int sanitize_val_alu(struct bpf_verifier_env *env, +++ struct bpf_insn *insn) +++{ +++ struct bpf_insn_aux_data *aux = cur_aux(env); +++ +++ if (can_skip_alu_sanitation(env, insn)) +++ return 0; +++ +++ return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); +++} +++ +++static bool sanitize_needed(u8 opcode) +++{ +++ return opcode == BPF_ADD || opcode == BPF_SUB; +++} +++ +++struct bpf_sanitize_info { +++ struct bpf_insn_aux_data aux; +++ bool mask_to_left; +++}; +++ +++static struct bpf_verifier_state * +++sanitize_speculative_path(struct bpf_verifier_env *env, +++ const struct bpf_insn *insn, +++ u32 next_idx, u32 curr_idx) +++{ +++ struct bpf_verifier_state *branch; +++ struct bpf_reg_state *regs; +++ +++ branch = push_stack(env, next_idx, curr_idx, true); +++ if (branch && insn) { +++ regs = branch->frame[branch->curframe]->regs; +++ if (BPF_SRC(insn->code) == BPF_K) { +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ } else if (BPF_SRC(insn->code) == BPF_X) { +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ mark_reg_unknown(env, regs, insn->src_reg); +++ } +++ } +++ return branch; +++} +++ +++static int sanitize_ptr_alu(struct bpf_verifier_env *env, +++ struct bpf_insn *insn, +++ const struct bpf_reg_state *ptr_reg, +++ const struct bpf_reg_state *off_reg, +++ struct bpf_reg_state *dst_reg, +++ struct bpf_sanitize_info *info, +++ const bool commit_window) +++{ +++ struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; +++ struct bpf_verifier_state *vstate = env->cur_state; +++ bool off_is_imm = tnum_is_const(off_reg->var_off); +++ bool off_is_neg = off_reg->smin_value < 0; +++ bool ptr_is_dst_reg = ptr_reg == dst_reg; +++ u8 opcode = BPF_OP(insn->code); +++ u32 alu_state, alu_limit; +++ struct bpf_reg_state tmp; +++ bool ret; +++ int err; +++ +++ if (can_skip_alu_sanitation(env, insn)) +++ return 0; +++ +++ /* We already marked aux for masking from non-speculative +++ * paths, thus we got here in the first place. We only care +++ * to explore bad access from here. +++ */ +++ if (vstate->speculative) +++ goto do_sim; +++ +++ if (!commit_window) { +++ if (!tnum_is_const(off_reg->var_off) && +++ (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) +++ return REASON_BOUNDS; +++ +++ info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || +++ (opcode == BPF_SUB && !off_is_neg); +++ } +++ +++ err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left); +++ if (err < 0) +++ return err; +++ +++ if (commit_window) { +++ /* In commit phase we narrow the masking window based on +++ * the observed pointer move after the simulated operation. +++ */ +++ alu_state = info->aux.alu_state; +++ alu_limit = abs(info->aux.alu_limit - alu_limit); +++ } else { +++ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; +++ alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; +++ alu_state |= ptr_is_dst_reg ? +++ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; +++ } +++ +++ err = update_alu_sanitation_state(aux, alu_state, alu_limit); +++ if (err < 0) +++ return err; +++do_sim: +++ /* If we're in commit phase, we're done here given we already +++ * pushed the truncated dst_reg into the speculative verification +++ * stack. +++ * +++ * Also, when register is a known constant, we rewrite register-based +++ * operation to immediate-based, and thus do not need masking (and as +++ * a consequence, do not need to simulate the zero-truncation either). +++ */ +++ if (commit_window || off_is_imm) +++ return 0; +++ +++ /* Simulate and find potential out-of-bounds access under +++ * speculative execution from truncation as a result of +++ * masking when off was not within expected range. If off +++ * sits in dst, then we temporarily need to move ptr there +++ * to simulate dst (== 0) +/-= ptr. Needed, for example, +++ * for cases where we use K-based arithmetic in one direction +++ * and truncated reg-based in the other in order to explore +++ * bad access. +++ */ +++ if (!ptr_is_dst_reg) { +++ tmp = *dst_reg; +++ *dst_reg = *ptr_reg; +++ } +++ ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, +++ env->insn_idx); +++ if (!ptr_is_dst_reg && ret) +++ *dst_reg = tmp; +++ return !ret ? REASON_STACK : 0; +++} +++ +++static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +++{ +++ struct bpf_verifier_state *vstate = env->cur_state; +++ +++ /* If we simulate paths under speculation, we don't update the +++ * insn as 'seen' such that when we verify unreachable paths in +++ * the non-speculative domain, sanitize_dead_code() can still +++ * rewrite/sanitize them. +++ */ +++ if (!vstate->speculative) +++ env->insn_aux_data[env->insn_idx].seen = true; +++} +++ +++static int sanitize_err(struct bpf_verifier_env *env, +++ const struct bpf_insn *insn, int reason, +++ const struct bpf_reg_state *off_reg, +++ const struct bpf_reg_state *dst_reg) +++{ +++ static const char *err = "pointer arithmetic with it prohibited for !root"; +++ const char *op = BPF_OP(insn->code) == BPF_ADD ? "add" : "sub"; +++ u32 dst = insn->dst_reg, src = insn->src_reg; +++ +++ switch (reason) { +++ case REASON_BOUNDS: +++ verbose(env, "R%d has unknown scalar with mixed signed bounds, %s\n", +++ off_reg == dst_reg ? dst : src, err); +++ break; +++ case REASON_TYPE: +++ verbose(env, "R%d has pointer with unsupported alu operation, %s\n", +++ off_reg == dst_reg ? src : dst, err); +++ break; +++ case REASON_PATHS: +++ verbose(env, "R%d tried to %s from different maps, paths or scalars, %s\n", +++ dst, op, err); +++ break; +++ case REASON_LIMIT: +++ verbose(env, "R%d tried to %s beyond pointer bounds, %s\n", +++ dst, op, err); +++ break; +++ case REASON_STACK: +++ verbose(env, "R%d could not be pushed for speculative verification, %s\n", +++ dst, err); +++ break; +++ default: +++ verbose(env, "verifier internal error: unknown reason (%d)\n", +++ reason); +++ break; +++ } +++ +++ return -EACCES; +++} +++ +++static int sanitize_check_bounds(struct bpf_verifier_env *env, +++ const struct bpf_insn *insn, +++ const struct bpf_reg_state *dst_reg) +++{ +++ u32 dst = insn->dst_reg; +++ +++ /* For unprivileged we require that resulting offset must be in bounds +++ * in order to be able to sanitize access later on. +++ */ +++ if (env->allow_ptr_leaks) +++ return 0; +++ +++ switch (dst_reg->type) { +++ case PTR_TO_STACK: +++ if (check_stack_access(env, dst_reg, dst_reg->off + +++ dst_reg->var_off.value, 1)) { +++ verbose(env, "R%d stack pointer arithmetic goes out of range, " +++ "prohibited for !root\n", dst); +++ return -EACCES; +++ } +++ break; +++ case PTR_TO_MAP_VALUE: +++ if (check_map_access(env, dst, dst_reg->off, 1, false)) { +++ verbose(env, "R%d pointer arithmetic of map value goes out of range, " +++ "prohibited for !root\n", dst); +++ return -EACCES; +++ } +++ break; +++ default: +++ break; +++ } +++ +++ return 0; +++} +++ +++/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. +++ * Caller should also handle BPF_MOV case separately. +++ * If we return -EACCES, caller may want to try again treating pointer as a +++ * scalar. So we only emit a diagnostic if !env->allow_ptr_leaks. +++ */ +++static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, +++ struct bpf_insn *insn, +++ const struct bpf_reg_state *ptr_reg, +++ const struct bpf_reg_state *off_reg) +++{ +++ struct bpf_verifier_state *vstate = env->cur_state; +++ struct bpf_func_state *state = vstate->frame[vstate->curframe]; +++ struct bpf_reg_state *regs = state->regs, *dst_reg; +++ bool known = tnum_is_const(off_reg->var_off); +++ s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, +++ smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; +++ u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, +++ umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; +++ struct bpf_sanitize_info info = {}; +++ u8 opcode = BPF_OP(insn->code); +++ u32 dst = insn->dst_reg; +++ int ret; +++ +++ dst_reg = ®s[dst]; +++ +++ if ((known && (smin_val != smax_val || umin_val != umax_val)) || +++ smin_val > smax_val || umin_val > umax_val) { +++ /* Taint dst register if offset had invalid bounds derived from +++ * e.g. dead branches. +++ */ +++ __mark_reg_unknown(env, dst_reg); +++ return 0; +++ } +++ +++ if (BPF_CLASS(insn->code) != BPF_ALU64) { +++ /* 32-bit ALU ops on pointers produce (meaningless) scalars */ +++ verbose(env, +++ "R%d 32-bit pointer arithmetic prohibited\n", +++ dst); +++ return -EACCES; +++ } +++ +++ switch (ptr_reg->type) { +++ case PTR_TO_MAP_VALUE_OR_NULL: +++ verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", +++ dst, reg_type_str[ptr_reg->type]); +++ return -EACCES; +++ case CONST_PTR_TO_MAP: +++ /* smin_val represents the known value */ +++ if (known && smin_val == 0 && opcode == BPF_ADD) +++ break; +++ /* fall-through */ +++ case PTR_TO_PACKET_END: +++ case PTR_TO_SOCKET: +++ case PTR_TO_SOCKET_OR_NULL: +++ case PTR_TO_SOCK_COMMON: +++ case PTR_TO_SOCK_COMMON_OR_NULL: +++ case PTR_TO_TCP_SOCK: +++ case PTR_TO_TCP_SOCK_OR_NULL: +++ case PTR_TO_XDP_SOCK: +++ verbose(env, "R%d pointer arithmetic on %s prohibited\n", +++ dst, reg_type_str[ptr_reg->type]); +++ return -EACCES; +++ default: +++ break; +++ } +++ +++ /* In case of 'scalar += pointer', dst_reg inherits pointer type and id. +++ * The id may be overwritten later if we create a new variable offset. +++ */ +++ dst_reg->type = ptr_reg->type; +++ dst_reg->id = ptr_reg->id; +++ +++ if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || +++ !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) +++ return -EINVAL; +++ +++ if (sanitize_needed(opcode)) { +++ ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg, +++ &info, false); +++ if (ret < 0) +++ return sanitize_err(env, insn, ret, off_reg, dst_reg); +++ } +++ +++ switch (opcode) { +++ case BPF_ADD: +++ /* We can take a fixed offset as long as it doesn't overflow +++ * the s32 'off' field +++ */ +++ if (known && (ptr_reg->off + smin_val == +++ (s64)(s32)(ptr_reg->off + smin_val))) { +++ /* pointer += K. Accumulate it into fixed offset */ +++ dst_reg->smin_value = smin_ptr; +++ dst_reg->smax_value = smax_ptr; +++ dst_reg->umin_value = umin_ptr; +++ dst_reg->umax_value = umax_ptr; +++ dst_reg->var_off = ptr_reg->var_off; +++ dst_reg->off = ptr_reg->off + smin_val; +++ dst_reg->raw = ptr_reg->raw; +++ break; +++ } +++ /* A new variable offset is created. Note that off_reg->off +++ * == 0, since it's a scalar. +++ * dst_reg gets the pointer type and since some positive +++ * integer value was added to the pointer, give it a new 'id' +++ * if it's a PTR_TO_PACKET. +++ * this creates a new 'base' pointer, off_reg (variable) gets +++ * added into the variable offset, and we copy the fixed offset +++ * from ptr_reg. +++ */ +++ if (signed_add_overflows(smin_ptr, smin_val) || +++ signed_add_overflows(smax_ptr, smax_val)) { +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ } else { +++ dst_reg->smin_value = smin_ptr + smin_val; +++ dst_reg->smax_value = smax_ptr + smax_val; +++ } +++ if (umin_ptr + umin_val < umin_ptr || +++ umax_ptr + umax_val < umax_ptr) { +++ dst_reg->umin_value = 0; +++ dst_reg->umax_value = U64_MAX; +++ } else { +++ dst_reg->umin_value = umin_ptr + umin_val; +++ dst_reg->umax_value = umax_ptr + umax_val; +++ } +++ dst_reg->var_off = tnum_add(ptr_reg->var_off, off_reg->var_off); +++ dst_reg->off = ptr_reg->off; +++ dst_reg->raw = ptr_reg->raw; +++ if (reg_is_pkt_pointer(ptr_reg)) { +++ dst_reg->id = ++env->id_gen; +++ /* something was added to pkt_ptr, set range to zero */ +++ dst_reg->raw = 0; +++ } +++ break; +++ case BPF_SUB: +++ if (dst_reg == off_reg) { +++ /* scalar -= pointer. Creates an unknown scalar */ +++ verbose(env, "R%d tried to subtract pointer from scalar\n", +++ dst); +++ return -EACCES; +++ } +++ /* We don't allow subtraction from FP, because (according to +++ * test_verifier.c test "invalid fp arithmetic", JITs might not +++ * be able to deal with it. +++ */ +++ if (ptr_reg->type == PTR_TO_STACK) { +++ verbose(env, "R%d subtraction from stack pointer prohibited\n", +++ dst); +++ return -EACCES; +++ } +++ if (known && (ptr_reg->off - smin_val == +++ (s64)(s32)(ptr_reg->off - smin_val))) { +++ /* pointer -= K. Subtract it from fixed offset */ +++ dst_reg->smin_value = smin_ptr; +++ dst_reg->smax_value = smax_ptr; +++ dst_reg->umin_value = umin_ptr; +++ dst_reg->umax_value = umax_ptr; +++ dst_reg->var_off = ptr_reg->var_off; +++ dst_reg->id = ptr_reg->id; +++ dst_reg->off = ptr_reg->off - smin_val; +++ dst_reg->raw = ptr_reg->raw; +++ break; +++ } +++ /* A new variable offset is created. If the subtrahend is known +++ * nonnegative, then any reg->range we had before is still good. +++ */ +++ if (signed_sub_overflows(smin_ptr, smax_val) || +++ signed_sub_overflows(smax_ptr, smin_val)) { +++ /* Overflow possible, we know nothing */ +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ } else { +++ dst_reg->smin_value = smin_ptr - smax_val; +++ dst_reg->smax_value = smax_ptr - smin_val; +++ } +++ if (umin_ptr < umax_val) { +++ /* Overflow possible, we know nothing */ +++ dst_reg->umin_value = 0; +++ dst_reg->umax_value = U64_MAX; +++ } else { +++ /* Cannot overflow (as long as bounds are consistent) */ +++ dst_reg->umin_value = umin_ptr - umax_val; +++ dst_reg->umax_value = umax_ptr - umin_val; +++ } +++ dst_reg->var_off = tnum_sub(ptr_reg->var_off, off_reg->var_off); +++ dst_reg->off = ptr_reg->off; +++ dst_reg->raw = ptr_reg->raw; +++ if (reg_is_pkt_pointer(ptr_reg)) { +++ dst_reg->id = ++env->id_gen; +++ /* something was added to pkt_ptr, set range to zero */ +++ if (smin_val < 0) +++ dst_reg->raw = 0; +++ } +++ break; +++ case BPF_AND: +++ case BPF_OR: +++ case BPF_XOR: +++ /* bitwise ops on pointers are troublesome, prohibit. */ +++ verbose(env, "R%d bitwise operator %s on pointer prohibited\n", +++ dst, bpf_alu_string[opcode >> 4]); +++ return -EACCES; +++ default: +++ /* other operators (e.g. MUL,LSH) produce non-pointer results */ +++ verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", +++ dst, bpf_alu_string[opcode >> 4]); +++ return -EACCES; +++ } +++ +++ if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) +++ return -EINVAL; +++ +++ __update_reg_bounds(dst_reg); +++ __reg_deduce_bounds(dst_reg); +++ __reg_bound_offset(dst_reg); +++ +++ if (sanitize_check_bounds(env, insn, dst_reg) < 0) +++ return -EACCES; +++ if (sanitize_needed(opcode)) { +++ ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, +++ &info, true); +++ if (ret < 0) +++ return sanitize_err(env, insn, ret, off_reg, dst_reg); +++ } +++ +++ return 0; +++} +++ +++/* WARNING: This function does calculations on 64-bit values, but the actual +++ * execution may occur on 32-bit values. Therefore, things like bitshifts +++ * need extra checks in the 32-bit case. +++ */ +++static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, +++ struct bpf_insn *insn, +++ struct bpf_reg_state *dst_reg, +++ struct bpf_reg_state src_reg) +++{ +++ struct bpf_reg_state *regs = cur_regs(env); +++ u8 opcode = BPF_OP(insn->code); +++ bool src_known, dst_known; +++ s64 smin_val, smax_val; +++ u64 umin_val, umax_val; +++ u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; +++ int ret; +++ +++ if (insn_bitness == 32) { +++ /* Relevant for 32-bit RSH: Information can propagate towards +++ * LSB, so it isn't sufficient to only truncate the output to +++ * 32 bits. +++ */ +++ coerce_reg_to_size(dst_reg, 4); +++ coerce_reg_to_size(&src_reg, 4); +++ } +++ +++ smin_val = src_reg.smin_value; +++ smax_val = src_reg.smax_value; +++ umin_val = src_reg.umin_value; +++ umax_val = src_reg.umax_value; +++ src_known = tnum_is_const(src_reg.var_off); +++ dst_known = tnum_is_const(dst_reg->var_off); +++ +++ if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || +++ smin_val > smax_val || umin_val > umax_val) { +++ /* Taint dst register if offset had invalid bounds derived from +++ * e.g. dead branches. +++ */ +++ __mark_reg_unknown(env, dst_reg); +++ return 0; +++ } +++ +++ if (!src_known && +++ opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { +++ __mark_reg_unknown(env, dst_reg); +++ return 0; +++ } +++ +++ if (sanitize_needed(opcode)) { +++ ret = sanitize_val_alu(env, insn); +++ if (ret < 0) +++ return sanitize_err(env, insn, ret, NULL, NULL); +++ } +++ +++ switch (opcode) { +++ case BPF_ADD: +++ if (signed_add_overflows(dst_reg->smin_value, smin_val) || +++ signed_add_overflows(dst_reg->smax_value, smax_val)) { +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ } else { +++ dst_reg->smin_value += smin_val; +++ dst_reg->smax_value += smax_val; +++ } +++ if (dst_reg->umin_value + umin_val < umin_val || +++ dst_reg->umax_value + umax_val < umax_val) { +++ dst_reg->umin_value = 0; +++ dst_reg->umax_value = U64_MAX; +++ } else { +++ dst_reg->umin_value += umin_val; +++ dst_reg->umax_value += umax_val; +++ } +++ dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); +++ break; +++ case BPF_SUB: +++ if (signed_sub_overflows(dst_reg->smin_value, smax_val) || +++ signed_sub_overflows(dst_reg->smax_value, smin_val)) { +++ /* Overflow possible, we know nothing */ +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ } else { +++ dst_reg->smin_value -= smax_val; +++ dst_reg->smax_value -= smin_val; +++ } +++ if (dst_reg->umin_value < umax_val) { +++ /* Overflow possible, we know nothing */ +++ dst_reg->umin_value = 0; +++ dst_reg->umax_value = U64_MAX; +++ } else { +++ /* Cannot overflow (as long as bounds are consistent) */ +++ dst_reg->umin_value -= umax_val; +++ dst_reg->umax_value -= umin_val; +++ } +++ dst_reg->var_off = tnum_sub(dst_reg->var_off, src_reg.var_off); +++ break; +++ case BPF_MUL: +++ dst_reg->var_off = tnum_mul(dst_reg->var_off, src_reg.var_off); +++ if (smin_val < 0 || dst_reg->smin_value < 0) { +++ /* Ain't nobody got time to multiply that sign */ +++ __mark_reg_unbounded(dst_reg); +++ __update_reg_bounds(dst_reg); +++ break; +++ } +++ /* Both values are positive, so we can work with unsigned and +++ * copy the result to signed (unless it exceeds S64_MAX). +++ */ +++ if (umax_val > U32_MAX || dst_reg->umax_value > U32_MAX) { +++ /* Potential overflow, we know nothing */ +++ __mark_reg_unbounded(dst_reg); +++ /* (except what we can learn from the var_off) */ +++ __update_reg_bounds(dst_reg); +++ break; +++ } +++ dst_reg->umin_value *= umin_val; +++ dst_reg->umax_value *= umax_val; +++ if (dst_reg->umax_value > S64_MAX) { +++ /* Overflow possible, we know nothing */ +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ } else { +++ dst_reg->smin_value = dst_reg->umin_value; +++ dst_reg->smax_value = dst_reg->umax_value; +++ } +++ break; +++ case BPF_AND: +++ if (src_known && dst_known) { +++ __mark_reg_known(dst_reg, dst_reg->var_off.value & +++ src_reg.var_off.value); +++ break; +++ } +++ /* We get our minimum from the var_off, since that's inherently +++ * bitwise. Our maximum is the minimum of the operands' maxima. +++ */ +++ dst_reg->var_off = tnum_and(dst_reg->var_off, src_reg.var_off); +++ dst_reg->umin_value = dst_reg->var_off.value; +++ dst_reg->umax_value = min(dst_reg->umax_value, umax_val); +++ if (dst_reg->smin_value < 0 || smin_val < 0) { +++ /* Lose signed bounds when ANDing negative numbers, +++ * ain't nobody got time for that. +++ */ +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ } else { +++ /* ANDing two positives gives a positive, so safe to +++ * cast result into s64. +++ */ +++ dst_reg->smin_value = dst_reg->umin_value; +++ dst_reg->smax_value = dst_reg->umax_value; +++ } +++ /* We may learn something more from the var_off */ +++ __update_reg_bounds(dst_reg); +++ break; +++ case BPF_OR: +++ if (src_known && dst_known) { +++ __mark_reg_known(dst_reg, dst_reg->var_off.value | +++ src_reg.var_off.value); +++ break; +++ } +++ /* We get our maximum from the var_off, and our minimum is the +++ * maximum of the operands' minima +++ */ +++ dst_reg->var_off = tnum_or(dst_reg->var_off, src_reg.var_off); +++ dst_reg->umin_value = max(dst_reg->umin_value, umin_val); +++ dst_reg->umax_value = dst_reg->var_off.value | +++ dst_reg->var_off.mask; +++ if (dst_reg->smin_value < 0 || smin_val < 0) { +++ /* Lose signed bounds when ORing negative numbers, +++ * ain't nobody got time for that. +++ */ +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ } else { +++ /* ORing two positives gives a positive, so safe to +++ * cast result into s64. +++ */ +++ dst_reg->smin_value = dst_reg->umin_value; +++ dst_reg->smax_value = dst_reg->umax_value; +++ } +++ /* We may learn something more from the var_off */ +++ __update_reg_bounds(dst_reg); +++ break; +++ case BPF_LSH: +++ if (umax_val >= insn_bitness) { +++ /* Shifts greater than 31 or 63 are undefined. +++ * This includes shifts by a negative number. +++ */ +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ break; +++ } +++ /* We lose all sign bit information (except what we can pick +++ * up from var_off) +++ */ +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ /* If we might shift our top bit out, then we know nothing */ +++ if (dst_reg->umax_value > 1ULL << (63 - umax_val)) { +++ dst_reg->umin_value = 0; +++ dst_reg->umax_value = U64_MAX; +++ } else { +++ dst_reg->umin_value <<= umin_val; +++ dst_reg->umax_value <<= umax_val; +++ } +++ dst_reg->var_off = tnum_lshift(dst_reg->var_off, umin_val); +++ /* We may learn something more from the var_off */ +++ __update_reg_bounds(dst_reg); +++ break; +++ case BPF_RSH: +++ if (umax_val >= insn_bitness) { +++ /* Shifts greater than 31 or 63 are undefined. +++ * This includes shifts by a negative number. +++ */ +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ break; +++ } +++ /* BPF_RSH is an unsigned shift. If the value in dst_reg might +++ * be negative, then either: +++ * 1) src_reg might be zero, so the sign bit of the result is +++ * unknown, so we lose our signed bounds +++ * 2) it's known negative, thus the unsigned bounds capture the +++ * signed bounds +++ * 3) the signed bounds cross zero, so they tell us nothing +++ * about the result +++ * If the value in dst_reg is known nonnegative, then again the +++ * unsigned bounts capture the signed bounds. +++ * Thus, in all cases it suffices to blow away our signed bounds +++ * and rely on inferring new ones from the unsigned bounds and +++ * var_off of the result. +++ */ +++ dst_reg->smin_value = S64_MIN; +++ dst_reg->smax_value = S64_MAX; +++ dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); +++ dst_reg->umin_value >>= umax_val; +++ dst_reg->umax_value >>= umin_val; +++ /* We may learn something more from the var_off */ +++ __update_reg_bounds(dst_reg); +++ break; +++ case BPF_ARSH: +++ if (umax_val >= insn_bitness) { +++ /* Shifts greater than 31 or 63 are undefined. +++ * This includes shifts by a negative number. +++ */ +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ break; +++ } +++ +++ /* Upon reaching here, src_known is true and +++ * umax_val is equal to umin_val. +++ */ +++ if (insn_bitness == 32) { +++ dst_reg->smin_value = (u32)(((s32)dst_reg->smin_value) >> umin_val); +++ dst_reg->smax_value = (u32)(((s32)dst_reg->smax_value) >> umin_val); +++ } else { +++ dst_reg->smin_value >>= umin_val; +++ dst_reg->smax_value >>= umin_val; +++ } +++ +++ dst_reg->var_off = tnum_arshift(dst_reg->var_off, umin_val, +++ insn_bitness); +++ +++ /* blow away the dst_reg umin_value/umax_value and rely on +++ * dst_reg var_off to refine the result. +++ */ +++ dst_reg->umin_value = 0; +++ dst_reg->umax_value = U64_MAX; +++ __update_reg_bounds(dst_reg); +++ break; +++ default: +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ break; +++ } +++ +++ if (BPF_CLASS(insn->code) != BPF_ALU64) { +++ /* 32-bit ALU ops are (32,32)->32 */ +++ coerce_reg_to_size(dst_reg, 4); +++ } +++ +++ __reg_deduce_bounds(dst_reg); +++ __reg_bound_offset(dst_reg); +++ return 0; +++} +++ +++/* Handles ALU ops other than BPF_END, BPF_NEG and BPF_MOV: computes new min/max +++ * and var_off. +++ */ +++static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, +++ struct bpf_insn *insn) +++{ +++ struct bpf_verifier_state *vstate = env->cur_state; +++ struct bpf_func_state *state = vstate->frame[vstate->curframe]; +++ struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; +++ struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; +++ u8 opcode = BPF_OP(insn->code); +++ int err; +++ +++ dst_reg = ®s[insn->dst_reg]; +++ src_reg = NULL; +++ if (dst_reg->type != SCALAR_VALUE) +++ ptr_reg = dst_reg; +++ if (BPF_SRC(insn->code) == BPF_X) { +++ src_reg = ®s[insn->src_reg]; +++ if (src_reg->type != SCALAR_VALUE) { +++ if (dst_reg->type != SCALAR_VALUE) { +++ /* Combining two pointers by any ALU op yields +++ * an arbitrary scalar. Disallow all math except +++ * pointer subtraction +++ */ +++ if (opcode == BPF_SUB && env->allow_ptr_leaks) { +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ return 0; +++ } +++ verbose(env, "R%d pointer %s pointer prohibited\n", +++ insn->dst_reg, +++ bpf_alu_string[opcode >> 4]); +++ return -EACCES; +++ } else { +++ /* scalar += pointer +++ * This is legal, but we have to reverse our +++ * src/dest handling in computing the range +++ */ +++ err = mark_chain_precision(env, insn->dst_reg); +++ if (err) +++ return err; +++ return adjust_ptr_min_max_vals(env, insn, +++ src_reg, dst_reg); +++ } +++ } else if (ptr_reg) { +++ /* pointer += scalar */ +++ err = mark_chain_precision(env, insn->src_reg); +++ if (err) +++ return err; +++ return adjust_ptr_min_max_vals(env, insn, +++ dst_reg, src_reg); +++ } +++ } else { +++ /* Pretend the src is a reg with a known value, since we only +++ * need to be able to read from this state. +++ */ +++ off_reg.type = SCALAR_VALUE; +++ __mark_reg_known(&off_reg, insn->imm); +++ src_reg = &off_reg; +++ if (ptr_reg) /* pointer += K */ +++ return adjust_ptr_min_max_vals(env, insn, +++ ptr_reg, src_reg); +++ } +++ +++ /* Got here implies adding two SCALAR_VALUEs */ +++ if (WARN_ON_ONCE(ptr_reg)) { +++ print_verifier_state(env, state); +++ verbose(env, "verifier internal error: unexpected ptr_reg\n"); +++ return -EINVAL; +++ } +++ if (WARN_ON(!src_reg)) { +++ print_verifier_state(env, state); +++ verbose(env, "verifier internal error: no src_reg\n"); +++ return -EINVAL; +++ } +++ return adjust_scalar_min_max_vals(env, insn, dst_reg, *src_reg); +++} +++ ++ /* check validity of 32-bit and 64-bit arithmetic operations */ ++-static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) +++static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) ++ { ++- struct reg_state *regs = env->cur_state.regs; +++ struct bpf_reg_state *regs = cur_regs(env); ++ u8 opcode = BPF_OP(insn->code); ++ int err; ++ ++@@ -1034,30 +5134,31 @@ static int check_alu_op(struct verifier_ ++ if (BPF_SRC(insn->code) != 0 || ++ insn->src_reg != BPF_REG_0 || ++ insn->off != 0 || insn->imm != 0) { ++- verbose("BPF_NEG uses reserved fields\n"); +++ verbose(env, "BPF_NEG uses reserved fields\n"); ++ return -EINVAL; ++ } ++ } else { ++ if (insn->src_reg != BPF_REG_0 || insn->off != 0 || ++- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { ++- verbose("BPF_END uses reserved fields\n"); +++ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || +++ BPF_CLASS(insn->code) == BPF_ALU64) { +++ verbose(env, "BPF_END uses reserved fields\n"); ++ return -EINVAL; ++ } ++ } ++ ++ /* check src operand */ ++- err = check_reg_arg(regs, insn->dst_reg, SRC_OP); +++ err = check_reg_arg(env, insn->dst_reg, SRC_OP); ++ if (err) ++ return err; ++ ++ if (is_pointer_value(env, insn->dst_reg)) { ++- verbose("R%d pointer arithmetic prohibited\n", +++ verbose(env, "R%d pointer arithmetic prohibited\n", ++ insn->dst_reg); ++ return -EACCES; ++ } ++ ++ /* check dest operand */ ++- err = check_reg_arg(regs, insn->dst_reg, DST_OP); +++ err = check_reg_arg(env, insn->dst_reg, DST_OP); ++ if (err) ++ return err; ++ ++@@ -1065,81 +5166,100 @@ static int check_alu_op(struct verifier_ ++ ++ if (BPF_SRC(insn->code) == BPF_X) { ++ if (insn->imm != 0 || insn->off != 0) { ++- verbose("BPF_MOV uses reserved fields\n"); +++ verbose(env, "BPF_MOV uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++ /* check src operand */ ++- err = check_reg_arg(regs, insn->src_reg, SRC_OP); +++ err = check_reg_arg(env, insn->src_reg, SRC_OP); ++ if (err) ++ return err; ++ } else { ++ if (insn->src_reg != BPF_REG_0 || insn->off != 0) { ++- verbose("BPF_MOV uses reserved fields\n"); +++ verbose(env, "BPF_MOV uses reserved fields\n"); ++ return -EINVAL; ++ } ++ } ++ ++- /* check dest operand */ ++- err = check_reg_arg(regs, insn->dst_reg, DST_OP); +++ /* check dest operand, mark as required later */ +++ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); ++ if (err) ++ return err; ++ ++ if (BPF_SRC(insn->code) == BPF_X) { +++ struct bpf_reg_state *src_reg = regs + insn->src_reg; +++ struct bpf_reg_state *dst_reg = regs + insn->dst_reg; +++ ++ if (BPF_CLASS(insn->code) == BPF_ALU64) { ++ /* case: R1 = R2 ++ * copy register state to dest reg ++ */ ++- regs[insn->dst_reg] = regs[insn->src_reg]; +++ *dst_reg = *src_reg; +++ dst_reg->live |= REG_LIVE_WRITTEN; +++ dst_reg->subreg_def = DEF_NOT_SUBREG; ++ } else { +++ /* R1 = (u32) R2 */ ++ if (is_pointer_value(env, insn->src_reg)) { ++- verbose("R%d partial copy of pointer\n", +++ verbose(env, +++ "R%d partial copy of pointer\n", ++ insn->src_reg); ++ return -EACCES; +++ } else if (src_reg->type == SCALAR_VALUE) { +++ *dst_reg = *src_reg; +++ dst_reg->live |= REG_LIVE_WRITTEN; +++ dst_reg->subreg_def = env->insn_idx + 1; +++ } else { +++ mark_reg_unknown(env, regs, +++ insn->dst_reg); ++ } ++- regs[insn->dst_reg].type = UNKNOWN_VALUE; ++- regs[insn->dst_reg].map_ptr = NULL; +++ coerce_reg_to_size(dst_reg, 4); ++ } ++ } else { ++ /* case: R = imm ++ * remember the value we stored into this reg ++ */ ++- regs[insn->dst_reg].type = CONST_IMM; ++- regs[insn->dst_reg].imm = insn->imm; +++ /* clear any state __mark_reg_known doesn't set */ +++ mark_reg_unknown(env, regs, insn->dst_reg); +++ regs[insn->dst_reg].type = SCALAR_VALUE; +++ if (BPF_CLASS(insn->code) == BPF_ALU64) { +++ __mark_reg_known(regs + insn->dst_reg, +++ insn->imm); +++ } else { +++ __mark_reg_known(regs + insn->dst_reg, +++ (u32)insn->imm); +++ } ++ } ++ ++ } else if (opcode > BPF_END) { ++- verbose("invalid BPF_ALU opcode %x\n", opcode); +++ verbose(env, "invalid BPF_ALU opcode %x\n", opcode); ++ return -EINVAL; ++ ++ } else { /* all other ALU ops: and, sub, xor, add, ... */ ++ ++- bool stack_relative = false; ++- ++ if (BPF_SRC(insn->code) == BPF_X) { ++ if (insn->imm != 0 || insn->off != 0) { ++- verbose("BPF_ALU uses reserved fields\n"); +++ verbose(env, "BPF_ALU uses reserved fields\n"); ++ return -EINVAL; ++ } ++ /* check src1 operand */ ++- err = check_reg_arg(regs, insn->src_reg, SRC_OP); +++ err = check_reg_arg(env, insn->src_reg, SRC_OP); ++ if (err) ++ return err; ++ } else { ++ if (insn->src_reg != BPF_REG_0 || insn->off != 0) { ++- verbose("BPF_ALU uses reserved fields\n"); +++ verbose(env, "BPF_ALU uses reserved fields\n"); ++ return -EINVAL; ++ } ++ } ++ ++ /* check src2 operand */ ++- err = check_reg_arg(regs, insn->dst_reg, SRC_OP); +++ err = check_reg_arg(env, insn->dst_reg, SRC_OP); ++ if (err) ++ return err; ++ ++ if ((opcode == BPF_MOD || opcode == BPF_DIV) && ++ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) { ++- verbose("div by zero\n"); +++ verbose(env, "div by zero\n"); ++ return -EINVAL; ++ } ++ ++@@ -1148,185 +5268,980 @@ static int check_alu_op(struct verifier_ ++ int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; ++ ++ if (insn->imm < 0 || insn->imm >= size) { ++- verbose("invalid shift %d\n", insn->imm); +++ verbose(env, "invalid shift %d\n", insn->imm); ++ return -EINVAL; ++ } ++ } ++ ++- /* pattern match 'bpf_add Rx, imm' instruction */ ++- if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && ++- regs[insn->dst_reg].type == FRAME_PTR && ++- BPF_SRC(insn->code) == BPF_K) { ++- stack_relative = true; ++- } else if (is_pointer_value(env, insn->dst_reg)) { ++- verbose("R%d pointer arithmetic prohibited\n", ++- insn->dst_reg); ++- return -EACCES; ++- } else if (BPF_SRC(insn->code) == BPF_X && ++- is_pointer_value(env, insn->src_reg)) { ++- verbose("R%d pointer arithmetic prohibited\n", ++- insn->src_reg); ++- return -EACCES; ++- } ++- ++ /* check dest operand */ ++- err = check_reg_arg(regs, insn->dst_reg, DST_OP); +++ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); ++ if (err) ++ return err; ++ ++- if (stack_relative) { ++- regs[insn->dst_reg].type = PTR_TO_STACK; ++- regs[insn->dst_reg].imm = insn->imm; ++- } +++ return adjust_reg_min_max_vals(env, insn); ++ } ++ ++ return 0; ++ } ++ ++-static int check_cond_jmp_op(struct verifier_env *env, +++static void __find_good_pkt_pointers(struct bpf_func_state *state, +++ struct bpf_reg_state *dst_reg, +++ enum bpf_reg_type type, u16 new_range) +++{ +++ struct bpf_reg_state *reg; +++ int i; +++ +++ for (i = 0; i < MAX_BPF_REG; i++) { +++ reg = &state->regs[i]; +++ if (reg->type == type && reg->id == dst_reg->id) +++ /* keep the maximum range already checked */ +++ reg->range = max(reg->range, new_range); +++ } +++ +++ bpf_for_each_spilled_reg(i, state, reg) { +++ if (!reg) +++ continue; +++ if (reg->type == type && reg->id == dst_reg->id) +++ reg->range = max(reg->range, new_range); +++ } +++} +++ +++static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, +++ struct bpf_reg_state *dst_reg, +++ enum bpf_reg_type type, +++ bool range_right_open) +++{ +++ u16 new_range; +++ int i; +++ +++ if (dst_reg->off < 0 || +++ (dst_reg->off == 0 && range_right_open)) +++ /* This doesn't give us any range */ +++ return; +++ +++ if (dst_reg->umax_value > MAX_PACKET_OFF || +++ dst_reg->umax_value + dst_reg->off > MAX_PACKET_OFF) +++ /* Risk of overflow. For instance, ptr + (1<<63) may be less +++ * than pkt_end, but that's because it's also less than pkt. +++ */ +++ return; +++ +++ new_range = dst_reg->off; +++ if (range_right_open) +++ new_range--; +++ +++ /* Examples for register markings: +++ * +++ * pkt_data in dst register: +++ * +++ * r2 = r3; +++ * r2 += 8; +++ * if (r2 > pkt_end) goto +++ * +++ * +++ * r2 = r3; +++ * r2 += 8; +++ * if (r2 < pkt_end) goto +++ * +++ * +++ * Where: +++ * r2 == dst_reg, pkt_end == src_reg +++ * r2=pkt(id=n,off=8,r=0) +++ * r3=pkt(id=n,off=0,r=0) +++ * +++ * pkt_data in src register: +++ * +++ * r2 = r3; +++ * r2 += 8; +++ * if (pkt_end >= r2) goto +++ * +++ * +++ * r2 = r3; +++ * r2 += 8; +++ * if (pkt_end <= r2) goto +++ * +++ * +++ * Where: +++ * pkt_end == dst_reg, r2 == src_reg +++ * r2=pkt(id=n,off=8,r=0) +++ * r3=pkt(id=n,off=0,r=0) +++ * +++ * Find register r3 and mark its range as r3=pkt(id=n,off=0,r=8) +++ * or r3=pkt(id=n,off=0,r=8-1), so that range of bytes [r3, r3 + 8) +++ * and [r3, r3 + 8-1) respectively is safe to access depending on +++ * the check. +++ */ +++ +++ /* If our ids match, then we must have the same max_value. And we +++ * don't care about the other reg's fixed offset, since if it's too big +++ * the range won't allow anything. +++ * dst_reg->off is known < MAX_PACKET_OFF, therefore it fits in a u16. +++ */ +++ for (i = 0; i <= vstate->curframe; i++) +++ __find_good_pkt_pointers(vstate->frame[i], dst_reg, type, +++ new_range); +++} +++ +++/* compute branch direction of the expression "if (reg opcode val) goto target;" +++ * and return: +++ * 1 - branch will be taken and "goto target" will be executed +++ * 0 - branch will not be taken and fall-through to next insn +++ * -1 - unknown. Example: "if (reg < 5)" is unknown when register value range [0,10] +++ */ +++static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, +++ bool is_jmp32) +++{ +++ struct bpf_reg_state reg_lo; +++ s64 sval; +++ +++ if (__is_pointer_value(false, reg)) +++ return -1; +++ +++ if (is_jmp32) { +++ reg_lo = *reg; +++ reg = ®_lo; +++ /* For JMP32, only low 32 bits are compared, coerce_reg_to_size +++ * could truncate high bits and update umin/umax according to +++ * information of low bits. +++ */ +++ coerce_reg_to_size(reg, 4); +++ /* smin/smax need special handling. For example, after coerce, +++ * if smin_value is 0x00000000ffffffffLL, the value is -1 when +++ * used as operand to JMP32. It is a negative number from s32's +++ * point of view, while it is a positive number when seen as +++ * s64. The smin/smax are kept as s64, therefore, when used with +++ * JMP32, they need to be transformed into s32, then sign +++ * extended back to s64. +++ * +++ * Also, smin/smax were copied from umin/umax. If umin/umax has +++ * different sign bit, then min/max relationship doesn't +++ * maintain after casting into s32, for this case, set smin/smax +++ * to safest range. +++ */ +++ if ((reg->umax_value ^ reg->umin_value) & +++ (1ULL << 31)) { +++ reg->smin_value = S32_MIN; +++ reg->smax_value = S32_MAX; +++ } +++ reg->smin_value = (s64)(s32)reg->smin_value; +++ reg->smax_value = (s64)(s32)reg->smax_value; +++ +++ val = (u32)val; +++ sval = (s64)(s32)val; +++ } else { +++ sval = (s64)val; +++ } +++ +++ switch (opcode) { +++ case BPF_JEQ: +++ if (tnum_is_const(reg->var_off)) +++ return !!tnum_equals_const(reg->var_off, val); +++ break; +++ case BPF_JNE: +++ if (tnum_is_const(reg->var_off)) +++ return !tnum_equals_const(reg->var_off, val); +++ break; +++ case BPF_JSET: +++ if ((~reg->var_off.mask & reg->var_off.value) & val) +++ return 1; +++ if (!((reg->var_off.mask | reg->var_off.value) & val)) +++ return 0; +++ break; +++ case BPF_JGT: +++ if (reg->umin_value > val) +++ return 1; +++ else if (reg->umax_value <= val) +++ return 0; +++ break; +++ case BPF_JSGT: +++ if (reg->smin_value > sval) +++ return 1; +++ else if (reg->smax_value < sval) +++ return 0; +++ break; +++ case BPF_JLT: +++ if (reg->umax_value < val) +++ return 1; +++ else if (reg->umin_value >= val) +++ return 0; +++ break; +++ case BPF_JSLT: +++ if (reg->smax_value < sval) +++ return 1; +++ else if (reg->smin_value >= sval) +++ return 0; +++ break; +++ case BPF_JGE: +++ if (reg->umin_value >= val) +++ return 1; +++ else if (reg->umax_value < val) +++ return 0; +++ break; +++ case BPF_JSGE: +++ if (reg->smin_value >= sval) +++ return 1; +++ else if (reg->smax_value < sval) +++ return 0; +++ break; +++ case BPF_JLE: +++ if (reg->umax_value <= val) +++ return 1; +++ else if (reg->umin_value > val) +++ return 0; +++ break; +++ case BPF_JSLE: +++ if (reg->smax_value <= sval) +++ return 1; +++ else if (reg->smin_value > sval) +++ return 0; +++ break; +++ } +++ +++ return -1; +++} +++ +++/* Generate min value of the high 32-bit from TNUM info. */ +++static u64 gen_hi_min(struct tnum var) +++{ +++ return var.value & ~0xffffffffULL; +++} +++ +++/* Generate max value of the high 32-bit from TNUM info. */ +++static u64 gen_hi_max(struct tnum var) +++{ +++ return (var.value | var.mask) & ~0xffffffffULL; +++} +++ +++/* Return true if VAL is compared with a s64 sign extended from s32, and they +++ * are with the same signedness. +++ */ +++static bool cmp_val_with_extended_s64(s64 sval, struct bpf_reg_state *reg) +++{ +++ return ((s32)sval >= 0 && +++ reg->smin_value >= 0 && reg->smax_value <= S32_MAX) || +++ ((s32)sval < 0 && +++ reg->smax_value <= 0 && reg->smin_value >= S32_MIN); +++} +++ +++/* Constrain the possible values of @reg with unsigned upper bound @bound. +++ * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive. +++ * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits +++ * of @reg. +++ */ +++static void set_upper_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32, +++ bool is_exclusive) +++{ +++ if (is_exclusive) { +++ /* There are no values for `reg` that make `reg<0` true. */ +++ if (bound == 0) +++ return; +++ bound--; +++ } +++ if (is_jmp32) { +++ /* Constrain the register's value in the tnum representation. +++ * For 64-bit comparisons this happens later in +++ * __reg_bound_offset(), but for 32-bit comparisons, we can be +++ * more precise than what can be derived from the updated +++ * numeric bounds. +++ */ +++ struct tnum t = tnum_range(0, bound); +++ +++ t.mask |= ~0xffffffffULL; /* upper half is unknown */ +++ reg->var_off = tnum_intersect(reg->var_off, t); +++ +++ /* Compute the 64-bit bound from the 32-bit bound. */ +++ bound += gen_hi_max(reg->var_off); +++ } +++ reg->umax_value = min(reg->umax_value, bound); +++} +++ +++/* Constrain the possible values of @reg with unsigned lower bound @bound. +++ * If @is_exclusive, @bound is an exclusive limit, otherwise it is inclusive. +++ * If @is_jmp32, @bound is a 32-bit value that only constrains the low 32 bits +++ * of @reg. +++ */ +++static void set_lower_bound(struct bpf_reg_state *reg, u64 bound, bool is_jmp32, +++ bool is_exclusive) +++{ +++ if (is_exclusive) { +++ /* There are no values for `reg` that make `reg>MAX` true. */ +++ if (bound == (is_jmp32 ? U32_MAX : U64_MAX)) +++ return; +++ bound++; +++ } +++ if (is_jmp32) { +++ /* Constrain the register's value in the tnum representation. +++ * For 64-bit comparisons this happens later in +++ * __reg_bound_offset(), but for 32-bit comparisons, we can be +++ * more precise than what can be derived from the updated +++ * numeric bounds. +++ */ +++ struct tnum t = tnum_range(bound, U32_MAX); +++ +++ t.mask |= ~0xffffffffULL; /* upper half is unknown */ +++ reg->var_off = tnum_intersect(reg->var_off, t); +++ +++ /* Compute the 64-bit bound from the 32-bit bound. */ +++ bound += gen_hi_min(reg->var_off); +++ } +++ reg->umin_value = max(reg->umin_value, bound); +++} +++ +++/* Adjusts the register min/max values in the case that the dst_reg is the +++ * variable register that we are working on, and src_reg is a constant or we're +++ * simply doing a BPF_K check. +++ * In JEQ/JNE cases we also adjust the var_off values. +++ */ +++static void reg_set_min_max(struct bpf_reg_state *true_reg, +++ struct bpf_reg_state *false_reg, u64 val, +++ u8 opcode, bool is_jmp32) +++{ +++ s64 sval; +++ +++ /* If the dst_reg is a pointer, we can't learn anything about its +++ * variable offset from the compare (unless src_reg were a pointer into +++ * the same object, but we don't bother with that. +++ * Since false_reg and true_reg have the same type by construction, we +++ * only need to check one of them for pointerness. +++ */ +++ if (__is_pointer_value(false, false_reg)) +++ return; +++ +++ val = is_jmp32 ? (u32)val : val; +++ sval = is_jmp32 ? (s64)(s32)val : (s64)val; +++ +++ switch (opcode) { +++ case BPF_JEQ: +++ case BPF_JNE: +++ { +++ struct bpf_reg_state *reg = +++ opcode == BPF_JEQ ? true_reg : false_reg; +++ +++ /* For BPF_JEQ, if this is false we know nothing Jon Snow, but +++ * if it is true we know the value for sure. Likewise for +++ * BPF_JNE. +++ */ +++ if (is_jmp32) { +++ u64 old_v = reg->var_off.value; +++ u64 hi_mask = ~0xffffffffULL; +++ +++ reg->var_off.value = (old_v & hi_mask) | val; +++ reg->var_off.mask &= hi_mask; +++ } else { +++ __mark_reg_known(reg, val); +++ } +++ break; +++ } +++ case BPF_JSET: +++ false_reg->var_off = tnum_and(false_reg->var_off, +++ tnum_const(~val)); +++ if (is_power_of_2(val)) +++ true_reg->var_off = tnum_or(true_reg->var_off, +++ tnum_const(val)); +++ break; +++ case BPF_JGE: +++ case BPF_JGT: +++ { +++ set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JGE); +++ set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JGT); +++ break; +++ } +++ case BPF_JSGE: +++ case BPF_JSGT: +++ { +++ s64 false_smax = opcode == BPF_JSGT ? sval : sval - 1; +++ s64 true_smin = opcode == BPF_JSGT ? sval + 1 : sval; +++ +++ /* If the full s64 was not sign-extended from s32 then don't +++ * deduct further info. +++ */ +++ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +++ break; +++ false_reg->smax_value = min(false_reg->smax_value, false_smax); +++ true_reg->smin_value = max(true_reg->smin_value, true_smin); +++ break; +++ } +++ case BPF_JLE: +++ case BPF_JLT: +++ { +++ set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JLE); +++ set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JLT); +++ break; +++ } +++ case BPF_JSLE: +++ case BPF_JSLT: +++ { +++ s64 false_smin = opcode == BPF_JSLT ? sval : sval + 1; +++ s64 true_smax = opcode == BPF_JSLT ? sval - 1 : sval; +++ +++ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +++ break; +++ false_reg->smin_value = max(false_reg->smin_value, false_smin); +++ true_reg->smax_value = min(true_reg->smax_value, true_smax); +++ break; +++ } +++ default: +++ break; +++ } +++ +++ __reg_deduce_bounds(false_reg); +++ __reg_deduce_bounds(true_reg); +++ /* We might have learned some bits from the bounds. */ +++ __reg_bound_offset(false_reg); +++ __reg_bound_offset(true_reg); +++ /* Intersecting with the old var_off might have improved our bounds +++ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), +++ * then new var_off is (0; 0x7f...fc) which improves our umax. +++ */ +++ __update_reg_bounds(false_reg); +++ __update_reg_bounds(true_reg); +++} +++ +++/* Same as above, but for the case that dst_reg holds a constant and src_reg is +++ * the variable reg. +++ */ +++static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, +++ struct bpf_reg_state *false_reg, u64 val, +++ u8 opcode, bool is_jmp32) +++{ +++ s64 sval; +++ +++ if (__is_pointer_value(false, false_reg)) +++ return; +++ +++ val = is_jmp32 ? (u32)val : val; +++ sval = is_jmp32 ? (s64)(s32)val : (s64)val; +++ +++ switch (opcode) { +++ case BPF_JEQ: +++ case BPF_JNE: +++ { +++ struct bpf_reg_state *reg = +++ opcode == BPF_JEQ ? true_reg : false_reg; +++ +++ if (is_jmp32) { +++ u64 old_v = reg->var_off.value; +++ u64 hi_mask = ~0xffffffffULL; +++ +++ reg->var_off.value = (old_v & hi_mask) | val; +++ reg->var_off.mask &= hi_mask; +++ } else { +++ __mark_reg_known(reg, val); +++ } +++ break; +++ } +++ case BPF_JSET: +++ false_reg->var_off = tnum_and(false_reg->var_off, +++ tnum_const(~val)); +++ if (is_power_of_2(val)) +++ true_reg->var_off = tnum_or(true_reg->var_off, +++ tnum_const(val)); +++ break; +++ case BPF_JGE: +++ case BPF_JGT: +++ { +++ set_lower_bound(false_reg, val, is_jmp32, opcode == BPF_JGE); +++ set_upper_bound(true_reg, val, is_jmp32, opcode == BPF_JGT); +++ break; +++ } +++ case BPF_JSGE: +++ case BPF_JSGT: +++ { +++ s64 false_smin = opcode == BPF_JSGT ? sval : sval + 1; +++ s64 true_smax = opcode == BPF_JSGT ? sval - 1 : sval; +++ +++ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +++ break; +++ false_reg->smin_value = max(false_reg->smin_value, false_smin); +++ true_reg->smax_value = min(true_reg->smax_value, true_smax); +++ break; +++ } +++ case BPF_JLE: +++ case BPF_JLT: +++ { +++ set_upper_bound(false_reg, val, is_jmp32, opcode == BPF_JLE); +++ set_lower_bound(true_reg, val, is_jmp32, opcode == BPF_JLT); +++ break; +++ } +++ case BPF_JSLE: +++ case BPF_JSLT: +++ { +++ s64 false_smax = opcode == BPF_JSLT ? sval : sval - 1; +++ s64 true_smin = opcode == BPF_JSLT ? sval + 1 : sval; +++ +++ if (is_jmp32 && !cmp_val_with_extended_s64(sval, false_reg)) +++ break; +++ false_reg->smax_value = min(false_reg->smax_value, false_smax); +++ true_reg->smin_value = max(true_reg->smin_value, true_smin); +++ break; +++ } +++ default: +++ break; +++ } +++ +++ __reg_deduce_bounds(false_reg); +++ __reg_deduce_bounds(true_reg); +++ /* We might have learned some bits from the bounds. */ +++ __reg_bound_offset(false_reg); +++ __reg_bound_offset(true_reg); +++ /* Intersecting with the old var_off might have improved our bounds +++ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), +++ * then new var_off is (0; 0x7f...fc) which improves our umax. +++ */ +++ __update_reg_bounds(false_reg); +++ __update_reg_bounds(true_reg); +++} +++ +++/* Regs are known to be equal, so intersect their min/max/var_off */ +++static void __reg_combine_min_max(struct bpf_reg_state *src_reg, +++ struct bpf_reg_state *dst_reg) +++{ +++ src_reg->umin_value = dst_reg->umin_value = max(src_reg->umin_value, +++ dst_reg->umin_value); +++ src_reg->umax_value = dst_reg->umax_value = min(src_reg->umax_value, +++ dst_reg->umax_value); +++ src_reg->smin_value = dst_reg->smin_value = max(src_reg->smin_value, +++ dst_reg->smin_value); +++ src_reg->smax_value = dst_reg->smax_value = min(src_reg->smax_value, +++ dst_reg->smax_value); +++ src_reg->var_off = dst_reg->var_off = tnum_intersect(src_reg->var_off, +++ dst_reg->var_off); +++ /* We might have learned new bounds from the var_off. */ +++ __update_reg_bounds(src_reg); +++ __update_reg_bounds(dst_reg); +++ /* We might have learned something about the sign bit. */ +++ __reg_deduce_bounds(src_reg); +++ __reg_deduce_bounds(dst_reg); +++ /* We might have learned some bits from the bounds. */ +++ __reg_bound_offset(src_reg); +++ __reg_bound_offset(dst_reg); +++ /* Intersecting with the old var_off might have improved our bounds +++ * slightly. e.g. if umax was 0x7f...f and var_off was (0; 0xf...fc), +++ * then new var_off is (0; 0x7f...fc) which improves our umax. +++ */ +++ __update_reg_bounds(src_reg); +++ __update_reg_bounds(dst_reg); +++} +++ +++static void reg_combine_min_max(struct bpf_reg_state *true_src, +++ struct bpf_reg_state *true_dst, +++ struct bpf_reg_state *false_src, +++ struct bpf_reg_state *false_dst, +++ u8 opcode) +++{ +++ switch (opcode) { +++ case BPF_JEQ: +++ __reg_combine_min_max(true_src, true_dst); +++ break; +++ case BPF_JNE: +++ __reg_combine_min_max(false_src, false_dst); +++ break; +++ } +++} +++ +++static void mark_ptr_or_null_reg(struct bpf_func_state *state, +++ struct bpf_reg_state *reg, u32 id, +++ bool is_null) +++{ +++ if (reg_type_may_be_null(reg->type) && reg->id == id) { +++ /* Old offset (both fixed and variable parts) should +++ * have been known-zero, because we don't allow pointer +++ * arithmetic on pointers that might be NULL. +++ */ +++ if (WARN_ON_ONCE(reg->smin_value || reg->smax_value || +++ !tnum_equals_const(reg->var_off, 0) || +++ reg->off)) { +++ __mark_reg_known_zero(reg); +++ reg->off = 0; +++ } +++ if (is_null) { +++ reg->type = SCALAR_VALUE; +++ } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { +++ if (reg->map_ptr->inner_map_meta) { +++ reg->type = CONST_PTR_TO_MAP; +++ reg->map_ptr = reg->map_ptr->inner_map_meta; +++ } else if (reg->map_ptr->map_type == +++ BPF_MAP_TYPE_XSKMAP) { +++ reg->type = PTR_TO_XDP_SOCK; +++ } else { +++ reg->type = PTR_TO_MAP_VALUE; +++ } +++ } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { +++ reg->type = PTR_TO_SOCKET; +++ } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { +++ reg->type = PTR_TO_SOCK_COMMON; +++ } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { +++ reg->type = PTR_TO_TCP_SOCK; +++ } +++ if (is_null) { +++ /* We don't need id and ref_obj_id from this point +++ * onwards anymore, thus we should better reset it, +++ * so that state pruning has chances to take effect. +++ */ +++ reg->id = 0; +++ reg->ref_obj_id = 0; +++ } else if (!reg_may_point_to_spin_lock(reg)) { +++ /* For not-NULL ptr, reg->ref_obj_id will be reset +++ * in release_reg_references(). +++ * +++ * reg->id is still used by spin_lock ptr. Other +++ * than spin_lock ptr type, reg->id can be reset. +++ */ +++ reg->id = 0; +++ } +++ } +++} +++ +++static void __mark_ptr_or_null_regs(struct bpf_func_state *state, u32 id, +++ bool is_null) +++{ +++ struct bpf_reg_state *reg; +++ int i; +++ +++ for (i = 0; i < MAX_BPF_REG; i++) +++ mark_ptr_or_null_reg(state, &state->regs[i], id, is_null); +++ +++ bpf_for_each_spilled_reg(i, state, reg) { +++ if (!reg) +++ continue; +++ mark_ptr_or_null_reg(state, reg, id, is_null); +++ } +++} +++ +++/* The logic is similar to find_good_pkt_pointers(), both could eventually +++ * be folded together at some point. +++ */ +++static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, +++ bool is_null) +++{ +++ struct bpf_func_state *state = vstate->frame[vstate->curframe]; +++ struct bpf_reg_state *regs = state->regs; +++ u32 ref_obj_id = regs[regno].ref_obj_id; +++ u32 id = regs[regno].id; +++ int i; +++ +++ if (ref_obj_id && ref_obj_id == id && is_null) +++ /* regs[regno] is in the " == NULL" branch. +++ * No one could have freed the reference state before +++ * doing the NULL check. +++ */ +++ WARN_ON_ONCE(release_reference_state(state, id)); +++ +++ for (i = 0; i <= vstate->curframe; i++) +++ __mark_ptr_or_null_regs(vstate->frame[i], id, is_null); +++} +++ +++static bool try_match_pkt_pointers(const struct bpf_insn *insn, +++ struct bpf_reg_state *dst_reg, +++ struct bpf_reg_state *src_reg, +++ struct bpf_verifier_state *this_branch, +++ struct bpf_verifier_state *other_branch) +++{ +++ if (BPF_SRC(insn->code) != BPF_X) +++ return false; +++ +++ /* Pointers are always 64-bit. */ +++ if (BPF_CLASS(insn->code) == BPF_JMP32) +++ return false; +++ +++ switch (BPF_OP(insn->code)) { +++ case BPF_JGT: +++ if ((dst_reg->type == PTR_TO_PACKET && +++ src_reg->type == PTR_TO_PACKET_END) || +++ (dst_reg->type == PTR_TO_PACKET_META && +++ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +++ /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ +++ find_good_pkt_pointers(this_branch, dst_reg, +++ dst_reg->type, false); +++ } else if ((dst_reg->type == PTR_TO_PACKET_END && +++ src_reg->type == PTR_TO_PACKET) || +++ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +++ src_reg->type == PTR_TO_PACKET_META)) { +++ /* pkt_end > pkt_data', pkt_data > pkt_meta' */ +++ find_good_pkt_pointers(other_branch, src_reg, +++ src_reg->type, true); +++ } else { +++ return false; +++ } +++ break; +++ case BPF_JLT: +++ if ((dst_reg->type == PTR_TO_PACKET && +++ src_reg->type == PTR_TO_PACKET_END) || +++ (dst_reg->type == PTR_TO_PACKET_META && +++ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +++ /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ +++ find_good_pkt_pointers(other_branch, dst_reg, +++ dst_reg->type, true); +++ } else if ((dst_reg->type == PTR_TO_PACKET_END && +++ src_reg->type == PTR_TO_PACKET) || +++ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +++ src_reg->type == PTR_TO_PACKET_META)) { +++ /* pkt_end < pkt_data', pkt_data > pkt_meta' */ +++ find_good_pkt_pointers(this_branch, src_reg, +++ src_reg->type, false); +++ } else { +++ return false; +++ } +++ break; +++ case BPF_JGE: +++ if ((dst_reg->type == PTR_TO_PACKET && +++ src_reg->type == PTR_TO_PACKET_END) || +++ (dst_reg->type == PTR_TO_PACKET_META && +++ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +++ /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ +++ find_good_pkt_pointers(this_branch, dst_reg, +++ dst_reg->type, true); +++ } else if ((dst_reg->type == PTR_TO_PACKET_END && +++ src_reg->type == PTR_TO_PACKET) || +++ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +++ src_reg->type == PTR_TO_PACKET_META)) { +++ /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ +++ find_good_pkt_pointers(other_branch, src_reg, +++ src_reg->type, false); +++ } else { +++ return false; +++ } +++ break; +++ case BPF_JLE: +++ if ((dst_reg->type == PTR_TO_PACKET && +++ src_reg->type == PTR_TO_PACKET_END) || +++ (dst_reg->type == PTR_TO_PACKET_META && +++ reg_is_init_pkt_pointer(src_reg, PTR_TO_PACKET))) { +++ /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ +++ find_good_pkt_pointers(other_branch, dst_reg, +++ dst_reg->type, false); +++ } else if ((dst_reg->type == PTR_TO_PACKET_END && +++ src_reg->type == PTR_TO_PACKET) || +++ (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && +++ src_reg->type == PTR_TO_PACKET_META)) { +++ /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ +++ find_good_pkt_pointers(this_branch, src_reg, +++ src_reg->type, true); +++ } else { +++ return false; +++ } +++ break; +++ default: +++ return false; +++ } +++ +++ return true; +++} +++ +++static int check_cond_jmp_op(struct bpf_verifier_env *env, ++ struct bpf_insn *insn, int *insn_idx) ++ { ++- struct reg_state *regs = env->cur_state.regs; ++- struct verifier_state *other_branch; +++ struct bpf_verifier_state *this_branch = env->cur_state; +++ struct bpf_verifier_state *other_branch; +++ struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; +++ struct bpf_reg_state *dst_reg, *other_branch_regs, *src_reg = NULL; ++ u8 opcode = BPF_OP(insn->code); +++ bool is_jmp32; +++ int pred = -1; ++ int err; ++ ++- if (opcode > BPF_EXIT) { ++- verbose("invalid BPF_JMP opcode %x\n", opcode); +++ /* Only conditional jumps are expected to reach here. */ +++ if (opcode == BPF_JA || opcode > BPF_JSLE) { +++ verbose(env, "invalid BPF_JMP/JMP32 opcode %x\n", opcode); ++ return -EINVAL; ++ } ++ ++ if (BPF_SRC(insn->code) == BPF_X) { ++ if (insn->imm != 0) { ++- verbose("BPF_JMP uses reserved fields\n"); +++ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++ /* check src1 operand */ ++- err = check_reg_arg(regs, insn->src_reg, SRC_OP); +++ err = check_reg_arg(env, insn->src_reg, SRC_OP); ++ if (err) ++ return err; ++ ++ if (is_pointer_value(env, insn->src_reg)) { ++- verbose("R%d pointer comparison prohibited\n", +++ verbose(env, "R%d pointer comparison prohibited\n", ++ insn->src_reg); ++ return -EACCES; ++ } +++ src_reg = ®s[insn->src_reg]; ++ } else { ++ if (insn->src_reg != BPF_REG_0) { ++- verbose("BPF_JMP uses reserved fields\n"); +++ verbose(env, "BPF_JMP/JMP32 uses reserved fields\n"); ++ return -EINVAL; ++ } ++ } ++ ++ /* check src2 operand */ ++- err = check_reg_arg(regs, insn->dst_reg, SRC_OP); +++ err = check_reg_arg(env, insn->dst_reg, SRC_OP); ++ if (err) ++ return err; ++ ++- /* detect if R == 0 where R was initialized to zero earlier */ ++- if (BPF_SRC(insn->code) == BPF_K && ++- (opcode == BPF_JEQ || opcode == BPF_JNE) && ++- regs[insn->dst_reg].type == CONST_IMM && ++- regs[insn->dst_reg].imm == insn->imm) { ++- if (opcode == BPF_JEQ) { ++- /* if (imm == imm) goto pc+off; ++- * only follow the goto, ignore fall-through ++- */ ++- *insn_idx += insn->off; ++- return 0; ++- } else { ++- /* if (imm != imm) goto pc+off; ++- * only follow fall-through branch, since ++- * that's where the program will go ++- */ ++- return 0; ++- } +++ dst_reg = ®s[insn->dst_reg]; +++ is_jmp32 = BPF_CLASS(insn->code) == BPF_JMP32; +++ +++ if (BPF_SRC(insn->code) == BPF_K) +++ pred = is_branch_taken(dst_reg, insn->imm, +++ opcode, is_jmp32); +++ else if (src_reg->type == SCALAR_VALUE && +++ tnum_is_const(src_reg->var_off)) +++ pred = is_branch_taken(dst_reg, src_reg->var_off.value, +++ opcode, is_jmp32); +++ if (pred >= 0) { +++ err = mark_chain_precision(env, insn->dst_reg); +++ if (BPF_SRC(insn->code) == BPF_X && !err) +++ err = mark_chain_precision(env, insn->src_reg); +++ if (err) +++ return err; +++ } +++ +++ if (pred == 1) { +++ /* Only follow the goto, ignore fall-through. If needed, push +++ * the fall-through branch for simulation under speculative +++ * execution. +++ */ +++ if (!env->allow_ptr_leaks && +++ !sanitize_speculative_path(env, insn, *insn_idx + 1, +++ *insn_idx)) +++ return -EFAULT; +++ *insn_idx += insn->off; +++ return 0; +++ } else if (pred == 0) { +++ /* Only follow the fall-through branch, since that's where the +++ * program will go. If needed, push the goto branch for +++ * simulation under speculative execution. +++ */ +++ if (!env->allow_ptr_leaks && +++ !sanitize_speculative_path(env, insn, +++ *insn_idx + insn->off + 1, +++ *insn_idx)) +++ return -EFAULT; +++ return 0; ++ } ++ ++- other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); +++ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, +++ false); ++ if (!other_branch) ++ return -EFAULT; +++ other_branch_regs = other_branch->frame[other_branch->curframe]->regs; ++ ++- /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */ ++- if (BPF_SRC(insn->code) == BPF_K && ++- insn->imm == 0 && (opcode == BPF_JEQ || ++- opcode == BPF_JNE) && ++- regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) { ++- if (opcode == BPF_JEQ) { ++- /* next fallthrough insn can access memory via ++- * this register ++- */ ++- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; ++- /* branch targer cannot access it, since reg == 0 */ ++- other_branch->regs[insn->dst_reg].type = CONST_IMM; ++- other_branch->regs[insn->dst_reg].imm = 0; ++- } else { ++- other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; ++- regs[insn->dst_reg].type = CONST_IMM; ++- regs[insn->dst_reg].imm = 0; ++- } ++- } else if (is_pointer_value(env, insn->dst_reg)) { ++- verbose("R%d pointer comparison prohibited\n", insn->dst_reg); +++ /* detect if we are comparing against a constant value so we can adjust +++ * our min/max values for our dst register. +++ * this is only legit if both are scalars (or pointers to the same +++ * object, I suppose, but we don't support that right now), because +++ * otherwise the different base pointers mean the offsets aren't +++ * comparable. +++ */ +++ if (BPF_SRC(insn->code) == BPF_X) { +++ struct bpf_reg_state *src_reg = ®s[insn->src_reg]; +++ struct bpf_reg_state lo_reg0 = *dst_reg; +++ struct bpf_reg_state lo_reg1 = *src_reg; +++ struct bpf_reg_state *src_lo, *dst_lo; +++ +++ dst_lo = &lo_reg0; +++ src_lo = &lo_reg1; +++ coerce_reg_to_size(dst_lo, 4); +++ coerce_reg_to_size(src_lo, 4); +++ +++ if (dst_reg->type == SCALAR_VALUE && +++ src_reg->type == SCALAR_VALUE) { +++ if (tnum_is_const(src_reg->var_off) || +++ (is_jmp32 && tnum_is_const(src_lo->var_off))) +++ reg_set_min_max(&other_branch_regs[insn->dst_reg], +++ dst_reg, +++ is_jmp32 +++ ? src_lo->var_off.value +++ : src_reg->var_off.value, +++ opcode, is_jmp32); +++ else if (tnum_is_const(dst_reg->var_off) || +++ (is_jmp32 && tnum_is_const(dst_lo->var_off))) +++ reg_set_min_max_inv(&other_branch_regs[insn->src_reg], +++ src_reg, +++ is_jmp32 +++ ? dst_lo->var_off.value +++ : dst_reg->var_off.value, +++ opcode, is_jmp32); +++ else if (!is_jmp32 && +++ (opcode == BPF_JEQ || opcode == BPF_JNE)) +++ /* Comparing for equality, we can combine knowledge */ +++ reg_combine_min_max(&other_branch_regs[insn->src_reg], +++ &other_branch_regs[insn->dst_reg], +++ src_reg, dst_reg, opcode); +++ } +++ } else if (dst_reg->type == SCALAR_VALUE) { +++ reg_set_min_max(&other_branch_regs[insn->dst_reg], +++ dst_reg, insn->imm, opcode, is_jmp32); +++ } +++ +++ /* detect if R == 0 where R is returned from bpf_map_lookup_elem(). +++ * NOTE: these optimizations below are related with pointer comparison +++ * which will never be JMP32. +++ */ +++ if (!is_jmp32 && BPF_SRC(insn->code) == BPF_K && +++ insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) && +++ reg_type_may_be_null(dst_reg->type)) { +++ /* Mark all identical registers in each branch as either +++ * safe or unknown depending R == 0 or R != 0 conditional. +++ */ +++ mark_ptr_or_null_regs(this_branch, insn->dst_reg, +++ opcode == BPF_JNE); +++ mark_ptr_or_null_regs(other_branch, insn->dst_reg, +++ opcode == BPF_JEQ); +++ } else if (!try_match_pkt_pointers(insn, dst_reg, ®s[insn->src_reg], +++ this_branch, other_branch) && +++ is_pointer_value(env, insn->dst_reg)) { +++ verbose(env, "R%d pointer comparison prohibited\n", +++ insn->dst_reg); ++ return -EACCES; ++- } else if (BPF_SRC(insn->code) == BPF_K && ++- (opcode == BPF_JEQ || opcode == BPF_JNE)) { ++- ++- if (opcode == BPF_JEQ) { ++- /* detect if (R == imm) goto ++- * and in the target state recognize that R = imm ++- */ ++- other_branch->regs[insn->dst_reg].type = CONST_IMM; ++- other_branch->regs[insn->dst_reg].imm = insn->imm; ++- } else { ++- /* detect if (R != imm) goto ++- * and in the fall-through state recognize that R = imm ++- */ ++- regs[insn->dst_reg].type = CONST_IMM; ++- regs[insn->dst_reg].imm = insn->imm; ++- } ++ } ++- if (log_level) ++- print_verifier_state(env); +++ if (env->log.level & BPF_LOG_LEVEL) +++ print_verifier_state(env, this_branch->frame[this_branch->curframe]); ++ return 0; ++ } ++ ++-/* return the map pointer stored inside BPF_LD_IMM64 instruction */ ++-static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) ++-{ ++- u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32; ++- ++- return (struct bpf_map *) (unsigned long) imm64; ++-} ++- ++ /* verify BPF_LD_IMM64 instruction */ ++-static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) +++static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) ++ { ++- struct reg_state *regs = env->cur_state.regs; +++ struct bpf_insn_aux_data *aux = cur_aux(env); +++ struct bpf_reg_state *regs = cur_regs(env); +++ struct bpf_map *map; ++ int err; ++ ++ if (BPF_SIZE(insn->code) != BPF_DW) { ++- verbose("invalid BPF_LD_IMM insn\n"); +++ verbose(env, "invalid BPF_LD_IMM insn\n"); ++ return -EINVAL; ++ } ++ if (insn->off != 0) { ++- verbose("BPF_LD_IMM64 uses reserved fields\n"); +++ verbose(env, "BPF_LD_IMM64 uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++- err = check_reg_arg(regs, insn->dst_reg, DST_OP); +++ err = check_reg_arg(env, insn->dst_reg, DST_OP); ++ if (err) ++ return err; ++ ++- if (insn->src_reg == 0) ++- /* generic move 64-bit immediate into a register */ +++ if (insn->src_reg == 0) { +++ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; +++ +++ regs[insn->dst_reg].type = SCALAR_VALUE; +++ __mark_reg_known(®s[insn->dst_reg], imm); ++ return 0; +++ } ++ ++- /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ ++- BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); +++ map = env->used_maps[aux->map_index]; +++ mark_reg_known_zero(env, regs, insn->dst_reg); +++ regs[insn->dst_reg].map_ptr = map; +++ +++ if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) { +++ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE; +++ regs[insn->dst_reg].off = aux->map_off; +++ if (map_value_has_spin_lock(map)) +++ regs[insn->dst_reg].id = ++env->id_gen; +++ } else if (insn->src_reg == BPF_PSEUDO_MAP_FD) { +++ regs[insn->dst_reg].type = CONST_PTR_TO_MAP; +++ } else { +++ verbose(env, "bpf verifier is misconfigured\n"); +++ return -EINVAL; +++ } ++ ++- regs[insn->dst_reg].type = CONST_PTR_TO_MAP; ++- regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); ++ return 0; ++ } ++ ++@@ -1357,53 +6272,148 @@ static bool may_access_skb(enum bpf_prog ++ * Output: ++ * R0 - 8/16/32-bit skb data converted to cpu endianness ++ */ ++-static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) +++static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) ++ { ++- struct reg_state *regs = env->cur_state.regs; +++ struct bpf_reg_state *regs = cur_regs(env); +++ static const int ctx_reg = BPF_REG_6; ++ u8 mode = BPF_MODE(insn->code); ++- struct reg_state *reg; ++ int i, err; ++ ++ if (!may_access_skb(env->prog->type)) { ++- verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); +++ verbose(env, "BPF_LD_[ABS|IND] instructions not allowed for this program type\n"); +++ return -EINVAL; +++ } +++ +++ if (!env->ops->gen_ld_abs) { +++ verbose(env, "bpf verifier is misconfigured\n"); +++ return -EINVAL; +++ } +++ +++ if (env->subprog_cnt > 1) { +++ /* when program has LD_ABS insn JITs and interpreter assume +++ * that r1 == ctx == skb which is not the case for callees +++ * that can have arbitrary arguments. It's problematic +++ * for main prog as well since JITs would need to analyze +++ * all functions in order to make proper register save/restore +++ * decisions in the main prog. Hence disallow LD_ABS with calls +++ */ +++ verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); ++ return -EINVAL; ++ } ++ ++ if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || ++ BPF_SIZE(insn->code) == BPF_DW || ++ (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { ++- verbose("BPF_LD_ABS uses reserved fields\n"); +++ verbose(env, "BPF_LD_[ABS|IND] uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++ /* check whether implicit source operand (register R6) is readable */ ++- err = check_reg_arg(regs, BPF_REG_6, SRC_OP); +++ err = check_reg_arg(env, ctx_reg, SRC_OP); ++ if (err) ++ return err; ++ ++- if (regs[BPF_REG_6].type != PTR_TO_CTX) { ++- verbose("at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); +++ /* Disallow usage of BPF_LD_[ABS|IND] with reference tracking, as +++ * gen_ld_abs() may terminate the program at runtime, leading to +++ * reference leak. +++ */ +++ err = check_reference_leak(env); +++ if (err) { +++ verbose(env, "BPF_LD_[ABS|IND] cannot be mixed with socket references\n"); +++ return err; +++ } +++ +++ if (env->cur_state->active_spin_lock) { +++ verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); +++ return -EINVAL; +++ } +++ +++ if (regs[ctx_reg].type != PTR_TO_CTX) { +++ verbose(env, +++ "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); ++ return -EINVAL; ++ } ++ ++ if (mode == BPF_IND) { ++ /* check explicit source operand */ ++- err = check_reg_arg(regs, insn->src_reg, SRC_OP); +++ err = check_reg_arg(env, insn->src_reg, SRC_OP); ++ if (err) ++ return err; ++ } ++ +++ err = check_ctx_reg(env, ®s[ctx_reg], ctx_reg); +++ if (err < 0) +++ return err; +++ ++ /* reset caller saved regs to unreadable */ ++ for (i = 0; i < CALLER_SAVED_REGS; i++) { ++- reg = regs + caller_saved[i]; ++- reg->type = NOT_INIT; ++- reg->imm = 0; +++ mark_reg_not_init(env, regs, caller_saved[i]); +++ check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); ++ } ++ ++ /* mark destination R0 register as readable, since it contains ++- * the value fetched from the packet +++ * the value fetched from the packet. +++ * Already marked as written above. ++ */ ++- regs[BPF_REG_0].type = UNKNOWN_VALUE; +++ mark_reg_unknown(env, regs, BPF_REG_0); +++ /* ld_abs load up to 32-bit skb data. */ +++ regs[BPF_REG_0].subreg_def = env->insn_idx + 1; +++ return 0; +++} +++ +++static int check_return_code(struct bpf_verifier_env *env) +++{ +++ struct tnum enforce_attach_type_range = tnum_unknown; +++ struct bpf_reg_state *reg; +++ struct tnum range = tnum_range(0, 1); +++ +++ switch (env->prog->type) { +++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: +++ if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || +++ env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) +++ range = tnum_range(1, 1); +++ break; +++ case BPF_PROG_TYPE_CGROUP_SKB: +++ if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { +++ range = tnum_range(0, 3); +++ enforce_attach_type_range = tnum_range(2, 3); +++ } +++ break; +++ case BPF_PROG_TYPE_CGROUP_SOCK: +++ case BPF_PROG_TYPE_SOCK_OPS: +++ case BPF_PROG_TYPE_CGROUP_DEVICE: +++ case BPF_PROG_TYPE_CGROUP_SYSCTL: +++ case BPF_PROG_TYPE_CGROUP_SOCKOPT: +++ break; +++ default: +++ return 0; +++ } +++ +++ reg = cur_regs(env) + BPF_REG_0; +++ if (reg->type != SCALAR_VALUE) { +++ verbose(env, "At program exit the register R0 is not a known value (%s)\n", +++ reg_type_str[reg->type]); +++ return -EINVAL; +++ } +++ +++ if (!tnum_in(range, reg->var_off)) { +++ char tn_buf[48]; +++ +++ verbose(env, "At program exit the register R0 "); +++ if (!tnum_is_unknown(reg->var_off)) { +++ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); +++ verbose(env, "has value %s", tn_buf); +++ } else { +++ verbose(env, "has unknown scalar value"); +++ } +++ tnum_strn(tn_buf, sizeof(tn_buf), range); +++ verbose(env, " should have been in %s\n", tn_buf); +++ return -EINVAL; +++ } +++ +++ if (!tnum_is_unknown(enforce_attach_type_range) && +++ tnum_in(enforce_attach_type_range, reg->var_off)) +++ env->prog->enforce_expected_attach_type = 1; ++ return 0; ++ } ++ ++@@ -1447,19 +6457,37 @@ enum { ++ BRANCH = 2, ++ }; ++ ++-#define STATE_LIST_MARK ((struct verifier_state_list *) -1L) +++static u32 state_htab_size(struct bpf_verifier_env *env) +++{ +++ return env->prog->len; +++} +++ +++static struct bpf_verifier_state_list **explored_state( +++ struct bpf_verifier_env *env, +++ int idx) +++{ +++ struct bpf_verifier_state *cur = env->cur_state; +++ struct bpf_func_state *state = cur->frame[cur->curframe]; +++ +++ return &env->explored_states[(idx ^ state->callsite) % state_htab_size(env)]; +++} ++ ++-static int *insn_stack; /* stack of insns to process */ ++-static int cur_stack; /* current stack index */ ++-static int *insn_state; +++static void init_explored_state(struct bpf_verifier_env *env, int idx) +++{ +++ env->insn_aux_data[idx].prune_point = true; +++} ++ ++ /* t, w, e - match pseudo-code above: ++ * t - index of current instruction ++ * w - next instruction ++ * e - edge ++ */ ++-static int push_insn(int t, int w, int e, struct verifier_env *env) +++static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, +++ bool loop_ok) ++ { +++ int *insn_stack = env->cfg.insn_stack; +++ int *insn_state = env->cfg.insn_state; +++ ++ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) ++ return 0; ++ ++@@ -1467,30 +6495,35 @@ static int push_insn(int t, int w, int e ++ return 0; ++ ++ if (w < 0 || w >= env->prog->len) { ++- verbose("jump out of range from insn %d to %d\n", t, w); +++ verbose_linfo(env, t, "%d: ", t); +++ verbose(env, "jump out of range from insn %d to %d\n", t, w); ++ return -EINVAL; ++ } ++ ++ if (e == BRANCH) ++ /* mark branch target for state pruning */ ++- env->explored_states[w] = STATE_LIST_MARK; +++ init_explored_state(env, w); ++ ++ if (insn_state[w] == 0) { ++ /* tree-edge */ ++ insn_state[t] = DISCOVERED | e; ++ insn_state[w] = DISCOVERED; ++- if (cur_stack >= env->prog->len) +++ if (env->cfg.cur_stack >= env->prog->len) ++ return -E2BIG; ++- insn_stack[cur_stack++] = w; +++ insn_stack[env->cfg.cur_stack++] = w; ++ return 1; ++ } else if ((insn_state[w] & 0xF0) == DISCOVERED) { ++- verbose("back-edge from insn %d to %d\n", t, w); +++ if (loop_ok && env->allow_ptr_leaks) +++ return 0; +++ verbose_linfo(env, t, "%d: ", t); +++ verbose_linfo(env, w, "%d: ", w); +++ verbose(env, "back-edge from insn %d to %d\n", t, w); ++ return -EINVAL; ++ } else if (insn_state[w] == EXPLORED) { ++ /* forward- or cross-edge */ ++ insn_state[t] = DISCOVERED | e; ++ } else { ++- verbose("insn state internal bug\n"); +++ verbose(env, "insn state internal bug\n"); ++ return -EFAULT; ++ } ++ return 0; ++@@ -1499,43 +6532,56 @@ static int push_insn(int t, int w, int e ++ /* non-recursive depth-first-search to detect loops in BPF program ++ * loop == back-edge in directed graph ++ */ ++-static int check_cfg(struct verifier_env *env) +++static int check_cfg(struct bpf_verifier_env *env) ++ { ++ struct bpf_insn *insns = env->prog->insnsi; ++ int insn_cnt = env->prog->len; +++ int *insn_stack, *insn_state; ++ int ret = 0; ++ int i, t; ++ ++- insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); +++ insn_state = env->cfg.insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); ++ if (!insn_state) ++ return -ENOMEM; ++ ++- insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); +++ insn_stack = env->cfg.insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); ++ if (!insn_stack) { ++- kfree(insn_state); +++ kvfree(insn_state); ++ return -ENOMEM; ++ } ++ ++ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */ ++ insn_stack[0] = 0; /* 0 is the first instruction */ ++- cur_stack = 1; +++ env->cfg.cur_stack = 1; ++ ++ peek_stack: ++- if (cur_stack == 0) +++ if (env->cfg.cur_stack == 0) ++ goto check_state; ++- t = insn_stack[cur_stack - 1]; +++ t = insn_stack[env->cfg.cur_stack - 1]; ++ ++- if (BPF_CLASS(insns[t].code) == BPF_JMP) { +++ if (BPF_CLASS(insns[t].code) == BPF_JMP || +++ BPF_CLASS(insns[t].code) == BPF_JMP32) { ++ u8 opcode = BPF_OP(insns[t].code); ++ ++ if (opcode == BPF_EXIT) { ++ goto mark_explored; ++ } else if (opcode == BPF_CALL) { ++- ret = push_insn(t, t + 1, FALLTHROUGH, env); +++ ret = push_insn(t, t + 1, FALLTHROUGH, env, false); ++ if (ret == 1) ++ goto peek_stack; ++ else if (ret < 0) ++ goto err_free; +++ if (t + 1 < insn_cnt) +++ init_explored_state(env, t + 1); +++ if (insns[t].src_reg == BPF_PSEUDO_CALL) { +++ init_explored_state(env, t); +++ ret = push_insn(t, t + insns[t].imm + 1, BRANCH, +++ env, false); +++ if (ret == 1) +++ goto peek_stack; +++ else if (ret < 0) +++ goto err_free; +++ } ++ } else if (opcode == BPF_JA) { ++ if (BPF_SRC(insns[t].code) != BPF_K) { ++ ret = -EINVAL; ++@@ -1543,25 +6589,31 @@ peek_stack: ++ } ++ /* unconditional jump with single edge */ ++ ret = push_insn(t, t + insns[t].off + 1, ++- FALLTHROUGH, env); +++ FALLTHROUGH, env, true); ++ if (ret == 1) ++ goto peek_stack; ++ else if (ret < 0) ++ goto err_free; +++ /* unconditional jmp is not a good pruning point, +++ * but it's marked, since backtracking needs +++ * to record jmp history in is_state_visited(). +++ */ +++ init_explored_state(env, t + insns[t].off + 1); ++ /* tell verifier to check for equivalent states ++ * after every call and jump ++ */ ++ if (t + 1 < insn_cnt) ++- env->explored_states[t + 1] = STATE_LIST_MARK; +++ init_explored_state(env, t + 1); ++ } else { ++ /* conditional jump with two edges */ ++- ret = push_insn(t, t + 1, FALLTHROUGH, env); +++ init_explored_state(env, t); +++ ret = push_insn(t, t + 1, FALLTHROUGH, env, true); ++ if (ret == 1) ++ goto peek_stack; ++ else if (ret < 0) ++ goto err_free; ++ ++- ret = push_insn(t, t + insns[t].off + 1, BRANCH, env); +++ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); ++ if (ret == 1) ++ goto peek_stack; ++ else if (ret < 0) ++@@ -1571,7 +6623,7 @@ peek_stack: ++ /* all other non-branch instructions with single ++ * fall-through edge ++ */ ++- ret = push_insn(t, t + 1, FALLTHROUGH, env); +++ ret = push_insn(t, t + 1, FALLTHROUGH, env, false); ++ if (ret == 1) ++ goto peek_stack; ++ else if (ret < 0) ++@@ -1580,8 +6632,8 @@ peek_stack: ++ ++ mark_explored: ++ insn_state[t] = EXPLORED; ++- if (cur_stack-- <= 0) { ++- verbose("pop stack internal bug\n"); +++ if (env->cfg.cur_stack-- <= 0) { +++ verbose(env, "pop stack internal bug\n"); ++ ret = -EFAULT; ++ goto err_free; ++ } ++@@ -1590,7 +6642,7 @@ mark_explored: ++ check_state: ++ for (i = 0; i < insn_cnt; i++) { ++ if (insn_state[i] != EXPLORED) { ++- verbose("unreachable insn %d\n", i); +++ verbose(env, "unreachable insn %d\n", i); ++ ret = -EINVAL; ++ goto err_free; ++ } ++@@ -1598,11 +6650,616 @@ check_state: ++ ret = 0; /* cfg looks good */ ++ ++ err_free: ++- kfree(insn_state); ++- kfree(insn_stack); +++ kvfree(insn_state); +++ kvfree(insn_stack); +++ env->cfg.insn_state = env->cfg.insn_stack = NULL; ++ return ret; ++ } ++ +++/* The minimum supported BTF func info size */ +++#define MIN_BPF_FUNCINFO_SIZE 8 +++#define MAX_FUNCINFO_REC_SIZE 252 +++ +++static int check_btf_func(struct bpf_verifier_env *env, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ u32 i, nfuncs, urec_size, min_size; +++ u32 krec_size = sizeof(struct bpf_func_info); +++ struct bpf_func_info *krecord; +++ const struct btf_type *type; +++ struct bpf_prog *prog; +++ const struct btf *btf; +++ void __user *urecord; +++ u32 prev_offset = 0; +++ int ret = 0; +++ +++ nfuncs = attr->func_info_cnt; +++ if (!nfuncs) +++ return 0; +++ +++ if (nfuncs != env->subprog_cnt) { +++ verbose(env, "number of funcs in func_info doesn't match number of subprogs\n"); +++ return -EINVAL; +++ } +++ +++ urec_size = attr->func_info_rec_size; +++ if (urec_size < MIN_BPF_FUNCINFO_SIZE || +++ urec_size > MAX_FUNCINFO_REC_SIZE || +++ urec_size % sizeof(u32)) { +++ verbose(env, "invalid func info rec size %u\n", urec_size); +++ return -EINVAL; +++ } +++ +++ prog = env->prog; +++ btf = prog->aux->btf; +++ +++ urecord = u64_to_user_ptr(attr->func_info); +++ min_size = min_t(u32, krec_size, urec_size); +++ +++ krecord = kcalloc(nfuncs, krec_size, GFP_KERNEL | __GFP_NOWARN); +++ if (!krecord) +++ return -ENOMEM; +++ +++ for (i = 0; i < nfuncs; i++) { +++ ret = bpf_check_uarg_tail_zero(urecord, krec_size, urec_size); +++ if (ret) { +++ if (ret == -E2BIG) { +++ verbose(env, "nonzero tailing record in func info"); +++ /* set the size kernel expects so loader can zero +++ * out the rest of the record. +++ */ +++ if (put_user(min_size, &uattr->func_info_rec_size)) +++ ret = -EFAULT; +++ } +++ goto err_free; +++ } +++ +++ if (copy_from_user(&krecord[i], urecord, min_size)) { +++ ret = -EFAULT; +++ goto err_free; +++ } +++ +++ /* check insn_off */ +++ if (i == 0) { +++ if (krecord[i].insn_off) { +++ verbose(env, +++ "nonzero insn_off %u for the first func info record", +++ krecord[i].insn_off); +++ ret = -EINVAL; +++ goto err_free; +++ } +++ } else if (krecord[i].insn_off <= prev_offset) { +++ verbose(env, +++ "same or smaller insn offset (%u) than previous func info record (%u)", +++ krecord[i].insn_off, prev_offset); +++ ret = -EINVAL; +++ goto err_free; +++ } +++ +++ if (env->subprog_info[i].start != krecord[i].insn_off) { +++ verbose(env, "func_info BTF section doesn't match subprog layout in BPF program\n"); +++ ret = -EINVAL; +++ goto err_free; +++ } +++ +++ /* check type_id */ +++ type = btf_type_by_id(btf, krecord[i].type_id); +++ if (!type || BTF_INFO_KIND(type->info) != BTF_KIND_FUNC) { +++ verbose(env, "invalid type id %d in func info", +++ krecord[i].type_id); +++ ret = -EINVAL; +++ goto err_free; +++ } +++ +++ prev_offset = krecord[i].insn_off; +++ urecord += urec_size; +++ } +++ +++ prog->aux->func_info = krecord; +++ prog->aux->func_info_cnt = nfuncs; +++ return 0; +++ +++err_free: +++ kvfree(krecord); +++ return ret; +++} +++ +++static void adjust_btf_func(struct bpf_verifier_env *env) +++{ +++ int i; +++ +++ if (!env->prog->aux->func_info) +++ return; +++ +++ for (i = 0; i < env->subprog_cnt; i++) +++ env->prog->aux->func_info[i].insn_off = env->subprog_info[i].start; +++} +++ +++#define MIN_BPF_LINEINFO_SIZE (offsetof(struct bpf_line_info, line_col) + \ +++ sizeof(((struct bpf_line_info *)(0))->line_col)) +++#define MAX_LINEINFO_REC_SIZE MAX_FUNCINFO_REC_SIZE +++ +++static int check_btf_line(struct bpf_verifier_env *env, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ u32 i, s, nr_linfo, ncopy, expected_size, rec_size, prev_offset = 0; +++ struct bpf_subprog_info *sub; +++ struct bpf_line_info *linfo; +++ struct bpf_prog *prog; +++ const struct btf *btf; +++ void __user *ulinfo; +++ int err; +++ +++ nr_linfo = attr->line_info_cnt; +++ if (!nr_linfo) +++ return 0; +++ +++ rec_size = attr->line_info_rec_size; +++ if (rec_size < MIN_BPF_LINEINFO_SIZE || +++ rec_size > MAX_LINEINFO_REC_SIZE || +++ rec_size & (sizeof(u32) - 1)) +++ return -EINVAL; +++ +++ /* Need to zero it in case the userspace may +++ * pass in a smaller bpf_line_info object. +++ */ +++ linfo = kcalloc(nr_linfo, sizeof(struct bpf_line_info), +++ GFP_KERNEL | __GFP_NOWARN); +++ if (!linfo) +++ return -ENOMEM; +++ +++ prog = env->prog; +++ btf = prog->aux->btf; +++ +++ s = 0; +++ sub = env->subprog_info; +++ ulinfo = u64_to_user_ptr(attr->line_info); +++ expected_size = sizeof(struct bpf_line_info); +++ ncopy = min_t(u32, expected_size, rec_size); +++ for (i = 0; i < nr_linfo; i++) { +++ err = bpf_check_uarg_tail_zero(ulinfo, expected_size, rec_size); +++ if (err) { +++ if (err == -E2BIG) { +++ verbose(env, "nonzero tailing record in line_info"); +++ if (put_user(expected_size, +++ &uattr->line_info_rec_size)) +++ err = -EFAULT; +++ } +++ goto err_free; +++ } +++ +++ if (copy_from_user(&linfo[i], ulinfo, ncopy)) { +++ err = -EFAULT; +++ goto err_free; +++ } +++ +++ /* +++ * Check insn_off to ensure +++ * 1) strictly increasing AND +++ * 2) bounded by prog->len +++ * +++ * The linfo[0].insn_off == 0 check logically falls into +++ * the later "missing bpf_line_info for func..." case +++ * because the first linfo[0].insn_off must be the +++ * first sub also and the first sub must have +++ * subprog_info[0].start == 0. +++ */ +++ if ((i && linfo[i].insn_off <= prev_offset) || +++ linfo[i].insn_off >= prog->len) { +++ verbose(env, "Invalid line_info[%u].insn_off:%u (prev_offset:%u prog->len:%u)\n", +++ i, linfo[i].insn_off, prev_offset, +++ prog->len); +++ err = -EINVAL; +++ goto err_free; +++ } +++ +++ if (!prog->insnsi[linfo[i].insn_off].code) { +++ verbose(env, +++ "Invalid insn code at line_info[%u].insn_off\n", +++ i); +++ err = -EINVAL; +++ goto err_free; +++ } +++ +++ if (!btf_name_by_offset(btf, linfo[i].line_off) || +++ !btf_name_by_offset(btf, linfo[i].file_name_off)) { +++ verbose(env, "Invalid line_info[%u].line_off or .file_name_off\n", i); +++ err = -EINVAL; +++ goto err_free; +++ } +++ +++ if (s != env->subprog_cnt) { +++ if (linfo[i].insn_off == sub[s].start) { +++ sub[s].linfo_idx = i; +++ s++; +++ } else if (sub[s].start < linfo[i].insn_off) { +++ verbose(env, "missing bpf_line_info for func#%u\n", s); +++ err = -EINVAL; +++ goto err_free; +++ } +++ } +++ +++ prev_offset = linfo[i].insn_off; +++ ulinfo += rec_size; +++ } +++ +++ if (s != env->subprog_cnt) { +++ verbose(env, "missing bpf_line_info for %u funcs starting from func#%u\n", +++ env->subprog_cnt - s, s); +++ err = -EINVAL; +++ goto err_free; +++ } +++ +++ prog->aux->linfo = linfo; +++ prog->aux->nr_linfo = nr_linfo; +++ +++ return 0; +++ +++err_free: +++ kvfree(linfo); +++ return err; +++} +++ +++static int check_btf_info(struct bpf_verifier_env *env, +++ const union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ struct btf *btf; +++ int err; +++ +++ if (!attr->func_info_cnt && !attr->line_info_cnt) +++ return 0; +++ +++ btf = btf_get_by_fd(attr->prog_btf_fd); +++ if (IS_ERR(btf)) +++ return PTR_ERR(btf); +++ env->prog->aux->btf = btf; +++ +++ err = check_btf_func(env, attr, uattr); +++ if (err) +++ return err; +++ +++ err = check_btf_line(env, attr, uattr); +++ if (err) +++ return err; +++ +++ return 0; +++} +++ +++/* check %cur's range satisfies %old's */ +++static bool range_within(struct bpf_reg_state *old, +++ struct bpf_reg_state *cur) +++{ +++ return old->umin_value <= cur->umin_value && +++ old->umax_value >= cur->umax_value && +++ old->smin_value <= cur->smin_value && +++ old->smax_value >= cur->smax_value; +++} +++ +++/* Maximum number of register states that can exist at once */ +++#define ID_MAP_SIZE (MAX_BPF_REG + MAX_BPF_STACK / BPF_REG_SIZE) +++struct idpair { +++ u32 old; +++ u32 cur; +++}; +++ +++/* If in the old state two registers had the same id, then they need to have +++ * the same id in the new state as well. But that id could be different from +++ * the old state, so we need to track the mapping from old to new ids. +++ * Once we have seen that, say, a reg with old id 5 had new id 9, any subsequent +++ * regs with old id 5 must also have new id 9 for the new state to be safe. But +++ * regs with a different old id could still have new id 9, we don't care about +++ * that. +++ * So we look through our idmap to see if this old id has been seen before. If +++ * so, we require the new id to match; otherwise, we add the id pair to the map. +++ */ +++static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) +++{ +++ unsigned int i; +++ +++ for (i = 0; i < ID_MAP_SIZE; i++) { +++ if (!idmap[i].old) { +++ /* Reached an empty slot; haven't seen this id before */ +++ idmap[i].old = old_id; +++ idmap[i].cur = cur_id; +++ return true; +++ } +++ if (idmap[i].old == old_id) +++ return idmap[i].cur == cur_id; +++ } +++ /* We ran out of idmap slots, which should be impossible */ +++ WARN_ON_ONCE(1); +++ return false; +++} +++ +++static void clean_func_state(struct bpf_verifier_env *env, +++ struct bpf_func_state *st) +++{ +++ enum bpf_reg_liveness live; +++ int i, j; +++ +++ for (i = 0; i < BPF_REG_FP; i++) { +++ live = st->regs[i].live; +++ /* liveness must not touch this register anymore */ +++ st->regs[i].live |= REG_LIVE_DONE; +++ if (!(live & REG_LIVE_READ)) +++ /* since the register is unused, clear its state +++ * to make further comparison simpler +++ */ +++ __mark_reg_not_init(env, &st->regs[i]); +++ } +++ +++ for (i = 0; i < st->allocated_stack / BPF_REG_SIZE; i++) { +++ live = st->stack[i].spilled_ptr.live; +++ /* liveness must not touch this stack slot anymore */ +++ st->stack[i].spilled_ptr.live |= REG_LIVE_DONE; +++ if (!(live & REG_LIVE_READ)) { +++ __mark_reg_not_init(env, &st->stack[i].spilled_ptr); +++ for (j = 0; j < BPF_REG_SIZE; j++) +++ st->stack[i].slot_type[j] = STACK_INVALID; +++ } +++ } +++} +++ +++static void clean_verifier_state(struct bpf_verifier_env *env, +++ struct bpf_verifier_state *st) +++{ +++ int i; +++ +++ if (st->frame[0]->regs[0].live & REG_LIVE_DONE) +++ /* all regs in this state in all frames were already marked */ +++ return; +++ +++ for (i = 0; i <= st->curframe; i++) +++ clean_func_state(env, st->frame[i]); +++} +++ +++/* the parentage chains form a tree. +++ * the verifier states are added to state lists at given insn and +++ * pushed into state stack for future exploration. +++ * when the verifier reaches bpf_exit insn some of the verifer states +++ * stored in the state lists have their final liveness state already, +++ * but a lot of states will get revised from liveness point of view when +++ * the verifier explores other branches. +++ * Example: +++ * 1: r0 = 1 +++ * 2: if r1 == 100 goto pc+1 +++ * 3: r0 = 2 +++ * 4: exit +++ * when the verifier reaches exit insn the register r0 in the state list of +++ * insn 2 will be seen as !REG_LIVE_READ. Then the verifier pops the other_branch +++ * of insn 2 and goes exploring further. At the insn 4 it will walk the +++ * parentage chain from insn 4 into insn 2 and will mark r0 as REG_LIVE_READ. +++ * +++ * Since the verifier pushes the branch states as it sees them while exploring +++ * the program the condition of walking the branch instruction for the second +++ * time means that all states below this branch were already explored and +++ * their final liveness markes are already propagated. +++ * Hence when the verifier completes the search of state list in is_state_visited() +++ * we can call this clean_live_states() function to mark all liveness states +++ * as REG_LIVE_DONE to indicate that 'parent' pointers of 'struct bpf_reg_state' +++ * will not be used. +++ * This function also clears the registers and stack for states that !READ +++ * to simplify state merging. +++ * +++ * Important note here that walking the same branch instruction in the callee +++ * doesn't meant that the states are DONE. The verifier has to compare +++ * the callsites +++ */ +++static void clean_live_states(struct bpf_verifier_env *env, int insn, +++ struct bpf_verifier_state *cur) +++{ +++ struct bpf_verifier_state_list *sl; +++ int i; +++ +++ sl = *explored_state(env, insn); +++ while (sl) { +++ if (sl->state.branches) +++ goto next; +++ if (sl->state.insn_idx != insn || +++ sl->state.curframe != cur->curframe) +++ goto next; +++ for (i = 0; i <= cur->curframe; i++) +++ if (sl->state.frame[i]->callsite != cur->frame[i]->callsite) +++ goto next; +++ clean_verifier_state(env, &sl->state); +++next: +++ sl = sl->next; +++ } +++} +++ +++/* Returns true if (rold safe implies rcur safe) */ +++static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, +++ struct idpair *idmap) +++{ +++ bool equal; +++ +++ if (!(rold->live & REG_LIVE_READ)) +++ /* explored state didn't use this */ +++ return true; +++ +++ equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, parent)) == 0; +++ +++ if (rold->type == PTR_TO_STACK) +++ /* two stack pointers are equal only if they're pointing to +++ * the same stack frame, since fp-8 in foo != fp-8 in bar +++ */ +++ return equal && rold->frameno == rcur->frameno; +++ +++ if (equal) +++ return true; +++ +++ if (rold->type == NOT_INIT) +++ /* explored state can't have used this */ +++ return true; +++ if (rcur->type == NOT_INIT) +++ return false; +++ switch (rold->type) { +++ case SCALAR_VALUE: +++ if (rcur->type == SCALAR_VALUE) { +++ if (!rold->precise && !rcur->precise) +++ return true; +++ /* new val must satisfy old val knowledge */ +++ return range_within(rold, rcur) && +++ tnum_in(rold->var_off, rcur->var_off); +++ } else { +++ /* We're trying to use a pointer in place of a scalar. +++ * Even if the scalar was unbounded, this could lead to +++ * pointer leaks because scalars are allowed to leak +++ * while pointers are not. We could make this safe in +++ * special cases if root is calling us, but it's +++ * probably not worth the hassle. +++ */ +++ return false; +++ } +++ case PTR_TO_MAP_VALUE: +++ /* If the new min/max/var_off satisfy the old ones and +++ * everything else matches, we are OK. +++ * 'id' is not compared, since it's only used for maps with +++ * bpf_spin_lock inside map element and in such cases if +++ * the rest of the prog is valid for one map element then +++ * it's valid for all map elements regardless of the key +++ * used in bpf_map_lookup() +++ */ +++ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && +++ range_within(rold, rcur) && +++ tnum_in(rold->var_off, rcur->var_off); +++ case PTR_TO_MAP_VALUE_OR_NULL: +++ /* a PTR_TO_MAP_VALUE could be safe to use as a +++ * PTR_TO_MAP_VALUE_OR_NULL into the same map. +++ * However, if the old PTR_TO_MAP_VALUE_OR_NULL then got NULL- +++ * checked, doing so could have affected others with the same +++ * id, and we can't check for that because we lost the id when +++ * we converted to a PTR_TO_MAP_VALUE. +++ */ +++ if (rcur->type != PTR_TO_MAP_VALUE_OR_NULL) +++ return false; +++ if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, id))) +++ return false; +++ /* Check our ids match any regs they're supposed to */ +++ return check_ids(rold->id, rcur->id, idmap); +++ case PTR_TO_PACKET_META: +++ case PTR_TO_PACKET: +++ if (rcur->type != rold->type) +++ return false; +++ /* We must have at least as much range as the old ptr +++ * did, so that any accesses which were safe before are +++ * still safe. This is true even if old range < old off, +++ * since someone could have accessed through (ptr - k), or +++ * even done ptr -= k in a register, to get a safe access. +++ */ +++ if (rold->range > rcur->range) +++ return false; +++ /* If the offsets don't match, we can't trust our alignment; +++ * nor can we be sure that we won't fall out of range. +++ */ +++ if (rold->off != rcur->off) +++ return false; +++ /* id relations must be preserved */ +++ if (rold->id && !check_ids(rold->id, rcur->id, idmap)) +++ return false; +++ /* new val must satisfy old val knowledge */ +++ return range_within(rold, rcur) && +++ tnum_in(rold->var_off, rcur->var_off); +++ case PTR_TO_CTX: +++ case CONST_PTR_TO_MAP: +++ case PTR_TO_PACKET_END: +++ case PTR_TO_FLOW_KEYS: +++ case PTR_TO_SOCKET: +++ case PTR_TO_SOCKET_OR_NULL: +++ case PTR_TO_SOCK_COMMON: +++ case PTR_TO_SOCK_COMMON_OR_NULL: +++ case PTR_TO_TCP_SOCK: +++ case PTR_TO_TCP_SOCK_OR_NULL: +++ case PTR_TO_XDP_SOCK: +++ /* Only valid matches are exact, which memcmp() above +++ * would have accepted +++ */ +++ default: +++ /* Don't know what's going on, just say it's not safe */ +++ return false; +++ } +++ +++ /* Shouldn't get here; if we do, say it's not safe */ +++ WARN_ON_ONCE(1); +++ return false; +++} +++ +++static bool stacksafe(struct bpf_func_state *old, +++ struct bpf_func_state *cur, +++ struct idpair *idmap) +++{ +++ int i, spi; +++ +++ /* walk slots of the explored stack and ignore any additional +++ * slots in the current stack, since explored(safe) state +++ * didn't use them +++ */ +++ for (i = 0; i < old->allocated_stack; i++) { +++ spi = i / BPF_REG_SIZE; +++ +++ if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) { +++ i += BPF_REG_SIZE - 1; +++ /* explored state didn't use this */ +++ continue; +++ } +++ +++ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) +++ continue; +++ +++ /* explored stack has more populated slots than current stack +++ * and these slots were used +++ */ +++ if (i >= cur->allocated_stack) +++ return false; +++ +++ /* if old state was safe with misc data in the stack +++ * it will be safe with zero-initialized stack. +++ * The opposite is not true +++ */ +++ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && +++ cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) +++ continue; +++ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != +++ cur->stack[spi].slot_type[i % BPF_REG_SIZE]) +++ /* Ex: old explored (safe) state has STACK_SPILL in +++ * this stack slot, but current has has STACK_MISC -> +++ * this verifier states are not equivalent, +++ * return false to continue verification of this path +++ */ +++ return false; +++ if (i % BPF_REG_SIZE) +++ continue; +++ if (old->stack[spi].slot_type[0] != STACK_SPILL) +++ continue; +++ if (!regsafe(&old->stack[spi].spilled_ptr, +++ &cur->stack[spi].spilled_ptr, +++ idmap)) +++ /* when explored and current stack slot are both storing +++ * spilled registers, check that stored pointers types +++ * are the same as well. +++ * Ex: explored safe path could have stored +++ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} +++ * but current path has stored: +++ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} +++ * such verifier states are not equivalent. +++ * return false to continue verification of this path +++ */ +++ return false; +++ } +++ return true; +++} +++ +++static bool refsafe(struct bpf_func_state *old, struct bpf_func_state *cur) +++{ +++ if (old->acquired_refs != cur->acquired_refs) +++ return false; +++ return !memcmp(old->refs, cur->refs, +++ sizeof(*old->refs) * old->acquired_refs); +++} +++ ++ /* compare two verifier states ++ * ++ * all states stored in state_list are known to be valid, since ++@@ -1629,165 +7286,562 @@ err_free: ++ * whereas register type in current state is meaningful, it means that ++ * the current state will reach 'bpf_exit' instruction safely ++ */ ++-static bool states_equal(struct verifier_state *old, struct verifier_state *cur) +++static bool func_states_equal(struct bpf_func_state *old, +++ struct bpf_func_state *cur) ++ { +++ struct idpair *idmap; +++ bool ret = false; ++ int i; ++ +++ idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); +++ /* If we failed to allocate the idmap, just say it's not safe */ +++ if (!idmap) +++ return false; +++ ++ for (i = 0; i < MAX_BPF_REG; i++) { ++- if (memcmp(&old->regs[i], &cur->regs[i], ++- sizeof(old->regs[0])) != 0) { ++- if (old->regs[i].type == NOT_INIT || ++- (old->regs[i].type == UNKNOWN_VALUE && ++- cur->regs[i].type != NOT_INIT)) ++- continue; +++ if (!regsafe(&old->regs[i], &cur->regs[i], idmap)) +++ goto out_free; +++ } +++ +++ if (!stacksafe(old, cur, idmap)) +++ goto out_free; +++ +++ if (!refsafe(old, cur)) +++ goto out_free; +++ ret = true; +++out_free: +++ kfree(idmap); +++ return ret; +++} +++ +++static bool states_equal(struct bpf_verifier_env *env, +++ struct bpf_verifier_state *old, +++ struct bpf_verifier_state *cur) +++{ +++ int i; +++ +++ if (old->curframe != cur->curframe) +++ return false; +++ +++ /* Verification state from speculative execution simulation +++ * must never prune a non-speculative execution one. +++ */ +++ if (old->speculative && !cur->speculative) +++ return false; +++ +++ if (old->active_spin_lock != cur->active_spin_lock) +++ return false; +++ +++ /* for states to be equal callsites have to be the same +++ * and all frame states need to be equivalent +++ */ +++ for (i = 0; i <= old->curframe; i++) { +++ if (old->frame[i]->callsite != cur->frame[i]->callsite) +++ return false; +++ if (!func_states_equal(old->frame[i], cur->frame[i])) ++ return false; +++ } +++ return true; +++} +++ +++/* Return 0 if no propagation happened. Return negative error code if error +++ * happened. Otherwise, return the propagated bit. +++ */ +++static int propagate_liveness_reg(struct bpf_verifier_env *env, +++ struct bpf_reg_state *reg, +++ struct bpf_reg_state *parent_reg) +++{ +++ u8 parent_flag = parent_reg->live & REG_LIVE_READ; +++ u8 flag = reg->live & REG_LIVE_READ; +++ int err; +++ +++ /* When comes here, read flags of PARENT_REG or REG could be any of +++ * REG_LIVE_READ64, REG_LIVE_READ32, REG_LIVE_NONE. There is no need +++ * of propagation if PARENT_REG has strongest REG_LIVE_READ64. +++ */ +++ if (parent_flag == REG_LIVE_READ64 || +++ /* Or if there is no read flag from REG. */ +++ !flag || +++ /* Or if the read flag from REG is the same as PARENT_REG. */ +++ parent_flag == flag) +++ return 0; +++ +++ err = mark_reg_read(env, reg, parent_reg, flag); +++ if (err) +++ return err; +++ +++ return flag; +++} +++ +++/* A write screens off any subsequent reads; but write marks come from the +++ * straight-line code between a state and its parent. When we arrive at an +++ * equivalent state (jump target or such) we didn't arrive by the straight-line +++ * code, so read marks in the state must propagate to the parent regardless +++ * of the state's write marks. That's what 'parent == state->parent' comparison +++ * in mark_reg_read() is for. +++ */ +++static int propagate_liveness(struct bpf_verifier_env *env, +++ const struct bpf_verifier_state *vstate, +++ struct bpf_verifier_state *vparent) +++{ +++ struct bpf_reg_state *state_reg, *parent_reg; +++ struct bpf_func_state *state, *parent; +++ int i, frame, err = 0; +++ +++ if (vparent->curframe != vstate->curframe) { +++ WARN(1, "propagate_live: parent frame %d current frame %d\n", +++ vparent->curframe, vstate->curframe); +++ return -EFAULT; +++ } +++ /* Propagate read liveness of registers... */ +++ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); +++ for (frame = 0; frame <= vstate->curframe; frame++) { +++ parent = vparent->frame[frame]; +++ state = vstate->frame[frame]; +++ parent_reg = parent->regs; +++ state_reg = state->regs; +++ /* We don't need to worry about FP liveness, it's read-only */ +++ for (i = frame < vstate->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) { +++ err = propagate_liveness_reg(env, &state_reg[i], +++ &parent_reg[i]); +++ if (err < 0) +++ return err; +++ if (err == REG_LIVE_READ64) +++ mark_insn_zext(env, &parent_reg[i]); +++ } +++ +++ /* Propagate stack slots. */ +++ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && +++ i < parent->allocated_stack / BPF_REG_SIZE; i++) { +++ parent_reg = &parent->stack[i].spilled_ptr; +++ state_reg = &state->stack[i].spilled_ptr; +++ err = propagate_liveness_reg(env, state_reg, +++ parent_reg); +++ if (err < 0) +++ return err; ++ } ++ } +++ return 0; +++} ++ ++- for (i = 0; i < MAX_BPF_STACK; i++) { ++- if (old->stack_slot_type[i] == STACK_INVALID) +++/* find precise scalars in the previous equivalent state and +++ * propagate them into the current state +++ */ +++static int propagate_precision(struct bpf_verifier_env *env, +++ const struct bpf_verifier_state *old) +++{ +++ struct bpf_reg_state *state_reg; +++ struct bpf_func_state *state; +++ int i, err = 0; +++ +++ state = old->frame[old->curframe]; +++ state_reg = state->regs; +++ for (i = 0; i < BPF_REG_FP; i++, state_reg++) { +++ if (state_reg->type != SCALAR_VALUE || +++ !state_reg->precise) ++ continue; ++- if (old->stack_slot_type[i] != cur->stack_slot_type[i]) ++- /* Ex: old explored (safe) state has STACK_SPILL in ++- * this stack slot, but current has has STACK_MISC -> ++- * this verifier states are not equivalent, ++- * return false to continue verification of this path ++- */ ++- return false; ++- if (i % BPF_REG_SIZE) +++ if (env->log.level & BPF_LOG_LEVEL2) +++ verbose(env, "propagating r%d\n", i); +++ err = mark_chain_precision(env, i); +++ if (err < 0) +++ return err; +++ } +++ +++ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { +++ if (state->stack[i].slot_type[0] != STACK_SPILL) ++ continue; ++- if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE], ++- &cur->spilled_regs[i / BPF_REG_SIZE], ++- sizeof(old->spilled_regs[0]))) ++- /* when explored and current stack slot types are ++- * the same, check that stored pointers types ++- * are the same as well. ++- * Ex: explored safe path could have stored ++- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -8} ++- * but current path has stored: ++- * (struct reg_state) {.type = PTR_TO_STACK, .imm = -16} ++- * such verifier states are not equivalent. ++- * return false to continue verification of this path ++- */ ++- return false; ++- else +++ state_reg = &state->stack[i].spilled_ptr; +++ if (state_reg->type != SCALAR_VALUE || +++ !state_reg->precise) ++ continue; +++ if (env->log.level & BPF_LOG_LEVEL2) +++ verbose(env, "propagating fp%d\n", +++ (-i - 1) * BPF_REG_SIZE); +++ err = mark_chain_precision_stack(env, i); +++ if (err < 0) +++ return err; ++ } +++ return 0; +++} +++ +++static bool states_maybe_looping(struct bpf_verifier_state *old, +++ struct bpf_verifier_state *cur) +++{ +++ struct bpf_func_state *fold, *fcur; +++ int i, fr = cur->curframe; +++ +++ if (old->curframe != fr) +++ return false; +++ +++ fold = old->frame[fr]; +++ fcur = cur->frame[fr]; +++ for (i = 0; i < MAX_BPF_REG; i++) +++ if (memcmp(&fold->regs[i], &fcur->regs[i], +++ offsetof(struct bpf_reg_state, parent))) +++ return false; ++ return true; ++ } ++ ++-static int is_state_visited(struct verifier_env *env, int insn_idx) +++ +++static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) ++ { ++- struct verifier_state_list *new_sl; ++- struct verifier_state_list *sl; +++ struct bpf_verifier_state_list *new_sl; +++ struct bpf_verifier_state_list *sl, **pprev; +++ struct bpf_verifier_state *cur = env->cur_state, *new; +++ int i, j, err, states_cnt = 0; +++ bool add_new_state = env->test_state_freq ? true : false; ++ ++- sl = env->explored_states[insn_idx]; ++- if (!sl) +++ cur->last_insn_idx = env->prev_insn_idx; +++ if (!env->insn_aux_data[insn_idx].prune_point) ++ /* this 'insn_idx' instruction wasn't marked, so we will not ++ * be doing state search here ++ */ ++ return 0; ++ ++- while (sl != STATE_LIST_MARK) { ++- if (states_equal(&sl->state, &env->cur_state)) +++ /* bpf progs typically have pruning point every 4 instructions +++ * http://vger.kernel.org/bpfconf2019.html#session-1 +++ * Do not add new state for future pruning if the verifier hasn't seen +++ * at least 2 jumps and at least 8 instructions. +++ * This heuristics helps decrease 'total_states' and 'peak_states' metric. +++ * In tests that amounts to up to 50% reduction into total verifier +++ * memory consumption and 20% verifier time speedup. +++ */ +++ if (env->jmps_processed - env->prev_jmps_processed >= 2 && +++ env->insn_processed - env->prev_insn_processed >= 8) +++ add_new_state = true; +++ +++ pprev = explored_state(env, insn_idx); +++ sl = *pprev; +++ +++ clean_live_states(env, insn_idx, cur); +++ +++ while (sl) { +++ states_cnt++; +++ if (sl->state.insn_idx != insn_idx) +++ goto next; +++ if (sl->state.branches) { +++ if (states_maybe_looping(&sl->state, cur) && +++ states_equal(env, &sl->state, cur)) { +++ verbose_linfo(env, insn_idx, "; "); +++ verbose(env, "infinite loop detected at insn %d\n", insn_idx); +++ return -EINVAL; +++ } +++ /* if the verifier is processing a loop, avoid adding new state +++ * too often, since different loop iterations have distinct +++ * states and may not help future pruning. +++ * This threshold shouldn't be too low to make sure that +++ * a loop with large bound will be rejected quickly. +++ * The most abusive loop will be: +++ * r1 += 1 +++ * if r1 < 1000000 goto pc-2 +++ * 1M insn_procssed limit / 100 == 10k peak states. +++ * This threshold shouldn't be too high either, since states +++ * at the end of the loop are likely to be useful in pruning. +++ */ +++ if (env->jmps_processed - env->prev_jmps_processed < 20 && +++ env->insn_processed - env->prev_insn_processed < 100) +++ add_new_state = false; +++ goto miss; +++ } +++ if (states_equal(env, &sl->state, cur)) { +++ sl->hit_cnt++; ++ /* reached equivalent register/stack state, ++- * prune the search +++ * prune the search. +++ * Registers read by the continuation are read by us. +++ * If we have any write marks in env->cur_state, they +++ * will prevent corresponding reads in the continuation +++ * from reaching our parent (an explored_state). Our +++ * own state will get the read marks recorded, but +++ * they'll be immediately forgotten as we're pruning +++ * this state and will pop a new one. ++ */ ++- return 1; ++- sl = sl->next; ++- } +++ err = propagate_liveness(env, &sl->state, cur); ++ ++- /* there were no equivalent states, remember current one. ++- * technically the current state is not proven to be safe yet, ++- * but it will either reach bpf_exit (which means it's safe) or ++- * it will be rejected. Since there are no loops, we won't be ++- * seeing this 'insn_idx' instruction again on the way to bpf_exit +++ /* if previous state reached the exit with precision and +++ * current state is equivalent to it (except precsion marks) +++ * the precision needs to be propagated back in +++ * the current state. +++ */ +++ err = err ? : push_jmp_history(env, cur); +++ err = err ? : propagate_precision(env, &sl->state); +++ if (err) +++ return err; +++ return 1; +++ } +++miss: +++ /* when new state is not going to be added do not increase miss count. +++ * Otherwise several loop iterations will remove the state +++ * recorded earlier. The goal of these heuristics is to have +++ * states from some iterations of the loop (some in the beginning +++ * and some at the end) to help pruning. +++ */ +++ if (add_new_state) +++ sl->miss_cnt++; +++ /* heuristic to determine whether this state is beneficial +++ * to keep checking from state equivalence point of view. +++ * Higher numbers increase max_states_per_insn and verification time, +++ * but do not meaningfully decrease insn_processed. +++ */ +++ if (sl->miss_cnt > sl->hit_cnt * 3 + 3) { +++ /* the state is unlikely to be useful. Remove it to +++ * speed up verification +++ */ +++ *pprev = sl->next; +++ if (sl->state.frame[0]->regs[0].live & REG_LIVE_DONE) { +++ u32 br = sl->state.branches; +++ +++ WARN_ONCE(br, +++ "BUG live_done but branches_to_explore %d\n", +++ br); +++ free_verifier_state(&sl->state, false); +++ kfree(sl); +++ env->peak_states--; +++ } else { +++ /* cannot free this state, since parentage chain may +++ * walk it later. Add it for free_list instead to +++ * be freed at the end of verification +++ */ +++ sl->next = env->free_list; +++ env->free_list = sl; +++ } +++ sl = *pprev; +++ continue; +++ } +++next: +++ pprev = &sl->next; +++ sl = *pprev; +++ } +++ +++ if (env->max_states_per_insn < states_cnt) +++ env->max_states_per_insn = states_cnt; +++ +++ if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) +++ return push_jmp_history(env, cur); +++ +++ if (!add_new_state) +++ return push_jmp_history(env, cur); +++ +++ /* There were no equivalent states, remember the current one. +++ * Technically the current state is not proven to be safe yet, +++ * but it will either reach outer most bpf_exit (which means it's safe) +++ * or it will be rejected. When there are no loops the verifier won't be +++ * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) +++ * again on the way to bpf_exit. +++ * When looping the sl->state.branches will be > 0 and this state +++ * will not be considered for equivalence until branches == 0. ++ */ ++- new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER); +++ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); ++ if (!new_sl) ++ return -ENOMEM; +++ env->total_states++; +++ env->peak_states++; +++ env->prev_jmps_processed = env->jmps_processed; +++ env->prev_insn_processed = env->insn_processed; ++ ++ /* add new state to the head of linked list */ ++- memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); ++- new_sl->next = env->explored_states[insn_idx]; ++- env->explored_states[insn_idx] = new_sl; +++ new = &new_sl->state; +++ err = copy_verifier_state(new, cur); +++ if (err) { +++ free_verifier_state(new, false); +++ kfree(new_sl); +++ return err; +++ } +++ new->insn_idx = insn_idx; +++ WARN_ONCE(new->branches != 1, +++ "BUG is_state_visited:branches_to_explore=%d insn %d\n", new->branches, insn_idx); +++ +++ cur->parent = new; +++ cur->first_insn_idx = insn_idx; +++ clear_jmp_history(cur); +++ new_sl->next = *explored_state(env, insn_idx); +++ *explored_state(env, insn_idx) = new_sl; +++ /* connect new state to parentage chain. Current frame needs all +++ * registers connected. Only r6 - r9 of the callers are alive (pushed +++ * to the stack implicitly by JITs) so in callers' frames connect just +++ * r6 - r9 as an optimization. Callers will have r1 - r5 connected to +++ * the state of the call instruction (with WRITTEN set), and r0 comes +++ * from callee with its full parentage chain, anyway. +++ */ +++ /* clear write marks in current state: the writes we did are not writes +++ * our child did, so they don't screen off its reads from us. +++ * (There are no read marks in current state, because reads always mark +++ * their parent and current state never has children yet. Only +++ * explored_states can get read marks.) +++ */ +++ for (j = 0; j <= cur->curframe; j++) { +++ for (i = j < cur->curframe ? BPF_REG_6 : 0; i < BPF_REG_FP; i++) +++ cur->frame[j]->regs[i].parent = &new->frame[j]->regs[i]; +++ for (i = 0; i < BPF_REG_FP; i++) +++ cur->frame[j]->regs[i].live = REG_LIVE_NONE; +++ } +++ +++ /* all stack frames are accessible from callee, clear them all */ +++ for (j = 0; j <= cur->curframe; j++) { +++ struct bpf_func_state *frame = cur->frame[j]; +++ struct bpf_func_state *newframe = new->frame[j]; +++ +++ for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) { +++ frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; +++ frame->stack[i].spilled_ptr.parent = +++ &newframe->stack[i].spilled_ptr; +++ } +++ } ++ return 0; ++ } ++ ++-static int do_check(struct verifier_env *env) +++/* Return true if it's OK to have the same insn return a different type. */ +++static bool reg_type_mismatch_ok(enum bpf_reg_type type) ++ { ++- struct verifier_state *state = &env->cur_state; +++ switch (type) { +++ case PTR_TO_CTX: +++ case PTR_TO_SOCKET: +++ case PTR_TO_SOCKET_OR_NULL: +++ case PTR_TO_SOCK_COMMON: +++ case PTR_TO_SOCK_COMMON_OR_NULL: +++ case PTR_TO_TCP_SOCK: +++ case PTR_TO_TCP_SOCK_OR_NULL: +++ case PTR_TO_XDP_SOCK: +++ return false; +++ default: +++ return true; +++ } +++} +++ +++/* If an instruction was previously used with particular pointer types, then we +++ * need to be careful to avoid cases such as the below, where it may be ok +++ * for one branch accessing the pointer, but not ok for the other branch: +++ * +++ * R1 = sock_ptr +++ * goto X; +++ * ... +++ * R1 = some_other_valid_ptr; +++ * goto X; +++ * ... +++ * R2 = *(u32 *)(R1 + 0); +++ */ +++static bool reg_type_mismatch(enum bpf_reg_type src, enum bpf_reg_type prev) +++{ +++ return src != prev && (!reg_type_mismatch_ok(src) || +++ !reg_type_mismatch_ok(prev)); +++} +++ +++static int do_check(struct bpf_verifier_env *env) +++{ +++ struct bpf_verifier_state *state; ++ struct bpf_insn *insns = env->prog->insnsi; ++- struct reg_state *regs = state->regs; +++ struct bpf_reg_state *regs; ++ int insn_cnt = env->prog->len; ++- int insn_idx, prev_insn_idx = 0; ++- int insn_processed = 0; ++ bool do_print_state = false; +++ int prev_insn_idx = -1; +++ +++ env->prev_linfo = NULL; +++ +++ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); +++ if (!state) +++ return -ENOMEM; +++ state->curframe = 0; +++ state->speculative = false; +++ state->branches = 1; +++ state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); +++ if (!state->frame[0]) { +++ kfree(state); +++ return -ENOMEM; +++ } +++ env->cur_state = state; +++ init_func_state(env, state->frame[0], +++ BPF_MAIN_FUNC /* callsite */, +++ 0 /* frameno */, +++ 0 /* subprogno, zero == main subprog */); ++ ++- init_reg_state(regs); ++- insn_idx = 0; ++ for (;;) { ++ struct bpf_insn *insn; ++ u8 class; ++ int err; ++ ++- if (insn_idx >= insn_cnt) { ++- verbose("invalid insn idx %d insn_cnt %d\n", ++- insn_idx, insn_cnt); +++ env->prev_insn_idx = prev_insn_idx; +++ if (env->insn_idx >= insn_cnt) { +++ verbose(env, "invalid insn idx %d insn_cnt %d\n", +++ env->insn_idx, insn_cnt); ++ return -EFAULT; ++ } ++ ++- insn = &insns[insn_idx]; +++ insn = &insns[env->insn_idx]; ++ class = BPF_CLASS(insn->code); ++ ++- if (++insn_processed > 32768) { ++- verbose("BPF program is too large. Proccessed %d insn\n", ++- insn_processed); +++ if (++env->insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { +++ verbose(env, +++ "BPF program is too large. Processed %d insn\n", +++ env->insn_processed); ++ return -E2BIG; ++ } ++ ++- err = is_state_visited(env, insn_idx); +++ err = is_state_visited(env, env->insn_idx); ++ if (err < 0) ++ return err; ++ if (err == 1) { ++ /* found equivalent state, can prune the search */ ++- if (log_level) { +++ if (env->log.level & BPF_LOG_LEVEL) { ++ if (do_print_state) ++- verbose("\nfrom %d to %d: safe\n", ++- prev_insn_idx, insn_idx); +++ verbose(env, "\nfrom %d to %d%s: safe\n", +++ env->prev_insn_idx, env->insn_idx, +++ env->cur_state->speculative ? +++ " (speculative execution)" : ""); ++ else ++- verbose("%d: safe\n", insn_idx); +++ verbose(env, "%d: safe\n", env->insn_idx); ++ } ++ goto process_bpf_exit; ++ } ++ ++- if (log_level && do_print_state) { ++- verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); ++- print_verifier_state(env); +++ if (signal_pending(current)) +++ return -EAGAIN; +++ +++ if (need_resched()) +++ cond_resched(); +++ +++ if (env->log.level & BPF_LOG_LEVEL2 || +++ (env->log.level & BPF_LOG_LEVEL && do_print_state)) { +++ if (env->log.level & BPF_LOG_LEVEL2) +++ verbose(env, "%d:", env->insn_idx); +++ else +++ verbose(env, "\nfrom %d to %d%s:", +++ env->prev_insn_idx, env->insn_idx, +++ env->cur_state->speculative ? +++ " (speculative execution)" : ""); +++ print_verifier_state(env, state->frame[state->curframe]); ++ do_print_state = false; ++ } ++ ++- if (log_level) { ++- verbose("%d: ", insn_idx); ++- print_bpf_insn(env, insn); +++ if (env->log.level & BPF_LOG_LEVEL) { +++ const struct bpf_insn_cbs cbs = { +++ .cb_print = verbose, +++ .private_data = env, +++ }; +++ +++ verbose_linfo(env, env->insn_idx, "; "); +++ verbose(env, "%d: ", env->insn_idx); +++ print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); ++ } ++ +++ regs = cur_regs(env); +++ sanitize_mark_insn_seen(env); +++ prev_insn_idx = env->insn_idx; +++ ++ if (class == BPF_ALU || class == BPF_ALU64) { ++ err = check_alu_op(env, insn); ++ if (err) ++ return err; ++ ++ } else if (class == BPF_LDX) { ++- enum bpf_reg_type src_reg_type; +++ enum bpf_reg_type *prev_src_type, src_reg_type; ++ ++ /* check for reserved fields is already done */ ++ ++ /* check src operand */ ++- err = check_reg_arg(regs, insn->src_reg, SRC_OP); +++ err = check_reg_arg(env, insn->src_reg, SRC_OP); ++ if (err) ++ return err; ++ ++- err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); +++ err = check_reg_arg(env, insn->dst_reg, DST_OP_NO_MARK); ++ if (err) ++ return err; ++ ++@@ -1796,27 +7850,22 @@ static int do_check(struct verifier_env ++ /* check that memory (src_reg + off) is readable, ++ * the state of dst_reg will be updated by this func ++ */ ++- err = check_mem_access(env, insn->src_reg, insn->off, ++- BPF_SIZE(insn->code), BPF_READ, ++- insn->dst_reg); +++ err = check_mem_access(env, env->insn_idx, insn->src_reg, +++ insn->off, BPF_SIZE(insn->code), +++ BPF_READ, insn->dst_reg, false); ++ if (err) ++ return err; ++ ++- if (BPF_SIZE(insn->code) != BPF_W) { ++- insn_idx++; ++- continue; ++- } +++ prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; ++ ++- if (insn->imm == 0) { +++ if (*prev_src_type == NOT_INIT) { ++ /* saw a valid insn ++ * dst_reg = *(u32 *)(src_reg + off) ++- * use reserved 'imm' field to mark this insn +++ * save type to validate intersecting paths ++ */ ++- insn->imm = src_reg_type; +++ *prev_src_type = src_reg_type; ++ ++- } else if (src_reg_type != insn->imm && ++- (src_reg_type == PTR_TO_CTX || ++- insn->imm == PTR_TO_CTX)) { +++ } else if (reg_type_mismatch(src_reg_type, *prev_src_type)) { ++ /* ABuser program is trying to use the same insn ++ * dst_reg = *(u32*) (src_reg + off) ++ * with different pointer types: ++@@ -1824,79 +7873,98 @@ static int do_check(struct verifier_env ++ * src_reg == stack|map in some other branch. ++ * Reject it. ++ */ ++- verbose("same insn cannot be used with different pointers\n"); +++ verbose(env, "same insn cannot be used with different pointers\n"); ++ return -EINVAL; ++ } ++ ++ } else if (class == BPF_STX) { ++- enum bpf_reg_type dst_reg_type; +++ enum bpf_reg_type *prev_dst_type, dst_reg_type; ++ ++ if (BPF_MODE(insn->code) == BPF_XADD) { ++- err = check_xadd(env, insn); +++ err = check_xadd(env, env->insn_idx, insn); ++ if (err) ++ return err; ++- insn_idx++; +++ env->insn_idx++; ++ continue; ++ } ++ ++ /* check src1 operand */ ++- err = check_reg_arg(regs, insn->src_reg, SRC_OP); +++ err = check_reg_arg(env, insn->src_reg, SRC_OP); ++ if (err) ++ return err; ++ /* check src2 operand */ ++- err = check_reg_arg(regs, insn->dst_reg, SRC_OP); +++ err = check_reg_arg(env, insn->dst_reg, SRC_OP); ++ if (err) ++ return err; ++ ++ dst_reg_type = regs[insn->dst_reg].type; ++ ++ /* check that memory (dst_reg + off) is writeable */ ++- err = check_mem_access(env, insn->dst_reg, insn->off, ++- BPF_SIZE(insn->code), BPF_WRITE, ++- insn->src_reg); +++ err = check_mem_access(env, env->insn_idx, insn->dst_reg, +++ insn->off, BPF_SIZE(insn->code), +++ BPF_WRITE, insn->src_reg, false); ++ if (err) ++ return err; ++ ++- if (insn->imm == 0) { ++- insn->imm = dst_reg_type; ++- } else if (dst_reg_type != insn->imm && ++- (dst_reg_type == PTR_TO_CTX || ++- insn->imm == PTR_TO_CTX)) { ++- verbose("same insn cannot be used with different pointers\n"); +++ prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; +++ +++ if (*prev_dst_type == NOT_INIT) { +++ *prev_dst_type = dst_reg_type; +++ } else if (reg_type_mismatch(dst_reg_type, *prev_dst_type)) { +++ verbose(env, "same insn cannot be used with different pointers\n"); ++ return -EINVAL; ++ } ++ ++ } else if (class == BPF_ST) { ++ if (BPF_MODE(insn->code) != BPF_MEM || ++ insn->src_reg != BPF_REG_0) { ++- verbose("BPF_ST uses reserved fields\n"); +++ verbose(env, "BPF_ST uses reserved fields\n"); ++ return -EINVAL; ++ } ++ /* check src operand */ ++- err = check_reg_arg(regs, insn->dst_reg, SRC_OP); +++ err = check_reg_arg(env, insn->dst_reg, SRC_OP); ++ if (err) ++ return err; ++ +++ if (is_ctx_reg(env, insn->dst_reg)) { +++ verbose(env, "BPF_ST stores into R%d %s is not allowed\n", +++ insn->dst_reg, +++ reg_type_str[reg_state(env, insn->dst_reg)->type]); +++ return -EACCES; +++ } +++ ++ /* check that memory (dst_reg + off) is writeable */ ++- err = check_mem_access(env, insn->dst_reg, insn->off, ++- BPF_SIZE(insn->code), BPF_WRITE, ++- -1); +++ err = check_mem_access(env, env->insn_idx, insn->dst_reg, +++ insn->off, BPF_SIZE(insn->code), +++ BPF_WRITE, -1, false); ++ if (err) ++ return err; ++ ++- } else if (class == BPF_JMP) { +++ } else if (class == BPF_JMP || class == BPF_JMP32) { ++ u8 opcode = BPF_OP(insn->code); ++ +++ env->jmps_processed++; ++ if (opcode == BPF_CALL) { ++ if (BPF_SRC(insn->code) != BPF_K || ++ insn->off != 0 || ++- insn->src_reg != BPF_REG_0 || ++- insn->dst_reg != BPF_REG_0) { ++- verbose("BPF_CALL uses reserved fields\n"); +++ (insn->src_reg != BPF_REG_0 && +++ insn->src_reg != BPF_PSEUDO_CALL) || +++ insn->dst_reg != BPF_REG_0 || +++ class == BPF_JMP32) { +++ verbose(env, "BPF_CALL uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++- err = check_call(env, insn->imm); +++ if (env->cur_state->active_spin_lock && +++ (insn->src_reg == BPF_PSEUDO_CALL || +++ insn->imm != BPF_FUNC_spin_unlock)) { +++ verbose(env, "function calls are not allowed while holding a lock\n"); +++ return -EINVAL; +++ } +++ if (insn->src_reg == BPF_PSEUDO_CALL) +++ err = check_func_call(env, insn, &env->insn_idx); +++ else +++ err = check_helper_call(env, insn->imm, env->insn_idx); ++ if (err) ++ return err; ++ ++@@ -1904,48 +7972,75 @@ static int do_check(struct verifier_env ++ if (BPF_SRC(insn->code) != BPF_K || ++ insn->imm != 0 || ++ insn->src_reg != BPF_REG_0 || ++- insn->dst_reg != BPF_REG_0) { ++- verbose("BPF_JA uses reserved fields\n"); +++ insn->dst_reg != BPF_REG_0 || +++ class == BPF_JMP32) { +++ verbose(env, "BPF_JA uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++- insn_idx += insn->off + 1; +++ env->insn_idx += insn->off + 1; ++ continue; ++ ++ } else if (opcode == BPF_EXIT) { ++ if (BPF_SRC(insn->code) != BPF_K || ++ insn->imm != 0 || ++ insn->src_reg != BPF_REG_0 || ++- insn->dst_reg != BPF_REG_0) { ++- verbose("BPF_EXIT uses reserved fields\n"); +++ insn->dst_reg != BPF_REG_0 || +++ class == BPF_JMP32) { +++ verbose(env, "BPF_EXIT uses reserved fields\n"); ++ return -EINVAL; ++ } ++ +++ if (env->cur_state->active_spin_lock) { +++ verbose(env, "bpf_spin_unlock is missing\n"); +++ return -EINVAL; +++ } +++ +++ if (state->curframe) { +++ /* exit from nested function */ +++ err = prepare_func_exit(env, &env->insn_idx); +++ if (err) +++ return err; +++ do_print_state = true; +++ continue; +++ } +++ +++ err = check_reference_leak(env); +++ if (err) +++ return err; +++ ++ /* eBPF calling convetion is such that R0 is used ++ * to return the value from eBPF program. ++ * Make sure that it's readable at this time ++ * of bpf_exit, which means that program wrote ++ * something into it earlier ++ */ ++- err = check_reg_arg(regs, BPF_REG_0, SRC_OP); +++ err = check_reg_arg(env, BPF_REG_0, SRC_OP); ++ if (err) ++ return err; ++ ++ if (is_pointer_value(env, BPF_REG_0)) { ++- verbose("R0 leaks addr as return value\n"); +++ verbose(env, "R0 leaks addr as return value\n"); ++ return -EACCES; ++ } ++ +++ err = check_return_code(env); +++ if (err) +++ return err; ++ process_bpf_exit: ++- insn_idx = pop_stack(env, &prev_insn_idx); ++- if (insn_idx < 0) { +++ update_branch_counts(env, env->cur_state); +++ err = pop_stack(env, &prev_insn_idx, +++ &env->insn_idx); +++ if (err < 0) { +++ if (err != -ENOENT) +++ return err; ++ break; ++ } else { ++ do_print_state = true; ++ continue; ++ } ++ } else { ++- err = check_cond_jmp_op(env, insn, &insn_idx); +++ err = check_cond_jmp_op(env, insn, &env->insn_idx); ++ if (err) ++ return err; ++ } ++@@ -1962,83 +8057,194 @@ process_bpf_exit: ++ if (err) ++ return err; ++ ++- insn_idx++; +++ env->insn_idx++; +++ sanitize_mark_insn_seen(env); ++ } else { ++- verbose("invalid BPF_LD mode\n"); +++ verbose(env, "invalid BPF_LD mode\n"); ++ return -EINVAL; ++ } ++ } else { ++- verbose("unknown insn class %d\n", class); +++ verbose(env, "unknown insn class %d\n", class); ++ return -EINVAL; ++ } ++ ++- insn_idx++; +++ env->insn_idx++; ++ } ++ +++ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth; ++ return 0; ++ } ++ +++static int check_map_prealloc(struct bpf_map *map) +++{ +++ return (map->map_type != BPF_MAP_TYPE_HASH && +++ map->map_type != BPF_MAP_TYPE_PERCPU_HASH && +++ map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || +++ !(map->map_flags & BPF_F_NO_PREALLOC); +++} +++ +++static bool is_tracing_prog_type(enum bpf_prog_type type) +++{ +++ switch (type) { +++ case BPF_PROG_TYPE_KPROBE: +++ case BPF_PROG_TYPE_TRACEPOINT: +++ case BPF_PROG_TYPE_PERF_EVENT: +++ case BPF_PROG_TYPE_RAW_TRACEPOINT: +++ return true; +++ default: +++ return false; +++ } +++} +++ +++static int check_map_prog_compatibility(struct bpf_verifier_env *env, +++ struct bpf_map *map, +++ struct bpf_prog *prog) +++ +++{ +++ /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use +++ * preallocated hash maps, since doing memory allocation +++ * in overflow_handler can crash depending on where nmi got +++ * triggered. +++ */ +++ if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { +++ if (!check_map_prealloc(map)) { +++ verbose(env, "perf_event programs can only use preallocated hash map\n"); +++ return -EINVAL; +++ } +++ if (map->inner_map_meta && +++ !check_map_prealloc(map->inner_map_meta)) { +++ verbose(env, "perf_event programs can only use preallocated inner hash map\n"); +++ return -EINVAL; +++ } +++ } +++ +++ if ((is_tracing_prog_type(prog->type) || +++ prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && +++ map_value_has_spin_lock(map)) { +++ verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); +++ return -EINVAL; +++ } +++ +++ return 0; +++} +++ +++static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +++{ +++ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || +++ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +++} +++ ++ /* look for pseudo eBPF instructions that access map FDs and ++ * replace them with actual map pointers ++ */ ++-static int replace_map_fd_with_map_ptr(struct verifier_env *env) +++static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) ++ { ++ struct bpf_insn *insn = env->prog->insnsi; ++ int insn_cnt = env->prog->len; ++- int i, j; +++ int i, j, err; +++ +++ err = bpf_prog_calc_tag(env->prog); +++ if (err) +++ return err; ++ ++ for (i = 0; i < insn_cnt; i++, insn++) { ++ if (BPF_CLASS(insn->code) == BPF_LDX && ++ (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0)) { ++- verbose("BPF_LDX uses reserved fields\n"); +++ verbose(env, "BPF_LDX uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++ if (BPF_CLASS(insn->code) == BPF_STX && ++ ((BPF_MODE(insn->code) != BPF_MEM && ++ BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { ++- verbose("BPF_STX uses reserved fields\n"); +++ verbose(env, "BPF_STX uses reserved fields\n"); ++ return -EINVAL; ++ } ++ ++ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { +++ struct bpf_insn_aux_data *aux; ++ struct bpf_map *map; ++ struct fd f; +++ u64 addr; ++ ++ if (i == insn_cnt - 1 || insn[1].code != 0 || ++ insn[1].dst_reg != 0 || insn[1].src_reg != 0 || ++ insn[1].off != 0) { ++- verbose("invalid bpf_ld_imm64 insn\n"); +++ verbose(env, "invalid bpf_ld_imm64 insn\n"); ++ return -EINVAL; ++ } ++ ++- if (insn->src_reg == 0) +++ if (insn[0].src_reg == 0) ++ /* valid generic load 64-bit imm */ ++ goto next_insn; ++ ++- if (insn->src_reg != BPF_PSEUDO_MAP_FD) { ++- verbose("unrecognized bpf_ld_imm64 insn\n"); +++ /* In final convert_pseudo_ld_imm64() step, this is +++ * converted into regular 64-bit imm load insn. +++ */ +++ if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD && +++ insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) || +++ (insn[0].src_reg == BPF_PSEUDO_MAP_FD && +++ insn[1].imm != 0)) { +++ verbose(env, +++ "unrecognized bpf_ld_imm64 insn\n"); ++ return -EINVAL; ++ } ++ ++- f = fdget(insn->imm); +++ f = fdget(insn[0].imm); ++ map = __bpf_map_get(f); ++ if (IS_ERR(map)) { ++- verbose("fd %d is not pointing to valid bpf_map\n", ++- insn->imm); +++ verbose(env, "fd %d is not pointing to valid bpf_map\n", +++ insn[0].imm); ++ return PTR_ERR(map); ++ } ++ ++- /* store map pointer inside BPF_LD_IMM64 instruction */ ++- insn[0].imm = (u32) (unsigned long) map; ++- insn[1].imm = ((u64) (unsigned long) map) >> 32; +++ err = check_map_prog_compatibility(env, map, env->prog); +++ if (err) { +++ fdput(f); +++ return err; +++ } +++ +++ aux = &env->insn_aux_data[i]; +++ if (insn->src_reg == BPF_PSEUDO_MAP_FD) { +++ addr = (unsigned long)map; +++ } else { +++ u32 off = insn[1].imm; +++ +++ if (off >= BPF_MAX_VAR_OFF) { +++ verbose(env, "direct value offset of %u is not allowed\n", off); +++ fdput(f); +++ return -EINVAL; +++ } +++ +++ if (!map->ops->map_direct_value_addr) { +++ verbose(env, "no direct value access support for this map type\n"); +++ fdput(f); +++ return -EINVAL; +++ } +++ +++ err = map->ops->map_direct_value_addr(map, &addr, off); +++ if (err) { +++ verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n", +++ map->value_size, off); +++ fdput(f); +++ return err; +++ } +++ +++ aux->map_off = off; +++ addr += off; +++ } +++ +++ insn[0].imm = (u32)addr; +++ insn[1].imm = addr >> 32; ++ ++ /* check whether we recorded this map already */ ++- for (j = 0; j < env->used_map_cnt; j++) +++ for (j = 0; j < env->used_map_cnt; j++) { ++ if (env->used_maps[j] == map) { +++ aux->map_index = j; ++ fdput(f); ++ goto next_insn; ++ } +++ } ++ ++ if (env->used_map_cnt >= MAX_USED_MAPS) { ++ fdput(f); ++@@ -2048,19 +8254,31 @@ static int replace_map_fd_with_map_ptr(s ++ /* hold the map. If the program is rejected by verifier, ++ * the map will be released by release_maps() or it ++ * will be used by the valid program until it's unloaded ++- * and all maps are released in free_bpf_prog_info() +++ * and all maps are released in free_used_maps() ++ */ ++ map = bpf_map_inc(map, false); ++ if (IS_ERR(map)) { ++ fdput(f); ++ return PTR_ERR(map); ++ } +++ +++ aux->map_index = env->used_map_cnt; ++ env->used_maps[env->used_map_cnt++] = map; ++ +++ if (bpf_map_is_cgroup_storage(map)) +++ return -EINVAL; +++ ++ fdput(f); ++ next_insn: ++ insn++; ++ i++; +++ continue; +++ } +++ +++ /* Basic sanity check before we invest more work here. */ +++ if (!bpf_opcode_in_insntable(insn->code)) { +++ verbose(env, "unknown opcode %02x\n", insn->code); +++ return -EINVAL; ++ } ++ } ++ ++@@ -2072,7 +8290,7 @@ next_insn: ++ } ++ ++ /* drop refcnt of maps used by the rejected program */ ++-static void release_maps(struct verifier_env *env) +++static void release_maps(struct bpf_verifier_env *env) ++ { ++ int i; ++ ++@@ -2081,7 +8299,7 @@ static void release_maps(struct verifier ++ } ++ ++ /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ ++-static void convert_pseudo_ld_imm64(struct verifier_env *env) +++static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) ++ { ++ struct bpf_insn *insn = env->prog->insnsi; ++ int insn_cnt = env->prog->len; ++@@ -2092,201 +8310,1266 @@ static void convert_pseudo_ld_imm64(stru ++ insn->src_reg = 0; ++ } ++ ++-static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +++/* single env->prog->insni[off] instruction was replaced with the range +++ * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying +++ * [0, off) and [off, end) to new locations, so the patched range stays zero +++ */ +++static int adjust_insn_aux_data(struct bpf_verifier_env *env, +++ struct bpf_prog *new_prog, u32 off, u32 cnt) ++ { ++- struct bpf_insn *insn = prog->insnsi; ++- int insn_cnt = prog->len; +++ struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; +++ struct bpf_insn *insn = new_prog->insnsi; +++ bool old_seen = old_data[off].seen; +++ u32 prog_len; ++ int i; ++ ++- for (i = 0; i < insn_cnt; i++, insn++) { ++- if (BPF_CLASS(insn->code) != BPF_JMP || ++- BPF_OP(insn->code) == BPF_CALL || ++- BPF_OP(insn->code) == BPF_EXIT) +++ /* aux info at OFF always needs adjustment, no matter fast path +++ * (cnt == 1) is taken or not. There is no guarantee INSN at OFF is the +++ * original insn at old prog. +++ */ +++ old_data[off].zext_dst = insn_has_def32(env, insn + off + cnt - 1); +++ +++ if (cnt == 1) +++ return 0; +++ prog_len = new_prog->len; +++ new_data = vzalloc(array_size(prog_len, +++ sizeof(struct bpf_insn_aux_data))); +++ if (!new_data) +++ return -ENOMEM; +++ memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); +++ memcpy(new_data + off + cnt - 1, old_data + off, +++ sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); +++ for (i = off; i < off + cnt - 1; i++) { +++ /* Expand insni[off]'s seen count to the patched range. */ +++ new_data[i].seen = old_seen; +++ new_data[i].zext_dst = insn_has_def32(env, insn + i); +++ } +++ env->insn_aux_data = new_data; +++ vfree(old_data); +++ return 0; +++} +++ +++static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +++{ +++ int i; +++ +++ if (len == 1) +++ return; +++ /* NOTE: fake 'exit' subprog should be updated as well. */ +++ for (i = 0; i <= env->subprog_cnt; i++) { +++ if (env->subprog_info[i].start <= off) ++ continue; +++ env->subprog_info[i].start += len - 1; +++ } +++} +++ +++static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, +++ const struct bpf_insn *patch, u32 len) +++{ +++ struct bpf_prog *new_prog; +++ +++ new_prog = bpf_patch_insn_single(env->prog, off, patch, len); +++ if (IS_ERR(new_prog)) { +++ if (PTR_ERR(new_prog) == -ERANGE) +++ verbose(env, +++ "insn %d cannot be patched due to 16-bit range\n", +++ env->insn_aux_data[off].orig_idx); +++ return NULL; +++ } +++ if (adjust_insn_aux_data(env, new_prog, off, len)) +++ return NULL; +++ adjust_subprog_starts(env, off, len); +++ return new_prog; +++} ++ ++- /* adjust offset of jmps if necessary */ ++- if (i < pos && i + insn->off + 1 > pos) ++- insn->off += delta; ++- else if (i > pos + delta && i + insn->off + 1 <= pos + delta) ++- insn->off -= delta; +++static int adjust_subprog_starts_after_remove(struct bpf_verifier_env *env, +++ u32 off, u32 cnt) +++{ +++ int i, j; +++ +++ /* find first prog starting at or after off (first to remove) */ +++ for (i = 0; i < env->subprog_cnt; i++) +++ if (env->subprog_info[i].start >= off) +++ break; +++ /* find first prog starting at or after off + cnt (first to stay) */ +++ for (j = i; j < env->subprog_cnt; j++) +++ if (env->subprog_info[j].start >= off + cnt) +++ break; +++ /* if j doesn't start exactly at off + cnt, we are just removing +++ * the front of previous prog +++ */ +++ if (env->subprog_info[j].start != off + cnt) +++ j--; +++ +++ if (j > i) { +++ struct bpf_prog_aux *aux = env->prog->aux; +++ int move; +++ +++ /* move fake 'exit' subprog as well */ +++ move = env->subprog_cnt + 1 - j; +++ +++ memmove(env->subprog_info + i, +++ env->subprog_info + j, +++ sizeof(*env->subprog_info) * move); +++ env->subprog_cnt -= j - i; +++ +++ /* remove func_info */ +++ if (aux->func_info) { +++ move = aux->func_info_cnt - j; +++ +++ memmove(aux->func_info + i, +++ aux->func_info + j, +++ sizeof(*aux->func_info) * move); +++ aux->func_info_cnt -= j - i; +++ /* func_info->insn_off is set after all code rewrites, +++ * in adjust_btf_func() - no need to adjust +++ */ +++ } +++ } else { +++ /* convert i from "first prog to remove" to "first to adjust" */ +++ if (env->subprog_info[i].start == off) +++ i++; ++ } +++ +++ /* update fake 'exit' subprog as well */ +++ for (; i <= env->subprog_cnt; i++) +++ env->subprog_info[i].start -= cnt; +++ +++ return 0; +++} +++ +++static int bpf_adj_linfo_after_remove(struct bpf_verifier_env *env, u32 off, +++ u32 cnt) +++{ +++ struct bpf_prog *prog = env->prog; +++ u32 i, l_off, l_cnt, nr_linfo; +++ struct bpf_line_info *linfo; +++ +++ nr_linfo = prog->aux->nr_linfo; +++ if (!nr_linfo) +++ return 0; +++ +++ linfo = prog->aux->linfo; +++ +++ /* find first line info to remove, count lines to be removed */ +++ for (i = 0; i < nr_linfo; i++) +++ if (linfo[i].insn_off >= off) +++ break; +++ +++ l_off = i; +++ l_cnt = 0; +++ for (; i < nr_linfo; i++) +++ if (linfo[i].insn_off < off + cnt) +++ l_cnt++; +++ else +++ break; +++ +++ /* First live insn doesn't match first live linfo, it needs to "inherit" +++ * last removed linfo. prog is already modified, so prog->len == off +++ * means no live instructions after (tail of the program was removed). +++ */ +++ if (prog->len != off && l_cnt && +++ (i == nr_linfo || linfo[i].insn_off != off + cnt)) { +++ l_cnt--; +++ linfo[--i].insn_off = off + cnt; +++ } +++ +++ /* remove the line info which refer to the removed instructions */ +++ if (l_cnt) { +++ memmove(linfo + l_off, linfo + i, +++ sizeof(*linfo) * (nr_linfo - i)); +++ +++ prog->aux->nr_linfo -= l_cnt; +++ nr_linfo = prog->aux->nr_linfo; +++ } +++ +++ /* pull all linfo[i].insn_off >= off + cnt in by cnt */ +++ for (i = l_off; i < nr_linfo; i++) +++ linfo[i].insn_off -= cnt; +++ +++ /* fix up all subprogs (incl. 'exit') which start >= off */ +++ for (i = 0; i <= env->subprog_cnt; i++) +++ if (env->subprog_info[i].linfo_idx > l_off) { +++ /* program may have started in the removed region but +++ * may not be fully removed +++ */ +++ if (env->subprog_info[i].linfo_idx >= l_off + l_cnt) +++ env->subprog_info[i].linfo_idx -= l_cnt; +++ else +++ env->subprog_info[i].linfo_idx = l_off; +++ } +++ +++ return 0; +++} +++ +++static int verifier_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt) +++{ +++ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; +++ unsigned int orig_prog_len = env->prog->len; +++ int err; +++ +++ err = bpf_remove_insns(env->prog, off, cnt); +++ if (err) +++ return err; +++ +++ err = adjust_subprog_starts_after_remove(env, off, cnt); +++ if (err) +++ return err; +++ +++ err = bpf_adj_linfo_after_remove(env, off, cnt); +++ if (err) +++ return err; +++ +++ memmove(aux_data + off, aux_data + off + cnt, +++ sizeof(*aux_data) * (orig_prog_len - off - cnt)); +++ +++ return 0; ++ } ++ ++-/* convert load instructions that access fields of 'struct __sk_buff' ++- * into sequence of instructions that access fields of 'struct sk_buff' +++/* The verifier does more data flow analysis than llvm and will not +++ * explore branches that are dead at run time. Malicious programs can +++ * have dead code too. Therefore replace all dead at-run-time code +++ * with 'ja -1'. +++ * +++ * Just nops are not optimal, e.g. if they would sit at the end of the +++ * program and through another bug we would manage to jump there, then +++ * we'd execute beyond program memory otherwise. Returning exception +++ * code also wouldn't work since we can have subprogs where the dead +++ * code could be located. ++ */ ++-static int convert_ctx_accesses(struct verifier_env *env) +++static void sanitize_dead_code(struct bpf_verifier_env *env) ++ { +++ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; +++ struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); ++ struct bpf_insn *insn = env->prog->insnsi; +++ const int insn_cnt = env->prog->len; +++ int i; +++ +++ for (i = 0; i < insn_cnt; i++) { +++ if (aux_data[i].seen) +++ continue; +++ memcpy(insn + i, &trap, sizeof(trap)); +++ } +++} +++ +++static bool insn_is_cond_jump(u8 code) +++{ +++ u8 op; +++ +++ if (BPF_CLASS(code) == BPF_JMP32) +++ return true; +++ +++ if (BPF_CLASS(code) != BPF_JMP) +++ return false; +++ +++ op = BPF_OP(code); +++ return op != BPF_JA && op != BPF_EXIT && op != BPF_CALL; +++} +++ +++static void opt_hard_wire_dead_code_branches(struct bpf_verifier_env *env) +++{ +++ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; +++ struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); +++ struct bpf_insn *insn = env->prog->insnsi; +++ const int insn_cnt = env->prog->len; +++ int i; +++ +++ for (i = 0; i < insn_cnt; i++, insn++) { +++ if (!insn_is_cond_jump(insn->code)) +++ continue; +++ +++ if (!aux_data[i + 1].seen) +++ ja.off = insn->off; +++ else if (!aux_data[i + 1 + insn->off].seen) +++ ja.off = 0; +++ else +++ continue; +++ +++ memcpy(insn, &ja, sizeof(ja)); +++ } +++} +++ +++static int opt_remove_dead_code(struct bpf_verifier_env *env) +++{ +++ struct bpf_insn_aux_data *aux_data = env->insn_aux_data; ++ int insn_cnt = env->prog->len; ++- struct bpf_insn insn_buf[16]; +++ int i, err; +++ +++ for (i = 0; i < insn_cnt; i++) { +++ int j; +++ +++ j = 0; +++ while (i + j < insn_cnt && !aux_data[i + j].seen) +++ j++; +++ if (!j) +++ continue; +++ +++ err = verifier_remove_insns(env, i, j); +++ if (err) +++ return err; +++ insn_cnt = env->prog->len; +++ } +++ +++ return 0; +++} +++ +++static int opt_remove_nops(struct bpf_verifier_env *env) +++{ +++ const struct bpf_insn ja = BPF_JMP_IMM(BPF_JA, 0, 0, 0); +++ struct bpf_insn *insn = env->prog->insnsi; +++ int insn_cnt = env->prog->len; +++ int i, err; +++ +++ for (i = 0; i < insn_cnt; i++) { +++ if (memcmp(&insn[i], &ja, sizeof(ja))) +++ continue; +++ +++ err = verifier_remove_insns(env, i, 1); +++ if (err) +++ return err; +++ insn_cnt--; +++ i--; +++ } +++ +++ return 0; +++} +++ +++static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, +++ const union bpf_attr *attr) +++{ +++ struct bpf_insn *patch, zext_patch[2], rnd_hi32_patch[4]; +++ struct bpf_insn_aux_data *aux = env->insn_aux_data; +++ int i, patch_len, delta = 0, len = env->prog->len; +++ struct bpf_insn *insns = env->prog->insnsi; +++ struct bpf_prog *new_prog; +++ bool rnd_hi32; +++ +++ rnd_hi32 = attr->prog_flags & BPF_F_TEST_RND_HI32; +++ zext_patch[1] = BPF_ZEXT_REG(0); +++ rnd_hi32_patch[1] = BPF_ALU64_IMM(BPF_MOV, BPF_REG_AX, 0); +++ rnd_hi32_patch[2] = BPF_ALU64_IMM(BPF_LSH, BPF_REG_AX, 32); +++ rnd_hi32_patch[3] = BPF_ALU64_REG(BPF_OR, 0, BPF_REG_AX); +++ for (i = 0; i < len; i++) { +++ int adj_idx = i + delta; +++ struct bpf_insn insn; +++ +++ insn = insns[adj_idx]; +++ if (!aux[adj_idx].zext_dst) { +++ u8 code, class; +++ u32 imm_rnd; +++ +++ if (!rnd_hi32) +++ continue; +++ +++ code = insn.code; +++ class = BPF_CLASS(code); +++ if (insn_no_def(&insn)) +++ continue; +++ +++ /* NOTE: arg "reg" (the fourth one) is only used for +++ * BPF_STX which has been ruled out in above +++ * check, it is safe to pass NULL here. +++ */ +++ if (is_reg64(env, &insn, insn.dst_reg, NULL, DST_OP)) { +++ if (class == BPF_LD && +++ BPF_MODE(code) == BPF_IMM) +++ i++; +++ continue; +++ } +++ +++ /* ctx load could be transformed into wider load. */ +++ if (class == BPF_LDX && +++ aux[adj_idx].ptr_type == PTR_TO_CTX) +++ continue; +++ +++ imm_rnd = get_random_int(); +++ rnd_hi32_patch[0] = insn; +++ rnd_hi32_patch[1].imm = imm_rnd; +++ rnd_hi32_patch[3].dst_reg = insn.dst_reg; +++ patch = rnd_hi32_patch; +++ patch_len = 4; +++ goto apply_patch_buffer; +++ } +++ +++ if (!bpf_jit_needs_zext()) +++ continue; +++ +++ zext_patch[0] = insn; +++ zext_patch[1].dst_reg = insn.dst_reg; +++ zext_patch[1].src_reg = insn.dst_reg; +++ patch = zext_patch; +++ patch_len = 2; +++apply_patch_buffer: +++ new_prog = bpf_patch_insn_data(env, adj_idx, patch, patch_len); +++ if (!new_prog) +++ return -ENOMEM; +++ env->prog = new_prog; +++ insns = new_prog->insnsi; +++ aux = env->insn_aux_data; +++ delta += patch_len - 1; +++ } +++ +++ return 0; +++} +++ +++/* convert load instructions that access fields of a context type into a +++ * sequence of instructions that access fields of the underlying structure: +++ * struct __sk_buff -> struct sk_buff +++ * struct bpf_sock_ops -> struct sock +++ */ +++static int convert_ctx_accesses(struct bpf_verifier_env *env) +++{ +++ const struct bpf_verifier_ops *ops = env->ops; +++ int i, cnt, size, ctx_field_size, delta = 0; +++ const int insn_cnt = env->prog->len; +++ struct bpf_insn insn_buf[16], *insn; +++ u32 target_size, size_default, off; ++ struct bpf_prog *new_prog; ++- u32 cnt; ++- int i; ++ enum bpf_access_type type; +++ bool is_narrower_load; +++ +++ if (ops->gen_prologue || env->seen_direct_write) { +++ if (!ops->gen_prologue) { +++ verbose(env, "bpf verifier is misconfigured\n"); +++ return -EINVAL; +++ } +++ cnt = ops->gen_prologue(insn_buf, env->seen_direct_write, +++ env->prog); +++ if (cnt >= ARRAY_SIZE(insn_buf)) { +++ verbose(env, "bpf verifier is misconfigured\n"); +++ return -EINVAL; +++ } else if (cnt) { +++ new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); +++ if (!new_prog) +++ return -ENOMEM; +++ +++ env->prog = new_prog; +++ delta += cnt - 1; +++ } +++ } ++ ++- if (!env->prog->aux->ops->convert_ctx_access) +++ if (bpf_prog_is_dev_bound(env->prog->aux)) ++ return 0; ++ +++ insn = env->prog->insnsi + delta; +++ ++ for (i = 0; i < insn_cnt; i++, insn++) { ++- if (insn->code == (BPF_LDX | BPF_MEM | BPF_W)) +++ bpf_convert_ctx_access_t convert_ctx_access; +++ +++ if (insn->code == (BPF_LDX | BPF_MEM | BPF_B) || +++ insn->code == (BPF_LDX | BPF_MEM | BPF_H) || +++ insn->code == (BPF_LDX | BPF_MEM | BPF_W) || +++ insn->code == (BPF_LDX | BPF_MEM | BPF_DW)) ++ type = BPF_READ; ++- else if (insn->code == (BPF_STX | BPF_MEM | BPF_W)) +++ else if (insn->code == (BPF_STX | BPF_MEM | BPF_B) || +++ insn->code == (BPF_STX | BPF_MEM | BPF_H) || +++ insn->code == (BPF_STX | BPF_MEM | BPF_W) || +++ insn->code == (BPF_STX | BPF_MEM | BPF_DW)) ++ type = BPF_WRITE; ++ else ++ continue; ++ ++- if (insn->imm != PTR_TO_CTX) { ++- /* clear internal mark */ ++- insn->imm = 0; +++ if (type == BPF_WRITE && +++ env->insn_aux_data[i + delta].sanitize_stack_off) { +++ struct bpf_insn patch[] = { +++ /* Sanitize suspicious stack slot with zero. +++ * There are no memory dependencies for this store, +++ * since it's only using frame pointer and immediate +++ * constant of zero +++ */ +++ BPF_ST_MEM(BPF_DW, BPF_REG_FP, +++ env->insn_aux_data[i + delta].sanitize_stack_off, +++ 0), +++ /* the original STX instruction will immediately +++ * overwrite the same stack slot with appropriate value +++ */ +++ *insn, +++ }; +++ +++ cnt = ARRAY_SIZE(patch); +++ new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); +++ if (!new_prog) +++ return -ENOMEM; +++ +++ delta += cnt - 1; +++ env->prog = new_prog; +++ insn = new_prog->insnsi + i + delta; +++ continue; +++ } +++ +++ switch (env->insn_aux_data[i + delta].ptr_type) { +++ case PTR_TO_CTX: +++ if (!ops->convert_ctx_access) +++ continue; +++ convert_ctx_access = ops->convert_ctx_access; +++ break; +++ default: ++ continue; ++ } ++ ++- cnt = env->prog->aux->ops-> ++- convert_ctx_access(type, insn->dst_reg, insn->src_reg, ++- insn->off, insn_buf, env->prog); ++- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { ++- verbose("bpf verifier is misconfigured\n"); +++ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size; +++ size = BPF_LDST_BYTES(insn); +++ +++ /* If the read access is a narrower load of the field, +++ * convert to a 4/8-byte load, to minimum program type specific +++ * convert_ctx_access changes. If conversion is successful, +++ * we will apply proper mask to the result. +++ */ +++ is_narrower_load = size < ctx_field_size; +++ size_default = bpf_ctx_off_adjust_machine(ctx_field_size); +++ off = insn->off; +++ if (is_narrower_load) { +++ u8 size_code; +++ +++ if (type == BPF_WRITE) { +++ verbose(env, "bpf verifier narrow ctx access misconfigured\n"); +++ return -EINVAL; +++ } +++ +++ size_code = BPF_H; +++ if (ctx_field_size == 4) +++ size_code = BPF_W; +++ else if (ctx_field_size == 8) +++ size_code = BPF_DW; +++ +++ insn->off = off & ~(size_default - 1); +++ insn->code = BPF_LDX | BPF_MEM | size_code; +++ } +++ +++ target_size = 0; +++ cnt = convert_ctx_access(type, insn, insn_buf, env->prog, +++ &target_size); +++ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) || +++ (ctx_field_size && !target_size)) { +++ verbose(env, "bpf verifier is misconfigured\n"); ++ return -EINVAL; ++ } ++ ++- if (cnt == 1) { ++- memcpy(insn, insn_buf, sizeof(*insn)); ++- continue; +++ if (is_narrower_load && size < target_size) { +++ u8 shift = bpf_ctx_narrow_access_offset( +++ off, size, size_default) * 8; +++ if (ctx_field_size <= 4) { +++ if (shift) +++ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH, +++ insn->dst_reg, +++ shift); +++ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg, +++ (1 << size * 8) - 1); +++ } else { +++ if (shift) +++ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_RSH, +++ insn->dst_reg, +++ shift); +++ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg, +++ (1ULL << size * 8) - 1); +++ } ++ } ++ ++- /* several new insns need to be inserted. Make room for them */ ++- insn_cnt += cnt - 1; ++- new_prog = bpf_prog_realloc(env->prog, ++- bpf_prog_size(insn_cnt), ++- GFP_USER); +++ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); ++ if (!new_prog) ++ return -ENOMEM; ++ ++- new_prog->len = insn_cnt; +++ delta += cnt - 1; ++ ++- memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, ++- sizeof(*insn) * (insn_cnt - i - cnt)); +++ /* keep walking new program and skip insns we just inserted */ +++ env->prog = new_prog; +++ insn = new_prog->insnsi + i + delta; +++ } ++ ++- /* copy substitute insns in place of load instruction */ ++- memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); +++ return 0; +++} ++ ++- /* adjust branches in the whole program */ ++- adjust_branches(new_prog, i, cnt - 1); +++static int jit_subprogs(struct bpf_verifier_env *env) +++{ +++ struct bpf_prog *prog = env->prog, **func, *tmp; +++ int i, j, subprog_start, subprog_end = 0, len, subprog; +++ struct bpf_insn *insn; +++ void *old_bpf_func; +++ int err; ++ ++- /* keep walking new program and skip insns we just inserted */ ++- env->prog = new_prog; ++- insn = new_prog->insnsi + i + cnt - 1; ++- i += cnt - 1; +++ if (env->subprog_cnt <= 1) +++ return 0; +++ +++ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { +++ if (insn->code != (BPF_JMP | BPF_CALL) || +++ insn->src_reg != BPF_PSEUDO_CALL) +++ continue; +++ /* Upon error here we cannot fall back to interpreter but +++ * need a hard reject of the program. Thus -EFAULT is +++ * propagated in any case. +++ */ +++ subprog = find_subprog(env, i + insn->imm + 1); +++ if (subprog < 0) { +++ WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", +++ i + insn->imm + 1); +++ return -EFAULT; +++ } +++ /* temporarily remember subprog id inside insn instead of +++ * aux_data, since next loop will split up all insns into funcs +++ */ +++ insn->off = subprog; +++ /* remember original imm in case JIT fails and fallback +++ * to interpreter will be needed +++ */ +++ env->insn_aux_data[i].call_imm = insn->imm; +++ /* point imm to __bpf_call_base+1 from JITs point of view */ +++ insn->imm = 1; +++ } +++ +++ err = bpf_prog_alloc_jited_linfo(prog); +++ if (err) +++ goto out_undo_insn; +++ +++ err = -ENOMEM; +++ func = kcalloc(env->subprog_cnt, sizeof(prog), GFP_KERNEL); +++ if (!func) +++ goto out_undo_insn; +++ +++ for (i = 0; i < env->subprog_cnt; i++) { +++ subprog_start = subprog_end; +++ subprog_end = env->subprog_info[i + 1].start; +++ +++ len = subprog_end - subprog_start; +++ /* BPF_PROG_RUN doesn't call subprogs directly, +++ * hence main prog stats include the runtime of subprogs. +++ * subprogs don't have IDs and not reachable via prog_get_next_id +++ * func[i]->aux->stats will never be accessed and stays NULL +++ */ +++ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); +++ if (!func[i]) +++ goto out_free; +++ memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], +++ len * sizeof(struct bpf_insn)); +++ func[i]->type = prog->type; +++ func[i]->len = len; +++ if (bpf_prog_calc_tag(func[i])) +++ goto out_free; +++ func[i]->is_func = 1; +++ func[i]->aux->func_idx = i; +++ /* the btf and func_info will be freed only at prog->aux */ +++ func[i]->aux->btf = prog->aux->btf; +++ func[i]->aux->func_info = prog->aux->func_info; +++ +++ /* Use bpf_prog_F_tag to indicate functions in stack traces. +++ * Long term would need debug info to populate names +++ */ +++ func[i]->aux->name[0] = 'F'; +++ func[i]->aux->stack_depth = env->subprog_info[i].stack_depth; +++ func[i]->jit_requested = 1; +++ func[i]->aux->linfo = prog->aux->linfo; +++ func[i]->aux->nr_linfo = prog->aux->nr_linfo; +++ func[i]->aux->jited_linfo = prog->aux->jited_linfo; +++ func[i]->aux->linfo_idx = env->subprog_info[i].linfo_idx; +++ func[i] = bpf_int_jit_compile(func[i]); +++ if (!func[i]->jited) { +++ err = -ENOTSUPP; +++ goto out_free; +++ } +++ cond_resched(); +++ } +++ /* at this point all bpf functions were successfully JITed +++ * now populate all bpf_calls with correct addresses and +++ * run last pass of JIT +++ */ +++ for (i = 0; i < env->subprog_cnt; i++) { +++ insn = func[i]->insnsi; +++ for (j = 0; j < func[i]->len; j++, insn++) { +++ if (insn->code != (BPF_JMP | BPF_CALL) || +++ insn->src_reg != BPF_PSEUDO_CALL) +++ continue; +++ subprog = insn->off; +++ insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - +++ __bpf_call_base; +++ } +++ +++ /* we use the aux data to keep a list of the start addresses +++ * of the JITed images for each function in the program +++ * +++ * for some architectures, such as powerpc64, the imm field +++ * might not be large enough to hold the offset of the start +++ * address of the callee's JITed image from __bpf_call_base +++ * +++ * in such cases, we can lookup the start address of a callee +++ * by using its subprog id, available from the off field of +++ * the call instruction, as an index for this list +++ */ +++ func[i]->aux->func = func; +++ func[i]->aux->func_cnt = env->subprog_cnt; +++ } +++ for (i = 0; i < env->subprog_cnt; i++) { +++ old_bpf_func = func[i]->bpf_func; +++ tmp = bpf_int_jit_compile(func[i]); +++ if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { +++ verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); +++ err = -ENOTSUPP; +++ goto out_free; +++ } +++ cond_resched(); +++ } +++ +++ /* finally lock prog and jit images for all functions and +++ * populate kallsysm +++ */ +++ for (i = 0; i < env->subprog_cnt; i++) { +++ bpf_prog_lock_ro(func[i]); +++ bpf_prog_kallsyms_add(func[i]); +++ } +++ +++ /* Last step: make now unused interpreter insns from main +++ * prog consistent for later dump requests, so they can +++ * later look the same as if they were interpreted only. +++ */ +++ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { +++ if (insn->code != (BPF_JMP | BPF_CALL) || +++ insn->src_reg != BPF_PSEUDO_CALL) +++ continue; +++ insn->off = env->insn_aux_data[i].call_imm; +++ subprog = find_subprog(env, i + insn->off + 1); +++ insn->imm = subprog; ++ } ++ +++ prog->jited = 1; +++ prog->bpf_func = func[0]->bpf_func; +++ prog->aux->func = func; +++ prog->aux->func_cnt = env->subprog_cnt; +++ bpf_prog_free_unused_jited_linfo(prog); ++ return 0; +++out_free: +++ for (i = 0; i < env->subprog_cnt; i++) +++ if (func[i]) +++ bpf_jit_free(func[i]); +++ kfree(func); +++out_undo_insn: +++ /* cleanup main prog to be interpreted */ +++ prog->jit_requested = 0; +++ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { +++ if (insn->code != (BPF_JMP | BPF_CALL) || +++ insn->src_reg != BPF_PSEUDO_CALL) +++ continue; +++ insn->off = 0; +++ insn->imm = env->insn_aux_data[i].call_imm; +++ } +++ bpf_prog_free_jited_linfo(prog); +++ return err; ++ } ++ ++-static void free_states(struct verifier_env *env) +++static int fixup_call_args(struct bpf_verifier_env *env) ++ { ++- struct verifier_state_list *sl, *sln; +++#ifndef CONFIG_BPF_JIT_ALWAYS_ON +++ struct bpf_prog *prog = env->prog; +++ struct bpf_insn *insn = prog->insnsi; +++ int i, depth; +++#endif +++ int err = 0; +++ +++ if (env->prog->jit_requested && +++ !bpf_prog_is_dev_bound(env->prog->aux)) { +++ err = jit_subprogs(env); +++ if (err == 0) +++ return 0; +++ if (err == -EFAULT) +++ return err; +++ } +++#ifndef CONFIG_BPF_JIT_ALWAYS_ON +++ for (i = 0; i < prog->len; i++, insn++) { +++ if (insn->code != (BPF_JMP | BPF_CALL) || +++ insn->src_reg != BPF_PSEUDO_CALL) +++ continue; +++ depth = get_callee_stack_depth(env, insn, i); +++ if (depth < 0) +++ return depth; +++ bpf_patch_call_args(insn, depth); +++ } +++ err = 0; +++#endif +++ return err; +++} +++ +++/* fixup insn->imm field of bpf_call instructions +++ * and inline eligible helpers as explicit sequence of BPF instructions +++ * +++ * this function is called after eBPF program passed verification +++ */ +++static int fixup_bpf_calls(struct bpf_verifier_env *env) +++{ +++ struct bpf_prog *prog = env->prog; +++ struct bpf_insn *insn = prog->insnsi; +++ const struct bpf_func_proto *fn; +++ const int insn_cnt = prog->len; +++ const struct bpf_map_ops *ops; +++ struct bpf_insn_aux_data *aux; +++ struct bpf_insn insn_buf[16]; +++ struct bpf_prog *new_prog; +++ struct bpf_map *map_ptr; +++ int i, cnt, delta = 0; +++ +++ for (i = 0; i < insn_cnt; i++, insn++) { +++ if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || +++ insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || +++ insn->code == (BPF_ALU | BPF_MOD | BPF_X) || +++ insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { +++ bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; +++ bool isdiv = BPF_OP(insn->code) == BPF_DIV; +++ struct bpf_insn *patchlet; +++ struct bpf_insn chk_and_div[] = { +++ /* [R,W]x div 0 -> 0 */ +++ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | +++ BPF_JNE | BPF_K, insn->src_reg, +++ 0, 2, 0), +++ BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), +++ BPF_JMP_IMM(BPF_JA, 0, 0, 1), +++ *insn, +++ }; +++ struct bpf_insn chk_and_mod[] = { +++ /* [R,W]x mod 0 -> [R,W]x */ +++ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | +++ BPF_JEQ | BPF_K, insn->src_reg, +++ 0, 1 + (is64 ? 0 : 1), 0), +++ *insn, +++ BPF_JMP_IMM(BPF_JA, 0, 0, 1), +++ BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), +++ }; +++ +++ patchlet = isdiv ? chk_and_div : chk_and_mod; +++ cnt = isdiv ? ARRAY_SIZE(chk_and_div) : +++ ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); +++ +++ new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); +++ if (!new_prog) +++ return -ENOMEM; +++ +++ delta += cnt - 1; +++ env->prog = prog = new_prog; +++ insn = new_prog->insnsi + i + delta; +++ continue; +++ } +++ +++ if (BPF_CLASS(insn->code) == BPF_LD && +++ (BPF_MODE(insn->code) == BPF_ABS || +++ BPF_MODE(insn->code) == BPF_IND)) { +++ cnt = env->ops->gen_ld_abs(insn, insn_buf); +++ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { +++ verbose(env, "bpf verifier is misconfigured\n"); +++ return -EINVAL; +++ } +++ +++ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); +++ if (!new_prog) +++ return -ENOMEM; +++ +++ delta += cnt - 1; +++ env->prog = prog = new_prog; +++ insn = new_prog->insnsi + i + delta; +++ continue; +++ } +++ +++ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || +++ insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { +++ const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; +++ const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; +++ struct bpf_insn insn_buf[16]; +++ struct bpf_insn *patch = &insn_buf[0]; +++ bool issrc, isneg, isimm; +++ u32 off_reg; +++ +++ aux = &env->insn_aux_data[i + delta]; +++ if (!aux->alu_state || +++ aux->alu_state == BPF_ALU_NON_POINTER) +++ continue; +++ +++ isneg = aux->alu_state & BPF_ALU_NEG_VALUE; +++ issrc = (aux->alu_state & BPF_ALU_SANITIZE) == +++ BPF_ALU_SANITIZE_SRC; +++ isimm = aux->alu_state & BPF_ALU_IMMEDIATE; +++ +++ off_reg = issrc ? insn->src_reg : insn->dst_reg; +++ if (isimm) { +++ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); +++ } else { +++ if (isneg) +++ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); +++ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit); +++ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); +++ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); +++ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); +++ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); +++ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, off_reg); +++ } +++ if (!issrc) +++ *patch++ = BPF_MOV64_REG(insn->dst_reg, insn->src_reg); +++ insn->src_reg = BPF_REG_AX; +++ if (isneg) +++ insn->code = insn->code == code_add ? +++ code_sub : code_add; +++ *patch++ = *insn; +++ if (issrc && isneg && !isimm) +++ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); +++ cnt = patch - insn_buf; +++ +++ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); +++ if (!new_prog) +++ return -ENOMEM; +++ +++ delta += cnt - 1; +++ env->prog = prog = new_prog; +++ insn = new_prog->insnsi + i + delta; +++ continue; +++ } +++ +++ if (insn->code != (BPF_JMP | BPF_CALL)) +++ continue; +++ if (insn->src_reg == BPF_PSEUDO_CALL) +++ continue; +++ +++ if (insn->imm == BPF_FUNC_get_route_realm) +++ prog->dst_needed = 1; +++ if (insn->imm == BPF_FUNC_get_prandom_u32) +++ bpf_user_rnd_init_once(); +++ if (insn->imm == BPF_FUNC_override_return) +++ prog->kprobe_override = 1; +++ if (insn->imm == BPF_FUNC_tail_call) { +++ /* If we tail call into other programs, we +++ * cannot make any assumptions since they can +++ * be replaced dynamically during runtime in +++ * the program array. +++ */ +++ prog->cb_access = 1; +++ env->prog->aux->stack_depth = MAX_BPF_STACK; +++ env->prog->aux->max_pkt_offset = MAX_PACKET_OFF; +++ +++ /* mark bpf_tail_call as different opcode to avoid +++ * conditional branch in the interpeter for every normal +++ * call and to prevent accidental JITing by JIT compiler +++ * that doesn't support bpf_tail_call yet +++ */ +++ insn->imm = 0; +++ insn->code = BPF_JMP | BPF_TAIL_CALL; +++ +++ aux = &env->insn_aux_data[i + delta]; +++ if (!bpf_map_ptr_unpriv(aux)) +++ continue; +++ +++ /* instead of changing every JIT dealing with tail_call +++ * emit two extra insns: +++ * if (index >= max_entries) goto out; +++ * index &= array->index_mask; +++ * to avoid out-of-bounds cpu speculation +++ */ +++ if (bpf_map_ptr_poisoned(aux)) { +++ verbose(env, "tail_call abusing map_ptr\n"); +++ return -EINVAL; +++ } +++ +++ map_ptr = BPF_MAP_PTR(aux->map_state); +++ insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, +++ map_ptr->max_entries, 2); +++ insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, +++ container_of(map_ptr, +++ struct bpf_array, +++ map)->index_mask); +++ insn_buf[2] = *insn; +++ cnt = 3; +++ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); +++ if (!new_prog) +++ return -ENOMEM; +++ +++ delta += cnt - 1; +++ env->prog = prog = new_prog; +++ insn = new_prog->insnsi + i + delta; +++ continue; +++ } +++ +++ /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup +++ * and other inlining handlers are currently limited to 64 bit +++ * only. +++ */ +++ if (prog->jit_requested && BITS_PER_LONG == 64 && +++ (insn->imm == BPF_FUNC_map_lookup_elem || +++ insn->imm == BPF_FUNC_map_update_elem || +++ insn->imm == BPF_FUNC_map_delete_elem || +++ insn->imm == BPF_FUNC_map_push_elem || +++ insn->imm == BPF_FUNC_map_pop_elem || +++ insn->imm == BPF_FUNC_map_peek_elem)) { +++ aux = &env->insn_aux_data[i + delta]; +++ if (bpf_map_ptr_poisoned(aux)) +++ goto patch_call_imm; +++ +++ map_ptr = BPF_MAP_PTR(aux->map_state); +++ ops = map_ptr->ops; +++ if (insn->imm == BPF_FUNC_map_lookup_elem && +++ ops->map_gen_lookup) { +++ cnt = ops->map_gen_lookup(map_ptr, insn_buf); +++ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { +++ verbose(env, "bpf verifier is misconfigured\n"); +++ return -EINVAL; +++ } +++ +++ new_prog = bpf_patch_insn_data(env, i + delta, +++ insn_buf, cnt); +++ if (!new_prog) +++ return -ENOMEM; +++ +++ delta += cnt - 1; +++ env->prog = prog = new_prog; +++ insn = new_prog->insnsi + i + delta; +++ continue; +++ } +++ +++ BUILD_BUG_ON(!__same_type(ops->map_lookup_elem, +++ (void *(*)(struct bpf_map *map, void *key))NULL)); +++ BUILD_BUG_ON(!__same_type(ops->map_delete_elem, +++ (int (*)(struct bpf_map *map, void *key))NULL)); +++ BUILD_BUG_ON(!__same_type(ops->map_update_elem, +++ (int (*)(struct bpf_map *map, void *key, void *value, +++ u64 flags))NULL)); +++ BUILD_BUG_ON(!__same_type(ops->map_push_elem, +++ (int (*)(struct bpf_map *map, void *value, +++ u64 flags))NULL)); +++ BUILD_BUG_ON(!__same_type(ops->map_pop_elem, +++ (int (*)(struct bpf_map *map, void *value))NULL)); +++ BUILD_BUG_ON(!__same_type(ops->map_peek_elem, +++ (int (*)(struct bpf_map *map, void *value))NULL)); +++ +++ switch (insn->imm) { +++ case BPF_FUNC_map_lookup_elem: +++ insn->imm = BPF_CAST_CALL(ops->map_lookup_elem) - +++ __bpf_call_base; +++ continue; +++ case BPF_FUNC_map_update_elem: +++ insn->imm = BPF_CAST_CALL(ops->map_update_elem) - +++ __bpf_call_base; +++ continue; +++ case BPF_FUNC_map_delete_elem: +++ insn->imm = BPF_CAST_CALL(ops->map_delete_elem) - +++ __bpf_call_base; +++ continue; +++ case BPF_FUNC_map_push_elem: +++ insn->imm = BPF_CAST_CALL(ops->map_push_elem) - +++ __bpf_call_base; +++ continue; +++ case BPF_FUNC_map_pop_elem: +++ insn->imm = BPF_CAST_CALL(ops->map_pop_elem) - +++ __bpf_call_base; +++ continue; +++ case BPF_FUNC_map_peek_elem: +++ insn->imm = BPF_CAST_CALL(ops->map_peek_elem) - +++ __bpf_call_base; +++ continue; +++ } +++ +++ goto patch_call_imm; +++ } +++ +++patch_call_imm: +++ fn = env->ops->get_func_proto(insn->imm, env->prog); +++ /* all functions that have prototype and verifier allowed +++ * programs to call them, must be real in-kernel functions +++ */ +++ if (!fn->func) { +++ verbose(env, +++ "kernel subsystem misconfigured func %s#%d\n", +++ func_id_name(insn->imm), insn->imm); +++ return -EFAULT; +++ } +++ insn->imm = fn->func - __bpf_call_base; +++ } +++ +++ return 0; +++} +++ +++static void free_states(struct bpf_verifier_env *env) +++{ +++ struct bpf_verifier_state_list *sl, *sln; ++ int i; ++ +++ sl = env->free_list; +++ while (sl) { +++ sln = sl->next; +++ free_verifier_state(&sl->state, false); +++ kfree(sl); +++ sl = sln; +++ } +++ ++ if (!env->explored_states) ++ return; ++ ++- for (i = 0; i < env->prog->len; i++) { +++ for (i = 0; i < state_htab_size(env); i++) { ++ sl = env->explored_states[i]; ++ ++- if (sl) ++- while (sl != STATE_LIST_MARK) { ++- sln = sl->next; ++- kfree(sl); ++- sl = sln; ++- } +++ while (sl) { +++ sln = sl->next; +++ free_verifier_state(&sl->state, false); +++ kfree(sl); +++ sl = sln; +++ } ++ } ++ ++- kfree(env->explored_states); +++ kvfree(env->explored_states); ++ } ++ ++-int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) +++static void print_verification_stats(struct bpf_verifier_env *env) ++ { ++- char __user *log_ubuf = NULL; ++- struct verifier_env *env; ++- int ret = -EINVAL; +++ int i; ++ ++- if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) ++- return -E2BIG; +++ if (env->log.level & BPF_LOG_STATS) { +++ verbose(env, "verification time %lld usec\n", +++ div_u64(env->verification_time, 1000)); +++ verbose(env, "stack depth "); +++ for (i = 0; i < env->subprog_cnt; i++) { +++ u32 depth = env->subprog_info[i].stack_depth; +++ +++ verbose(env, "%d", depth); +++ if (i + 1 < env->subprog_cnt) +++ verbose(env, "+"); +++ } +++ verbose(env, "\n"); +++ } +++ verbose(env, "processed %d insns (limit %d) max_states_per_insn %d " +++ "total_states %d peak_states %d mark_read %d\n", +++ env->insn_processed, BPF_COMPLEXITY_LIMIT_INSNS, +++ env->max_states_per_insn, env->total_states, +++ env->peak_states, env->longest_mark_read_walk); +++} +++ +++int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, +++ union bpf_attr __user *uattr) +++{ +++ u64 start_time = ktime_get_ns(); +++ struct bpf_verifier_env *env; +++ struct bpf_verifier_log *log; +++ int i, len, ret = -EINVAL; +++ bool is_priv; ++ ++- /* 'struct verifier_env' can be global, but since it's not small, +++ /* no program is valid */ +++ if (ARRAY_SIZE(bpf_verifier_ops) == 0) +++ return -EINVAL; +++ +++ /* 'struct bpf_verifier_env' can be global, but since it's not small, ++ * allocate/free it every time bpf_check() is called ++ */ ++- env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); +++ env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); ++ if (!env) ++ return -ENOMEM; +++ log = &env->log; ++ +++ len = (*prog)->len; +++ env->insn_aux_data = +++ vzalloc(array_size(sizeof(struct bpf_insn_aux_data), len)); +++ ret = -ENOMEM; +++ if (!env->insn_aux_data) +++ goto err_free_env; +++ for (i = 0; i < len; i++) +++ env->insn_aux_data[i].orig_idx = i; ++ env->prog = *prog; +++ env->ops = bpf_verifier_ops[env->prog->type]; +++ is_priv = capable(CAP_SYS_ADMIN); ++ ++ /* grab the mutex to protect few globals used by verifier */ ++- mutex_lock(&bpf_verifier_lock); +++ if (!is_priv) +++ mutex_lock(&bpf_verifier_lock); ++ ++ if (attr->log_level || attr->log_buf || attr->log_size) { ++ /* user requested verbose verifier output ++ * and supplied buffer to store the verification trace ++ */ ++- log_level = attr->log_level; ++- log_ubuf = (char __user *) (unsigned long) attr->log_buf; ++- log_size = attr->log_size; ++- log_len = 0; +++ log->level = attr->log_level; +++ log->ubuf = (char __user *) (unsigned long) attr->log_buf; +++ log->len_total = attr->log_size; ++ ++ ret = -EINVAL; ++- /* log_* values have to be sane */ ++- if (log_size < 128 || log_size > UINT_MAX >> 8 || ++- log_level == 0 || log_ubuf == NULL) ++- goto free_env; ++- ++- ret = -ENOMEM; ++- log_buf = vmalloc(log_size); ++- if (!log_buf) ++- goto free_env; ++- } else { ++- log_level = 0; ++- } +++ /* log attributes have to be sane */ +++ if (log->len_total < 128 || log->len_total > UINT_MAX >> 2 || +++ !log->level || !log->ubuf || log->level & ~BPF_LOG_MASK) +++ goto err_unlock; +++ } +++ +++ env->strict_alignment = !!(attr->prog_flags & BPF_F_STRICT_ALIGNMENT); +++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) +++ env->strict_alignment = true; +++ if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) +++ env->strict_alignment = false; +++ +++ env->allow_ptr_leaks = is_priv; +++ +++ if (is_priv) +++ env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; ++ ++ ret = replace_map_fd_with_map_ptr(env); ++ if (ret < 0) ++ goto skip_full_check; ++ ++- env->explored_states = kcalloc(env->prog->len, ++- sizeof(struct verifier_state_list *), +++ env->explored_states = kcalloc(state_htab_size(env), +++ sizeof(struct bpf_verifier_state_list *), ++ GFP_USER); ++ ret = -ENOMEM; ++ if (!env->explored_states) ++ goto skip_full_check; ++ ++- ret = check_cfg(env); +++ ret = check_subprogs(env); +++ if (ret < 0) +++ goto skip_full_check; +++ +++ ret = check_btf_info(env, attr, uattr); ++ if (ret < 0) ++ goto skip_full_check; ++ ++- env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); +++ ret = check_cfg(env); +++ if (ret < 0) +++ goto skip_full_check; ++ ++ ret = do_check(env); +++ if (env->cur_state) { +++ free_verifier_state(env->cur_state, true); +++ env->cur_state = NULL; +++ } ++ ++ skip_full_check: ++- while (pop_stack(env, NULL) >= 0); +++ while (!pop_stack(env, NULL, NULL)); ++ free_states(env); ++ ++ if (ret == 0) +++ ret = check_max_stack_depth(env); +++ +++ /* instruction rewrites happen after this point */ +++ if (is_priv) { +++ if (ret == 0) +++ opt_hard_wire_dead_code_branches(env); +++ if (ret == 0) +++ ret = opt_remove_dead_code(env); +++ if (ret == 0) +++ ret = opt_remove_nops(env); +++ } else { +++ if (ret == 0) +++ sanitize_dead_code(env); +++ } +++ +++ if (ret == 0) ++ /* program is valid, convert *(u32*)(ctx + off) accesses */ ++ ret = convert_ctx_accesses(env); ++ ++- if (log_level && log_len >= log_size - 1) { ++- BUG_ON(log_len >= log_size); ++- /* verifier log exceeded user supplied buffer */ ++- ret = -ENOSPC; ++- /* fall through to return what was recorded */ +++ if (ret == 0) +++ ret = fixup_bpf_calls(env); +++ +++ /* do 32-bit optimization after insn patching has done so those patched +++ * insns could be handled correctly. +++ */ +++ if (ret == 0 && !bpf_prog_is_dev_bound(env->prog->aux)) { +++ ret = opt_subreg_zext_lo32_rnd_hi32(env, attr); +++ env->prog->aux->verifier_zext = bpf_jit_needs_zext() ? !ret +++ : false; ++ } ++ ++- /* copy verifier log back to user space including trailing zero */ ++- if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { +++ if (ret == 0) +++ ret = fixup_call_args(env); +++ +++ env->verification_time = ktime_get_ns() - start_time; +++ print_verification_stats(env); +++ +++ if (log->level && bpf_verifier_log_full(log)) +++ ret = -ENOSPC; +++ if (log->level && !log->ubuf) { ++ ret = -EFAULT; ++- goto free_log_buf; +++ goto err_release_maps; ++ } ++ ++ if (ret == 0 && env->used_map_cnt) { ++@@ -2297,7 +9580,7 @@ skip_full_check: ++ ++ if (!env->prog->aux->used_maps) { ++ ret = -ENOMEM; ++- goto free_log_buf; +++ goto err_release_maps; ++ } ++ ++ memcpy(env->prog->aux->used_maps, env->used_maps, ++@@ -2310,17 +9593,21 @@ skip_full_check: ++ convert_pseudo_ld_imm64(env); ++ } ++ ++-free_log_buf: ++- if (log_level) ++- vfree(log_buf); ++-free_env: +++ if (ret == 0) +++ adjust_btf_func(env); +++ +++err_release_maps: ++ if (!env->prog->aux->used_maps) ++ /* if we didn't copy map pointers into bpf_prog_info, release ++- * them now. Otherwise free_bpf_prog_info() will release them. +++ * them now. Otherwise free_used_maps() will release them. ++ */ ++ release_maps(env); ++ *prog = env->prog; +++err_unlock: +++ if (!is_priv) +++ mutex_unlock(&bpf_verifier_lock); +++ vfree(env->insn_aux_data); +++err_free_env: ++ kfree(env); ++- mutex_unlock(&bpf_verifier_lock); ++ return ret; ++ } ++--- a/include/linux/filter.h +++++ b/include/linux/filter.h ++@@ -1,3 +1,4 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ ++ /* ++ * Linux Socket Filter Data Structures ++ */ ++@@ -7,16 +8,22 @@ ++ #include ++ ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include ++-#include +++#include +++#include +++#include +++#include +++#include ++ ++-#include +++#include ++ +++#include ++ #include ++ #include ++ ++@@ -24,6 +31,11 @@ struct sk_buff; ++ struct sock; ++ struct seccomp_data; ++ struct bpf_prog_aux; +++struct xdp_rxq_info; +++struct xdp_buff; +++struct sock_reuseport; +++struct ctl_table; +++struct ctl_table_header; ++ ++ /* ArgX, context and stack frame pointer register positions. Note, ++ * Arg1, Arg2, Arg3, etc are used as argument mappings of function ++@@ -40,7 +52,26 @@ struct bpf_prog_aux; ++ /* Additional register mappings for converted user programs. */ ++ #define BPF_REG_A BPF_REG_0 ++ #define BPF_REG_X BPF_REG_7 ++-#define BPF_REG_TMP BPF_REG_8 +++#define BPF_REG_TMP BPF_REG_2 /* scratch reg */ +++#define BPF_REG_D BPF_REG_8 /* data, callee-saved */ +++#define BPF_REG_H BPF_REG_9 /* hlen, callee-saved */ +++ +++/* Kernel hidden auxiliary/helper register. */ +++#define BPF_REG_AX MAX_BPF_REG +++#define MAX_BPF_EXT_REG (MAX_BPF_REG + 1) +++#define MAX_BPF_JIT_REG MAX_BPF_EXT_REG +++ +++/* unused opcode to mark special call to bpf_tail_call() helper */ +++#define BPF_TAIL_CALL 0xf0 +++ +++/* unused opcode to mark call to interpreter with arguments */ +++#define BPF_CALL_ARGS 0xe0 +++ +++/* As per nm, we expose JITed images as text (code) section for +++ * kallsyms. That way, tools like perf can find it to match +++ * addresses. +++ */ +++#define BPF_SYM_ELF_TYPE 't' ++ ++ /* BPF program can access up to 512 bytes of stack space. */ ++ #define MAX_BPF_STACK 512 ++@@ -129,6 +160,20 @@ struct bpf_prog_aux; ++ .off = 0, \ ++ .imm = IMM }) ++ +++/* Special form of mov32, used for doing explicit zero extension on dst. */ +++#define BPF_ZEXT_REG(DST) \ +++ ((struct bpf_insn) { \ +++ .code = BPF_ALU | BPF_MOV | BPF_X, \ +++ .dst_reg = DST, \ +++ .src_reg = DST, \ +++ .off = 0, \ +++ .imm = 1 }) +++ +++static inline bool insn_is_zext(const struct bpf_insn *insn) +++{ +++ return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1; +++} +++ ++ /* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */ ++ #define BPF_LD_IMM64(DST, IMM) \ ++ BPF_LD_IMM64_RAW(DST, 0, IMM) ++@@ -249,8 +294,51 @@ struct bpf_prog_aux; ++ .off = OFF, \ ++ .imm = IMM }) ++ +++/* Like BPF_JMP_REG, but with 32-bit wide operands for comparison. */ +++ +++#define BPF_JMP32_REG(OP, DST, SRC, OFF) \ +++ ((struct bpf_insn) { \ +++ .code = BPF_JMP32 | BPF_OP(OP) | BPF_X, \ +++ .dst_reg = DST, \ +++ .src_reg = SRC, \ +++ .off = OFF, \ +++ .imm = 0 }) +++ +++/* Like BPF_JMP_IMM, but with 32-bit wide operands for comparison. */ +++ +++#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ +++ ((struct bpf_insn) { \ +++ .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ +++ .dst_reg = DST, \ +++ .src_reg = 0, \ +++ .off = OFF, \ +++ .imm = IMM }) +++ +++/* Unconditional jumps, goto pc + off16 */ +++ +++#define BPF_JMP_A(OFF) \ +++ ((struct bpf_insn) { \ +++ .code = BPF_JMP | BPF_JA, \ +++ .dst_reg = 0, \ +++ .src_reg = 0, \ +++ .off = OFF, \ +++ .imm = 0 }) +++ +++/* Relative call */ +++ +++#define BPF_CALL_REL(TGT) \ +++ ((struct bpf_insn) { \ +++ .code = BPF_JMP | BPF_CALL, \ +++ .dst_reg = 0, \ +++ .src_reg = BPF_PSEUDO_CALL, \ +++ .off = 0, \ +++ .imm = TGT }) +++ ++ /* Function call */ ++ +++#define BPF_CAST_CALL(x) \ +++ ((u64 (*)(u64, u64, u64, u64, u64))(x)) +++ ++ #define BPF_EMIT_CALL(FUNC) \ ++ ((struct bpf_insn) { \ ++ .code = BPF_JMP | BPF_CALL, \ ++@@ -303,6 +391,112 @@ struct bpf_prog_aux; ++ bpf_size; \ ++ }) ++ +++#define bpf_size_to_bytes(bpf_size) \ +++({ \ +++ int bytes = -EINVAL; \ +++ \ +++ if (bpf_size == BPF_B) \ +++ bytes = sizeof(u8); \ +++ else if (bpf_size == BPF_H) \ +++ bytes = sizeof(u16); \ +++ else if (bpf_size == BPF_W) \ +++ bytes = sizeof(u32); \ +++ else if (bpf_size == BPF_DW) \ +++ bytes = sizeof(u64); \ +++ \ +++ bytes; \ +++}) +++ +++#define BPF_SIZEOF(type) \ +++ ({ \ +++ const int __size = bytes_to_bpf_size(sizeof(type)); \ +++ BUILD_BUG_ON(__size < 0); \ +++ __size; \ +++ }) +++ +++#define BPF_FIELD_SIZEOF(type, field) \ +++ ({ \ +++ const int __size = bytes_to_bpf_size(FIELD_SIZEOF(type, field)); \ +++ BUILD_BUG_ON(__size < 0); \ +++ __size; \ +++ }) +++ +++#define BPF_LDST_BYTES(insn) \ +++ ({ \ +++ const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ +++ WARN_ON(__size < 0); \ +++ __size; \ +++ }) +++ +++#define __BPF_MAP_0(m, v, ...) v +++#define __BPF_MAP_1(m, v, t, a, ...) m(t, a) +++#define __BPF_MAP_2(m, v, t, a, ...) m(t, a), __BPF_MAP_1(m, v, __VA_ARGS__) +++#define __BPF_MAP_3(m, v, t, a, ...) m(t, a), __BPF_MAP_2(m, v, __VA_ARGS__) +++#define __BPF_MAP_4(m, v, t, a, ...) m(t, a), __BPF_MAP_3(m, v, __VA_ARGS__) +++#define __BPF_MAP_5(m, v, t, a, ...) m(t, a), __BPF_MAP_4(m, v, __VA_ARGS__) +++ +++#define __BPF_REG_0(...) __BPF_PAD(5) +++#define __BPF_REG_1(...) __BPF_MAP(1, __VA_ARGS__), __BPF_PAD(4) +++#define __BPF_REG_2(...) __BPF_MAP(2, __VA_ARGS__), __BPF_PAD(3) +++#define __BPF_REG_3(...) __BPF_MAP(3, __VA_ARGS__), __BPF_PAD(2) +++#define __BPF_REG_4(...) __BPF_MAP(4, __VA_ARGS__), __BPF_PAD(1) +++#define __BPF_REG_5(...) __BPF_MAP(5, __VA_ARGS__) +++ +++#define __BPF_MAP(n, ...) __BPF_MAP_##n(__VA_ARGS__) +++#define __BPF_REG(n, ...) __BPF_REG_##n(__VA_ARGS__) +++ +++#define __BPF_CAST(t, a) \ +++ (__force t) \ +++ (__force \ +++ typeof(__builtin_choose_expr(sizeof(t) == sizeof(unsigned long), \ +++ (unsigned long)0, (t)0))) a +++#define __BPF_V void +++#define __BPF_N +++ +++#define __BPF_DECL_ARGS(t, a) t a +++#define __BPF_DECL_REGS(t, a) u64 a +++ +++#define __BPF_PAD(n) \ +++ __BPF_MAP(n, __BPF_DECL_ARGS, __BPF_N, u64, __ur_1, u64, __ur_2, \ +++ u64, __ur_3, u64, __ur_4, u64, __ur_5) +++ +++#define BPF_CALL_x(x, name, ...) \ +++ static __always_inline \ +++ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)); \ +++ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)); \ +++ u64 name(__BPF_REG(x, __BPF_DECL_REGS, __BPF_N, __VA_ARGS__)) \ +++ { \ +++ return ____##name(__BPF_MAP(x,__BPF_CAST,__BPF_N,__VA_ARGS__));\ +++ } \ +++ static __always_inline \ +++ u64 ____##name(__BPF_MAP(x, __BPF_DECL_ARGS, __BPF_V, __VA_ARGS__)) +++ +++#define BPF_CALL_0(name, ...) BPF_CALL_x(0, name, __VA_ARGS__) +++#define BPF_CALL_1(name, ...) BPF_CALL_x(1, name, __VA_ARGS__) +++#define BPF_CALL_2(name, ...) BPF_CALL_x(2, name, __VA_ARGS__) +++#define BPF_CALL_3(name, ...) BPF_CALL_x(3, name, __VA_ARGS__) +++#define BPF_CALL_4(name, ...) BPF_CALL_x(4, name, __VA_ARGS__) +++#define BPF_CALL_5(name, ...) BPF_CALL_x(5, name, __VA_ARGS__) +++ +++#define bpf_ctx_range(TYPE, MEMBER) \ +++ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +++#define bpf_ctx_range_till(TYPE, MEMBER1, MEMBER2) \ +++ offsetof(TYPE, MEMBER1) ... offsetofend(TYPE, MEMBER2) - 1 +++#if BITS_PER_LONG == 64 +++# define bpf_ctx_range_ptr(TYPE, MEMBER) \ +++ offsetof(TYPE, MEMBER) ... offsetofend(TYPE, MEMBER) - 1 +++#else +++# define bpf_ctx_range_ptr(TYPE, MEMBER) \ +++ offsetof(TYPE, MEMBER) ... offsetof(TYPE, MEMBER) + 8 - 1 +++#endif /* BITS_PER_LONG == 64 */ +++ +++#define bpf_target_off(TYPE, MEMBER, SIZE, PTR_SIZE) \ +++ ({ \ +++ BUILD_BUG_ON(FIELD_SIZEOF(TYPE, MEMBER) != (SIZE)); \ +++ *(PTR_SIZE) = (SIZE); \ +++ offsetof(TYPE, MEMBER); \ +++ }) +++ ++ #ifdef CONFIG_COMPAT ++ /* A struct sock_filter is architecture independent. */ ++ struct compat_sock_fprog { ++@@ -317,24 +511,33 @@ struct sock_fprog_kern { ++ }; ++ ++ struct bpf_binary_header { ++- unsigned int pages; ++- u8 image[]; +++ u32 pages; +++ /* Some arches need word alignment for their instructions */ +++ u8 image[] __aligned(4); ++ }; ++ ++ struct bpf_prog { ++ u16 pages; /* Number of allocated pages */ ++- kmemcheck_bitfield_begin(meta); ++ u16 jited:1, /* Is our filter JIT'ed? */ +++ jit_requested:1,/* archs need to JIT the prog */ +++ undo_set_mem:1, /* Passed set_memory_ro() checkpoint */ ++ gpl_compatible:1, /* Is filter GPL compatible? */ ++ cb_access:1, /* Is control block accessed? */ ++- dst_needed:1; /* Do we need dst entry? */ ++- kmemcheck_bitfield_end(meta); ++- u32 len; /* Number of filter blocks */ +++ dst_needed:1, /* Do we need dst entry? */ +++ blinded:1, /* Was blinded */ +++ is_func:1, /* program is a bpf function */ +++ kprobe_override:1, /* Do we override a kprobe? */ +++ has_callchain_buf:1, /* callchain buffer allocated? */ +++ enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */ ++ enum bpf_prog_type type; /* Type of BPF program */ +++ enum bpf_attach_type expected_attach_type; /* For some prog types */ +++ u32 len; /* Number of filter blocks */ +++ u32 jited_len; /* Size of jited insns in bytes */ +++ u8 tag[BPF_TAG_SIZE]; ++ struct bpf_prog_aux *aux; /* Auxiliary fields */ ++ struct sock_fprog_kern *orig_prog; /* Original BPF program */ ++- unsigned int (*bpf_func)(const struct sk_buff *skb, ++- const struct bpf_insn *filter); +++ unsigned int (*bpf_func)(const void *ctx, +++ const struct bpf_insn *insn); ++ /* Instructions for interpreter */ ++ union { ++ struct sock_filter insns[0]; ++@@ -343,44 +546,160 @@ struct bpf_prog { ++ }; ++ ++ struct sk_filter { ++- atomic_t refcnt; +++ refcount_t refcnt; ++ struct rcu_head rcu; ++ struct bpf_prog *prog; ++ }; ++ ++-#define BPF_PROG_RUN(filter, ctx) (*filter->bpf_func)(ctx, filter->insnsi) +++#define BPF_PROG_RUN(prog, ctx) ({ \ +++ u32 ret; \ +++ ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi); \ +++ ret; }) +++ +++#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN +++ +++struct bpf_skb_data_end { +++ struct qdisc_skb_cb qdisc_cb; +++ void *data_meta; +++ void *data_end; +++}; ++ ++-static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, ++- struct sk_buff *skb) +++struct bpf_redirect_info { +++ u32 flags; +++ u32 tgt_index; +++ void *tgt_value; +++ struct bpf_map *map; +++ struct bpf_map *map_to_flush; +++ u32 kern_flags; +++}; +++ +++DECLARE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +++ +++/* flags for bpf_redirect_info kern_flags */ +++#define BPF_RI_F_RF_NO_DIRECT BIT(0) /* no napi_direct on return_frame */ +++ +++/* Compute the linear packet data range [data, data_end) which +++ * will be accessed by various program types (cls_bpf, act_bpf, +++ * lwt, ...). Subsystems allowing direct data access must (!) +++ * ensure that cb[] area can be written to when BPF program is +++ * invoked (otherwise cb[] save/restore is necessary). +++ */ +++static inline void bpf_compute_data_pointers(struct sk_buff *skb) ++ { ++- u8 *cb_data = qdisc_skb_cb(skb)->data; ++- u8 saved_cb[QDISC_CB_PRIV_LEN]; ++- u32 res; +++ struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; +++ +++ BUILD_BUG_ON(sizeof(*cb) > FIELD_SIZEOF(struct sk_buff, cb)); +++ cb->data_meta = skb->data; +++ cb->data_end = skb->data + skb_headlen(skb); +++} ++ +++/* Similar to bpf_compute_data_pointers(), except that save orginal +++ * data in cb->data and cb->meta_data for restore. +++ */ +++static inline void bpf_compute_and_save_data_end( +++ struct sk_buff *skb, void **saved_data_end) +++{ +++ struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; +++ +++ *saved_data_end = cb->data_end; +++ cb->data_end = skb->data + skb_headlen(skb); +++} +++ +++/* Restore data saved by bpf_compute_data_pointers(). */ +++static inline void bpf_restore_data_end( +++ struct sk_buff *skb, void *saved_data_end) +++{ +++ struct bpf_skb_data_end *cb = (struct bpf_skb_data_end *)skb->cb; +++ +++ cb->data_end = saved_data_end; +++} +++ +++static inline u8 *bpf_skb_cb(struct sk_buff *skb) +++{ +++ /* eBPF programs may read/write skb->cb[] area to transfer meta +++ * data between tail calls. Since this also needs to work with +++ * tc, that scratch memory is mapped to qdisc_skb_cb's data area. +++ * +++ * In some socket filter cases, the cb unfortunately needs to be +++ * saved/restored so that protocol specific skb->cb[] data won't +++ * be lost. In any case, due to unpriviledged eBPF programs +++ * attached to sockets, we need to clear the bpf_skb_cb() area +++ * to not leak previous contents to user space. +++ */ +++ BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != BPF_SKB_CB_LEN); ++ BUILD_BUG_ON(FIELD_SIZEOF(struct __sk_buff, cb) != ++- QDISC_CB_PRIV_LEN); +++ FIELD_SIZEOF(struct qdisc_skb_cb, data)); +++ +++ return qdisc_skb_cb(skb)->data; +++} +++ +++static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog, +++ struct sk_buff *skb) +++{ +++ u8 *cb_data = bpf_skb_cb(skb); +++ u8 cb_saved[BPF_SKB_CB_LEN]; +++ u32 res; ++ ++ if (unlikely(prog->cb_access)) { ++- memcpy(saved_cb, cb_data, sizeof(saved_cb)); ++- memset(cb_data, 0, sizeof(saved_cb)); +++ memcpy(cb_saved, cb_data, sizeof(cb_saved)); +++ memset(cb_data, 0, sizeof(cb_saved)); ++ } ++ ++ res = BPF_PROG_RUN(prog, skb); ++ ++ if (unlikely(prog->cb_access)) ++- memcpy(cb_data, saved_cb, sizeof(saved_cb)); +++ memcpy(cb_data, cb_saved, sizeof(cb_saved)); +++ +++ return res; +++} +++ +++static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog, +++ struct sk_buff *skb) +++{ +++ u32 res; ++ +++ preempt_disable(); +++ res = __bpf_prog_run_save_cb(prog, skb); +++ preempt_enable(); ++ return res; ++ } ++ ++ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog, ++ struct sk_buff *skb) ++ { ++- u8 *cb_data = qdisc_skb_cb(skb)->data; +++ u8 *cb_data = bpf_skb_cb(skb); +++ u32 res; ++ ++ if (unlikely(prog->cb_access)) ++- memset(cb_data, 0, QDISC_CB_PRIV_LEN); ++- return BPF_PROG_RUN(prog, skb); +++ memset(cb_data, 0, BPF_SKB_CB_LEN); +++ +++ preempt_disable(); +++ res = BPF_PROG_RUN(prog, skb); +++ preempt_enable(); +++ return res; +++} +++ +++static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, +++ struct xdp_buff *xdp) +++{ +++ /* Caller needs to hold rcu_read_lock() (!), otherwise program +++ * can be released while still running, or map elements could be +++ * freed early while still having concurrent users. XDP fastpath +++ * already takes rcu_read_lock() when fetching the program, so +++ * it's not necessary here anymore. +++ */ +++ return BPF_PROG_RUN(prog, xdp); +++} +++ +++static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) +++{ +++ return prog->len * sizeof(struct bpf_insn); +++} +++ +++static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog) +++{ +++ return round_up(bpf_prog_insn_size(prog) + +++ sizeof(__be64) + 1, SHA_MESSAGE_BYTES); ++ } ++ ++ static inline unsigned int bpf_prog_size(unsigned int proglen) ++@@ -399,27 +718,77 @@ static inline bool bpf_prog_was_classic( ++ return prog->type == BPF_PROG_TYPE_UNSPEC; ++ } ++ +++static inline u32 bpf_ctx_off_adjust_machine(u32 size) +++{ +++ const u32 size_machine = sizeof(unsigned long); +++ +++ if (size > size_machine && size % size_machine == 0) +++ size = size_machine; +++ +++ return size; +++} +++ +++static inline bool +++bpf_ctx_narrow_access_ok(u32 off, u32 size, u32 size_default) +++{ +++ return size <= size_default && (size & (size - 1)) == 0; +++} +++ +++static inline u8 +++bpf_ctx_narrow_access_offset(u32 off, u32 size, u32 size_default) +++{ +++ u8 access_off = off & (size_default - 1); +++ +++#ifdef __LITTLE_ENDIAN +++ return access_off; +++#else +++ return size_default - (access_off + size); +++#endif +++} +++ +++#define bpf_ctx_wide_access_ok(off, size, type, field) \ +++ (size == sizeof(__u64) && \ +++ off >= offsetof(type, field) && \ +++ off + sizeof(__u64) <= offsetofend(type, field) && \ +++ off % sizeof(__u64) == 0) +++ ++ #define bpf_classic_proglen(fprog) (fprog->len * sizeof(fprog->filter[0])) ++ ++-#ifdef CONFIG_DEBUG_SET_MODULE_RONX ++ static inline void bpf_prog_lock_ro(struct bpf_prog *fp) ++ { ++- set_memory_ro((unsigned long)fp, fp->pages); +++#ifndef CONFIG_BPF_JIT_ALWAYS_ON +++ if (!fp->jited) { +++ fp->undo_set_mem = 1; +++ set_memory_ro((unsigned long)fp, fp->pages); +++ } +++#endif ++ } ++ ++ static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) ++ { ++- set_memory_rw((unsigned long)fp, fp->pages); +++ if (fp->undo_set_mem) +++ set_memory_rw((unsigned long)fp, fp->pages); ++ } ++-#else ++-static inline void bpf_prog_lock_ro(struct bpf_prog *fp) +++ +++static inline void bpf_jit_binary_lock_ro(struct bpf_binary_header *hdr) ++ { +++ set_memory_ro((unsigned long)hdr, hdr->pages); +++ set_memory_x((unsigned long)hdr, hdr->pages); ++ } ++ ++-static inline void bpf_prog_unlock_ro(struct bpf_prog *fp) +++static inline void bpf_jit_binary_unlock_ro(struct bpf_binary_header *hdr) +++{ +++ set_memory_rw((unsigned long)hdr, hdr->pages); +++} +++ +++static inline struct bpf_binary_header * +++bpf_jit_binary_hdr(const struct bpf_prog *fp) ++ { +++ unsigned long real_start = (unsigned long)fp->bpf_func; +++ unsigned long addr = real_start & PAGE_MASK; +++ +++ return (void *)addr; ++ } ++-#endif /* CONFIG_DEBUG_SET_MODULE_RONX */ ++ ++ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap); ++ static inline int sk_filter(struct sock *sk, struct sk_buff *skb) ++@@ -427,10 +796,20 @@ static inline int sk_filter(struct sock ++ return sk_filter_trim_cap(sk, skb, 1); ++ } ++ ++-int bpf_prog_select_runtime(struct bpf_prog *fp); +++struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); ++ void bpf_prog_free(struct bpf_prog *fp); ++ +++bool bpf_opcode_in_insntable(u8 code); +++ +++void bpf_prog_free_linfo(struct bpf_prog *prog); +++void bpf_prog_fill_jited_linfo(struct bpf_prog *prog, +++ const u32 *insn_to_jit_off); +++int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog); +++void bpf_prog_free_jited_linfo(struct bpf_prog *prog); +++void bpf_prog_free_unused_jited_linfo(struct bpf_prog *prog); +++ ++ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); +++struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags); ++ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, ++ gfp_t gfp_extra_flags); ++ void __bpf_prog_free(struct bpf_prog *fp); ++@@ -450,12 +829,11 @@ int bpf_prog_create_from_user(struct bpf ++ void bpf_prog_destroy(struct bpf_prog *fp); ++ ++ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); ++-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, ++- bool locked); ++ int sk_attach_bpf(u32 ufd, struct sock *sk); +++int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk); +++int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk); +++void sk_reuseport_prog_free(struct bpf_prog *prog); ++ int sk_detach_filter(struct sock *sk); ++-int __sk_detach_filter(struct sock *sk, bool locked); ++- ++ int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, ++ unsigned int len); ++ ++@@ -463,10 +841,100 @@ bool sk_filter_charge(struct sock *sk, s ++ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); ++ ++ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); ++-void bpf_int_jit_compile(struct bpf_prog *fp); ++-bool bpf_helper_changes_skb_data(void *func); +++#define __bpf_call_base_args \ +++ ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ +++ (void *)__bpf_call_base) +++ +++struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); +++void bpf_jit_compile(struct bpf_prog *prog); +++bool bpf_jit_needs_zext(void); +++bool bpf_helper_changes_pkt_data(void *func); +++ +++static inline bool bpf_dump_raw_ok(const struct cred *cred) +++{ +++ /* Reconstruction of call-sites is dependent on kallsyms, +++ * thus make dump the same restriction. +++ */ +++ return true; +++} +++ +++struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, +++ const struct bpf_insn *patch, u32 len); +++int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt); +++ +++void bpf_clear_redirect_map(struct bpf_map *map); +++ +++static inline bool xdp_return_frame_no_direct(void) +++{ +++ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +++ +++ return ri->kern_flags & BPF_RI_F_RF_NO_DIRECT; +++} +++ +++static inline void xdp_set_return_frame_no_direct(void) +++{ +++ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +++ +++ ri->kern_flags |= BPF_RI_F_RF_NO_DIRECT; +++} +++ +++static inline void xdp_clear_return_frame_no_direct(void) +++{ +++ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +++ +++ ri->kern_flags &= ~BPF_RI_F_RF_NO_DIRECT; +++} +++ +++static inline int xdp_ok_fwd_dev(const struct net_device *fwd, +++ unsigned int pktlen) +++{ +++ unsigned int len; +++ +++ if (unlikely(!(fwd->flags & IFF_UP))) +++ return -ENETDOWN; +++ +++ len = fwd->mtu + fwd->hard_header_len + VLAN_HLEN; +++ if (pktlen > len) +++ return -EMSGSIZE; +++ +++ return 0; +++} +++ +++/* The pair of xdp_do_redirect and xdp_do_flush_map MUST be called in the +++ * same cpu context. Further for best results no more than a single map +++ * for the do_redirect/do_flush pair should be used. This limitation is +++ * because we only track one map and force a flush when the map changes. +++ * This does not appear to be a real limitation for existing software. +++ */ +++int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, +++ struct xdp_buff *xdp, struct bpf_prog *prog); +++int xdp_do_redirect(struct net_device *dev, +++ struct xdp_buff *xdp, +++ struct bpf_prog *prog); +++void xdp_do_flush_map(void); +++ +++void bpf_warn_invalid_xdp_action(u32 act); +++ +++#ifdef CONFIG_INET +++struct sock *bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, +++ struct bpf_prog *prog, struct sk_buff *skb, +++ u32 hash); +++#else +++static inline struct sock * +++bpf_run_sk_reuseport(struct sock_reuseport *reuse, struct sock *sk, +++ struct bpf_prog *prog, struct sk_buff *skb, +++ u32 hash) +++{ +++ return NULL; +++} +++#endif ++ ++ #ifdef CONFIG_BPF_JIT +++extern int bpf_jit_enable; +++extern int bpf_jit_harden; +++extern int bpf_jit_kallsyms; +++extern long bpf_jit_limit; +++ ++ typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size); ++ ++ struct bpf_binary_header * ++@@ -474,10 +942,18 @@ bpf_jit_binary_alloc(unsigned int progle ++ unsigned int alignment, ++ bpf_jit_fill_hole_t bpf_fill_ill_insns); ++ void bpf_jit_binary_free(struct bpf_binary_header *hdr); ++- ++-void bpf_jit_compile(struct bpf_prog *fp); +++u64 bpf_jit_alloc_exec_limit(void); +++void *bpf_jit_alloc_exec(unsigned long size); +++void bpf_jit_free_exec(void *addr); ++ void bpf_jit_free(struct bpf_prog *fp); ++ +++int bpf_jit_get_func_addr(const struct bpf_prog *prog, +++ const struct bpf_insn *insn, bool extra_pass, +++ u64 *func_addr, bool *func_addr_fixed); +++ +++struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *fp); +++void bpf_jit_prog_release_other(struct bpf_prog *fp, struct bpf_prog *fp_other); +++ ++ static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen, ++ u32 pass, void *image) ++ { ++@@ -488,17 +964,144 @@ static inline void bpf_jit_dump(unsigned ++ print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_OFFSET, ++ 16, 1, image, proglen, false); ++ } ++-#else ++-static inline void bpf_jit_compile(struct bpf_prog *fp) +++ +++static inline bool bpf_jit_is_ebpf(void) +++{ +++# ifdef CONFIG_HAVE_EBPF_JIT +++ return true; +++# else +++ return false; +++# endif +++} +++ +++static inline bool ebpf_jit_enabled(void) +++{ +++ return bpf_jit_enable && bpf_jit_is_ebpf(); +++} +++ +++static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) +++{ +++ return fp->jited && bpf_jit_is_ebpf(); +++} +++ +++static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) +++{ +++ /* These are the prerequisites, should someone ever have the +++ * idea to call blinding outside of them, we make sure to +++ * bail out. +++ */ +++ if (!bpf_jit_is_ebpf()) +++ return false; +++ if (!prog->jit_requested) +++ return false; +++ if (!bpf_jit_harden) +++ return false; +++ if (bpf_jit_harden == 1 && capable(CAP_SYS_ADMIN)) +++ return false; +++ +++ return true; +++} +++ +++static inline bool bpf_jit_kallsyms_enabled(void) ++ { +++ /* There are a couple of corner cases where kallsyms should +++ * not be enabled f.e. on hardening. +++ */ +++ if (bpf_jit_harden) +++ return false; +++ if (!bpf_jit_kallsyms) +++ return false; +++ if (bpf_jit_kallsyms == 1) +++ return true; +++ +++ return false; +++} +++ +++const char *__bpf_address_lookup(unsigned long addr, unsigned long *size, +++ unsigned long *off, char *sym); +++bool is_bpf_text_address(unsigned long addr); +++int bpf_get_kallsym(unsigned int symnum, unsigned long *value, char *type, +++ char *sym); +++ +++static inline const char * +++bpf_address_lookup(unsigned long addr, unsigned long *size, +++ unsigned long *off, char **modname, char *sym) +++{ +++ const char *ret = __bpf_address_lookup(addr, size, off, sym); +++ +++ if (ret && modname) +++ *modname = NULL; +++ return ret; +++} +++ +++void bpf_prog_kallsyms_add(struct bpf_prog *fp); +++void bpf_prog_kallsyms_del(struct bpf_prog *fp); +++void bpf_get_prog_name(const struct bpf_prog *prog, char *sym); +++ +++#else /* CONFIG_BPF_JIT */ +++ +++static inline bool ebpf_jit_enabled(void) +++{ +++ return false; +++} +++ +++static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) +++{ +++ return false; ++ } ++ ++ static inline void bpf_jit_free(struct bpf_prog *fp) ++ { ++ bpf_prog_unlock_free(fp); ++ } +++ +++static inline bool bpf_jit_kallsyms_enabled(void) +++{ +++ return false; +++} +++ +++static inline const char * +++__bpf_address_lookup(unsigned long addr, unsigned long *size, +++ unsigned long *off, char *sym) +++{ +++ return NULL; +++} +++ +++static inline bool is_bpf_text_address(unsigned long addr) +++{ +++ return false; +++} +++ +++static inline int bpf_get_kallsym(unsigned int symnum, unsigned long *value, +++ char *type, char *sym) +++{ +++ return -ERANGE; +++} +++ +++static inline const char * +++bpf_address_lookup(unsigned long addr, unsigned long *size, +++ unsigned long *off, char **modname, char *sym) +++{ +++ return NULL; +++} +++ +++static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) +++{ +++} +++ +++static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) +++{ +++} +++ +++static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +++{ +++ sym[0] = '\0'; +++} +++ ++ #endif /* CONFIG_BPF_JIT */ ++ +++void bpf_prog_kallsyms_del_all(struct bpf_prog *fp); +++ ++ #define BPF_ANC BIT(15) ++ ++ static inline bool bpf_needs_clear_a(const struct sock_filter *first) ++@@ -571,4 +1174,59 @@ static inline int bpf_tell_extensions(vo ++ return SKF_AD_MAX; ++ } ++ +++struct bpf_sock_addr_kern { +++ struct sock *sk; +++ struct sockaddr *uaddr; +++ /* Temporary "register" to make indirect stores to nested structures +++ * defined above. We need three registers to make such a store, but +++ * only two (src and dst) are available at convert_ctx_access time +++ */ +++ u64 tmp_reg; +++ void *t_ctx; /* Attach type specific context. */ +++}; +++ +++struct bpf_sock_ops_kern { +++ struct sock *sk; +++ u32 op; +++ union { +++ u32 args[4]; +++ u32 reply; +++ u32 replylong[4]; +++ }; +++ u32 is_fullsock; +++ u64 temp; /* temp and everything after is not +++ * initialized to 0 before calling +++ * the BPF program. New fields that +++ * should be initialized to 0 should +++ * be inserted before temp. +++ * temp is scratch storage used by +++ * sock_ops_convert_ctx_access +++ * as temporary storage of a register. +++ */ +++}; +++ +++struct bpf_sysctl_kern { +++ struct ctl_table_header *head; +++ struct ctl_table *table; +++ void *cur_val; +++ size_t cur_len; +++ void *new_val; +++ size_t new_len; +++ int new_updated; +++ int write; +++ loff_t *ppos; +++ /* Temporary "register" for indirect stores to ppos. */ +++ u64 tmp_reg; +++}; +++ +++struct bpf_sockopt_kern { +++ struct sock *sk; +++ u8 *optval; +++ u8 *optval_end; +++ s32 level; +++ s32 optname; +++ s32 optlen; +++ s32 retval; +++}; +++ ++ #endif /* __LINUX_FILTER_H__ */ ++--- /dev/null +++++ b/include/linux/set_memory.h ++@@ -0,0 +1,47 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ +++/* +++ * Copyright 2017, Michael Ellerman, IBM Corporation. +++ */ +++#ifndef _LINUX_SET_MEMORY_H_ +++#define _LINUX_SET_MEMORY_H_ +++ +++#include +++ +++#ifndef CONFIG_ARCH_HAS_SET_DIRECT_MAP +++static inline int set_direct_map_invalid_noflush(struct page *page) +++{ +++ return 0; +++} +++static inline int set_direct_map_default_noflush(struct page *page) +++{ +++ return 0; +++} +++#endif +++ +++#ifndef set_mce_nospec +++static inline int set_mce_nospec(unsigned long pfn, bool unmap) +++{ +++ return 0; +++} +++#endif +++ +++#ifndef clear_mce_nospec +++static inline int clear_mce_nospec(unsigned long pfn) +++{ +++ return 0; +++} +++#endif +++ +++#ifndef CONFIG_ARCH_HAS_MEM_ENCRYPT +++static inline int set_memory_encrypted(unsigned long addr, int numpages) +++{ +++ return 0; +++} +++ +++static inline int set_memory_decrypted(unsigned long addr, int numpages) +++{ +++ return 0; +++} +++#endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ +++ +++#endif /* _LINUX_SET_MEMORY_H_ */ ++--- /dev/null +++++ b/include/trace/events/xdp.h ++@@ -0,0 +1,407 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ +++#undef TRACE_SYSTEM +++#define TRACE_SYSTEM xdp +++ +++#if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) +++#define _TRACE_XDP_H +++ +++#include +++#include +++#include +++#include +++ +++#define __XDP_ACT_MAP(FN) \ +++ FN(ABORTED) \ +++ FN(DROP) \ +++ FN(PASS) \ +++ FN(TX) \ +++ FN(REDIRECT) +++ +++#define __XDP_ACT_TP_FN(x) \ +++ TRACE_DEFINE_ENUM(XDP_##x); +++#define __XDP_ACT_SYM_FN(x) \ +++ { XDP_##x, #x }, +++#define __XDP_ACT_SYM_TAB \ +++ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, 0 } +++__XDP_ACT_MAP(__XDP_ACT_TP_FN) +++ +++TRACE_EVENT(xdp_exception, +++ +++ TP_PROTO(const struct net_device *dev, +++ const struct bpf_prog *xdp, u32 act), +++ +++ TP_ARGS(dev, xdp, act), +++ +++ TP_STRUCT__entry( +++ __field(int, prog_id) +++ __field(u32, act) +++ __field(int, ifindex) +++ ), +++ +++ TP_fast_assign( +++ __entry->prog_id = xdp->aux->id; +++ __entry->act = act; +++ __entry->ifindex = dev->ifindex; +++ ), +++ +++ TP_printk("prog_id=%d action=%s ifindex=%d", +++ __entry->prog_id, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->ifindex) +++); +++ +++TRACE_EVENT(xdp_bulk_tx, +++ +++ TP_PROTO(const struct net_device *dev, +++ int sent, int drops, int err), +++ +++ TP_ARGS(dev, sent, drops, err), +++ +++ TP_STRUCT__entry( +++ __field(int, ifindex) +++ __field(u32, act) +++ __field(int, drops) +++ __field(int, sent) +++ __field(int, err) +++ ), +++ +++ TP_fast_assign( +++ __entry->ifindex = dev->ifindex; +++ __entry->act = XDP_TX; +++ __entry->drops = drops; +++ __entry->sent = sent; +++ __entry->err = err; +++ ), +++ +++ TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", +++ __entry->ifindex, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->sent, __entry->drops, __entry->err) +++); +++ +++DECLARE_EVENT_CLASS(xdp_redirect_template, +++ +++ TP_PROTO(const struct net_device *dev, +++ const struct bpf_prog *xdp, +++ int to_ifindex, int err, +++ const struct bpf_map *map, u32 map_index), +++ +++ TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), +++ +++ TP_STRUCT__entry( +++ __field(int, prog_id) +++ __field(u32, act) +++ __field(int, ifindex) +++ __field(int, err) +++ __field(int, to_ifindex) +++ __field(u32, map_id) +++ __field(int, map_index) +++ ), +++ +++ TP_fast_assign( +++ __entry->prog_id = xdp->aux->id; +++ __entry->act = XDP_REDIRECT; +++ __entry->ifindex = dev->ifindex; +++ __entry->err = err; +++ __entry->to_ifindex = to_ifindex; +++ __entry->map_id = map ? map->id : 0; +++ __entry->map_index = map_index; +++ ), +++ +++ TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d", +++ __entry->prog_id, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->ifindex, __entry->to_ifindex, +++ __entry->err) +++); +++ +++DEFINE_EVENT(xdp_redirect_template, xdp_redirect, +++ TP_PROTO(const struct net_device *dev, +++ const struct bpf_prog *xdp, +++ int to_ifindex, int err, +++ const struct bpf_map *map, u32 map_index), +++ TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) +++); +++ +++DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, +++ TP_PROTO(const struct net_device *dev, +++ const struct bpf_prog *xdp, +++ int to_ifindex, int err, +++ const struct bpf_map *map, u32 map_index), +++ TP_ARGS(dev, xdp, to_ifindex, err, map, map_index) +++); +++ +++#define _trace_xdp_redirect(dev, xdp, to) \ +++ trace_xdp_redirect(dev, xdp, to, 0, NULL, 0); +++ +++#define _trace_xdp_redirect_err(dev, xdp, to, err) \ +++ trace_xdp_redirect_err(dev, xdp, to, err, NULL, 0); +++ +++DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map, +++ TP_PROTO(const struct net_device *dev, +++ const struct bpf_prog *xdp, +++ int to_ifindex, int err, +++ const struct bpf_map *map, u32 map_index), +++ TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), +++ TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" +++ " map_id=%d map_index=%d", +++ __entry->prog_id, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->ifindex, __entry->to_ifindex, +++ __entry->err, +++ __entry->map_id, __entry->map_index) +++); +++ +++DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, +++ TP_PROTO(const struct net_device *dev, +++ const struct bpf_prog *xdp, +++ int to_ifindex, int err, +++ const struct bpf_map *map, u32 map_index), +++ TP_ARGS(dev, xdp, to_ifindex, err, map, map_index), +++ TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" +++ " map_id=%d map_index=%d", +++ __entry->prog_id, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->ifindex, __entry->to_ifindex, +++ __entry->err, +++ __entry->map_id, __entry->map_index) +++); +++ +++#ifndef __DEVMAP_OBJ_TYPE +++#define __DEVMAP_OBJ_TYPE +++struct _bpf_dtab_netdev { +++ struct net_device *dev; +++}; +++#endif /* __DEVMAP_OBJ_TYPE */ +++ +++#define devmap_ifindex(fwd, map) \ +++ ((map->map_type == BPF_MAP_TYPE_DEVMAP || \ +++ map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) ? \ +++ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0) +++ +++#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ +++ trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \ +++ 0, map, idx) +++ +++#define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ +++ trace_xdp_redirect_map_err(dev, xdp, devmap_ifindex(fwd, map), \ +++ err, map, idx) +++ +++TRACE_EVENT(xdp_cpumap_kthread, +++ +++ TP_PROTO(int map_id, unsigned int processed, unsigned int drops, +++ int sched), +++ +++ TP_ARGS(map_id, processed, drops, sched), +++ +++ TP_STRUCT__entry( +++ __field(int, map_id) +++ __field(u32, act) +++ __field(int, cpu) +++ __field(unsigned int, drops) +++ __field(unsigned int, processed) +++ __field(int, sched) +++ ), +++ +++ TP_fast_assign( +++ __entry->map_id = map_id; +++ __entry->act = XDP_REDIRECT; +++ __entry->cpu = smp_processor_id(); +++ __entry->drops = drops; +++ __entry->processed = processed; +++ __entry->sched = sched; +++ ), +++ +++ TP_printk("kthread" +++ " cpu=%d map_id=%d action=%s" +++ " processed=%u drops=%u" +++ " sched=%d", +++ __entry->cpu, __entry->map_id, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->processed, __entry->drops, +++ __entry->sched) +++); +++ +++TRACE_EVENT(xdp_cpumap_enqueue, +++ +++ TP_PROTO(int map_id, unsigned int processed, unsigned int drops, +++ int to_cpu), +++ +++ TP_ARGS(map_id, processed, drops, to_cpu), +++ +++ TP_STRUCT__entry( +++ __field(int, map_id) +++ __field(u32, act) +++ __field(int, cpu) +++ __field(unsigned int, drops) +++ __field(unsigned int, processed) +++ __field(int, to_cpu) +++ ), +++ +++ TP_fast_assign( +++ __entry->map_id = map_id; +++ __entry->act = XDP_REDIRECT; +++ __entry->cpu = smp_processor_id(); +++ __entry->drops = drops; +++ __entry->processed = processed; +++ __entry->to_cpu = to_cpu; +++ ), +++ +++ TP_printk("enqueue" +++ " cpu=%d map_id=%d action=%s" +++ " processed=%u drops=%u" +++ " to_cpu=%d", +++ __entry->cpu, __entry->map_id, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->processed, __entry->drops, +++ __entry->to_cpu) +++); +++ +++TRACE_EVENT(xdp_devmap_xmit, +++ +++ TP_PROTO(const struct bpf_map *map, u32 map_index, +++ int sent, int drops, +++ const struct net_device *from_dev, +++ const struct net_device *to_dev, int err), +++ +++ TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err), +++ +++ TP_STRUCT__entry( +++ __field(int, map_id) +++ __field(u32, act) +++ __field(u32, map_index) +++ __field(int, drops) +++ __field(int, sent) +++ __field(int, from_ifindex) +++ __field(int, to_ifindex) +++ __field(int, err) +++ ), +++ +++ TP_fast_assign( +++ __entry->map_id = map->id; +++ __entry->act = XDP_REDIRECT; +++ __entry->map_index = map_index; +++ __entry->drops = drops; +++ __entry->sent = sent; +++ __entry->from_ifindex = from_dev->ifindex; +++ __entry->to_ifindex = to_dev->ifindex; +++ __entry->err = err; +++ ), +++ +++ TP_printk("ndo_xdp_xmit" +++ " map_id=%d map_index=%d action=%s" +++ " sent=%d drops=%d" +++ " from_ifindex=%d to_ifindex=%d err=%d", +++ __entry->map_id, __entry->map_index, +++ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), +++ __entry->sent, __entry->drops, +++ __entry->from_ifindex, __entry->to_ifindex, __entry->err) +++); +++ +++/* Expect users already include , but not xdp_priv.h */ +++#include +++ +++#define __MEM_TYPE_MAP(FN) \ +++ FN(PAGE_SHARED) \ +++ FN(PAGE_ORDER0) \ +++ FN(PAGE_POOL) \ +++ FN(ZERO_COPY) +++ +++#define __MEM_TYPE_TP_FN(x) \ +++ TRACE_DEFINE_ENUM(MEM_TYPE_##x); +++#define __MEM_TYPE_SYM_FN(x) \ +++ { MEM_TYPE_##x, #x }, +++#define __MEM_TYPE_SYM_TAB \ +++ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } +++__MEM_TYPE_MAP(__MEM_TYPE_TP_FN) +++ +++TRACE_EVENT(mem_disconnect, +++ +++ TP_PROTO(const struct xdp_mem_allocator *xa), +++ +++ TP_ARGS(xa), +++ +++ TP_STRUCT__entry( +++ __field(const struct xdp_mem_allocator *, xa) +++ __field(u32, mem_id) +++ __field(u32, mem_type) +++ __field(const void *, allocator) +++ ), +++ +++ TP_fast_assign( +++ __entry->xa = xa; +++ __entry->mem_id = xa->mem.id; +++ __entry->mem_type = xa->mem.type; +++ __entry->allocator = xa->allocator; +++ ), +++ +++ TP_printk("mem_id=%d mem_type=%s allocator=%p", +++ __entry->mem_id, +++ __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), +++ __entry->allocator +++ ) +++); +++ +++TRACE_EVENT(mem_connect, +++ +++ TP_PROTO(const struct xdp_mem_allocator *xa, +++ const struct xdp_rxq_info *rxq), +++ +++ TP_ARGS(xa, rxq), +++ +++ TP_STRUCT__entry( +++ __field(const struct xdp_mem_allocator *, xa) +++ __field(u32, mem_id) +++ __field(u32, mem_type) +++ __field(const void *, allocator) +++ __field(const struct xdp_rxq_info *, rxq) +++ __field(int, ifindex) +++ ), +++ +++ TP_fast_assign( +++ __entry->xa = xa; +++ __entry->mem_id = xa->mem.id; +++ __entry->mem_type = xa->mem.type; +++ __entry->allocator = xa->allocator; +++ __entry->rxq = rxq; +++ __entry->ifindex = rxq->dev->ifindex; +++ ), +++ +++ TP_printk("mem_id=%d mem_type=%s allocator=%p" +++ " ifindex=%d", +++ __entry->mem_id, +++ __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), +++ __entry->allocator, +++ __entry->ifindex +++ ) +++); +++ +++TRACE_EVENT(mem_return_failed, +++ +++ TP_PROTO(const struct xdp_mem_info *mem, +++ const struct page *page), +++ +++ TP_ARGS(mem, page), +++ +++ TP_STRUCT__entry( +++ __field(const struct page *, page) +++ __field(u32, mem_id) +++ __field(u32, mem_type) +++ ), +++ +++ TP_fast_assign( +++ __entry->page = page; +++ __entry->mem_id = mem->id; +++ __entry->mem_type = mem->type; +++ ), +++ +++ TP_printk("mem_id=%d mem_type=%s page=%p", +++ __entry->mem_id, +++ __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), +++ __entry->page +++ ) +++); +++ +++#endif /* _TRACE_XDP_H */ +++ +++#include ++--- /dev/null +++++ b/include/net/xdp_priv.h ++@@ -0,0 +1,20 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ +++#ifndef __LINUX_NET_XDP_PRIV_H__ +++#define __LINUX_NET_XDP_PRIV_H__ +++ +++#include +++#include +++ +++/* Private to net/core/xdp.c, but used by trace/events/xdp.h */ +++struct xdp_mem_allocator { +++ struct xdp_mem_info mem; +++ union { +++ void *allocator; +++ struct page_pool *page_pool; +++ struct zero_copy_allocator *zc_alloc; +++ }; +++ struct rhash_head node; +++ struct rcu_head rcu; +++}; +++ +++#endif /* __LINUX_NET_XDP_PRIV_H__ */ ++--- /dev/null +++++ b/include/net/xdp.h ++@@ -0,0 +1,184 @@ +++/* SPDX-License-Identifier: GPL-2.0-only */ +++/* include/net/xdp.h +++ * +++ * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. +++ */ +++#ifndef __LINUX_NET_XDP_H__ +++#define __LINUX_NET_XDP_H__ +++ +++/** +++ * DOC: XDP RX-queue information +++ * +++ * The XDP RX-queue info (xdp_rxq_info) is associated with the driver +++ * level RX-ring queues. It is information that is specific to how +++ * the driver have configured a given RX-ring queue. +++ * +++ * Each xdp_buff frame received in the driver carry a (pointer) +++ * reference to this xdp_rxq_info structure. This provides the XDP +++ * data-path read-access to RX-info for both kernel and bpf-side +++ * (limited subset). +++ * +++ * For now, direct access is only safe while running in NAPI/softirq +++ * context. Contents is read-mostly and must not be updated during +++ * driver NAPI/softirq poll. +++ * +++ * The driver usage API is a register and unregister API. +++ * +++ * The struct is not directly tied to the XDP prog. A new XDP prog +++ * can be attached as long as it doesn't change the underlying +++ * RX-ring. If the RX-ring does change significantly, the NIC driver +++ * naturally need to stop the RX-ring before purging and reallocating +++ * memory. In that process the driver MUST call unregistor (which +++ * also apply for driver shutdown and unload). The register API is +++ * also mandatory during RX-ring setup. +++ */ +++ +++enum xdp_mem_type { +++ MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ +++ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ +++ MEM_TYPE_PAGE_POOL, +++ MEM_TYPE_ZERO_COPY, +++ MEM_TYPE_MAX, +++}; +++ +++/* XDP flags for ndo_xdp_xmit */ +++#define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ +++#define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH +++ +++struct xdp_mem_info { +++ u32 type; /* enum xdp_mem_type, but known size type */ +++ u32 id; +++}; +++ +++struct page_pool; +++ +++struct zero_copy_allocator { +++ void (*free)(struct zero_copy_allocator *zca, unsigned long handle); +++}; +++ +++struct xdp_rxq_info { +++ struct net_device *dev; +++ u32 queue_index; +++ u32 reg_state; +++ struct xdp_mem_info mem; +++} ____cacheline_aligned; /* perf critical, avoid false-sharing */ +++ +++struct xdp_buff { +++ void *data; +++ void *data_end; +++ void *data_meta; +++ void *data_hard_start; +++ unsigned long handle; +++ struct xdp_rxq_info *rxq; +++}; +++ +++struct xdp_frame { +++ void *data; +++ u16 len; +++ u16 headroom; +++ u16 metasize; +++ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, +++ * while mem info is valid on remote CPU. +++ */ +++ struct xdp_mem_info mem; +++ struct net_device *dev_rx; /* used by cpumap */ +++}; +++ +++/* Clear kernel pointers in xdp_frame */ +++static inline void xdp_scrub_frame(struct xdp_frame *frame) +++{ +++ frame->data = NULL; +++ frame->dev_rx = NULL; +++} +++ +++struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); +++ +++/* Convert xdp_buff to xdp_frame */ +++static inline +++struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) +++{ +++ struct xdp_frame *xdp_frame; +++ int metasize; +++ int headroom; +++ +++ if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) +++ return xdp_convert_zc_to_xdp_frame(xdp); +++ +++ /* Assure headroom is available for storing info */ +++ headroom = xdp->data - xdp->data_hard_start; +++ metasize = xdp->data - xdp->data_meta; +++ metasize = metasize > 0 ? metasize : 0; +++ if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) +++ return NULL; +++ +++ /* Store info in top of packet */ +++ xdp_frame = xdp->data_hard_start; +++ +++ xdp_frame->data = xdp->data; +++ xdp_frame->len = xdp->data_end - xdp->data; +++ xdp_frame->headroom = headroom - sizeof(*xdp_frame); +++ xdp_frame->metasize = metasize; +++ +++ /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ +++ xdp_frame->mem = xdp->rxq->mem; +++ +++ return xdp_frame; +++} +++ +++void xdp_return_frame(struct xdp_frame *xdpf); +++void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); +++void xdp_return_buff(struct xdp_buff *xdp); +++ +++/* When sending xdp_frame into the network stack, then there is no +++ * return point callback, which is needed to release e.g. DMA-mapping +++ * resources with page_pool. Thus, have explicit function to release +++ * frame resources. +++ */ +++void __xdp_release_frame(void *data, struct xdp_mem_info *mem); +++static inline void xdp_release_frame(struct xdp_frame *xdpf) +++{ +++ struct xdp_mem_info *mem = &xdpf->mem; +++ +++ /* Curr only page_pool needs this */ +++ if (mem->type == MEM_TYPE_PAGE_POOL) +++ __xdp_release_frame(xdpf->data, mem); +++} +++ +++int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, +++ struct net_device *dev, u32 queue_index); +++void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); +++void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); +++bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); +++int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, +++ enum xdp_mem_type type, void *allocator); +++void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); +++ +++/* Drivers not supporting XDP metadata can use this helper, which +++ * rejects any room expansion for metadata as a result. +++ */ +++static __always_inline void +++xdp_set_data_meta_invalid(struct xdp_buff *xdp) +++{ +++ xdp->data_meta = xdp->data + 1; +++} +++ +++static __always_inline bool +++xdp_data_meta_unsupported(const struct xdp_buff *xdp) +++{ +++ return unlikely(xdp->data_meta > xdp->data); +++} +++ +++struct xdp_attachment_info { +++ struct bpf_prog *prog; +++ u32 flags; +++}; +++ +++struct netdev_bpf; +++int xdp_attachment_query(struct xdp_attachment_info *info, +++ struct netdev_bpf *bpf); +++bool xdp_attachment_flags_ok(struct xdp_attachment_info *info, +++ struct netdev_bpf *bpf); +++void xdp_attachment_setup(struct xdp_attachment_info *info, +++ struct netdev_bpf *bpf); +++ +++#endif /* __LINUX_NET_XDP_H__ */ ++--- a/include/linux/atomic.h +++++ b/include/linux/atomic.h ++@@ -437,6 +437,8 @@ static inline int atomic_add_unless(atom ++ return __atomic_add_unless(v, a, u) != u; ++ } ++ +++#define atomic_fetch_add_unless __atomic_add_unless +++ ++ /** ++ * atomic_inc_not_zero - increment unless the number is zero ++ * @v: pointer of type atomic_t ++--- a/include/linux/kernel.h +++++ b/include/linux/kernel.h ++@@ -45,6 +45,13 @@ ++ ++ #define STACK_MAGIC 0xdeadbeef ++ +++#define u64_to_user_ptr(x) ( \ +++ { \ +++ typecheck(u64, (x)); \ +++ (void __user *)(uintptr_t)(x); \ +++ } \ +++) +++ ++ #define REPEAT_BYTE(x) ((~0ul / 0xff) * (x)) ++ ++ #define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) ++--- /dev/null +++++ b/include/linux/tnum.h ++@@ -0,0 +1,89 @@ +++/* tnum: tracked (or tristate) numbers +++ * +++ * A tnum tracks knowledge about the bits of a value. Each bit can be either +++ * known (0 or 1), or unknown (x). Arithmetic operations on tnums will +++ * propagate the unknown bits such that the tnum result represents all the +++ * possible results for possible values of the operands. +++ */ +++ +++#ifndef _LINUX_TNUM_H +++#define _LINUX_TNUM_H +++ +++#include +++ +++struct tnum { +++ u64 value; +++ u64 mask; +++}; +++ +++/* Constructors */ +++/* Represent a known constant as a tnum. */ +++struct tnum tnum_const(u64 value); +++/* A completely unknown value */ +++extern const struct tnum tnum_unknown; +++/* A value that's unknown except that @min <= value <= @max */ +++struct tnum tnum_range(u64 min, u64 max); +++ +++/* Arithmetic and logical ops */ +++/* Shift a tnum left (by a fixed shift) */ +++struct tnum tnum_lshift(struct tnum a, u8 shift); +++/* Shift (rsh) a tnum right (by a fixed shift) */ +++struct tnum tnum_rshift(struct tnum a, u8 shift); +++/* Shift (arsh) a tnum right (by a fixed min_shift) */ +++struct tnum tnum_arshift(struct tnum a, u8 min_shift, u8 insn_bitness); +++/* Add two tnums, return @a + @b */ +++struct tnum tnum_add(struct tnum a, struct tnum b); +++/* Subtract two tnums, return @a - @b */ +++struct tnum tnum_sub(struct tnum a, struct tnum b); +++/* Bitwise-AND, return @a & @b */ +++struct tnum tnum_and(struct tnum a, struct tnum b); +++/* Bitwise-OR, return @a | @b */ +++struct tnum tnum_or(struct tnum a, struct tnum b); +++/* Bitwise-XOR, return @a ^ @b */ +++struct tnum tnum_xor(struct tnum a, struct tnum b); +++/* Multiply two tnums, return @a * @b */ +++struct tnum tnum_mul(struct tnum a, struct tnum b); +++ +++/* Return a tnum representing numbers satisfying both @a and @b */ +++struct tnum tnum_intersect(struct tnum a, struct tnum b); +++ +++/* Return @a with all but the lowest @size bytes cleared */ +++struct tnum tnum_cast(struct tnum a, u8 size); +++ +++/* Returns true if @a is a known constant */ +++static inline bool tnum_is_const(struct tnum a) +++{ +++ return !a.mask; +++} +++ +++/* Returns true if @a == tnum_const(@b) */ +++static inline bool tnum_equals_const(struct tnum a, u64 b) +++{ +++ return tnum_is_const(a) && a.value == b; +++} +++ +++/* Returns true if @a is completely unknown */ +++static inline bool tnum_is_unknown(struct tnum a) +++{ +++ return !~a.mask; +++} +++ +++/* Returns true if @a is known to be a multiple of @size. +++ * @size must be a power of two. +++ */ +++bool tnum_is_aligned(struct tnum a, u64 size); +++ +++/* Returns true if @b represents a subset of @a. */ +++bool tnum_in(struct tnum a, struct tnum b); +++ +++/* Formatting functions. These have snprintf-like semantics: they will write +++ * up to @size bytes (including the terminating NUL byte), and return the number +++ * of bytes (excluding the terminating NUL) which would have been written had +++ * sufficient space been available. (Thus tnum_sbin always returns 64.) +++ */ +++/* Format a tnum as a pair of hex numbers (value; mask) */ +++int tnum_strn(char *str, size_t size, struct tnum a); +++/* Format a tnum as tristate binary expansion */ +++int tnum_sbin(char *str, size_t size, struct tnum a); +++ +++#endif /* _LINUX_TNUM_H */ ++--- a/include/linux/bitmap.h +++++ b/include/linux/bitmap.h ++@@ -326,6 +326,24 @@ static inline int bitmap_parse(const cha ++ return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); ++ } ++ +++/** +++ * bitmap_from_u64 - Check and swap words within u64. +++ * @mask: source bitmap +++ * @dst: destination bitmap +++ * +++ * In 32-bit Big Endian kernel, when using ``(u32 *)(&val)[*]`` +++ * to read u64 mask, we will get the wrong word. +++ * That is ``(u32 *)(&val)[0]`` gets the upper 32 bits, +++ * but we expect the lower 32-bits of u64. +++ */ +++static inline void bitmap_from_u64(unsigned long *dst, u64 mask) +++{ +++ dst[0] = mask & ULONG_MAX; +++ +++ if (sizeof(mask) > sizeof(unsigned long)) +++ dst[1] = mask >> 32; +++} +++ ++ #endif /* __ASSEMBLY__ */ ++ ++ #endif /* __LINUX_BITMAP_H */ ++--- /dev/null +++++ b/include/linux/overflow.h ++@@ -0,0 +1,320 @@ +++/* SPDX-License-Identifier: GPL-2.0 OR MIT */ +++#ifndef __LINUX_OVERFLOW_H +++#define __LINUX_OVERFLOW_H +++ +++#include +++#include +++ +++/* +++ * In the fallback code below, we need to compute the minimum and +++ * maximum values representable in a given type. These macros may also +++ * be useful elsewhere, so we provide them outside the +++ * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. +++ * +++ * It would seem more obvious to do something like +++ * +++ * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) +++ * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) +++ * +++ * Unfortunately, the middle expressions, strictly speaking, have +++ * undefined behaviour, and at least some versions of gcc warn about +++ * the type_max expression (but not if -fsanitize=undefined is in +++ * effect; in that case, the warning is deferred to runtime...). +++ * +++ * The slightly excessive casting in type_min is to make sure the +++ * macros also produce sensible values for the exotic type _Bool. [The +++ * overflow checkers only almost work for _Bool, but that's +++ * a-feature-not-a-bug, since people shouldn't be doing arithmetic on +++ * _Bools. Besides, the gcc builtins don't allow _Bool* as third +++ * argument.] +++ * +++ * Idea stolen from +++ * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - +++ * credit to Christian Biere. +++ */ +++#define is_signed_type(type) (((type)(-1)) < (type)1) +++#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +++#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +++#define type_min(T) ((T)((T)-type_max(T)-(T)1)) +++ +++/* +++ * Avoids triggering -Wtype-limits compilation warning, +++ * while using unsigned data types to check a < 0. +++ */ +++#define is_non_negative(a) ((a) > 0 || (a) == 0) +++#define is_negative(a) (!(is_non_negative(a))) +++ +++#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW +++/* +++ * For simplicity and code hygiene, the fallback code below insists on +++ * a, b and *d having the same type (similar to the min() and max() +++ * macros), whereas gcc's type-generic overflow checkers accept +++ * different types. Hence we don't just make check_add_overflow an +++ * alias for __builtin_add_overflow, but add type checks similar to +++ * below. +++ */ +++#define check_add_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ __builtin_add_overflow(__a, __b, __d); \ +++}) +++ +++#define check_sub_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ __builtin_sub_overflow(__a, __b, __d); \ +++}) +++ +++#define check_mul_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ __builtin_mul_overflow(__a, __b, __d); \ +++}) +++ +++#else +++ +++ +++/* Checking for unsigned overflow is relatively easy without causing UB. */ +++#define __unsigned_add_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ *__d = __a + __b; \ +++ *__d < __a; \ +++}) +++#define __unsigned_sub_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ *__d = __a - __b; \ +++ __a < __b; \ +++}) +++/* +++ * If one of a or b is a compile-time constant, this avoids a division. +++ */ +++#define __unsigned_mul_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ *__d = __a * __b; \ +++ __builtin_constant_p(__b) ? \ +++ __b > 0 && __a > type_max(typeof(__a)) / __b : \ +++ __a > 0 && __b > type_max(typeof(__b)) / __a; \ +++}) +++ +++/* +++ * For signed types, detecting overflow is much harder, especially if +++ * we want to avoid UB. But the interface of these macros is such that +++ * we must provide a result in *d, and in fact we must produce the +++ * result promised by gcc's builtins, which is simply the possibly +++ * wrapped-around value. Fortunately, we can just formally do the +++ * operations in the widest relevant unsigned type (u64) and then +++ * truncate the result - gcc is smart enough to generate the same code +++ * with and without the (u64) casts. +++ */ +++ +++/* +++ * Adding two signed integers can overflow only if they have the same +++ * sign, and overflow has happened iff the result has the opposite +++ * sign. +++ */ +++#define __signed_add_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ *__d = (u64)__a + (u64)__b; \ +++ (((~(__a ^ __b)) & (*__d ^ __a)) \ +++ & type_min(typeof(__a))) != 0; \ +++}) +++ +++/* +++ * Subtraction is similar, except that overflow can now happen only +++ * when the signs are opposite. In this case, overflow has happened if +++ * the result has the opposite sign of a. +++ */ +++#define __signed_sub_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ *__d = (u64)__a - (u64)__b; \ +++ ((((__a ^ __b)) & (*__d ^ __a)) \ +++ & type_min(typeof(__a))) != 0; \ +++}) +++ +++/* +++ * Signed multiplication is rather hard. gcc always follows C99, so +++ * division is truncated towards 0. This means that we can write the +++ * overflow check like this: +++ * +++ * (a > 0 && (b > MAX/a || b < MIN/a)) || +++ * (a < -1 && (b > MIN/a || b < MAX/a) || +++ * (a == -1 && b == MIN) +++ * +++ * The redundant casts of -1 are to silence an annoying -Wtype-limits +++ * (included in -Wextra) warning: When the type is u8 or u16, the +++ * __b_c_e in check_mul_overflow obviously selects +++ * __unsigned_mul_overflow, but unfortunately gcc still parses this +++ * code and warns about the limited range of __b. +++ */ +++ +++#define __signed_mul_overflow(a, b, d) ({ \ +++ typeof(a) __a = (a); \ +++ typeof(b) __b = (b); \ +++ typeof(d) __d = (d); \ +++ typeof(a) __tmax = type_max(typeof(a)); \ +++ typeof(a) __tmin = type_min(typeof(a)); \ +++ (void) (&__a == &__b); \ +++ (void) (&__a == __d); \ +++ *__d = (u64)__a * (u64)__b; \ +++ (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ +++ (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ +++ (__b == (typeof(__b))-1 && __a == __tmin); \ +++}) +++ +++ +++#define check_add_overflow(a, b, d) \ +++ __builtin_choose_expr(is_signed_type(typeof(a)), \ +++ __signed_add_overflow(a, b, d), \ +++ __unsigned_add_overflow(a, b, d)) +++ +++#define check_sub_overflow(a, b, d) \ +++ __builtin_choose_expr(is_signed_type(typeof(a)), \ +++ __signed_sub_overflow(a, b, d), \ +++ __unsigned_sub_overflow(a, b, d)) +++ +++#define check_mul_overflow(a, b, d) \ +++ __builtin_choose_expr(is_signed_type(typeof(a)), \ +++ __signed_mul_overflow(a, b, d), \ +++ __unsigned_mul_overflow(a, b, d)) +++ +++ +++#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ +++ +++/** check_shl_overflow() - Calculate a left-shifted value and check overflow +++ * +++ * @a: Value to be shifted +++ * @s: How many bits left to shift +++ * @d: Pointer to where to store the result +++ * +++ * Computes *@d = (@a << @s) +++ * +++ * Returns true if '*d' cannot hold the result or when 'a << s' doesn't +++ * make sense. Example conditions: +++ * - 'a << s' causes bits to be lost when stored in *d. +++ * - 's' is garbage (e.g. negative) or so large that the result of +++ * 'a << s' is guaranteed to be 0. +++ * - 'a' is negative. +++ * - 'a << s' sets the sign bit, if any, in '*d'. +++ * +++ * '*d' will hold the results of the attempted shift, but is not +++ * considered "safe for use" if false is returned. +++ */ +++#define check_shl_overflow(a, s, d) ({ \ +++ typeof(a) _a = a; \ +++ typeof(s) _s = s; \ +++ typeof(d) _d = d; \ +++ u64 _a_full = _a; \ +++ unsigned int _to_shift = \ +++ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ +++ *_d = (_a_full << _to_shift); \ +++ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ +++ (*_d >> _to_shift) != _a); \ +++}) +++ +++/** +++ * array_size() - Calculate size of 2-dimensional array. +++ * +++ * @a: dimension one +++ * @b: dimension two +++ * +++ * Calculates size of 2-dimensional array: @a * @b. +++ * +++ * Returns: number of bytes needed to represent the array or SIZE_MAX on +++ * overflow. +++ */ +++static inline __must_check size_t array_size(size_t a, size_t b) +++{ +++ size_t bytes; +++ +++ if (check_mul_overflow(a, b, &bytes)) +++ return SIZE_MAX; +++ +++ return bytes; +++} +++ +++/** +++ * array3_size() - Calculate size of 3-dimensional array. +++ * +++ * @a: dimension one +++ * @b: dimension two +++ * @c: dimension three +++ * +++ * Calculates size of 3-dimensional array: @a * @b * @c. +++ * +++ * Returns: number of bytes needed to represent the array or SIZE_MAX on +++ * overflow. +++ */ +++static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) +++{ +++ size_t bytes; +++ +++ if (check_mul_overflow(a, b, &bytes)) +++ return SIZE_MAX; +++ if (check_mul_overflow(bytes, c, &bytes)) +++ return SIZE_MAX; +++ +++ return bytes; +++} +++ +++/* +++ * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for +++ * struct_size() below. +++ */ +++static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c) +++{ +++ size_t bytes; +++ +++ if (check_mul_overflow(a, b, &bytes)) +++ return SIZE_MAX; +++ if (check_add_overflow(bytes, c, &bytes)) +++ return SIZE_MAX; +++ +++ return bytes; +++} +++ +++/** +++ * struct_size() - Calculate size of structure with trailing array. +++ * @p: Pointer to the structure. +++ * @member: Name of the array member. +++ * @n: Number of elements in the array. +++ * +++ * Calculates size of memory needed for structure @p followed by an +++ * array of @n @member elements. +++ * +++ * Return: number of bytes needed or SIZE_MAX on overflow. +++ */ +++#define struct_size(p, member, n) \ +++ __ab_c_size(n, \ +++ sizeof(*(p)->member) + __must_be_array((p)->member),\ +++ sizeof(*(p))) +++ +++#endif /* __LINUX_OVERFLOW_H */ ++--- a/net/core/filter.c +++++ b/net/core/filter.c ++@@ -1,3 +1,4 @@ +++// SPDX-License-Identifier: GPL-2.0-or-later ++ /* ++ * Linux Socket Filter - Kernel level socket filtering ++ * ++@@ -12,11 +13,6 @@ ++ * Alexei Starovoitov ++ * Daniel Borkmann ++ * ++- * This program is free software; you can redistribute it and/or ++- * modify it under the terms of the GNU General Public License ++- * as published by the Free Software Foundation; either version ++- * 2 of the License, or (at your option) any later version. ++- * ++ * Andi Kleen - Fix a few bad bugs and races. ++ * Kris Katterjohn - Added many additional checks in bpf_check_classic() ++ */ ++@@ -26,11 +22,14 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include +++#include ++ #include +++#include ++ #include ++ #include ++ #include ++@@ -39,17 +38,32 @@ ++ #include ++ #include ++ #include ++-#include +++#include ++ #include +++#include ++ #include ++ #include ++ #include ++ #include ++ #include ++ #include ++-#include ++ #include ++ #include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include +++#include ++ ++ /** ++ * sk_filter_trim_cap - run a packet through a socket filter ++@@ -84,7 +98,12 @@ int sk_filter_trim_cap(struct sock *sk, ++ rcu_read_lock(); ++ filter = rcu_dereference(sk->sk_filter); ++ if (filter) { ++- unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); +++ struct sock *save_sk = skb->sk; +++ unsigned int pkt_len; +++ +++ skb->sk = sk; +++ pkt_len = bpf_prog_run_save_cb(filter->prog, skb); +++ skb->sk = save_sk; ++ err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; ++ } ++ rcu_read_unlock(); ++@@ -93,14 +112,13 @@ int sk_filter_trim_cap(struct sock *sk, ++ } ++ EXPORT_SYMBOL(sk_filter_trim_cap); ++ ++-static u64 __skb_get_pay_offset(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +++BPF_CALL_1(bpf_skb_get_pay_offset, struct sk_buff *, skb) ++ { ++- return skb_get_poff((struct sk_buff *)(unsigned long) ctx); +++ return skb_get_poff(skb); ++ } ++ ++-static u64 __skb_get_nlattr(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +++BPF_CALL_3(bpf_skb_get_nlattr, struct sk_buff *, skb, u32, a, u32, x) ++ { ++- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; ++ struct nlattr *nla; ++ ++ if (skb_is_nonlinear(skb)) ++@@ -119,9 +137,8 @@ static u64 __skb_get_nlattr(u64 ctx, u64 ++ return 0; ++ } ++ ++-static u64 __skb_get_nlattr_nest(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +++BPF_CALL_3(bpf_skb_get_nlattr_nest, struct sk_buff *, skb, u32, a, u32, x) ++ { ++- struct sk_buff *skb = (struct sk_buff *)(unsigned long) ctx; ++ struct nlattr *nla; ++ ++ if (skb_is_nonlinear(skb)) ++@@ -144,11 +161,98 @@ static u64 __skb_get_nlattr_nest(u64 ctx ++ return 0; ++ } ++ ++-static u64 __get_raw_cpu_id(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) +++BPF_CALL_4(bpf_skb_load_helper_8, const struct sk_buff *, skb, const void *, +++ data, int, headlen, int, offset) +++{ +++ u8 tmp, *ptr; +++ const int len = sizeof(tmp); +++ +++ if (offset >= 0) { +++ if (headlen - offset >= len) +++ return *(u8 *)(data + offset); +++ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) +++ return tmp; +++ } else { +++ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); +++ if (likely(ptr)) +++ return *(u8 *)ptr; +++ } +++ +++ return -EFAULT; +++} +++ +++BPF_CALL_2(bpf_skb_load_helper_8_no_cache, const struct sk_buff *, skb, +++ int, offset) +++{ +++ return ____bpf_skb_load_helper_8(skb, skb->data, skb->len - skb->data_len, +++ offset); +++} +++ +++BPF_CALL_4(bpf_skb_load_helper_16, const struct sk_buff *, skb, const void *, +++ data, int, headlen, int, offset) +++{ +++ u16 tmp, *ptr; +++ const int len = sizeof(tmp); +++ +++ if (offset >= 0) { +++ if (headlen - offset >= len) +++ return get_unaligned_be16(data + offset); +++ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) +++ return be16_to_cpu(tmp); +++ } else { +++ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); +++ if (likely(ptr)) +++ return get_unaligned_be16(ptr); +++ } +++ +++ return -EFAULT; +++} +++ +++BPF_CALL_2(bpf_skb_load_helper_16_no_cache, const struct sk_buff *, skb, +++ int, offset) +++{ +++ return ____bpf_skb_load_helper_16(skb, skb->data, skb->len - skb->data_len, +++ offset); +++} +++ +++BPF_CALL_4(bpf_skb_load_helper_32, const struct sk_buff *, skb, const void *, +++ data, int, headlen, int, offset) +++{ +++ u32 tmp, *ptr; +++ const int len = sizeof(tmp); +++ +++ if (likely(offset >= 0)) { +++ if (headlen - offset >= len) +++ return get_unaligned_be32(data + offset); +++ if (!skb_copy_bits(skb, offset, &tmp, sizeof(tmp))) +++ return be32_to_cpu(tmp); +++ } else { +++ ptr = bpf_internal_load_pointer_neg_helper(skb, offset, len); +++ if (likely(ptr)) +++ return get_unaligned_be32(ptr); +++ } +++ +++ return -EFAULT; +++} +++ +++BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, +++ int, offset) +++{ +++ return ____bpf_skb_load_helper_32(skb, skb->data, skb->len - skb->data_len, +++ offset); +++} +++ +++BPF_CALL_0(bpf_get_raw_cpu_id) ++ { ++ return raw_smp_processor_id(); ++ } ++ +++static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { +++ .func = bpf_get_raw_cpu_id, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++}; +++ ++ static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, ++ struct bpf_insn *insn_buf) ++ { ++@@ -178,22 +282,18 @@ static u32 convert_skb_access(int skb_fi ++ break; ++ ++ case SKF_AD_VLAN_TAG: ++- case SKF_AD_VLAN_TAG_PRESENT: ++ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_tci) != 2); ++- BUILD_BUG_ON(VLAN_TAG_PRESENT != 0x1000); ++ ++ /* dst_reg = *(u16 *) (src_reg + offsetof(vlan_tci)) */ ++ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ++ offsetof(struct sk_buff, vlan_tci)); ++- if (skb_field == SKF_AD_VLAN_TAG) { ++- *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, ++- ~VLAN_TAG_PRESENT); ++- } else { ++- /* dst_reg >>= 12 */ ++- *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 12); ++- /* dst_reg &= 1 */ +++ break; +++ case SKF_AD_VLAN_TAG_PRESENT: +++ *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_VLAN_PRESENT_OFFSET()); +++ if (PKT_VLAN_PRESENT_BIT) +++ *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, PKT_VLAN_PRESENT_BIT); +++ if (PKT_VLAN_PRESENT_BIT < 7) ++ *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, 1); ++- } ++ break; ++ } ++ ++@@ -226,9 +326,8 @@ static bool convert_bpf_extensions(struc ++ case SKF_AD_OFF + SKF_AD_HATYPE: ++ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); ++ BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, type) != 2); ++- BUILD_BUG_ON(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)) < 0); ++ ++- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), ++ BPF_REG_TMP, BPF_REG_CTX, ++ offsetof(struct sk_buff, dev)); ++ /* if (tmp != 0) goto pc + 1 */ ++@@ -295,16 +394,16 @@ static bool convert_bpf_extensions(struc ++ /* Emit call(arg1=CTX, arg2=A, arg3=X) */ ++ switch (fp->k) { ++ case SKF_AD_OFF + SKF_AD_PAY_OFFSET: ++- *insn = BPF_EMIT_CALL(__skb_get_pay_offset); +++ *insn = BPF_EMIT_CALL(bpf_skb_get_pay_offset); ++ break; ++ case SKF_AD_OFF + SKF_AD_NLATTR: ++- *insn = BPF_EMIT_CALL(__skb_get_nlattr); +++ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr); ++ break; ++ case SKF_AD_OFF + SKF_AD_NLATTR_NEST: ++- *insn = BPF_EMIT_CALL(__skb_get_nlattr_nest); +++ *insn = BPF_EMIT_CALL(bpf_skb_get_nlattr_nest); ++ break; ++ case SKF_AD_OFF + SKF_AD_CPU: ++- *insn = BPF_EMIT_CALL(__get_raw_cpu_id); +++ *insn = BPF_EMIT_CALL(bpf_get_raw_cpu_id); ++ break; ++ case SKF_AD_OFF + SKF_AD_RANDOM: ++ *insn = BPF_EMIT_CALL(bpf_user_rnd_u32); ++@@ -331,35 +430,101 @@ static bool convert_bpf_extensions(struc ++ return true; ++ } ++ +++static bool convert_bpf_ld_abs(struct sock_filter *fp, struct bpf_insn **insnp) +++{ +++ const bool unaligned_ok = IS_BUILTIN(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS); +++ int size = bpf_size_to_bytes(BPF_SIZE(fp->code)); +++ bool endian = BPF_SIZE(fp->code) == BPF_H || +++ BPF_SIZE(fp->code) == BPF_W; +++ bool indirect = BPF_MODE(fp->code) == BPF_IND; +++ const int ip_align = NET_IP_ALIGN; +++ struct bpf_insn *insn = *insnp; +++ int offset = fp->k; +++ +++ if (!indirect && +++ ((unaligned_ok && offset >= 0) || +++ (!unaligned_ok && offset >= 0 && +++ offset + ip_align >= 0 && +++ offset + ip_align % size == 0))) { +++ bool ldx_off_ok = offset <= S16_MAX; +++ +++ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_H); +++ if (offset) +++ *insn++ = BPF_ALU64_IMM(BPF_SUB, BPF_REG_TMP, offset); +++ *insn++ = BPF_JMP_IMM(BPF_JSLT, BPF_REG_TMP, +++ size, 2 + endian + (!ldx_off_ok * 2)); +++ if (ldx_off_ok) { +++ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, +++ BPF_REG_D, offset); +++ } else { +++ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_D); +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_TMP, offset); +++ *insn++ = BPF_LDX_MEM(BPF_SIZE(fp->code), BPF_REG_A, +++ BPF_REG_TMP, 0); +++ } +++ if (endian) +++ *insn++ = BPF_ENDIAN(BPF_FROM_BE, BPF_REG_A, size * 8); +++ *insn++ = BPF_JMP_A(8); +++ } +++ +++ *insn++ = BPF_MOV64_REG(BPF_REG_ARG1, BPF_REG_CTX); +++ *insn++ = BPF_MOV64_REG(BPF_REG_ARG2, BPF_REG_D); +++ *insn++ = BPF_MOV64_REG(BPF_REG_ARG3, BPF_REG_H); +++ if (!indirect) { +++ *insn++ = BPF_MOV64_IMM(BPF_REG_ARG4, offset); +++ } else { +++ *insn++ = BPF_MOV64_REG(BPF_REG_ARG4, BPF_REG_X); +++ if (fp->k) +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_ARG4, offset); +++ } +++ +++ switch (BPF_SIZE(fp->code)) { +++ case BPF_B: +++ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8); +++ break; +++ case BPF_H: +++ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16); +++ break; +++ case BPF_W: +++ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32); +++ break; +++ default: +++ return false; +++ } +++ +++ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_A, 0, 2); +++ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); +++ *insn = BPF_EXIT_INSN(); +++ +++ *insnp = insn; +++ return true; +++} +++ ++ /** ++ * bpf_convert_filter - convert filter program ++ * @prog: the user passed filter program ++ * @len: the length of the user passed filter program ++- * @new_prog: buffer where converted program will be stored +++ * @new_prog: allocated 'struct bpf_prog' or NULL ++ * @new_len: pointer to store length of converted program +++ * @seen_ld_abs: bool whether we've seen ld_abs/ind ++ * ++- * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. +++ * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' +++ * style extended BPF (eBPF). ++ * Conversion workflow: ++ * ++ * 1) First pass for calculating the new program length: ++- * bpf_convert_filter(old_prog, old_len, NULL, &new_len) +++ * bpf_convert_filter(old_prog, old_len, NULL, &new_len, &seen_ld_abs) ++ * ++ * 2) 2nd pass to remap in two passes: 1st pass finds new ++ * jump offsets, 2nd pass remapping: ++- * new_prog = kmalloc(sizeof(struct bpf_insn) * new_len); ++- * bpf_convert_filter(old_prog, old_len, new_prog, &new_len); ++- * ++- * User BPF's register A is mapped to our BPF register 6, user BPF ++- * register X is mapped to BPF register 7; frame pointer is always ++- * register 10; Context 'void *ctx' is stored in register 1, that is, ++- * for socket filters: ctx == 'struct sk_buff *', for seccomp: ++- * ctx == 'struct seccomp_data *'. +++ * bpf_convert_filter(old_prog, old_len, new_prog, &new_len, &seen_ld_abs) ++ */ ++ static int bpf_convert_filter(struct sock_filter *prog, int len, ++- struct bpf_insn *new_prog, int *new_len) +++ struct bpf_prog *new_prog, int *new_len, +++ bool *seen_ld_abs) ++ { ++- int new_flen = 0, pass = 0, target, i; ++- struct bpf_insn *new_insn; +++ int new_flen = 0, pass = 0, target, i, stack_off; +++ struct bpf_insn *new_insn, *first_insn = NULL; ++ struct sock_filter *fp; ++ int *addrs = NULL; ++ u8 bpf_src; ++@@ -371,6 +536,7 @@ static int bpf_convert_filter(struct soc ++ return -EINVAL; ++ ++ if (new_prog) { +++ first_insn = new_prog->insnsi; ++ addrs = kcalloc(len, sizeof(*addrs), ++ GFP_KERNEL | __GFP_NOWARN); ++ if (!addrs) ++@@ -378,19 +544,47 @@ static int bpf_convert_filter(struct soc ++ } ++ ++ do_pass: ++- new_insn = new_prog; +++ new_insn = first_insn; ++ fp = prog; ++ ++- if (new_insn) ++- *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); ++- new_insn++; +++ /* Classic BPF related prologue emission. */ +++ if (new_prog) { +++ /* Classic BPF expects A and X to be reset first. These need +++ * to be guaranteed to be the first two instructions. +++ */ +++ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); +++ *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); +++ +++ /* All programs must keep CTX in callee saved BPF_REG_CTX. +++ * In eBPF case it's done by the compiler, here we need to +++ * do this ourself. Initial CTX is present in BPF_REG_ARG1. +++ */ +++ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1); +++ if (*seen_ld_abs) { +++ /* For packet access in classic BPF, cache skb->data +++ * in callee-saved BPF R8 and skb->len - skb->data_len +++ * (headlen) in BPF R9. Since classic BPF is read-only +++ * on CTX, we only need to cache it once. +++ */ +++ *new_insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), +++ BPF_REG_D, BPF_REG_CTX, +++ offsetof(struct sk_buff, data)); +++ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_H, BPF_REG_CTX, +++ offsetof(struct sk_buff, len)); +++ *new_insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_TMP, BPF_REG_CTX, +++ offsetof(struct sk_buff, data_len)); +++ *new_insn++ = BPF_ALU32_REG(BPF_SUB, BPF_REG_H, BPF_REG_TMP); +++ } +++ } else { +++ new_insn += 3; +++ } ++ ++ for (i = 0; i < len; fp++, i++) { ++- struct bpf_insn tmp_insns[6] = { }; +++ struct bpf_insn tmp_insns[32] = { }; ++ struct bpf_insn *insn = tmp_insns; ++ ++ if (addrs) ++- addrs[i] = new_insn - new_prog; +++ addrs[i] = new_insn - first_insn; ++ ++ switch (fp->code) { ++ /* All arithmetic insns and skb loads map as-is. */ ++@@ -429,6 +623,22 @@ do_pass: ++ BPF_MODE(fp->code) == BPF_ABS && ++ convert_bpf_extensions(fp, &insn)) ++ break; +++ if (BPF_CLASS(fp->code) == BPF_LD && +++ convert_bpf_ld_abs(fp, &insn)) { +++ *seen_ld_abs = true; +++ break; +++ } +++ +++ if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || +++ fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { +++ *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); +++ /* Error with exception code on div/mod by 0. +++ * For cBPF programs, this was always return 0. +++ */ +++ *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); +++ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); +++ *insn++ = BPF_EXIT_INSN(); +++ } ++ ++ *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); ++ break; ++@@ -441,11 +651,18 @@ do_pass: ++ ++ #define BPF_EMIT_JMP \ ++ do { \ +++ const s32 off_min = S16_MIN, off_max = S16_MAX; \ +++ s32 off; \ +++ \ ++ if (target >= len || target < 0) \ ++ goto err; \ ++- insn->off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ +++ off = addrs ? addrs[target] - addrs[i] - 1 : 0; \ ++ /* Adjust pc relative offset for 2nd or 3rd insn. */ \ ++- insn->off -= insn - tmp_insns; \ +++ off -= insn - tmp_insns; \ +++ /* Reject anything not fitting into insn->off. */ \ +++ if (off < off_min || off > off_max) \ +++ goto err; \ +++ insn->off = off; \ ++ } while (0) ++ ++ case BPF_JMP | BPF_JA: ++@@ -487,14 +704,27 @@ do_pass: ++ break; ++ } ++ ++- /* Convert JEQ into JNE when 'jump_true' is next insn. */ ++- if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { ++- insn->code = BPF_JMP | BPF_JNE | bpf_src; +++ /* Convert some jumps when 'jump_true' is next insn. */ +++ if (fp->jt == 0) { +++ switch (BPF_OP(fp->code)) { +++ case BPF_JEQ: +++ insn->code = BPF_JMP | BPF_JNE | bpf_src; +++ break; +++ case BPF_JGT: +++ insn->code = BPF_JMP | BPF_JLE | bpf_src; +++ break; +++ case BPF_JGE: +++ insn->code = BPF_JMP | BPF_JLT | bpf_src; +++ break; +++ default: +++ goto jmp_rest; +++ } +++ ++ target = i + fp->jf + 1; ++ BPF_EMIT_JMP; ++ break; ++ } ++- +++jmp_rest: ++ /* Other jumps are mapped into two insns: Jxx and JA. */ ++ target = i + fp->jt + 1; ++ insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; ++@@ -507,44 +737,64 @@ do_pass: ++ break; ++ ++ /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ ++- case BPF_LDX | BPF_MSH | BPF_B: ++- /* tmp = A */ ++- *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_A); +++ case BPF_LDX | BPF_MSH | BPF_B: { +++ struct sock_filter tmp = { +++ .code = BPF_LD | BPF_ABS | BPF_B, +++ .k = fp->k, +++ }; +++ +++ *seen_ld_abs = true; +++ +++ /* X = A */ +++ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); ++ /* A = BPF_R0 = *(u8 *) (skb->data + K) */ ++- *insn++ = BPF_LD_ABS(BPF_B, fp->k); +++ convert_bpf_ld_abs(&tmp, &insn); +++ insn++; ++ /* A &= 0xf */ ++ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, 0xf); ++ /* A <<= 2 */ ++ *insn++ = BPF_ALU32_IMM(BPF_LSH, BPF_REG_A, 2); +++ /* tmp = X */ +++ *insn++ = BPF_MOV64_REG(BPF_REG_TMP, BPF_REG_X); ++ /* X = A */ ++ *insn++ = BPF_MOV64_REG(BPF_REG_X, BPF_REG_A); ++ /* A = tmp */ ++ *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); ++ break; ++- ++- /* RET_K, RET_A are remaped into 2 insns. */ +++ } +++ /* RET_K is remaped into 2 insns. RET_A case doesn't need an +++ * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. +++ */ ++ case BPF_RET | BPF_A: ++ case BPF_RET | BPF_K: ++- *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ? ++- BPF_K : BPF_X, BPF_REG_0, ++- BPF_REG_A, fp->k); +++ if (BPF_RVAL(fp->code) == BPF_K) +++ *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0, +++ 0, fp->k); ++ *insn = BPF_EXIT_INSN(); ++ break; ++ ++ /* Store to stack. */ ++ case BPF_ST: ++ case BPF_STX: +++ stack_off = fp->k * 4 + 4; ++ *insn = BPF_STX_MEM(BPF_W, BPF_REG_FP, BPF_CLASS(fp->code) == ++ BPF_ST ? BPF_REG_A : BPF_REG_X, ++- -(BPF_MEMWORDS - fp->k) * 4); +++ -stack_off); +++ /* check_load_and_stores() verifies that classic BPF can +++ * load from stack only after write, so tracking +++ * stack_depth for ST|STX insns is enough +++ */ +++ if (new_prog && new_prog->aux->stack_depth < stack_off) +++ new_prog->aux->stack_depth = stack_off; ++ break; ++ ++ /* Load from stack. */ ++ case BPF_LD | BPF_MEM: ++ case BPF_LDX | BPF_MEM: +++ stack_off = fp->k * 4 + 4; ++ *insn = BPF_LDX_MEM(BPF_W, BPF_CLASS(fp->code) == BPF_LD ? ++ BPF_REG_A : BPF_REG_X, BPF_REG_FP, ++- -(BPF_MEMWORDS - fp->k) * 4); +++ -stack_off); ++ break; ++ ++ /* A = K or X = K */ ++@@ -592,13 +842,15 @@ do_pass: ++ ++ if (!new_prog) { ++ /* Only calculating new length. */ ++- *new_len = new_insn - new_prog; +++ *new_len = new_insn - first_insn; +++ if (*seen_ld_abs) +++ *new_len += 4; /* Prologue bits. */ ++ return 0; ++ } ++ ++ pass++; ++- if (new_flen != new_insn - new_prog) { ++- new_flen = new_insn - new_prog; +++ if (new_flen != new_insn - first_insn) { +++ new_flen = new_insn - first_insn; ++ if (pass > 2) ++ goto err; ++ goto do_pass; ++@@ -738,6 +990,17 @@ static bool chk_code_allowed(u16 code_to ++ return codes[code_to_probe]; ++ } ++ +++static bool bpf_check_basics_ok(const struct sock_filter *filter, +++ unsigned int flen) +++{ +++ if (filter == NULL) +++ return false; +++ if (flen == 0 || flen > BPF_MAXINSNS) +++ return false; +++ +++ return true; +++} +++ ++ /** ++ * bpf_check_classic - verify socket filter code ++ * @filter: filter to verify ++@@ -758,9 +1021,6 @@ static int bpf_check_classic(const struc ++ bool anc_found; ++ int pc; ++ ++- if (flen == 0 || flen > BPF_MAXINSNS) ++- return -EINVAL; ++- ++ /* Check the filter code now */ ++ for (pc = 0; pc < flen; pc++) { ++ const struct sock_filter *ftest = &filter[pc]; ++@@ -901,7 +1161,7 @@ static void sk_filter_release_rcu(struct ++ */ ++ static void sk_filter_release(struct sk_filter *fp) ++ { ++- if (atomic_dec_and_test(&fp->refcnt)) +++ if (refcount_dec_and_test(&fp->refcnt)) ++ call_rcu(&fp->rcu, sk_filter_release_rcu); ++ } ++ ++@@ -916,25 +1176,37 @@ void sk_filter_uncharge(struct sock *sk, ++ /* try to charge the socket memory if there is space available ++ * return true on success ++ */ ++-bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +++static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) ++ { ++ u32 filter_size = bpf_prog_size(fp->prog->len); ++ ++ /* same check as in sock_kmalloc() */ ++ if (filter_size <= sysctl_optmem_max && ++ atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { ++- atomic_inc(&fp->refcnt); ++ atomic_add(filter_size, &sk->sk_omem_alloc); ++ return true; ++ } ++ return false; ++ } ++ +++bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +++{ +++ if (!refcount_inc_not_zero(&fp->refcnt)) +++ return false; +++ +++ if (!__sk_filter_charge(sk, fp)) { +++ sk_filter_release(fp); +++ return false; +++ } +++ return true; +++} +++ ++ static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) ++ { ++ struct sock_filter *old_prog; ++ struct bpf_prog *old_fp; ++ int err, new_len, old_len = fp->len; +++ bool seen_ld_abs = false; ++ ++ /* We are free to overwrite insns et al right here as it ++ * won't be used at this point in time anymore internally ++@@ -956,7 +1228,8 @@ static struct bpf_prog *bpf_migrate_filt ++ } ++ ++ /* 1st pass: calculate the new program length. */ ++- err = bpf_convert_filter(old_prog, old_len, NULL, &new_len); +++ err = bpf_convert_filter(old_prog, old_len, NULL, &new_len, +++ &seen_ld_abs); ++ if (err) ++ goto out_err_free; ++ ++@@ -975,7 +1248,8 @@ static struct bpf_prog *bpf_migrate_filt ++ fp->len = new_len; ++ ++ /* 2nd pass: remap sock_filter insns into bpf_insn insns. */ ++- err = bpf_convert_filter(old_prog, old_len, fp->insnsi, &new_len); +++ err = bpf_convert_filter(old_prog, old_len, fp, &new_len, +++ &seen_ld_abs); ++ if (err) ++ /* 2nd bpf_convert_filter() can fail only if it fails ++ * to allocate memory, remapping must succeed. Note, ++@@ -984,7 +1258,9 @@ static struct bpf_prog *bpf_migrate_filt ++ */ ++ goto out_err_free; ++ ++- bpf_prog_select_runtime(fp); +++ fp = bpf_prog_select_runtime(fp, &err); +++ if (err) +++ goto out_err_free; ++ ++ kfree(old_prog); ++ return fp; ++@@ -1051,7 +1327,7 @@ int bpf_prog_create(struct bpf_prog **pf ++ struct bpf_prog *fp; ++ ++ /* Make sure new filter is there and in the right amounts. */ ++- if (fprog->filter == NULL) +++ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) ++ return -EINVAL; ++ ++ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); ++@@ -1098,7 +1374,7 @@ int bpf_prog_create_from_user(struct bpf ++ int err; ++ ++ /* Make sure new filter is there and in the right amounts. */ ++- if (fprog->filter == NULL) +++ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) ++ return -EINVAL; ++ ++ fp = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); ++@@ -1139,8 +1415,7 @@ void bpf_prog_destroy(struct bpf_prog *f ++ } ++ EXPORT_SYMBOL_GPL(bpf_prog_destroy); ++ ++-static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk, ++- bool locked) +++static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) ++ { ++ struct sk_filter *fp, *old_fp; ++ ++@@ -1149,53 +1424,44 @@ static int __sk_attach_prog(struct bpf_p ++ return -ENOMEM; ++ ++ fp->prog = prog; ++- atomic_set(&fp->refcnt, 0); ++ ++- if (!sk_filter_charge(sk, fp)) { +++ if (!__sk_filter_charge(sk, fp)) { ++ kfree(fp); ++ return -ENOMEM; ++ } +++ refcount_set(&fp->refcnt, 1); ++ ++- old_fp = rcu_dereference_protected(sk->sk_filter, locked); +++ old_fp = rcu_dereference_protected(sk->sk_filter, +++ lockdep_sock_is_held(sk)); ++ rcu_assign_pointer(sk->sk_filter, fp); +++ ++ if (old_fp) ++ sk_filter_uncharge(sk, old_fp); ++ ++ return 0; ++ } ++ ++-/** ++- * sk_attach_filter - attach a socket filter ++- * @fprog: the filter program ++- * @sk: the socket to use ++- * ++- * Attach the user's filter code. We first run some sanity checks on ++- * it to make sure it does not explode on us later. If an error ++- * occurs or there is insufficient memory for the filter a negative ++- * errno code is returned. On success the return is zero. ++- */ ++-int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk, ++- bool locked) +++static +++struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk) ++ { ++ unsigned int fsize = bpf_classic_proglen(fprog); ++- unsigned int bpf_fsize = bpf_prog_size(fprog->len); ++ struct bpf_prog *prog; ++ int err; ++ ++ if (sock_flag(sk, SOCK_FILTER_LOCKED)) ++- return -EPERM; +++ return ERR_PTR(-EPERM); ++ ++ /* Make sure new filter is there and in the right amounts. */ ++- if (fprog->filter == NULL) ++- return -EINVAL; +++ if (!bpf_check_basics_ok(fprog->filter, fprog->len)) +++ return ERR_PTR(-EINVAL); ++ ++- prog = bpf_prog_alloc(bpf_fsize, 0); +++ prog = bpf_prog_alloc(bpf_prog_size(fprog->len), 0); ++ if (!prog) ++- return -ENOMEM; +++ return ERR_PTR(-ENOMEM); ++ ++ if (copy_from_user(prog->insns, fprog->filter, fsize)) { ++ __bpf_prog_free(prog); ++- return -EFAULT; +++ return ERR_PTR(-EINVAL); ++ } ++ ++ prog->len = fprog->len; ++@@ -1203,17 +1469,34 @@ int __sk_attach_filter(struct sock_fprog ++ err = bpf_prog_store_orig_filter(prog, fprog); ++ if (err) { ++ __bpf_prog_free(prog); ++- return -ENOMEM; +++ return ERR_PTR(-ENOMEM); ++ } ++ ++ /* bpf_prepare_filter() already takes care of freeing ++ * memory in case something goes wrong. ++ */ ++- prog = bpf_prepare_filter(prog, NULL); +++ return bpf_prepare_filter(prog, NULL); +++} +++ +++/** +++ * sk_attach_filter - attach a socket filter +++ * @fprog: the filter program +++ * @sk: the socket to use +++ * +++ * Attach the user's filter code. We first run some sanity checks on +++ * it to make sure it does not explode on us later. If an error +++ * occurs or there is insufficient memory for the filter a negative +++ * errno code is returned. On success the return is zero. +++ */ +++int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +++{ +++ struct bpf_prog *prog = __get_filter(fprog, sk); +++ int err; +++ ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++- err = __sk_attach_prog(prog, sk, locked); +++ err = __sk_attach_prog(prog, sk); ++ if (err < 0) { ++ __bpf_prog_release(prog); ++ return err; ++@@ -1221,31 +1504,25 @@ int __sk_attach_filter(struct sock_fprog ++ ++ return 0; ++ } ++-EXPORT_SYMBOL_GPL(__sk_attach_filter); +++EXPORT_SYMBOL_GPL(sk_attach_filter); ++ ++-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +++static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk) ++ { ++- return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk)); +++ if (sock_flag(sk, SOCK_FILTER_LOCKED)) +++ return ERR_PTR(-EPERM); +++ +++ return bpf_prog_get_type(ufd, BPF_PROG_TYPE_SOCKET_FILTER); ++ } ++ ++ int sk_attach_bpf(u32 ufd, struct sock *sk) ++ { ++- struct bpf_prog *prog; +++ struct bpf_prog *prog = __get_bpf(ufd, sk); ++ int err; ++ ++- if (sock_flag(sk, SOCK_FILTER_LOCKED)) ++- return -EPERM; ++- ++- prog = bpf_prog_get(ufd); ++ if (IS_ERR(prog)) ++ return PTR_ERR(prog); ++ ++- if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { ++- bpf_prog_put(prog); ++- return -EINVAL; ++- } ++- ++- err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk)); +++ err = __sk_attach_prog(prog, sk); ++ if (err < 0) { ++ bpf_prog_put(prog); ++ return err; ++@@ -1254,79 +1531,201 @@ int sk_attach_bpf(u32 ufd, struct sock * ++ return 0; ++ } ++ ++-#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) +++struct bpf_scratchpad { +++ union { +++ __be32 diff[MAX_BPF_STACK / sizeof(__be32)]; +++ u8 buff[MAX_BPF_STACK]; +++ }; +++}; +++ +++static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp); +++ +++static inline int __bpf_try_make_writable(struct sk_buff *skb, +++ unsigned int write_len) +++{ +++ return skb_ensure_writable(skb, write_len); +++} +++ +++static inline int bpf_try_make_writable(struct sk_buff *skb, +++ unsigned int write_len) +++{ +++ int err = __bpf_try_make_writable(skb, write_len); +++ +++ bpf_compute_data_pointers(skb); +++ return err; +++} +++ +++static int bpf_try_make_head_writable(struct sk_buff *skb) +++{ +++ return bpf_try_make_writable(skb, skb_headlen(skb)); +++} ++ ++-static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) +++static inline void bpf_push_mac_rcsum(struct sk_buff *skb) +++{ +++ if (skb_at_tc_ingress(skb)) +++ skb_postpush_rcsum(skb, skb_mac_header(skb), skb->mac_len); +++} +++ +++static inline void bpf_pull_mac_rcsum(struct sk_buff *skb) +++{ +++ if (skb_at_tc_ingress(skb)) +++ skb_postpull_rcsum(skb, skb_mac_header(skb), skb->mac_len); +++} +++ +++BPF_CALL_5(bpf_skb_store_bytes, struct sk_buff *, skb, u32, offset, +++ const void *, from, u32, len, u64, flags) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1; ++- int offset = (int) r2; ++- void *from = (void *) (long) r3; ++- unsigned int len = (unsigned int) r4; ++- char buf[16]; ++ void *ptr; ++ ++- /* bpf verifier guarantees that: ++- * 'from' pointer points to bpf program stack ++- * 'len' bytes of it were initialized ++- * 'len' > 0 ++- * 'skb' is a valid pointer to 'struct sk_buff' ++- * ++- * so check for invalid 'offset' and too large 'len' ++- */ ++- if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) +++ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH))) +++ return -EINVAL; +++ if (unlikely(offset > 0xffff)) ++ return -EFAULT; ++- if (unlikely(skb_try_make_writable(skb, offset + len))) +++ if (unlikely(bpf_try_make_writable(skb, offset + len))) ++ return -EFAULT; ++ ++- ptr = skb_header_pointer(skb, offset, len, buf); ++- if (unlikely(!ptr)) ++- return -EFAULT; ++- ++- if (BPF_RECOMPUTE_CSUM(flags)) ++- skb_postpull_rcsum(skb, ptr, len); +++ ptr = skb->data + offset; +++ if (flags & BPF_F_RECOMPUTE_CSUM) +++ __skb_postpull_rcsum(skb, ptr, len, offset); ++ ++ memcpy(ptr, from, len); ++ ++- if (ptr == buf) ++- /* skb_store_bits cannot return -EFAULT here */ ++- skb_store_bits(skb, offset, ptr, len); +++ if (flags & BPF_F_RECOMPUTE_CSUM) +++ __skb_postpush_rcsum(skb, ptr, len, offset); +++ if (flags & BPF_F_INVALIDATE_HASH) +++ skb_clear_hash(skb); ++ ++- if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE) ++- skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); ++ return 0; ++ } ++ ++-const struct bpf_func_proto bpf_skb_store_bytes_proto = { +++static const struct bpf_func_proto bpf_skb_store_bytes_proto = { ++ .func = bpf_skb_store_bytes, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_PTR_TO_CTX, ++ .arg2_type = ARG_ANYTHING, ++- .arg3_type = ARG_PTR_TO_STACK, ++- .arg4_type = ARG_CONST_STACK_SIZE, +++ .arg3_type = ARG_PTR_TO_MEM, +++ .arg4_type = ARG_CONST_SIZE, ++ .arg5_type = ARG_ANYTHING, ++ }; ++ ++-#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f) ++-#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10) +++BPF_CALL_4(bpf_skb_load_bytes, const struct sk_buff *, skb, u32, offset, +++ void *, to, u32, len) +++{ +++ void *ptr; +++ +++ if (unlikely(offset > 0xffff)) +++ goto err_clear; +++ +++ ptr = skb_header_pointer(skb, offset, len, to); +++ if (unlikely(!ptr)) +++ goto err_clear; +++ if (ptr != to) +++ memcpy(to, ptr, len); ++ ++-static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +++ return 0; +++err_clear: +++ memset(to, 0, len); +++ return -EFAULT; +++} +++ +++static const struct bpf_func_proto bpf_skb_load_bytes_proto = { +++ .func = bpf_skb_load_bytes, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++ .arg3_type = ARG_PTR_TO_UNINIT_MEM, +++ .arg4_type = ARG_CONST_SIZE, +++}; +++ +++BPF_CALL_5(bpf_skb_load_bytes_relative, const struct sk_buff *, skb, +++ u32, offset, void *, to, u32, len, u32, start_header) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1; ++- int offset = (int) r2; ++- __sum16 sum, *ptr; +++ u8 *end = skb_tail_pointer(skb); +++ u8 *start, *ptr; ++ ++- if (unlikely((u32) offset > 0xffff)) ++- return -EFAULT; +++ if (unlikely(offset > 0xffff)) +++ goto err_clear; ++ ++- if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) ++- return -EFAULT; +++ switch (start_header) { +++ case BPF_HDR_START_MAC: +++ if (unlikely(!skb_mac_header_was_set(skb))) +++ goto err_clear; +++ start = skb_mac_header(skb); +++ break; +++ case BPF_HDR_START_NET: +++ start = skb_network_header(skb); +++ break; +++ default: +++ goto err_clear; +++ } ++ ++- ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); ++- if (unlikely(!ptr)) +++ ptr = start + offset; +++ +++ if (likely(ptr + len <= end)) { +++ memcpy(to, ptr, len); +++ return 0; +++ } +++ +++err_clear: +++ memset(to, 0, len); +++ return -EFAULT; +++} +++ +++static const struct bpf_func_proto bpf_skb_load_bytes_relative_proto = { +++ .func = bpf_skb_load_bytes_relative, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++ .arg3_type = ARG_PTR_TO_UNINIT_MEM, +++ .arg4_type = ARG_CONST_SIZE, +++ .arg5_type = ARG_ANYTHING, +++}; +++ +++BPF_CALL_2(bpf_skb_pull_data, struct sk_buff *, skb, u32, len) +++{ +++ /* Idea is the following: should the needed direct read/write +++ * test fail during runtime, we can pull in more data and redo +++ * again, since implicitly, we invalidate previous checks here. +++ * +++ * Or, since we know how much we need to make read/writeable, +++ * this can be done once at the program beginning for direct +++ * access case. By this we overcome limitations of only current +++ * headroom being accessible. +++ */ +++ return bpf_try_make_writable(skb, len ? : skb_headlen(skb)); +++} +++ +++static const struct bpf_func_proto bpf_skb_pull_data_proto = { +++ .func = bpf_skb_pull_data, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++}; +++ +++BPF_CALL_5(bpf_l3_csum_replace, struct sk_buff *, skb, u32, offset, +++ u64, from, u64, to, u64, flags) +++{ +++ __sum16 *ptr; +++ +++ if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK))) +++ return -EINVAL; +++ if (unlikely(offset > 0xffff || offset & 1)) +++ return -EFAULT; +++ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) ++ return -EFAULT; ++ ++- switch (BPF_HEADER_FIELD_SIZE(flags)) { +++ ptr = (__sum16 *)(skb->data + offset); +++ switch (flags & BPF_F_HDR_FIELD_MASK) { +++ case 0: +++ if (unlikely(from != 0)) +++ return -EINVAL; +++ +++ csum_replace_by_diff(ptr, to); +++ break; ++ case 2: ++ csum_replace2(ptr, from, to); ++ break; ++@@ -1337,14 +1736,10 @@ static u64 bpf_l3_csum_replace(u64 r1, u ++ return -EINVAL; ++ } ++ ++- if (ptr == &sum) ++- /* skb_store_bits guaranteed to not return -EFAULT here */ ++- skb_store_bits(skb, offset, ptr, sizeof(sum)); ++- ++ return 0; ++ } ++ ++-const struct bpf_func_proto bpf_l3_csum_replace_proto = { +++static const struct bpf_func_proto bpf_l3_csum_replace_proto = { ++ .func = bpf_l3_csum_replace, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++@@ -1355,23 +1750,33 @@ const struct bpf_func_proto bpf_l3_csum_ ++ .arg5_type = ARG_ANYTHING, ++ }; ++ ++-static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) +++BPF_CALL_5(bpf_l4_csum_replace, struct sk_buff *, skb, u32, offset, +++ u64, from, u64, to, u64, flags) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1; ++- bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags); ++- int offset = (int) r2; ++- __sum16 sum, *ptr; +++ bool is_pseudo = flags & BPF_F_PSEUDO_HDR; +++ bool is_mmzero = flags & BPF_F_MARK_MANGLED_0; +++ bool do_mforce = flags & BPF_F_MARK_ENFORCE; +++ __sum16 *ptr; ++ ++- if (unlikely((u32) offset > 0xffff)) +++ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_MARK_ENFORCE | +++ BPF_F_PSEUDO_HDR | BPF_F_HDR_FIELD_MASK))) +++ return -EINVAL; +++ if (unlikely(offset > 0xffff || offset & 1)) ++ return -EFAULT; ++- if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum)))) +++ if (unlikely(bpf_try_make_writable(skb, offset + sizeof(*ptr)))) ++ return -EFAULT; ++ ++- ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); ++- if (unlikely(!ptr)) ++- return -EFAULT; +++ ptr = (__sum16 *)(skb->data + offset); +++ if (is_mmzero && !do_mforce && !*ptr) +++ return 0; ++ ++- switch (BPF_HEADER_FIELD_SIZE(flags)) { +++ switch (flags & BPF_F_HDR_FIELD_MASK) { +++ case 0: +++ if (unlikely(from != 0)) +++ return -EINVAL; +++ +++ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo); +++ break; ++ case 2: ++ inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo); ++ break; ++@@ -1382,14 +1787,12 @@ static u64 bpf_l4_csum_replace(u64 r1, u ++ return -EINVAL; ++ } ++ ++- if (ptr == &sum) ++- /* skb_store_bits guaranteed to not return -EFAULT here */ ++- skb_store_bits(skb, offset, ptr, sizeof(sum)); ++- +++ if (is_mmzero && !*ptr) +++ *ptr = CSUM_MANGLED_0; ++ return 0; ++ } ++ ++-const struct bpf_func_proto bpf_l4_csum_replace_proto = { +++static const struct bpf_func_proto bpf_l4_csum_replace_proto = { ++ .func = bpf_l4_csum_replace, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++@@ -1400,30 +1803,172 @@ const struct bpf_func_proto bpf_l4_csum_ ++ .arg5_type = ARG_ANYTHING, ++ }; ++ ++-#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1) +++BPF_CALL_5(bpf_csum_diff, __be32 *, from, u32, from_size, +++ __be32 *, to, u32, to_size, __wsum, seed) +++{ +++ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp); +++ u32 diff_size = from_size + to_size; +++ int i, j = 0; +++ +++ /* This is quite flexible, some examples: +++ * +++ * from_size == 0, to_size > 0, seed := csum --> pushing data +++ * from_size > 0, to_size == 0, seed := csum --> pulling data +++ * from_size > 0, to_size > 0, seed := 0 --> diffing data +++ * +++ * Even for diffing, from_size and to_size don't need to be equal. +++ */ +++ if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) || +++ diff_size > sizeof(sp->diff))) +++ return -EINVAL; +++ +++ for (i = 0; i < from_size / sizeof(__be32); i++, j++) +++ sp->diff[j] = ~from[i]; +++ for (i = 0; i < to_size / sizeof(__be32); i++, j++) +++ sp->diff[j] = to[i]; +++ +++ return csum_partial(sp->diff, diff_size, seed); +++} +++ +++static const struct bpf_func_proto bpf_csum_diff_proto = { +++ .func = bpf_csum_diff, +++ .gpl_only = false, +++ .pkt_access = true, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_MEM_OR_NULL, +++ .arg2_type = ARG_CONST_SIZE_OR_ZERO, +++ .arg3_type = ARG_PTR_TO_MEM_OR_NULL, +++ .arg4_type = ARG_CONST_SIZE_OR_ZERO, +++ .arg5_type = ARG_ANYTHING, +++}; +++ +++BPF_CALL_2(bpf_csum_update, struct sk_buff *, skb, __wsum, csum) +++{ +++ /* The interface is to be used in combination with bpf_csum_diff() +++ * for direct packet writes. csum rotation for alignment as well +++ * as emulating csum_sub() can be done from the eBPF program. +++ */ +++ if (skb->ip_summed == CHECKSUM_COMPLETE) +++ return (skb->csum = csum_add(skb->csum, csum)); +++ +++ return -ENOTSUPP; +++} +++ +++static const struct bpf_func_proto bpf_csum_update_proto = { +++ .func = bpf_csum_update, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++}; +++ +++static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) +++{ +++ return dev_forward_skb(dev, skb); +++} +++ +++static inline int __bpf_rx_skb_no_mac(struct net_device *dev, +++ struct sk_buff *skb) +++{ +++ int ret = ____dev_forward_skb(dev, skb); +++ +++ if (likely(!ret)) { +++ skb->dev = dev; +++ ret = netif_rx(skb); +++ } +++ +++ return ret; +++} +++ +++static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) +++{ +++ int ret; +++ +++ skb->dev = dev; +++ skb->tstamp.tv64 = 0; +++ +++ ret = dev_queue_xmit(skb); +++ +++ return ret; +++} +++ +++static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, +++ u32 flags) +++{ +++ unsigned int mlen = skb_network_offset(skb); +++ +++ if (mlen) { +++ __skb_pull(skb, mlen); +++ +++ /* At ingress, the mac header has already been pulled once. +++ * At egress, skb_pospull_rcsum has to be done in case that +++ * the skb is originated from ingress (i.e. a forwarded skb) +++ * to ensure that rcsum starts at net header. +++ */ +++ if (!skb_at_tc_ingress(skb)) +++ skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); +++ } +++ skb_pop_mac_header(skb); +++ skb_reset_mac_len(skb); +++ return flags & BPF_F_INGRESS ? +++ __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +++} ++ ++-static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) +++static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, +++ u32 flags) +++{ +++ /* Verify that a link layer header is carried */ +++ if (unlikely(skb->mac_header >= skb->network_header)) { +++ kfree_skb(skb); +++ return -ERANGE; +++ } +++ +++ bpf_push_mac_rcsum(skb); +++ return flags & BPF_F_INGRESS ? +++ __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +++} +++ +++static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, +++ u32 flags) +++{ +++ if (dev_is_mac_header_xmit(dev)) +++ return __bpf_redirect_common(skb, dev, flags); +++ else +++ return __bpf_redirect_no_mac(skb, dev, flags); +++} +++ +++BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2; ++ struct net_device *dev; +++ struct sk_buff *clone; +++ int ret; +++ +++ if (unlikely(flags & ~(BPF_F_INGRESS))) +++ return -EINVAL; ++ ++ dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); ++ if (unlikely(!dev)) ++ return -EINVAL; ++ ++- skb2 = skb_clone(skb, GFP_ATOMIC); ++- if (unlikely(!skb2)) +++ clone = skb_clone(skb, GFP_ATOMIC); +++ if (unlikely(!clone)) ++ return -ENOMEM; ++ ++- if (BPF_IS_REDIRECT_INGRESS(flags)) ++- return dev_forward_skb(dev, skb2); +++ /* For direct write, we need to keep the invariant that the skbs +++ * we're dealing with need to be uncloned. Should uncloning fail +++ * here, we need to free the just generated clone to unclone once +++ * again. +++ */ +++ ret = bpf_try_make_head_writable(skb); +++ if (unlikely(ret)) { +++ kfree_skb(clone); +++ return -ENOMEM; +++ } ++ ++- skb2->dev = dev; ++- skb_sender_cpu_clear(skb2); ++- return dev_queue_xmit(skb2); +++ return __bpf_redirect(clone, dev, flags); ++ } ++ ++-const struct bpf_func_proto bpf_clone_redirect_proto = { +++static const struct bpf_func_proto bpf_clone_redirect_proto = { ++ .func = bpf_clone_redirect, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++@@ -1432,42 +1977,38 @@ const struct bpf_func_proto bpf_clone_re ++ .arg3_type = ARG_ANYTHING, ++ }; ++ ++-struct redirect_info { ++- u32 ifindex; ++- u32 flags; ++-}; +++DEFINE_PER_CPU(struct bpf_redirect_info, bpf_redirect_info); +++EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); ++ ++-static DEFINE_PER_CPU(struct redirect_info, redirect_info); ++-static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5) +++BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ++ { ++- struct redirect_info *ri = this_cpu_ptr(&redirect_info); +++ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); +++ +++ if (unlikely(flags & ~(BPF_F_INGRESS))) +++ return TC_ACT_SHOT; ++ ++- ri->ifindex = ifindex; ++ ri->flags = flags; +++ ri->tgt_index = ifindex; +++ ++ return TC_ACT_REDIRECT; ++ } ++ ++ int skb_do_redirect(struct sk_buff *skb) ++ { ++- struct redirect_info *ri = this_cpu_ptr(&redirect_info); +++ struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); ++ struct net_device *dev; ++ ++- dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex); ++- ri->ifindex = 0; +++ dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); +++ ri->tgt_index = 0; ++ if (unlikely(!dev)) { ++ kfree_skb(skb); ++ return -EINVAL; ++ } ++ ++- if (BPF_IS_REDIRECT_INGRESS(ri->flags)) ++- return dev_forward_skb(dev, skb); ++- ++- skb->dev = dev; ++- skb_sender_cpu_clear(skb); ++- return dev_queue_xmit(skb); +++ return __bpf_redirect(skb, dev, ri->flags); ++ } ++ ++-const struct bpf_func_proto bpf_redirect_proto = { +++static const struct bpf_func_proto bpf_redirect_proto = { ++ .func = bpf_redirect, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++@@ -1475,50 +2016,75 @@ const struct bpf_func_proto bpf_redirect ++ .arg2_type = ARG_ANYTHING, ++ }; ++ ++-static u64 bpf_get_cgroup_classid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_1(bpf_get_hash_recalc, struct sk_buff *, skb) ++ { ++- return task_get_classid((struct sk_buff *) (unsigned long) r1); +++ /* If skb_clear_hash() was called due to mangling, we can +++ * trigger SW recalculation here. Later access to hash +++ * can then use the inline skb->hash via context directly +++ * instead of calling this helper again. +++ */ +++ return skb_get_hash(skb); ++ } ++ ++-static const struct bpf_func_proto bpf_get_cgroup_classid_proto = { ++- .func = bpf_get_cgroup_classid, ++- .gpl_only = false, ++- .ret_type = RET_INTEGER, ++- .arg1_type = ARG_PTR_TO_CTX, +++static const struct bpf_func_proto bpf_get_hash_recalc_proto = { +++ .func = bpf_get_hash_recalc, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, ++ }; ++ ++-static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_1(bpf_set_hash_invalid, struct sk_buff *, skb) ++ { ++-#ifdef CONFIG_IP_ROUTE_CLASSID ++- const struct dst_entry *dst; +++ /* After all direct packet write, this can be used once for +++ * triggering a lazy recalc on next skb_get_hash() invocation. +++ */ +++ skb_clear_hash(skb); +++ return 0; +++} ++ ++- dst = skb_dst((struct sk_buff *) (unsigned long) r1); ++- if (dst) ++- return dst->tclassid; ++-#endif +++static const struct bpf_func_proto bpf_set_hash_invalid_proto = { +++ .func = bpf_set_hash_invalid, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++}; +++ +++BPF_CALL_2(bpf_set_hash, struct sk_buff *, skb, u32, hash) +++{ +++ /* Set user specified hash as L4(+), so that it gets returned +++ * on skb_get_hash() call unless BPF prog later on triggers a +++ * skb_clear_hash(). +++ */ +++ __skb_set_sw_hash(skb, hash, true); ++ return 0; ++ } ++ ++-static const struct bpf_func_proto bpf_get_route_realm_proto = { ++- .func = bpf_get_route_realm, ++- .gpl_only = false, ++- .ret_type = RET_INTEGER, ++- .arg1_type = ARG_PTR_TO_CTX, +++static const struct bpf_func_proto bpf_set_hash_proto = { +++ .func = bpf_set_hash, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, ++ }; ++ ++-static u64 bpf_skb_vlan_push(u64 r1, u64 r2, u64 vlan_tci, u64 r4, u64 r5) +++BPF_CALL_3(bpf_skb_vlan_push, struct sk_buff *, skb, __be16, vlan_proto, +++ u16, vlan_tci) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1; ++- __be16 vlan_proto = (__force __be16) r2; +++ int ret; ++ ++ if (unlikely(vlan_proto != htons(ETH_P_8021Q) && ++ vlan_proto != htons(ETH_P_8021AD))) ++ vlan_proto = htons(ETH_P_8021Q); ++ ++- return skb_vlan_push(skb, vlan_proto, vlan_tci); +++ bpf_push_mac_rcsum(skb); +++ ret = skb_vlan_push(skb, vlan_proto, vlan_tci); +++ bpf_pull_mac_rcsum(skb); +++ +++ bpf_compute_data_pointers(skb); +++ return ret; ++ } ++ ++-const struct bpf_func_proto bpf_skb_vlan_push_proto = { +++static const struct bpf_func_proto bpf_skb_vlan_push_proto = { ++ .func = bpf_skb_vlan_push, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++@@ -1526,116 +2092,401 @@ const struct bpf_func_proto bpf_skb_vlan ++ .arg2_type = ARG_ANYTHING, ++ .arg3_type = ARG_ANYTHING, ++ }; ++-EXPORT_SYMBOL_GPL(bpf_skb_vlan_push_proto); ++ ++-static u64 bpf_skb_vlan_pop(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +++BPF_CALL_1(bpf_skb_vlan_pop, struct sk_buff *, skb) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1; +++ int ret; +++ +++ bpf_push_mac_rcsum(skb); +++ ret = skb_vlan_pop(skb); +++ bpf_pull_mac_rcsum(skb); ++ ++- return skb_vlan_pop(skb); +++ bpf_compute_data_pointers(skb); +++ return ret; ++ } ++ ++-const struct bpf_func_proto bpf_skb_vlan_pop_proto = { +++static const struct bpf_func_proto bpf_skb_vlan_pop_proto = { ++ .func = bpf_skb_vlan_pop, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_PTR_TO_CTX, ++ }; ++-EXPORT_SYMBOL_GPL(bpf_skb_vlan_pop_proto); ++ ++-bool bpf_helper_changes_skb_data(void *func) +++BPF_CALL_2(bpf_skb_change_type, struct sk_buff *, skb, u32, pkt_type) ++ { ++- if (func == bpf_skb_vlan_push) ++- return true; ++- if (func == bpf_skb_vlan_pop) ++- return true; ++- if (func == bpf_skb_store_bytes) ++- return true; ++- if (func == bpf_l3_csum_replace) ++- return true; ++- if (func == bpf_l4_csum_replace) ++- return true; +++ /* We only allow a restricted subset to be changed for now. */ +++ if (unlikely(!skb_pkt_type_ok(skb->pkt_type) || +++ !skb_pkt_type_ok(pkt_type))) +++ return -EINVAL; ++ ++- return false; +++ skb->pkt_type = pkt_type; +++ return 0; +++} +++ +++static const struct bpf_func_proto bpf_skb_change_type_proto = { +++ .func = bpf_skb_change_type, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++}; +++ +++#define BPF_F_ADJ_ROOM_ENCAP_L3_MASK (BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 | \ +++ BPF_F_ADJ_ROOM_ENCAP_L3_IPV6) +++ +++#define BPF_F_ADJ_ROOM_MASK (BPF_F_ADJ_ROOM_FIXED_GSO | \ +++ BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ +++ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ +++ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ +++ BPF_F_ADJ_ROOM_ENCAP_L2( \ +++ BPF_ADJ_ROOM_ENCAP_L2_MASK)) +++ +++#define BPF_SKB_MAX_LEN SKB_MAX_ALLOC +++ +++static u32 __bpf_skb_min_len(const struct sk_buff *skb) +++{ +++ u32 min_len = skb_network_offset(skb); +++ +++ if (skb_transport_header_was_set(skb)) +++ min_len = skb_transport_offset(skb); +++ if (skb->ip_summed == CHECKSUM_PARTIAL) +++ min_len = skb_checksum_start_offset(skb) + +++ skb->csum_offset + sizeof(__sum16); +++ return min_len; ++ } ++ ++-static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +++static int bpf_skb_grow_rcsum(struct sk_buff *skb, unsigned int new_len) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1; ++- struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2; ++- struct ip_tunnel_info *info = skb_tunnel_info(skb); +++ unsigned int old_len = skb->len; +++ int ret; ++ ++- if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info)) +++ ret = __skb_grow_rcsum(skb, new_len); +++ if (!ret) +++ memset(skb->data + old_len, 0, new_len - old_len); +++ return ret; +++} +++ +++static int bpf_skb_trim_rcsum(struct sk_buff *skb, unsigned int new_len) +++{ +++ return __skb_trim_rcsum(skb, new_len); +++} +++ +++static inline int __bpf_skb_change_tail(struct sk_buff *skb, u32 new_len, +++ u64 flags) +++{ +++ u32 max_len = BPF_SKB_MAX_LEN; +++ u32 min_len = __bpf_skb_min_len(skb); +++ int ret; +++ +++ if (unlikely(flags || new_len > max_len || new_len < min_len)) ++ return -EINVAL; ++- if (ip_tunnel_info_af(info) != AF_INET) +++ if (skb->encapsulation) +++ return -ENOTSUPP; +++ +++ /* The basic idea of this helper is that it's performing the +++ * needed work to either grow or trim an skb, and eBPF program +++ * rewrites the rest via helpers like bpf_skb_store_bytes(), +++ * bpf_lX_csum_replace() and others rather than passing a raw +++ * buffer here. This one is a slow path helper and intended +++ * for replies with control messages. +++ * +++ * Like in bpf_skb_change_proto(), we want to keep this rather +++ * minimal and without protocol specifics so that we are able +++ * to separate concerns as in bpf_skb_store_bytes() should only +++ * be the one responsible for writing buffers. +++ * +++ * It's really expected to be a slow path operation here for +++ * control message replies, so we're implicitly linearizing, +++ * uncloning and drop offloads from the skb by this. +++ */ +++ ret = __bpf_try_make_writable(skb, skb->len); +++ if (!ret) { +++ if (new_len > skb->len) +++ ret = bpf_skb_grow_rcsum(skb, new_len); +++ else if (new_len < skb->len) +++ ret = bpf_skb_trim_rcsum(skb, new_len); +++ if (!ret && skb_is_gso(skb)) +++ skb_gso_reset(skb); +++ } +++ return ret; +++} +++ +++BPF_CALL_3(bpf_skb_change_tail, struct sk_buff *, skb, u32, new_len, +++ u64, flags) +++{ +++ int ret = __bpf_skb_change_tail(skb, new_len, flags); +++ +++ bpf_compute_data_pointers(skb); +++ return ret; +++} +++ +++static const struct bpf_func_proto bpf_skb_change_tail_proto = { +++ .func = bpf_skb_change_tail, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++ .arg3_type = ARG_ANYTHING, +++}; +++ +++static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, +++ u64 flags) +++{ +++ u32 max_len = BPF_SKB_MAX_LEN; +++ u32 new_len = skb->len + head_room; +++ int ret; +++ +++ if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || +++ new_len < skb->len)) ++ return -EINVAL; ++ ++- to->tunnel_id = be64_to_cpu(info->key.tun_id); ++- to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src); +++ ret = skb_cow(skb, head_room); +++ if (likely(!ret)) { +++ /* Idea for this helper is that we currently only +++ * allow to expand on mac header. This means that +++ * skb->protocol network header, etc, stay as is. +++ * Compared to bpf_skb_change_tail(), we're more +++ * flexible due to not needing to linearize or +++ * reset GSO. Intention for this helper is to be +++ * used by an L3 skb that needs to push mac header +++ * for redirection into L2 device. +++ */ +++ __skb_push(skb, head_room); +++ memset(skb->data, 0, head_room); +++ skb_reset_mac_header(skb); +++ skb_reset_mac_len(skb); +++ } ++ ++- return 0; +++ return ret; ++ } ++ ++-const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = { ++- .func = bpf_skb_get_tunnel_key, +++BPF_CALL_3(bpf_skb_change_head, struct sk_buff *, skb, u32, head_room, +++ u64, flags) +++{ +++ int ret = __bpf_skb_change_head(skb, head_room, flags); +++ +++ bpf_compute_data_pointers(skb); +++ return ret; +++} +++ +++static const struct bpf_func_proto bpf_skb_change_head_proto = { +++ .func = bpf_skb_change_head, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_PTR_TO_CTX, ++- .arg2_type = ARG_PTR_TO_STACK, ++- .arg3_type = ARG_CONST_STACK_SIZE, ++- .arg4_type = ARG_ANYTHING, +++ .arg2_type = ARG_ANYTHING, +++ .arg3_type = ARG_ANYTHING, ++ }; ++ ++-static struct metadata_dst __percpu *md_dst; +++void bpf_clear_redirect_map(struct bpf_map *map) +++{ +++ struct bpf_redirect_info *ri; +++ int cpu; +++ +++ for_each_possible_cpu(cpu) { +++ ri = per_cpu_ptr(&bpf_redirect_info, cpu); +++ /* Avoid polluting remote cacheline due to writes if +++ * not needed. Once we pass this test, we need the +++ * cmpxchg() to make sure it hasn't been changed in +++ * the meantime by remote CPU. +++ */ +++ if (unlikely(READ_ONCE(ri->map) == map)) +++ cmpxchg(&ri->map, map, NULL); +++ } +++} +++ +++static unsigned long bpf_skb_copy(void *dst_buff, const void *skb, +++ unsigned long off, unsigned long len) +++{ +++ void *ptr = skb_header_pointer(skb, off, len, dst_buff); +++ +++ if (unlikely(!ptr)) +++ return len; +++ if (ptr != dst_buff) +++ memcpy(dst_buff, ptr, len); +++ +++ return 0; +++} ++ ++-static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5) +++BPF_CALL_5(bpf_skb_event_output, struct sk_buff *, skb, struct bpf_map *, map, +++ u64, flags, void *, meta, u64, meta_size) ++ { ++- struct sk_buff *skb = (struct sk_buff *) (long) r1; ++- struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2; ++- struct metadata_dst *md = this_cpu_ptr(md_dst); ++- struct ip_tunnel_info *info; +++ u64 skb_size = (flags & BPF_F_CTXLEN_MASK) >> 32; ++ ++- if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags)) +++ if (unlikely(flags & ~(BPF_F_CTXLEN_MASK | BPF_F_INDEX_MASK))) ++ return -EINVAL; +++ if (unlikely(skb_size > skb->len)) +++ return -EFAULT; ++ ++- skb_dst_drop(skb); ++- dst_hold((struct dst_entry *) md); ++- skb_dst_set(skb, (struct dst_entry *) md); +++ return bpf_event_output(map, flags, meta, meta_size, skb, skb_size, +++ bpf_skb_copy); +++} ++ ++- info = &md->u.tun_info; ++- info->mode = IP_TUNNEL_INFO_TX; ++- info->key.tun_flags = TUNNEL_KEY; ++- info->key.tun_id = cpu_to_be64(from->tunnel_id); ++- info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); +++static const struct bpf_func_proto bpf_skb_event_output_proto = { +++ .func = bpf_skb_event_output, +++ .gpl_only = true, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_CONST_MAP_PTR, +++ .arg3_type = ARG_ANYTHING, +++ .arg4_type = ARG_PTR_TO_MEM, +++ .arg5_type = ARG_CONST_SIZE_OR_ZERO, +++}; +++ +++ +++const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +++EXPORT_SYMBOL_GPL(ipv6_bpf_stub); +++ +++#ifdef CONFIG_XFRM +++BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index, +++ struct bpf_xfrm_state *, to, u32, size, u64, flags) +++{ +++ const struct sec_path *sp = skb_sec_path(skb); +++ const struct xfrm_state *x; +++ +++ if (!sp || unlikely(index >= sp->len || flags)) +++ goto err_clear; +++ +++ x = sp->xvec[index]; +++ +++ if (unlikely(size != sizeof(struct bpf_xfrm_state))) +++ goto err_clear; +++ +++ to->reqid = x->props.reqid; +++ to->spi = x->id.spi; +++ to->family = x->props.family; +++ to->ext = 0; +++ +++ if (to->family == AF_INET6) { +++ memcpy(to->remote_ipv6, x->props.saddr.a6, +++ sizeof(to->remote_ipv6)); +++ } else { +++ to->remote_ipv4 = x->props.saddr.a4; +++ memset(&to->remote_ipv6[1], 0, sizeof(__u32) * 3); +++ } ++ ++ return 0; +++err_clear: +++ memset(to, 0, size); +++ return -EINVAL; ++ } ++ ++-const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = { ++- .func = bpf_skb_set_tunnel_key, +++static const struct bpf_func_proto bpf_skb_get_xfrm_state_proto = { +++ .func = bpf_skb_get_xfrm_state, ++ .gpl_only = false, ++ .ret_type = RET_INTEGER, ++ .arg1_type = ARG_PTR_TO_CTX, ++- .arg2_type = ARG_PTR_TO_STACK, ++- .arg3_type = ARG_CONST_STACK_SIZE, ++- .arg4_type = ARG_ANYTHING, +++ .arg2_type = ARG_ANYTHING, +++ .arg3_type = ARG_PTR_TO_UNINIT_MEM, +++ .arg4_type = ARG_CONST_SIZE, +++ .arg5_type = ARG_ANYTHING, ++ }; +++#endif +++ ++ ++-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void) +++#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +++static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, +++ bool ingress) ++ { ++- if (!md_dst) { ++- /* race is not possible, since it's called from ++- * verifier that is holding verifier mutex ++- */ ++- md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL); ++- if (!md_dst) ++- return NULL; +++ return bpf_lwt_push_ip_encap(skb, hdr, len, ingress); +++} +++#endif +++ +++BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, +++ u32, len) +++{ +++ switch (type) { +++#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +++ case BPF_LWT_ENCAP_IP: +++ return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); +++#endif +++ default: +++ return -EINVAL; ++ } ++- return &bpf_skb_set_tunnel_key_proto; +++} +++ +++BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, +++ void *, hdr, u32, len) +++{ +++ switch (type) { +++#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +++ case BPF_LWT_ENCAP_IP: +++ return bpf_push_ip_encap(skb, hdr, len, false /* egress */); +++#endif +++ default: +++ return -EINVAL; +++ } +++} +++ +++static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { +++ .func = bpf_lwt_in_push_encap, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++ .arg3_type = ARG_PTR_TO_MEM, +++ .arg4_type = ARG_CONST_SIZE +++}; +++ +++static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { +++ .func = bpf_lwt_xmit_push_encap, +++ .gpl_only = false, +++ .ret_type = RET_INTEGER, +++ .arg1_type = ARG_PTR_TO_CTX, +++ .arg2_type = ARG_ANYTHING, +++ .arg3_type = ARG_PTR_TO_MEM, +++ .arg4_type = ARG_CONST_SIZE +++}; +++ +++bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ return 0; +++} +++ +++BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +++{ +++ if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) +++ return (unsigned long)sk; +++ +++ return (unsigned long)NULL; +++} +++ +++const struct bpf_func_proto bpf_tcp_sock_proto = { +++ .func = bpf_tcp_sock, +++ .gpl_only = false, +++ .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, +++ .arg1_type = ARG_PTR_TO_SOCK_COMMON, +++}; +++ +++bool bpf_helper_changes_pkt_data(void *func) +++{ +++ if (func == bpf_skb_vlan_push || +++ func == bpf_skb_vlan_pop || +++ func == bpf_skb_store_bytes || +++ func == bpf_skb_change_head || +++ func == bpf_skb_change_tail || +++ func == bpf_skb_pull_data || +++ func == bpf_clone_redirect || +++ func == bpf_l3_csum_replace || +++ func == bpf_l4_csum_replace || +++ func == bpf_lwt_in_push_encap || +++ func == bpf_lwt_xmit_push_encap) +++ return true; +++ +++ return false; ++ } ++ ++ static const struct bpf_func_proto * ++-sk_filter_func_proto(enum bpf_func_id func_id) +++bpf_base_func_proto(enum bpf_func_id func_id) ++ { ++ switch (func_id) { ++ case BPF_FUNC_map_lookup_elem: ++@@ -1644,283 +2495,1168 @@ sk_filter_func_proto(enum bpf_func_id fu ++ return &bpf_map_update_elem_proto; ++ case BPF_FUNC_map_delete_elem: ++ return &bpf_map_delete_elem_proto; +++ case BPF_FUNC_map_push_elem: +++ return &bpf_map_push_elem_proto; +++ case BPF_FUNC_map_pop_elem: +++ return &bpf_map_pop_elem_proto; +++ case BPF_FUNC_map_peek_elem: +++ return &bpf_map_peek_elem_proto; ++ case BPF_FUNC_get_prandom_u32: ++ return &bpf_get_prandom_u32_proto; ++ case BPF_FUNC_get_smp_processor_id: ++- return &bpf_get_smp_processor_id_proto; +++ return &bpf_get_raw_smp_processor_id_proto; +++ case BPF_FUNC_get_numa_node_id: +++ return &bpf_get_numa_node_id_proto; ++ case BPF_FUNC_tail_call: ++ return &bpf_tail_call_proto; ++ case BPF_FUNC_ktime_get_ns: ++ return &bpf_ktime_get_ns_proto; +++ default: +++ break; +++ } +++ +++ if (!capable(CAP_SYS_ADMIN)) +++ return NULL; +++ +++ switch (func_id) { +++ case BPF_FUNC_spin_lock: +++ return &bpf_spin_lock_proto; +++ case BPF_FUNC_spin_unlock: +++ return &bpf_spin_unlock_proto; ++ case BPF_FUNC_trace_printk: ++- if (capable(CAP_SYS_ADMIN)) ++- return bpf_get_trace_printk_proto(); +++ return bpf_get_trace_printk_proto(); ++ default: ++ return NULL; ++ } ++ } ++ ++ static const struct bpf_func_proto * ++-tc_cls_act_func_proto(enum bpf_func_id func_id) +++sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ return bpf_base_func_proto(func_id); +++} +++ +++static const struct bpf_func_proto * +++sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ return bpf_base_func_proto(func_id); +++} +++ +++const struct bpf_func_proto bpf_sk_storage_get_proto __weak; +++const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; +++ +++static const struct bpf_func_proto * +++tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ++ { ++ switch (func_id) { ++ case BPF_FUNC_skb_store_bytes: ++ return &bpf_skb_store_bytes_proto; +++ case BPF_FUNC_skb_load_bytes: +++ return &bpf_skb_load_bytes_proto; +++ case BPF_FUNC_skb_load_bytes_relative: +++ return &bpf_skb_load_bytes_relative_proto; +++ case BPF_FUNC_skb_pull_data: +++ return &bpf_skb_pull_data_proto; +++ case BPF_FUNC_csum_diff: +++ return &bpf_csum_diff_proto; +++ case BPF_FUNC_csum_update: +++ return &bpf_csum_update_proto; ++ case BPF_FUNC_l3_csum_replace: ++ return &bpf_l3_csum_replace_proto; ++ case BPF_FUNC_l4_csum_replace: ++ return &bpf_l4_csum_replace_proto; ++ case BPF_FUNC_clone_redirect: ++ return &bpf_clone_redirect_proto; ++- case BPF_FUNC_get_cgroup_classid: ++- return &bpf_get_cgroup_classid_proto; ++ case BPF_FUNC_skb_vlan_push: ++ return &bpf_skb_vlan_push_proto; ++ case BPF_FUNC_skb_vlan_pop: ++ return &bpf_skb_vlan_pop_proto; ++- case BPF_FUNC_skb_get_tunnel_key: ++- return &bpf_skb_get_tunnel_key_proto; ++- case BPF_FUNC_skb_set_tunnel_key: ++- return bpf_get_skb_set_tunnel_key_proto(); +++ case BPF_FUNC_skb_change_type: +++ return &bpf_skb_change_type_proto; +++ case BPF_FUNC_skb_change_tail: +++ return &bpf_skb_change_tail_proto; +++ case BPF_FUNC_redirect: +++ return &bpf_redirect_proto; +++ case BPF_FUNC_get_hash_recalc: +++ return &bpf_get_hash_recalc_proto; +++ case BPF_FUNC_set_hash_invalid: +++ return &bpf_set_hash_invalid_proto; +++ case BPF_FUNC_set_hash: +++ return &bpf_set_hash_proto; +++ case BPF_FUNC_perf_event_output: +++ return &bpf_skb_event_output_proto; +++ case BPF_FUNC_get_smp_processor_id: +++ return &bpf_get_smp_processor_id_proto; +++#ifdef CONFIG_XFRM +++ case BPF_FUNC_skb_get_xfrm_state: +++ return &bpf_skb_get_xfrm_state_proto; +++#endif +++ default: +++ return bpf_base_func_proto(func_id); +++ } +++} +++ +++static const struct bpf_func_proto * +++xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ default: +++ return bpf_base_func_proto(func_id); +++ } +++} +++ +++const struct bpf_func_proto bpf_sock_map_update_proto __weak; +++const struct bpf_func_proto bpf_sock_hash_update_proto __weak; +++ +++static const struct bpf_func_proto * +++sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ default: +++ return bpf_base_func_proto(func_id); +++ } +++} +++ +++const struct bpf_func_proto bpf_msg_redirect_map_proto __weak; +++const struct bpf_func_proto bpf_msg_redirect_hash_proto __weak; +++ +++static const struct bpf_func_proto * +++sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ default: +++ return bpf_base_func_proto(func_id); +++ } +++} +++ +++const struct bpf_func_proto bpf_sk_redirect_map_proto __weak; +++const struct bpf_func_proto bpf_sk_redirect_hash_proto __weak; +++ +++static const struct bpf_func_proto * +++sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ return bpf_base_func_proto(func_id); +++} +++ +++static const struct bpf_func_proto * +++flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ default: +++ return bpf_base_func_proto(func_id); +++ } +++} +++ +++static const struct bpf_func_proto * +++lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ case BPF_FUNC_skb_load_bytes: +++ return &bpf_skb_load_bytes_proto; +++ case BPF_FUNC_skb_pull_data: +++ return &bpf_skb_pull_data_proto; +++ case BPF_FUNC_csum_diff: +++ return &bpf_csum_diff_proto; +++ case BPF_FUNC_get_hash_recalc: +++ return &bpf_get_hash_recalc_proto; +++ case BPF_FUNC_perf_event_output: +++ return &bpf_skb_event_output_proto; +++ case BPF_FUNC_get_smp_processor_id: +++ return &bpf_get_smp_processor_id_proto; +++ default: +++ return bpf_base_func_proto(func_id); +++ } +++} +++ +++static const struct bpf_func_proto * +++lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { +++ case BPF_FUNC_lwt_push_encap: +++ return &bpf_lwt_in_push_encap_proto; +++ default: +++ return lwt_out_func_proto(func_id, prog); +++ } +++} +++ +++static const struct bpf_func_proto * +++lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +++{ +++ switch (func_id) { ++ case BPF_FUNC_redirect: ++ return &bpf_redirect_proto; ++- case BPF_FUNC_get_route_realm: ++- return &bpf_get_route_realm_proto; +++ case BPF_FUNC_clone_redirect: +++ return &bpf_clone_redirect_proto; +++ case BPF_FUNC_skb_change_tail: +++ return &bpf_skb_change_tail_proto; +++ case BPF_FUNC_skb_change_head: +++ return &bpf_skb_change_head_proto; +++ case BPF_FUNC_skb_store_bytes: +++ return &bpf_skb_store_bytes_proto; +++ case BPF_FUNC_csum_update: +++ return &bpf_csum_update_proto; +++ case BPF_FUNC_l3_csum_replace: +++ return &bpf_l3_csum_replace_proto; +++ case BPF_FUNC_l4_csum_replace: +++ return &bpf_l4_csum_replace_proto; +++ case BPF_FUNC_set_hash_invalid: +++ return &bpf_set_hash_invalid_proto; +++ case BPF_FUNC_lwt_push_encap: +++ return &bpf_lwt_xmit_push_encap_proto; ++ default: ++- return sk_filter_func_proto(func_id); +++ return lwt_out_func_proto(func_id, prog); ++ } ++ } ++ ++-static bool __is_valid_access(int off, int size, enum bpf_access_type type) +++static const struct bpf_func_proto * +++lwt_seg6local_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ++ { ++- /* check bounds */ +++ switch (func_id) { +++ default: +++ return lwt_out_func_proto(func_id, prog); +++ } +++} +++ +++static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ const int size_default = sizeof(__u32); +++ ++ if (off < 0 || off >= sizeof(struct __sk_buff)) ++ return false; ++ ++- /* disallow misaligned access */ +++ /* The verifier guarantees that size > 0. */ ++ if (off % size != 0) ++ return false; ++ ++- /* all __sk_buff fields are __u32 */ ++- if (size != 4) +++ switch (off) { +++ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): +++ if (off + size > offsetofend(struct __sk_buff, cb[4])) +++ return false; +++ break; +++ case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): +++ case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): +++ case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): +++ case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): +++ case bpf_ctx_range(struct __sk_buff, data): +++ case bpf_ctx_range(struct __sk_buff, data_meta): +++ case bpf_ctx_range(struct __sk_buff, data_end): +++ if (size != size_default) +++ return false; +++ break; +++ case bpf_ctx_range_ptr(struct __sk_buff, flow_keys): ++ return false; +++ case bpf_ctx_range(struct __sk_buff, tstamp): +++ if (size != sizeof(__u64)) +++ return false; +++ break; +++ case offsetof(struct __sk_buff, sk): +++ if (type == BPF_WRITE || size != sizeof(__u64)) +++ return false; +++ info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; +++ break; +++ default: +++ /* Only narrow read access allowed for now. */ +++ if (type == BPF_WRITE) { +++ if (size != size_default) +++ return false; +++ } else { +++ bpf_ctx_record_field_size(info, size_default); +++ if (!bpf_ctx_narrow_access_ok(off, size, size_default)) +++ return false; +++ } +++ } ++ ++ return true; ++ } ++ ++ static bool sk_filter_is_valid_access(int off, int size, ++- enum bpf_access_type type) +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) ++ { ++- if (off == offsetof(struct __sk_buff, tc_classid)) +++ return false; +++} +++ +++static bool lwt_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ switch (off) { +++ case bpf_ctx_range(struct __sk_buff, tc_classid): +++ case bpf_ctx_range_till(struct __sk_buff, family, local_port): +++ case bpf_ctx_range(struct __sk_buff, data_meta): +++ case bpf_ctx_range(struct __sk_buff, tstamp): +++ case bpf_ctx_range(struct __sk_buff, wire_len): ++ return false; +++ } ++ ++ if (type == BPF_WRITE) { ++ switch (off) { ++- case offsetof(struct __sk_buff, cb[0]) ... ++- offsetof(struct __sk_buff, cb[4]): +++ case bpf_ctx_range(struct __sk_buff, mark): +++ case bpf_ctx_range(struct __sk_buff, priority): +++ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): ++ break; ++ default: ++ return false; ++ } ++ } ++ ++- return __is_valid_access(off, size, type); +++ switch (off) { +++ case bpf_ctx_range(struct __sk_buff, data): +++ info->reg_type = PTR_TO_PACKET; +++ break; +++ case bpf_ctx_range(struct __sk_buff, data_end): +++ info->reg_type = PTR_TO_PACKET_END; +++ break; +++ } +++ +++ return bpf_skb_is_valid_access(off, size, type, prog, info); ++ } ++ ++-static bool tc_cls_act_is_valid_access(int off, int size, ++- enum bpf_access_type type) +++ +++bool bpf_sock_common_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ struct bpf_insn_access_aux *info) +++{ +++ switch (off) { +++ case bpf_ctx_range_till(struct bpf_sock, type, priority): +++ return false; +++ default: +++ return bpf_sock_is_valid_access(off, size, type, info); +++ } +++} +++ +++bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static bool sock_filter_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, +++ const struct bpf_prog *prog) +++{ +++ /* Neither direct read nor direct write requires any preliminary +++ * action. +++ */ +++ return 0; +++} +++ +++static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, +++ const struct bpf_prog *prog, int drop_verdict) +++{ +++ struct bpf_insn *insn = insn_buf; +++ +++ if (!direct_write) +++ return 0; +++ +++ /* if (!skb->cloned) +++ * goto start; +++ * +++ * (Fast-path, otherwise approximation that we might be +++ * a clone, do the rest in helper.) +++ */ +++ *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); +++ *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); +++ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); +++ +++ /* ret = bpf_skb_pull_data(skb, 0); */ +++ *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); +++ *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); +++ *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, +++ BPF_FUNC_skb_pull_data); +++ /* if (!ret) +++ * goto restore; +++ * return TC_ACT_SHOT; +++ */ +++ *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); +++ *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, drop_verdict); +++ *insn++ = BPF_EXIT_INSN(); +++ +++ /* restore: */ +++ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); +++ /* start: */ +++ *insn++ = prog->insnsi[0]; +++ +++ return insn - insn_buf; +++} +++ +++static int bpf_gen_ld_abs(const struct bpf_insn *orig, +++ struct bpf_insn *insn_buf) +++{ +++ bool indirect = BPF_MODE(orig->code) == BPF_IND; +++ struct bpf_insn *insn = insn_buf; +++ +++ if (!indirect) { +++ *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); +++ } else { +++ *insn++ = BPF_MOV64_REG(BPF_REG_2, orig->src_reg); +++ if (orig->imm) +++ *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); +++ } +++ /* We're guaranteed here that CTX is in R6. */ +++ *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); +++ +++ switch (BPF_SIZE(orig->code)) { +++ case BPF_B: +++ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_8_no_cache); +++ break; +++ case BPF_H: +++ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_16_no_cache); +++ break; +++ case BPF_W: +++ *insn++ = BPF_EMIT_CALL(bpf_skb_load_helper_32_no_cache); +++ break; +++ } +++ +++ *insn++ = BPF_JMP_IMM(BPF_JSGE, BPF_REG_0, 0, 2); +++ *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); +++ *insn++ = BPF_EXIT_INSN(); +++ +++ return insn - insn_buf; +++} +++ +++static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, +++ const struct bpf_prog *prog) ++ { ++- if (off == offsetof(struct __sk_buff, tc_classid)) ++- return type == BPF_WRITE ? true : false; +++ return bpf_unclone_prologue(insn_buf, direct_write, prog, TC_ACT_SHOT); +++} ++ +++static bool tc_cls_act_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ ++ if (type == BPF_WRITE) { ++ switch (off) { ++- case offsetof(struct __sk_buff, mark): ++- case offsetof(struct __sk_buff, tc_index): ++- case offsetof(struct __sk_buff, priority): ++- case offsetof(struct __sk_buff, cb[0]) ... ++- offsetof(struct __sk_buff, cb[4]): +++ case bpf_ctx_range(struct __sk_buff, mark): +++ case bpf_ctx_range(struct __sk_buff, tc_index): +++ case bpf_ctx_range(struct __sk_buff, priority): +++ case bpf_ctx_range(struct __sk_buff, tc_classid): +++ case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): +++ case bpf_ctx_range(struct __sk_buff, tstamp): +++ case bpf_ctx_range(struct __sk_buff, queue_mapping): ++ break; ++ default: ++ return false; ++ } ++ } ++- return __is_valid_access(off, size, type); +++ +++ switch (off) { +++ case bpf_ctx_range(struct __sk_buff, data): +++ info->reg_type = PTR_TO_PACKET; +++ break; +++ case bpf_ctx_range(struct __sk_buff, data_meta): +++ info->reg_type = PTR_TO_PACKET_META; +++ break; +++ case bpf_ctx_range(struct __sk_buff, data_end): +++ info->reg_type = PTR_TO_PACKET_END; +++ break; +++ case bpf_ctx_range_till(struct __sk_buff, family, local_port): +++ return false; +++ } +++ +++ return bpf_skb_is_valid_access(off, size, type, prog, info); +++} +++ +++static bool xdp_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++void bpf_warn_invalid_xdp_action(u32 act) +++{ +++} +++EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +++ +++static bool sock_addr_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static bool sock_ops_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, +++ const struct bpf_prog *prog) +++{ +++ return bpf_unclone_prologue(insn_buf, direct_write, prog, SK_DROP); +++} +++ +++static bool sk_skb_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static bool sk_msg_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; +++} +++ +++static bool flow_dissector_is_valid_access(int off, int size, +++ enum bpf_access_type type, +++ const struct bpf_prog *prog, +++ struct bpf_insn_access_aux *info) +++{ +++ return false; ++ } ++ ++-static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg, ++- int src_reg, int ctx_off, ++- struct bpf_insn *insn_buf, ++- struct bpf_prog *prog) +++static u32 flow_dissector_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size) +++ +++{ +++ return 0; +++} +++ +++static u32 bpf_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) ++ { ++ struct bpf_insn *insn = insn_buf; +++ int off; ++ ++- switch (ctx_off) { +++ switch (si->off) { ++ case offsetof(struct __sk_buff, len): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); ++- ++- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ++- offsetof(struct sk_buff, len)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, len, 4, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, protocol): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); ++- ++- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ++- offsetof(struct sk_buff, protocol)); +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, protocol, 2, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, vlan_proto): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, vlan_proto) != 2); ++- ++- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ++- offsetof(struct sk_buff, vlan_proto)); +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, vlan_proto, 2, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, priority): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, priority) != 4); ++- ++ if (type == BPF_WRITE) ++- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ++- offsetof(struct sk_buff, priority)); +++ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, priority, 4, +++ target_size)); ++ else ++- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ++- offsetof(struct sk_buff, priority)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, priority, 4, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, ingress_ifindex): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, skb_iif) != 4); ++- ++- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ++- offsetof(struct sk_buff, skb_iif)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, skb_iif, 4, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, ifindex): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); ++- ++- *insn++ = BPF_LDX_MEM(bytes_to_bpf_size(FIELD_SIZEOF(struct sk_buff, dev)), ++- dst_reg, src_reg, +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), +++ si->dst_reg, si->src_reg, ++ offsetof(struct sk_buff, dev)); ++- *insn++ = BPF_JMP_IMM(BPF_JEQ, dst_reg, 0, 1); ++- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, dst_reg, ++- offsetof(struct net_device, ifindex)); +++ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +++ bpf_target_off(struct net_device, ifindex, 4, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, hash): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, hash) != 4); ++- ++- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ++- offsetof(struct sk_buff, hash)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, hash, 4, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, mark): ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); ++- ++ if (type == BPF_WRITE) ++- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ++- offsetof(struct sk_buff, mark)); +++ *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, mark, 4, +++ target_size)); ++ else ++- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ++- offsetof(struct sk_buff, mark)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, mark, 4, +++ target_size)); ++ break; ++ ++ case offsetof(struct __sk_buff, pkt_type): ++- return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); +++ *target_size = 1; +++ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, +++ PKT_TYPE_OFFSET()); +++ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, PKT_TYPE_MAX); +++#ifdef __BIG_ENDIAN_BITFIELD +++ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, 5); +++#endif +++ break; ++ ++ case offsetof(struct __sk_buff, queue_mapping): ++- return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); +++ if (type == BPF_WRITE) { +++ *insn++ = BPF_JMP_IMM(BPF_JGE, si->src_reg, USHRT_MAX, 1); +++ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, +++ queue_mapping, +++ 2, target_size)); +++ } else { +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, +++ queue_mapping, +++ 2, target_size)); +++ } +++ break; ++ ++ case offsetof(struct __sk_buff, vlan_present): ++- return convert_skb_access(SKF_AD_VLAN_TAG_PRESENT, ++- dst_reg, src_reg, insn); +++ *target_size = 1; +++ *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->src_reg, +++ PKT_VLAN_PRESENT_OFFSET()); +++ if (PKT_VLAN_PRESENT_BIT) +++ *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, PKT_VLAN_PRESENT_BIT); +++ if (PKT_VLAN_PRESENT_BIT < 7) +++ *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, 1); +++ break; ++ ++ case offsetof(struct __sk_buff, vlan_tci): ++- return convert_skb_access(SKF_AD_VLAN_TAG, ++- dst_reg, src_reg, insn); +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, vlan_tci, 2, +++ target_size)); +++ break; ++ ++ case offsetof(struct __sk_buff, cb[0]) ... ++- offsetof(struct __sk_buff, cb[4]): +++ offsetofend(struct __sk_buff, cb[4]) - 1: ++ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, data) < 20); +++ BUILD_BUG_ON((offsetof(struct sk_buff, cb) + +++ offsetof(struct qdisc_skb_cb, data)) % +++ sizeof(__u64)); ++ ++ prog->cb_access = 1; ++- ctx_off -= offsetof(struct __sk_buff, cb[0]); ++- ctx_off += offsetof(struct sk_buff, cb); ++- ctx_off += offsetof(struct qdisc_skb_cb, data); +++ off = si->off; +++ off -= offsetof(struct __sk_buff, cb[0]); +++ off += offsetof(struct sk_buff, cb); +++ off += offsetof(struct qdisc_skb_cb, data); ++ if (type == BPF_WRITE) ++- *insn++ = BPF_STX_MEM(BPF_W, dst_reg, src_reg, ctx_off); +++ *insn++ = BPF_STX_MEM(BPF_SIZE(si->code), si->dst_reg, +++ si->src_reg, off); ++ else ++- *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, ctx_off); +++ *insn++ = BPF_LDX_MEM(BPF_SIZE(si->code), si->dst_reg, +++ si->src_reg, off); ++ break; ++ ++ case offsetof(struct __sk_buff, tc_classid): ++- ctx_off -= offsetof(struct __sk_buff, tc_classid); ++- ctx_off += offsetof(struct sk_buff, cb); ++- ctx_off += offsetof(struct qdisc_skb_cb, tc_classid); ++- WARN_ON(type != BPF_WRITE); ++- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off); +++ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, tc_classid) != 2); +++ +++ off = si->off; +++ off -= offsetof(struct __sk_buff, tc_classid); +++ off += offsetof(struct sk_buff, cb); +++ off += offsetof(struct qdisc_skb_cb, tc_classid); +++ *target_size = 2; +++ if (type == BPF_WRITE) +++ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, +++ si->src_reg, off); +++ else +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, +++ si->src_reg, off); +++ break; +++ +++ case offsetof(struct __sk_buff, data): +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, data)); +++ break; +++ +++ case offsetof(struct __sk_buff, data_meta): +++ off = si->off; +++ off -= offsetof(struct __sk_buff, data_meta); +++ off += offsetof(struct sk_buff, cb); +++ off += offsetof(struct bpf_skb_data_end, data_meta); +++ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, +++ si->src_reg, off); +++ break; +++ +++ case offsetof(struct __sk_buff, data_end): +++ off = si->off; +++ off -= offsetof(struct __sk_buff, data_end); +++ off += offsetof(struct sk_buff, cb); +++ off += offsetof(struct bpf_skb_data_end, data_end); +++ *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, +++ si->src_reg, off); ++ break; ++ ++ case offsetof(struct __sk_buff, tc_index): ++ #ifdef CONFIG_NET_SCHED ++- BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tc_index) != 2); ++- ++ if (type == BPF_WRITE) ++- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ++- offsetof(struct sk_buff, tc_index)); +++ *insn++ = BPF_STX_MEM(BPF_H, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, tc_index, 2, +++ target_size)); ++ else ++- *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ++- offsetof(struct sk_buff, tc_index)); ++- break; +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, tc_index, 2, +++ target_size)); ++ #else +++ *target_size = 2; ++ if (type == BPF_WRITE) ++- *insn++ = BPF_MOV64_REG(dst_reg, dst_reg); +++ *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); ++ else ++- *insn++ = BPF_MOV64_IMM(dst_reg, 0); +++ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +++#endif +++ break; +++ +++ case offsetof(struct __sk_buff, napi_id): +++ *target_size = 4; +++ *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +++ break; +++ case offsetof(struct __sk_buff, family): +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); +++ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +++ bpf_target_off(struct sock_common, +++ skc_family, +++ 2, target_size)); +++ break; +++ case offsetof(struct __sk_buff, remote_ip4): +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); +++ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +++ bpf_target_off(struct sock_common, +++ skc_daddr, +++ 4, target_size)); +++ break; +++ case offsetof(struct __sk_buff, local_ip4): +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +++ skc_rcv_saddr) != 4); +++ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +++ bpf_target_off(struct sock_common, +++ skc_rcv_saddr, +++ 4, target_size)); +++ break; +++ case offsetof(struct __sk_buff, remote_ip6[0]) ... +++ offsetof(struct __sk_buff, remote_ip6[3]): +++#if IS_ENABLED(CONFIG_IPV6) +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +++ skc_v6_daddr.s6_addr32[0]) != 4); +++ +++ off = si->off; +++ off -= offsetof(struct __sk_buff, remote_ip6[0]); +++ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +++ offsetof(struct sock_common, +++ skc_v6_daddr.s6_addr32[0]) + +++ off); +++#else +++ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +++#endif +++ break; +++ case offsetof(struct __sk_buff, local_ip6[0]) ... +++ offsetof(struct __sk_buff, local_ip6[3]): +++#if IS_ENABLED(CONFIG_IPV6) +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, +++ skc_v6_rcv_saddr.s6_addr32[0]) != 4); +++ +++ off = si->off; +++ off -= offsetof(struct __sk_buff, local_ip6[0]); +++ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +++ offsetof(struct sock_common, +++ skc_v6_rcv_saddr.s6_addr32[0]) + +++ off); +++#else +++ *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +++#endif +++ break; +++ +++ case offsetof(struct __sk_buff, remote_port): +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); +++ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +++ bpf_target_off(struct sock_common, +++ skc_dport, +++ 2, target_size)); +++#ifndef __BIG_ENDIAN_BITFIELD +++ *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +++#endif ++ break; +++ +++ case offsetof(struct __sk_buff, local_port): +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); +++ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, +++ bpf_target_off(struct sock_common, +++ skc_num, 2, target_size)); +++ break; +++ +++ case offsetof(struct __sk_buff, tstamp): +++ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, tstamp) != 8); +++ +++ if (type == BPF_WRITE) +++ *insn++ = BPF_STX_MEM(BPF_DW, +++ si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, +++ tstamp, 8, +++ target_size)); +++ else +++ *insn++ = BPF_LDX_MEM(BPF_DW, +++ si->dst_reg, si->src_reg, +++ bpf_target_off(struct sk_buff, +++ tstamp, 8, +++ target_size)); +++ break; +++ +++ case offsetof(struct __sk_buff, gso_segs): +++ /* si->dst_reg = skb_shinfo(SKB); */ +++#ifdef NET_SKBUFF_DATA_USES_OFFSET +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), +++ BPF_REG_AX, si->src_reg, +++ offsetof(struct sk_buff, end)); +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, head)); +++ *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +++#else +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, end)); ++ #endif +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), +++ si->dst_reg, si->dst_reg, +++ bpf_target_off(struct skb_shared_info, +++ gso_segs, 2, +++ target_size)); +++ break; +++ case offsetof(struct __sk_buff, wire_len): +++ BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); +++ +++ off = si->off; +++ off -= offsetof(struct __sk_buff, wire_len); +++ off += offsetof(struct sk_buff, cb); +++ off += offsetof(struct qdisc_skb_cb, pkt_len); +++ *target_size = 4; +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); +++ break; +++ +++ case offsetof(struct __sk_buff, sk): +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, sk)); +++ break; ++ } ++ ++ return insn - insn_buf; ++ } ++ ++-static const struct bpf_verifier_ops sk_filter_ops = { ++- .get_func_proto = sk_filter_func_proto, ++- .is_valid_access = sk_filter_is_valid_access, ++- .convert_ctx_access = bpf_net_convert_ctx_access, +++u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ return 0; +++} +++ +++static u32 tc_cls_act_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ struct bpf_insn *insn = insn_buf; +++ +++ switch (si->off) { +++ case offsetof(struct __sk_buff, ifindex): +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, dev), +++ si->dst_reg, si->src_reg, +++ offsetof(struct sk_buff, dev)); +++ *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, +++ bpf_target_off(struct net_device, ifindex, 4, +++ target_size)); +++ break; +++ default: +++ return bpf_convert_ctx_access(type, si, insn_buf, prog, +++ target_size); +++ } +++ +++ return insn - insn_buf; +++} +++ +++static u32 xdp_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ return 0; +++} +++ +++/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of +++ * context Structure, F is Field in context structure that contains a pointer +++ * to Nested Structure of type NS that has the field NF. +++ * +++ * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make +++ * sure that SIZE is not greater than actual size of S.F.NF. +++ * +++ * If offset OFF is provided, the load happens from that offset relative to +++ * offset of NF. +++ */ +++#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ +++ do { \ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ +++ si->src_reg, offsetof(S, F)); \ +++ *insn++ = BPF_LDX_MEM( \ +++ SIZE, si->dst_reg, si->dst_reg, \ +++ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ +++ target_size) \ +++ + OFF); \ +++ } while (0) +++ +++#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ +++ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ +++ BPF_FIELD_SIZEOF(NS, NF), 0) +++ +++/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to +++ * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. +++ * +++ * In addition it uses Temporary Field TF (member of struct S) as the 3rd +++ * "register" since two registers available in convert_ctx_access are not +++ * enough: we can't override neither SRC, since it contains value to store, nor +++ * DST since it contains pointer to context that may be used by later +++ * instructions. But we need a temporary place to save pointer to nested +++ * structure whose field we want to store to. +++ */ +++#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \ +++ do { \ +++ int tmp_reg = BPF_REG_9; \ +++ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ +++ --tmp_reg; \ +++ if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ +++ --tmp_reg; \ +++ *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ +++ offsetof(S, TF)); \ +++ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ +++ si->dst_reg, offsetof(S, F)); \ +++ *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \ +++ bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ +++ target_size) \ +++ + OFF); \ +++ *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ +++ offsetof(S, TF)); \ +++ } while (0) +++ +++#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ +++ TF) \ +++ do { \ +++ if (type == BPF_WRITE) { \ +++ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \ +++ OFF, TF); \ +++ } else { \ +++ SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ +++ S, NS, F, NF, SIZE, OFF); \ +++ } \ +++ } while (0) +++ +++#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ +++ SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ +++ S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) +++ +++static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ return 0; +++} +++ +++static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, +++ u32 *target_size) +++{ +++ return 0; +++} +++ +++static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ return 0; +++} +++ +++static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, +++ const struct bpf_insn *si, +++ struct bpf_insn *insn_buf, +++ struct bpf_prog *prog, u32 *target_size) +++{ +++ return 0; +++} +++ +++const struct bpf_verifier_ops sk_filter_verifier_ops = { +++ .is_valid_access = sk_filter_is_valid_access, +++ .convert_ctx_access = bpf_convert_ctx_access, +++ .gen_ld_abs = bpf_gen_ld_abs, ++ }; ++ ++-static const struct bpf_verifier_ops tc_cls_act_ops = { ++- .get_func_proto = tc_cls_act_func_proto, ++- .is_valid_access = tc_cls_act_is_valid_access, ++- .convert_ctx_access = bpf_net_convert_ctx_access, +++const struct bpf_prog_ops sk_filter_prog_ops = { ++ }; ++ ++-static struct bpf_prog_type_list sk_filter_type __read_mostly = { ++- .ops = &sk_filter_ops, ++- .type = BPF_PROG_TYPE_SOCKET_FILTER, +++const struct bpf_verifier_ops tc_cls_act_verifier_ops = { +++ .get_func_proto = tc_cls_act_func_proto, +++ .is_valid_access = tc_cls_act_is_valid_access, +++ .convert_ctx_access = tc_cls_act_convert_ctx_access, +++ .gen_prologue = tc_cls_act_prologue, +++ .gen_ld_abs = bpf_gen_ld_abs, ++ }; ++ ++-static struct bpf_prog_type_list sched_cls_type __read_mostly = { ++- .ops = &tc_cls_act_ops, ++- .type = BPF_PROG_TYPE_SCHED_CLS, +++const struct bpf_prog_ops tc_cls_act_prog_ops = { ++ }; ++ ++-static struct bpf_prog_type_list sched_act_type __read_mostly = { ++- .ops = &tc_cls_act_ops, ++- .type = BPF_PROG_TYPE_SCHED_ACT, +++const struct bpf_verifier_ops xdp_verifier_ops = { +++ .get_func_proto = xdp_func_proto, +++ .is_valid_access = xdp_is_valid_access, +++ .convert_ctx_access = xdp_convert_ctx_access, +++ .gen_prologue = bpf_noop_prologue, ++ }; ++ ++-static int __init register_sk_filter_ops(void) ++-{ ++- bpf_register_prog_type(&sk_filter_type); ++- bpf_register_prog_type(&sched_cls_type); ++- bpf_register_prog_type(&sched_act_type); +++const struct bpf_verifier_ops lwt_in_verifier_ops = { +++ .get_func_proto = lwt_in_func_proto, +++ .is_valid_access = lwt_is_valid_access, +++ .convert_ctx_access = bpf_convert_ctx_access, +++}; ++ ++- return 0; ++-} ++-late_initcall(register_sk_filter_ops); +++const struct bpf_prog_ops lwt_in_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops lwt_out_verifier_ops = { +++ .get_func_proto = lwt_out_func_proto, +++ .is_valid_access = lwt_is_valid_access, +++ .convert_ctx_access = bpf_convert_ctx_access, +++}; +++ +++const struct bpf_prog_ops lwt_out_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops lwt_xmit_verifier_ops = { +++ .get_func_proto = lwt_xmit_func_proto, +++ .is_valid_access = lwt_is_valid_access, +++ .convert_ctx_access = bpf_convert_ctx_access, +++ .gen_prologue = tc_cls_act_prologue, +++}; +++ +++const struct bpf_prog_ops lwt_xmit_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops lwt_seg6local_verifier_ops = { +++ .get_func_proto = lwt_seg6local_func_proto, +++ .is_valid_access = lwt_is_valid_access, +++ .convert_ctx_access = bpf_convert_ctx_access, +++}; +++ +++const struct bpf_prog_ops lwt_seg6local_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops cg_sock_verifier_ops = { +++ .get_func_proto = sock_filter_func_proto, +++ .is_valid_access = sock_filter_is_valid_access, +++ .convert_ctx_access = bpf_sock_convert_ctx_access, +++}; +++ +++const struct bpf_prog_ops cg_sock_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { +++ .get_func_proto = sock_addr_func_proto, +++ .is_valid_access = sock_addr_is_valid_access, +++ .convert_ctx_access = sock_addr_convert_ctx_access, +++}; ++ ++-int __sk_detach_filter(struct sock *sk, bool locked) +++const struct bpf_prog_ops cg_sock_addr_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops sock_ops_verifier_ops = { +++ .get_func_proto = sock_ops_func_proto, +++ .is_valid_access = sock_ops_is_valid_access, +++ .convert_ctx_access = sock_ops_convert_ctx_access, +++}; +++ +++const struct bpf_prog_ops sock_ops_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops sk_skb_verifier_ops = { +++ .get_func_proto = sk_skb_func_proto, +++ .is_valid_access = sk_skb_is_valid_access, +++ .convert_ctx_access = sk_skb_convert_ctx_access, +++ .gen_prologue = sk_skb_prologue, +++}; +++ +++const struct bpf_prog_ops sk_skb_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops sk_msg_verifier_ops = { +++ .get_func_proto = sk_msg_func_proto, +++ .is_valid_access = sk_msg_is_valid_access, +++ .convert_ctx_access = sk_msg_convert_ctx_access, +++ .gen_prologue = bpf_noop_prologue, +++}; +++ +++const struct bpf_prog_ops sk_msg_prog_ops = { +++}; +++ +++const struct bpf_verifier_ops flow_dissector_verifier_ops = { +++ .get_func_proto = flow_dissector_func_proto, +++ .is_valid_access = flow_dissector_is_valid_access, +++ .convert_ctx_access = flow_dissector_convert_ctx_access, +++}; +++ +++const struct bpf_prog_ops flow_dissector_prog_ops = { +++}; +++ +++int sk_detach_filter(struct sock *sk) ++ { ++ int ret = -ENOENT; ++ struct sk_filter *filter; ++@@ -1928,7 +3664,8 @@ int __sk_detach_filter(struct sock *sk, ++ if (sock_flag(sk, SOCK_FILTER_LOCKED)) ++ return -EPERM; ++ ++- filter = rcu_dereference_protected(sk->sk_filter, locked); +++ filter = rcu_dereference_protected(sk->sk_filter, +++ lockdep_sock_is_held(sk)); ++ if (filter) { ++ RCU_INIT_POINTER(sk->sk_filter, NULL); ++ sk_filter_uncharge(sk, filter); ++@@ -1937,12 +3674,7 @@ int __sk_detach_filter(struct sock *sk, ++ ++ return ret; ++ } ++-EXPORT_SYMBOL_GPL(__sk_detach_filter); ++- ++-int sk_detach_filter(struct sock *sk) ++-{ ++- return __sk_detach_filter(sk, sock_owned_by_user(sk)); ++-} +++EXPORT_SYMBOL_GPL(sk_detach_filter); ++ ++ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, ++ unsigned int len) ++@@ -1953,7 +3685,7 @@ int sk_get_filter(struct sock *sk, struc ++ ++ lock_sock(sk); ++ filter = rcu_dereference_protected(sk->sk_filter, ++- sock_owned_by_user(sk)); +++ lockdep_sock_is_held(sk)); ++ if (!filter) ++ goto out; ++ ++@@ -1987,3 +3719,5 @@ out: ++ release_sock(sk); ++ return ret; ++ } +++ +++ ++--- a/include/asm-generic/barrier.h +++++ b/include/asm-generic/barrier.h ++@@ -119,5 +119,29 @@ do { \ ++ ___p1; \ ++ }) ++ +++/** +++ * smp_cond_load_relaxed() - (Spin) wait for cond with no ordering guarantees +++ * @ptr: pointer to the variable to wait on +++ * @cond: boolean expression to wait for +++ * +++ * Equivalent to using READ_ONCE() on the condition variable. +++ * +++ * Due to C lacking lambda expressions we load the value of *ptr into a +++ * pre-named variable @VAL to be used in @cond. +++ */ +++#ifndef smp_cond_load_relaxed +++#define smp_cond_load_relaxed(ptr, cond_expr) ({ \ +++ typeof(ptr) __PTR = (ptr); \ +++ typeof(*ptr) VAL; \ +++ for (;;) { \ +++ VAL = READ_ONCE(*__PTR); \ +++ if (cond_expr) \ +++ break; \ +++ cpu_relax(); \ +++ } \ +++ VAL; \ +++}) +++#endif +++ ++ #endif /* !__ASSEMBLY__ */ ++ #endif /* __ASM_GENERIC_BARRIER_H */ ++--- a/arch/arm/include/asm/barrier.h +++++ b/arch/arm/include/asm/barrier.h ++@@ -94,4 +94,6 @@ do { \ ++ #define smp_mb__after_atomic() smp_mb() ++ ++ #endif /* !__ASSEMBLY__ */ +++ +++#include ++ #endif /* __ASM_BARRIER_H */ ++--- a/include/linux/list_nulls.h +++++ b/include/linux/list_nulls.h ++@@ -1,3 +1,4 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ ++ #ifndef _LINUX_LIST_NULLS_H ++ #define _LINUX_LIST_NULLS_H ++ ++@@ -29,6 +30,11 @@ struct hlist_nulls_node { ++ ((ptr)->first = (struct hlist_nulls_node *) NULLS_MARKER(nulls)) ++ ++ #define hlist_nulls_entry(ptr, type, member) container_of(ptr,type,member) +++ +++#define hlist_nulls_entry_safe(ptr, type, member) \ +++ ({ typeof(ptr) ____ptr = (ptr); \ +++ !is_a_nulls(____ptr) ? hlist_nulls_entry(____ptr, type, member) : NULL; \ +++ }) ++ /** ++ * ptr_is_a_nulls - Test if a ptr is a nulls ++ * @ptr: ptr to be tested ++@@ -57,7 +63,7 @@ static inline int hlist_nulls_unhashed(c ++ ++ static inline int hlist_nulls_empty(const struct hlist_nulls_head *h) ++ { ++- return is_a_nulls(h->first); +++ return is_a_nulls(READ_ONCE(h->first)); ++ } ++ ++ static inline void hlist_nulls_add_head(struct hlist_nulls_node *n, ++@@ -66,10 +72,10 @@ static inline void hlist_nulls_add_head( ++ struct hlist_nulls_node *first = h->first; ++ ++ n->next = first; ++- n->pprev = &h->first; +++ WRITE_ONCE(n->pprev, &h->first); ++ h->first = n; ++ if (!is_a_nulls(first)) ++- first->pprev = &n->next; +++ WRITE_ONCE(first->pprev, &n->next); ++ } ++ ++ static inline void __hlist_nulls_del(struct hlist_nulls_node *n) ++@@ -79,13 +85,13 @@ static inline void __hlist_nulls_del(str ++ ++ WRITE_ONCE(*pprev, next); ++ if (!is_a_nulls(next)) ++- next->pprev = pprev; +++ WRITE_ONCE(next->pprev, pprev); ++ } ++ ++ static inline void hlist_nulls_del(struct hlist_nulls_node *n) ++ { ++ __hlist_nulls_del(n); ++- n->pprev = LIST_POISON2; +++ WRITE_ONCE(n->pprev, LIST_POISON2); ++ } ++ ++ /** ++--- a/include/linux/rculist_nulls.h +++++ b/include/linux/rculist_nulls.h ++@@ -1,3 +1,4 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ ++ #ifndef _LINUX_RCULIST_NULLS_H ++ #define _LINUX_RCULIST_NULLS_H ++ ++@@ -33,7 +34,7 @@ static inline void hlist_nulls_del_init_ ++ { ++ if (!hlist_nulls_unhashed(n)) { ++ __hlist_nulls_del(n); ++- n->pprev = NULL; +++ WRITE_ONCE(n->pprev, NULL); ++ } ++ } ++ ++@@ -65,7 +66,7 @@ static inline void hlist_nulls_del_init_ ++ static inline void hlist_nulls_del_rcu(struct hlist_nulls_node *n) ++ { ++ __hlist_nulls_del(n); ++- n->pprev = LIST_POISON2; +++ WRITE_ONCE(n->pprev, LIST_POISON2); ++ } ++ ++ /** ++@@ -93,11 +94,49 @@ static inline void hlist_nulls_add_head_ ++ struct hlist_nulls_node *first = h->first; ++ ++ n->next = first; ++- n->pprev = &h->first; +++ WRITE_ONCE(n->pprev, &h->first); ++ rcu_assign_pointer(hlist_nulls_first_rcu(h), n); ++ if (!is_a_nulls(first)) ++- first->pprev = &n->next; +++ WRITE_ONCE(first->pprev, &n->next); ++ } +++ +++/** +++ * hlist_nulls_add_tail_rcu +++ * @n: the element to add to the hash list. +++ * @h: the list to add to. +++ * +++ * Description: +++ * Adds the specified element to the specified hlist_nulls, +++ * while permitting racing traversals. +++ * +++ * The caller must take whatever precautions are necessary +++ * (such as holding appropriate locks) to avoid racing +++ * with another list-mutation primitive, such as hlist_nulls_add_head_rcu() +++ * or hlist_nulls_del_rcu(), running on this same list. +++ * However, it is perfectly legal to run concurrently with +++ * the _rcu list-traversal primitives, such as +++ * hlist_nulls_for_each_entry_rcu(), used to prevent memory-consistency +++ * problems on Alpha CPUs. Regardless of the type of CPU, the +++ * list-traversal primitive must be guarded by rcu_read_lock(). +++ */ +++static inline void hlist_nulls_add_tail_rcu(struct hlist_nulls_node *n, +++ struct hlist_nulls_head *h) +++{ +++ struct hlist_nulls_node *i, *last = NULL; +++ +++ /* Note: write side code, so rcu accessors are not needed. */ +++ for (i = h->first; !is_a_nulls(i); i = i->next) +++ last = i; +++ +++ if (last) { +++ n->next = last->next; +++ n->pprev = &last->next; +++ rcu_assign_pointer(hlist_next_rcu(last), n); +++ } else { +++ hlist_nulls_add_head_rcu(n, h); +++ } +++} +++ ++ /** ++ * hlist_nulls_for_each_entry_rcu - iterate over rcu list of given type ++ * @tpos: the type * to use as a loop cursor. ++@@ -107,7 +146,7 @@ static inline void hlist_nulls_add_head_ ++ * ++ * The barrier() is needed to make sure compiler doesn't cache first element [1], ++ * as this loop can be restarted [2] ++- * [1] Documentation/atomic_ops.txt around line 114 +++ * [1] Documentation/core-api/atomic_ops.rst around line 114 ++ * [2] Documentation/RCU/rculist_nulls.txt around line 146 ++ */ ++ #define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ ++@@ -117,5 +156,19 @@ static inline void hlist_nulls_add_head_ ++ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ ++ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos))) ++ +++/** +++ * hlist_nulls_for_each_entry_safe - +++ * iterate over list of given type safe against removal of list entry +++ * @tpos: the type * to use as a loop cursor. +++ * @pos: the &struct hlist_nulls_node to use as a loop cursor. +++ * @head: the head for your list. +++ * @member: the name of the hlist_nulls_node within the struct. +++ */ +++#define hlist_nulls_for_each_entry_safe(tpos, pos, head, member) \ +++ for (({barrier();}), \ +++ pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \ +++ (!is_a_nulls(pos)) && \ +++ ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); \ +++ pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)); 1; });) ++ #endif ++ #endif ++--- a/drivers/net/tun.c +++++ b/drivers/net/tun.c ++@@ -627,8 +627,9 @@ static int tun_attach(struct tun_struct ++ ++ /* Re-attach the filter to persist device */ ++ if (!skip_filter && (tun->filter_attached == true)) { ++- err = __sk_attach_filter(&tun->fprog, tfile->socket.sk, ++- lockdep_rtnl_is_held()); +++ lock_sock(tfile->socket.sk); +++ err = sk_attach_filter(&tun->fprog, tfile->socket.sk); +++ release_sock(tfile->socket.sk); ++ if (!err) ++ goto out; ++ } ++@@ -1835,7 +1836,9 @@ static void tun_detach_filter(struct tun ++ ++ for (i = 0; i < n; i++) { ++ tfile = rtnl_dereference(tun->tfiles[i]); ++- __sk_detach_filter(tfile->socket.sk, lockdep_rtnl_is_held()); +++ lock_sock(tfile->socket.sk); +++ sk_detach_filter(tfile->socket.sk); +++ release_sock(tfile->socket.sk); ++ } ++ ++ tun->filter_attached = false; ++@@ -1848,8 +1851,9 @@ static int tun_attach_filter(struct tun_ ++ ++ for (i = 0; i < tun->numqueues; i++) { ++ tfile = rtnl_dereference(tun->tfiles[i]); ++- ret = __sk_attach_filter(&tun->fprog, tfile->socket.sk, ++- lockdep_rtnl_is_held()); +++ lock_sock(tfile->socket.sk); +++ ret = sk_attach_filter(&tun->fprog, tfile->socket.sk); +++ release_sock(tfile->socket.sk); ++ if (ret) { ++ tun_detach_filter(tun, i); ++ return ret; ++--- a/include/linux/list.h +++++ b/include/linux/list.h ++@@ -1,3 +1,4 @@ +++/* SPDX-License-Identifier: GPL-2.0 */ ++ #ifndef _LINUX_LIST_H ++ #define _LINUX_LIST_H ++ ++@@ -24,31 +25,46 @@ ++ ++ static inline void INIT_LIST_HEAD(struct list_head *list) ++ { ++- list->next = list; +++ WRITE_ONCE(list->next, list); ++ list->prev = list; ++ } ++ +++#ifdef CONFIG_DEBUG_LIST +++extern bool __list_add_valid(struct list_head *new, +++ struct list_head *prev, +++ struct list_head *next); +++extern bool __list_del_entry_valid(struct list_head *entry); +++#else +++static inline bool __list_add_valid(struct list_head *new, +++ struct list_head *prev, +++ struct list_head *next) +++{ +++ return true; +++} +++static inline bool __list_del_entry_valid(struct list_head *entry) +++{ +++ return true; +++} +++#endif +++ ++ /* ++ * Insert a new entry between two known consecutive entries. ++ * ++ * This is only for internal list manipulation where we know ++ * the prev/next entries already! ++ */ ++-#ifndef CONFIG_DEBUG_LIST ++ static inline void __list_add(struct list_head *new, ++ struct list_head *prev, ++ struct list_head *next) ++ { +++ if (!__list_add_valid(new, prev, next)) +++ return; +++ ++ next->prev = new; ++ new->next = next; ++ new->prev = prev; ++- prev->next = new; +++ WRITE_ONCE(prev->next, new); ++ } ++-#else ++-extern void __list_add(struct list_head *new, ++- struct list_head *prev, ++- struct list_head *next); ++-#endif ++ ++ /** ++ * list_add - add a new entry ++@@ -90,28 +106,40 @@ static inline void __list_del(struct lis ++ WRITE_ONCE(prev->next, next); ++ } ++ +++/* +++ * Delete a list entry and clear the 'prev' pointer. +++ * +++ * This is a special-purpose list clearing method used in the networking code +++ * for lists allocated as per-cpu, where we don't want to incur the extra +++ * WRITE_ONCE() overhead of a regular list_del_init(). The code that uses this +++ * needs to check the node 'prev' pointer instead of calling list_empty(). +++ */ +++static inline void __list_del_clearprev(struct list_head *entry) +++{ +++ __list_del(entry->prev, entry->next); +++ entry->prev = NULL; +++} +++ ++ /** ++ * list_del - deletes entry from list. ++ * @entry: the element to delete from the list. ++ * Note: list_empty() on entry does not return true after this, the entry is ++ * in an undefined state. ++ */ ++-#ifndef CONFIG_DEBUG_LIST ++ static inline void __list_del_entry(struct list_head *entry) ++ { +++ if (!__list_del_entry_valid(entry)) +++ return; +++ ++ __list_del(entry->prev, entry->next); ++ } ++ ++ static inline void list_del(struct list_head *entry) ++ { ++- __list_del(entry->prev, entry->next); +++ __list_del_entry(entry); ++ entry->next = LIST_POISON1; ++ entry->prev = LIST_POISON2; ++ } ++-#else ++-extern void __list_del_entry(struct list_head *entry); ++-extern void list_del(struct list_head *entry); ++-#endif ++ ++ /** ++ * list_replace - replace old entry by new one ++@@ -137,6 +165,23 @@ static inline void list_replace_init(str ++ } ++ ++ /** +++ * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position +++ * @entry1: the location to place entry2 +++ * @entry2: the location to place entry1 +++ */ +++static inline void list_swap(struct list_head *entry1, +++ struct list_head *entry2) +++{ +++ struct list_head *pos = entry2->prev; +++ +++ list_del(entry2); +++ list_replace(entry1, entry2); +++ if (pos == entry1) +++ pos = entry2; +++ list_add(entry1, pos); +++} +++ +++/** ++ * list_del_init - deletes entry from list and reinitialize it. ++ * @entry: the element to delete from the list. ++ */ ++@@ -170,6 +215,40 @@ static inline void list_move_tail(struct ++ } ++ ++ /** +++ * list_bulk_move_tail - move a subsection of a list to its tail +++ * @head: the head that will follow our entry +++ * @first: first entry to move +++ * @last: last entry to move, can be the same as first +++ * +++ * Move all entries between @first and including @last before @head. +++ * All three entries must belong to the same linked list. +++ */ +++static inline void list_bulk_move_tail(struct list_head *head, +++ struct list_head *first, +++ struct list_head *last) +++{ +++ first->prev->next = last->next; +++ last->next->prev = first->prev; +++ +++ head->prev->next = first; +++ first->prev = head->prev; +++ +++ last->next = head; +++ head->prev = last; +++} +++ +++/** +++ * list_is_first -- tests whether @list is the first entry in list @head +++ * @list: the entry to test +++ * @head: the head of the list +++ */ +++static inline int list_is_first(const struct list_head *list, +++ const struct list_head *head) +++{ +++ return list->prev == head; +++} +++ +++/** ++ * list_is_last - tests whether @list is the last entry in list @head ++ * @list: the entry to test ++ * @head: the head of the list ++@@ -186,7 +265,7 @@ static inline int list_is_last(const str ++ */ ++ static inline int list_empty(const struct list_head *head) ++ { ++- return head->next == head; +++ return READ_ONCE(head->next) == head; ++ } ++ ++ /** ++@@ -223,6 +302,24 @@ static inline void list_rotate_left(stru ++ } ++ ++ /** +++ * list_rotate_to_front() - Rotate list to specific item. +++ * @list: The desired new front of the list. +++ * @head: The head of the list. +++ * +++ * Rotates list so that @list becomes the new front of the list. +++ */ +++static inline void list_rotate_to_front(struct list_head *list, +++ struct list_head *head) +++{ +++ /* +++ * Deletes the list head from the list denoted by @head and +++ * places it as the tail of @list, this effectively rotates the +++ * list so that @list is at the front. +++ */ +++ list_move_tail(head, list); +++} +++ +++/** ++ * list_is_singular - tests whether a list has just one entry. ++ * @head: the list to test. ++ */ ++@@ -271,6 +368,36 @@ static inline void list_cut_position(str ++ __list_cut_position(list, head, entry); ++ } ++ +++/** +++ * list_cut_before - cut a list into two, before given entry +++ * @list: a new list to add all removed entries +++ * @head: a list with entries +++ * @entry: an entry within head, could be the head itself +++ * +++ * This helper moves the initial part of @head, up to but +++ * excluding @entry, from @head to @list. You should pass +++ * in @entry an element you know is on @head. @list should +++ * be an empty list or a list you do not care about losing +++ * its data. +++ * If @entry == @head, all entries on @head are moved to +++ * @list. +++ */ +++static inline void list_cut_before(struct list_head *list, +++ struct list_head *head, +++ struct list_head *entry) +++{ +++ if (head->next == entry) { +++ INIT_LIST_HEAD(list); +++ return; +++ } +++ list->next = head->next; +++ list->next->prev = list; +++ list->prev = entry->prev; +++ list->prev->next = list; +++ head->next = entry; +++ entry->prev = head; +++} +++ ++ static inline void __list_splice(const struct list_head *list, ++ struct list_head *prev, ++ struct list_head *next) ++@@ -381,8 +508,11 @@ static inline void list_splice_tail_init ++ * ++ * Note that if the list is empty, it returns NULL. ++ */ ++-#define list_first_entry_or_null(ptr, type, member) \ ++- (!list_empty(ptr) ? list_first_entry(ptr, type, member) : NULL) +++#define list_first_entry_or_null(ptr, type, member) ({ \ +++ struct list_head *head__ = (ptr); \ +++ struct list_head *pos__ = READ_ONCE(head__->next); \ +++ pos__ != head__ ? list_entry(pos__, type, member) : NULL; \ +++}) ++ ++ /** ++ * list_next_entry - get the next element in list ++@@ -511,6 +641,19 @@ static inline void list_splice_tail_init ++ pos = list_next_entry(pos, member)) ++ ++ /** +++ * list_for_each_entry_from_reverse - iterate backwards over list of given type +++ * from the current point +++ * @pos: the type * to use as a loop cursor. +++ * @head: the head for your list. +++ * @member: the name of the list_head within the struct. +++ * +++ * Iterate backwards over list of given type, continuing from current position. +++ */ +++#define list_for_each_entry_from_reverse(pos, head, member) \ +++ for (; &pos->member != (head); \ +++ pos = list_prev_entry(pos, member)) +++ +++/** ++ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry ++ * @pos: the type * to use as a loop cursor. ++ * @n: another type * to use as temporary storage ++@@ -608,7 +751,7 @@ static inline int hlist_unhashed(const s ++ ++ static inline int hlist_empty(const struct hlist_head *h) ++ { ++- return !h->first; +++ return !READ_ONCE(h->first); ++ } ++ ++ static inline void __hlist_del(struct hlist_node *n) ++@@ -642,7 +785,7 @@ static inline void hlist_add_head(struct ++ n->next = first; ++ if (first) ++ first->pprev = &n->next; ++- h->first = n; +++ WRITE_ONCE(h->first, n); ++ n->pprev = &h->first; ++ } ++ ++@@ -653,7 +796,7 @@ static inline void hlist_add_before(stru ++ n->pprev = next->pprev; ++ n->next = next; ++ next->pprev = &n->next; ++- *(n->pprev) = n; +++ WRITE_ONCE(*(n->pprev), n); ++ } ++ ++ static inline void hlist_add_behind(struct hlist_node *n, ++@@ -679,6 +822,16 @@ static inline bool hlist_fake(struct hli ++ } ++ ++ /* +++ * Check whether the node is the only node of the head without +++ * accessing head: +++ */ +++static inline bool +++hlist_is_singular_node(struct hlist_node *n, struct hlist_head *h) +++{ +++ return !n->next && n->pprev == &h->first; +++} +++ +++/* ++ * Move a list from one list head to another. Fixup the pprev ++ * reference of the first entry if it exists. ++ */ ++--- /dev/null +++++ b/include/linux/ptr_ring.h ++@@ -0,0 +1,673 @@ +++/* SPDX-License-Identifier: GPL-2.0-or-later */ +++/* +++ * Definitions for the 'struct ptr_ring' datastructure. +++ * +++ * Author: +++ * Michael S. Tsirkin +++ * +++ * Copyright (C) 2016 Red Hat, Inc. +++ * +++ * This is a limited-size FIFO maintaining pointers in FIFO order, with +++ * one CPU producing entries and another consuming entries from a FIFO. +++ * +++ * This implementation tries to minimize cache-contention when there is a +++ * single producer and a single consumer CPU. +++ */ +++ +++#ifndef _LINUX_PTR_RING_H +++#define _LINUX_PTR_RING_H 1 +++ +++#ifdef __KERNEL__ +++#include +++#include +++#include +++#include +++#include +++#include +++#endif +++ +++struct ptr_ring { +++ int producer ____cacheline_aligned_in_smp; +++ spinlock_t producer_lock; +++ int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ +++ int consumer_tail; /* next entry to invalidate */ +++ spinlock_t consumer_lock; +++ /* Shared consumer/producer data */ +++ /* Read-only by both the producer and the consumer */ +++ int size ____cacheline_aligned_in_smp; /* max entries in queue */ +++ int batch; /* number of entries to consume in a batch */ +++ void **queue; +++}; +++ +++/* Note: callers invoking this in a loop must use a compiler barrier, +++ * for example cpu_relax(). +++ * +++ * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock: +++ * see e.g. ptr_ring_full. +++ */ +++static inline bool __ptr_ring_full(struct ptr_ring *r) +++{ +++ return r->queue[r->producer]; +++} +++ +++static inline bool ptr_ring_full(struct ptr_ring *r) +++{ +++ bool ret; +++ +++ spin_lock(&r->producer_lock); +++ ret = __ptr_ring_full(r); +++ spin_unlock(&r->producer_lock); +++ +++ return ret; +++} +++ +++static inline bool ptr_ring_full_irq(struct ptr_ring *r) +++{ +++ bool ret; +++ +++ spin_lock_irq(&r->producer_lock); +++ ret = __ptr_ring_full(r); +++ spin_unlock_irq(&r->producer_lock); +++ +++ return ret; +++} +++ +++static inline bool ptr_ring_full_any(struct ptr_ring *r) +++{ +++ unsigned long flags; +++ bool ret; +++ +++ spin_lock_irqsave(&r->producer_lock, flags); +++ ret = __ptr_ring_full(r); +++ spin_unlock_irqrestore(&r->producer_lock, flags); +++ +++ return ret; +++} +++ +++static inline bool ptr_ring_full_bh(struct ptr_ring *r) +++{ +++ bool ret; +++ +++ spin_lock_bh(&r->producer_lock); +++ ret = __ptr_ring_full(r); +++ spin_unlock_bh(&r->producer_lock); +++ +++ return ret; +++} +++ +++/* Note: callers invoking this in a loop must use a compiler barrier, +++ * for example cpu_relax(). Callers must hold producer_lock. +++ * Callers are responsible for making sure pointer that is being queued +++ * points to a valid data. +++ */ +++static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) +++{ +++ if (unlikely(!r->size) || r->queue[r->producer]) +++ return -ENOSPC; +++ +++ /* Make sure the pointer we are storing points to a valid data. */ +++ /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */ +++ smp_wmb(); +++ +++ WRITE_ONCE(r->queue[r->producer++], ptr); +++ if (unlikely(r->producer >= r->size)) +++ r->producer = 0; +++ return 0; +++} +++ +++/* +++ * Note: resize (below) nests producer lock within consumer lock, so if you +++ * consume in interrupt or BH context, you must disable interrupts/BH when +++ * calling this. +++ */ +++static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr) +++{ +++ int ret; +++ +++ spin_lock(&r->producer_lock); +++ ret = __ptr_ring_produce(r, ptr); +++ spin_unlock(&r->producer_lock); +++ +++ return ret; +++} +++ +++static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr) +++{ +++ int ret; +++ +++ spin_lock_irq(&r->producer_lock); +++ ret = __ptr_ring_produce(r, ptr); +++ spin_unlock_irq(&r->producer_lock); +++ +++ return ret; +++} +++ +++static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr) +++{ +++ unsigned long flags; +++ int ret; +++ +++ spin_lock_irqsave(&r->producer_lock, flags); +++ ret = __ptr_ring_produce(r, ptr); +++ spin_unlock_irqrestore(&r->producer_lock, flags); +++ +++ return ret; +++} +++ +++static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) +++{ +++ int ret; +++ +++ spin_lock_bh(&r->producer_lock); +++ ret = __ptr_ring_produce(r, ptr); +++ spin_unlock_bh(&r->producer_lock); +++ +++ return ret; +++} +++ +++static inline void *__ptr_ring_peek(struct ptr_ring *r) +++{ +++ if (likely(r->size)) +++ return READ_ONCE(r->queue[r->consumer_head]); +++ return NULL; +++} +++ +++/* +++ * Test ring empty status without taking any locks. +++ * +++ * NB: This is only safe to call if ring is never resized. +++ * +++ * However, if some other CPU consumes ring entries at the same time, the value +++ * returned is not guaranteed to be correct. +++ * +++ * In this case - to avoid incorrectly detecting the ring +++ * as empty - the CPU consuming the ring entries is responsible +++ * for either consuming all ring entries until the ring is empty, +++ * or synchronizing with some other CPU and causing it to +++ * re-test __ptr_ring_empty and/or consume the ring enteries +++ * after the synchronization point. +++ * +++ * Note: callers invoking this in a loop must use a compiler barrier, +++ * for example cpu_relax(). +++ */ +++static inline bool __ptr_ring_empty(struct ptr_ring *r) +++{ +++ if (likely(r->size)) +++ return !r->queue[READ_ONCE(r->consumer_head)]; +++ return true; +++} +++ +++static inline bool ptr_ring_empty(struct ptr_ring *r) +++{ +++ bool ret; +++ +++ spin_lock(&r->consumer_lock); +++ ret = __ptr_ring_empty(r); +++ spin_unlock(&r->consumer_lock); +++ +++ return ret; +++} +++ +++static inline bool ptr_ring_empty_irq(struct ptr_ring *r) +++{ +++ bool ret; +++ +++ spin_lock_irq(&r->consumer_lock); +++ ret = __ptr_ring_empty(r); +++ spin_unlock_irq(&r->consumer_lock); +++ +++ return ret; +++} +++ +++static inline bool ptr_ring_empty_any(struct ptr_ring *r) +++{ +++ unsigned long flags; +++ bool ret; +++ +++ spin_lock_irqsave(&r->consumer_lock, flags); +++ ret = __ptr_ring_empty(r); +++ spin_unlock_irqrestore(&r->consumer_lock, flags); +++ +++ return ret; +++} +++ +++static inline bool ptr_ring_empty_bh(struct ptr_ring *r) +++{ +++ bool ret; +++ +++ spin_lock_bh(&r->consumer_lock); +++ ret = __ptr_ring_empty(r); +++ spin_unlock_bh(&r->consumer_lock); +++ +++ return ret; +++} +++ +++/* Must only be called after __ptr_ring_peek returned !NULL */ +++static inline void __ptr_ring_discard_one(struct ptr_ring *r) +++{ +++ /* Fundamentally, what we want to do is update consumer +++ * index and zero out the entry so producer can reuse it. +++ * Doing it naively at each consume would be as simple as: +++ * consumer = r->consumer; +++ * r->queue[consumer++] = NULL; +++ * if (unlikely(consumer >= r->size)) +++ * consumer = 0; +++ * r->consumer = consumer; +++ * but that is suboptimal when the ring is full as producer is writing +++ * out new entries in the same cache line. Defer these updates until a +++ * batch of entries has been consumed. +++ */ +++ /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty +++ * to work correctly. +++ */ +++ int consumer_head = r->consumer_head; +++ int head = consumer_head++; +++ +++ /* Once we have processed enough entries invalidate them in +++ * the ring all at once so producer can reuse their space in the ring. +++ * We also do this when we reach end of the ring - not mandatory +++ * but helps keep the implementation simple. +++ */ +++ if (unlikely(consumer_head - r->consumer_tail >= r->batch || +++ consumer_head >= r->size)) { +++ /* Zero out entries in the reverse order: this way we touch the +++ * cache line that producer might currently be reading the last; +++ * producer won't make progress and touch other cache lines +++ * besides the first one until we write out all entries. +++ */ +++ while (likely(head >= r->consumer_tail)) +++ r->queue[head--] = NULL; +++ r->consumer_tail = consumer_head; +++ } +++ if (unlikely(consumer_head >= r->size)) { +++ consumer_head = 0; +++ r->consumer_tail = 0; +++ } +++ /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ +++ WRITE_ONCE(r->consumer_head, consumer_head); +++} +++ +++static inline void *__ptr_ring_consume(struct ptr_ring *r) +++{ +++ void *ptr; +++ +++ /* The READ_ONCE in __ptr_ring_peek guarantees that anyone +++ * accessing data through the pointer is up to date. Pairs +++ * with smp_wmb in __ptr_ring_produce. +++ */ +++ ptr = __ptr_ring_peek(r); +++ if (ptr) +++ __ptr_ring_discard_one(r); +++ +++ return ptr; +++} +++ +++static inline int __ptr_ring_consume_batched(struct ptr_ring *r, +++ void **array, int n) +++{ +++ void *ptr; +++ int i; +++ +++ for (i = 0; i < n; i++) { +++ ptr = __ptr_ring_consume(r); +++ if (!ptr) +++ break; +++ array[i] = ptr; +++ } +++ +++ return i; +++} +++ +++/* +++ * Note: resize (below) nests producer lock within consumer lock, so if you +++ * call this in interrupt or BH context, you must disable interrupts/BH when +++ * producing. +++ */ +++static inline void *ptr_ring_consume(struct ptr_ring *r) +++{ +++ void *ptr; +++ +++ spin_lock(&r->consumer_lock); +++ ptr = __ptr_ring_consume(r); +++ spin_unlock(&r->consumer_lock); +++ +++ return ptr; +++} +++ +++static inline void *ptr_ring_consume_irq(struct ptr_ring *r) +++{ +++ void *ptr; +++ +++ spin_lock_irq(&r->consumer_lock); +++ ptr = __ptr_ring_consume(r); +++ spin_unlock_irq(&r->consumer_lock); +++ +++ return ptr; +++} +++ +++static inline void *ptr_ring_consume_any(struct ptr_ring *r) +++{ +++ unsigned long flags; +++ void *ptr; +++ +++ spin_lock_irqsave(&r->consumer_lock, flags); +++ ptr = __ptr_ring_consume(r); +++ spin_unlock_irqrestore(&r->consumer_lock, flags); +++ +++ return ptr; +++} +++ +++static inline void *ptr_ring_consume_bh(struct ptr_ring *r) +++{ +++ void *ptr; +++ +++ spin_lock_bh(&r->consumer_lock); +++ ptr = __ptr_ring_consume(r); +++ spin_unlock_bh(&r->consumer_lock); +++ +++ return ptr; +++} +++ +++static inline int ptr_ring_consume_batched(struct ptr_ring *r, +++ void **array, int n) +++{ +++ int ret; +++ +++ spin_lock(&r->consumer_lock); +++ ret = __ptr_ring_consume_batched(r, array, n); +++ spin_unlock(&r->consumer_lock); +++ +++ return ret; +++} +++ +++static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r, +++ void **array, int n) +++{ +++ int ret; +++ +++ spin_lock_irq(&r->consumer_lock); +++ ret = __ptr_ring_consume_batched(r, array, n); +++ spin_unlock_irq(&r->consumer_lock); +++ +++ return ret; +++} +++ +++static inline int ptr_ring_consume_batched_any(struct ptr_ring *r, +++ void **array, int n) +++{ +++ unsigned long flags; +++ int ret; +++ +++ spin_lock_irqsave(&r->consumer_lock, flags); +++ ret = __ptr_ring_consume_batched(r, array, n); +++ spin_unlock_irqrestore(&r->consumer_lock, flags); +++ +++ return ret; +++} +++ +++static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, +++ void **array, int n) +++{ +++ int ret; +++ +++ spin_lock_bh(&r->consumer_lock); +++ ret = __ptr_ring_consume_batched(r, array, n); +++ spin_unlock_bh(&r->consumer_lock); +++ +++ return ret; +++} +++ +++/* Cast to structure type and call a function without discarding from FIFO. +++ * Function must return a value. +++ * Callers must take consumer_lock. +++ */ +++#define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r))) +++ +++#define PTR_RING_PEEK_CALL(r, f) ({ \ +++ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ +++ \ +++ spin_lock(&(r)->consumer_lock); \ +++ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ +++ spin_unlock(&(r)->consumer_lock); \ +++ __PTR_RING_PEEK_CALL_v; \ +++}) +++ +++#define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \ +++ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ +++ \ +++ spin_lock_irq(&(r)->consumer_lock); \ +++ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ +++ spin_unlock_irq(&(r)->consumer_lock); \ +++ __PTR_RING_PEEK_CALL_v; \ +++}) +++ +++#define PTR_RING_PEEK_CALL_BH(r, f) ({ \ +++ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ +++ \ +++ spin_lock_bh(&(r)->consumer_lock); \ +++ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ +++ spin_unlock_bh(&(r)->consumer_lock); \ +++ __PTR_RING_PEEK_CALL_v; \ +++}) +++ +++#define PTR_RING_PEEK_CALL_ANY(r, f) ({ \ +++ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ +++ unsigned long __PTR_RING_PEEK_CALL_f;\ +++ \ +++ spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ +++ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ +++ spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ +++ __PTR_RING_PEEK_CALL_v; \ +++}) +++ +++/* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See +++ * documentation for vmalloc for which of them are legal. +++ */ +++static inline void **__ptr_ring_init_queue_alloc(unsigned int size, gfp_t gfp) +++{ +++ if (size > KMALLOC_MAX_SIZE / sizeof(void *)) +++ return NULL; +++ return kmalloc_array(size, sizeof(void *), gfp | __GFP_ZERO); +++} +++ +++static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) +++{ +++ r->size = size; +++ r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); +++ /* We need to set batch at least to 1 to make logic +++ * in __ptr_ring_discard_one work correctly. +++ * Batching too much (because ring is small) would cause a lot of +++ * burstiness. Needs tuning, for now disable batching. +++ */ +++ if (r->batch > r->size / 2 || !r->batch) +++ r->batch = 1; +++} +++ +++static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) +++{ +++ r->queue = __ptr_ring_init_queue_alloc(size, gfp); +++ if (!r->queue) +++ return -ENOMEM; +++ +++ __ptr_ring_set_size(r, size); +++ r->producer = r->consumer_head = r->consumer_tail = 0; +++ spin_lock_init(&r->producer_lock); +++ spin_lock_init(&r->consumer_lock); +++ +++ return 0; +++} +++ +++/* +++ * Return entries into ring. Destroy entries that don't fit. +++ * +++ * Note: this is expected to be a rare slow path operation. +++ * +++ * Note: producer lock is nested within consumer lock, so if you +++ * resize you must make sure all uses nest correctly. +++ * In particular if you consume ring in interrupt or BH context, you must +++ * disable interrupts/BH when doing so. +++ */ +++static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, +++ void (*destroy)(void *)) +++{ +++ unsigned long flags; +++ int head; +++ +++ spin_lock_irqsave(&r->consumer_lock, flags); +++ spin_lock(&r->producer_lock); +++ +++ if (!r->size) +++ goto done; +++ +++ /* +++ * Clean out buffered entries (for simplicity). This way following code +++ * can test entries for NULL and if not assume they are valid. +++ */ +++ head = r->consumer_head - 1; +++ while (likely(head >= r->consumer_tail)) +++ r->queue[head--] = NULL; +++ r->consumer_tail = r->consumer_head; +++ +++ /* +++ * Go over entries in batch, start moving head back and copy entries. +++ * Stop when we run into previously unconsumed entries. +++ */ +++ while (n) { +++ head = r->consumer_head - 1; +++ if (head < 0) +++ head = r->size - 1; +++ if (r->queue[head]) { +++ /* This batch entry will have to be destroyed. */ +++ goto done; +++ } +++ r->queue[head] = batch[--n]; +++ r->consumer_tail = head; +++ /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ +++ WRITE_ONCE(r->consumer_head, head); +++ } +++ +++done: +++ /* Destroy all entries left in the batch. */ +++ while (n) +++ destroy(batch[--n]); +++ spin_unlock(&r->producer_lock); +++ spin_unlock_irqrestore(&r->consumer_lock, flags); +++} +++ +++static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, +++ int size, gfp_t gfp, +++ void (*destroy)(void *)) +++{ +++ int producer = 0; +++ void **old; +++ void *ptr; +++ +++ while ((ptr = __ptr_ring_consume(r))) +++ if (producer < size) +++ queue[producer++] = ptr; +++ else if (destroy) +++ destroy(ptr); +++ +++ if (producer >= size) +++ producer = 0; +++ __ptr_ring_set_size(r, size); +++ r->producer = producer; +++ r->consumer_head = 0; +++ r->consumer_tail = 0; +++ old = r->queue; +++ r->queue = queue; +++ +++ return old; +++} +++ +++/* +++ * Note: producer lock is nested within consumer lock, so if you +++ * resize you must make sure all uses nest correctly. +++ * In particular if you consume ring in interrupt or BH context, you must +++ * disable interrupts/BH when doing so. +++ */ +++static inline int ptr_ring_resize(struct ptr_ring *r, int size, gfp_t gfp, +++ void (*destroy)(void *)) +++{ +++ unsigned long flags; +++ void **queue = __ptr_ring_init_queue_alloc(size, gfp); +++ void **old; +++ +++ if (!queue) +++ return -ENOMEM; +++ +++ spin_lock_irqsave(&(r)->consumer_lock, flags); +++ spin_lock(&(r)->producer_lock); +++ +++ old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy); +++ +++ spin_unlock(&(r)->producer_lock); +++ spin_unlock_irqrestore(&(r)->consumer_lock, flags); +++ +++ kvfree(old); +++ +++ return 0; +++} +++ +++/* +++ * Note: producer lock is nested within consumer lock, so if you +++ * resize you must make sure all uses nest correctly. +++ * In particular if you consume ring in interrupt or BH context, you must +++ * disable interrupts/BH when doing so. +++ */ +++static inline int ptr_ring_resize_multiple(struct ptr_ring **rings, +++ unsigned int nrings, +++ int size, +++ gfp_t gfp, void (*destroy)(void *)) +++{ +++ unsigned long flags; +++ void ***queues; +++ int i; +++ +++ queues = kmalloc_array(nrings, sizeof(*queues), gfp); +++ if (!queues) +++ goto noqueues; +++ +++ for (i = 0; i < nrings; ++i) { +++ queues[i] = __ptr_ring_init_queue_alloc(size, gfp); +++ if (!queues[i]) +++ goto nomem; +++ } +++ +++ for (i = 0; i < nrings; ++i) { +++ spin_lock_irqsave(&(rings[i])->consumer_lock, flags); +++ spin_lock(&(rings[i])->producer_lock); +++ queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], +++ size, gfp, destroy); +++ spin_unlock(&(rings[i])->producer_lock); +++ spin_unlock_irqrestore(&(rings[i])->consumer_lock, flags); +++ } +++ +++ for (i = 0; i < nrings; ++i) +++ kvfree(queues[i]); +++ +++ kfree(queues); +++ +++ return 0; +++ +++nomem: +++ while (--i >= 0) +++ kvfree(queues[i]); +++ +++ kfree(queues); +++ +++noqueues: +++ return -ENOMEM; +++} +++ +++static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) +++{ +++ void *ptr; +++ +++ if (destroy) +++ while ((ptr = ptr_ring_consume(r))) +++ destroy(ptr); +++ kvfree(r->queue); +++} +++ +++#endif /* _LINUX_PTR_RING_H */ ++--- a/include/linux/skbuff.h +++++ b/include/linux/skbuff.h ++@@ -37,6 +37,7 @@ ++ #include ++ #include ++ #include +++#include ++ #include ++ ++ /* A. Checksumming of received packets by device. ++@@ -592,13 +593,23 @@ struct sk_buff { ++ */ ++ kmemcheck_bitfield_begin(flags1); ++ __u16 queue_mapping; +++ +++/* if you move cloned around you also must adapt those constants */ +++#ifdef __BIG_ENDIAN_BITFIELD +++#define CLONED_MASK (1 << 7) +++#else +++#define CLONED_MASK 1 +++#endif +++#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset) +++ +++ __u8 __cloned_offset[0]; ++ __u8 cloned:1, ++ nohdr:1, ++ fclone:2, ++ peeked:1, ++ head_frag:1, ++- xmit_more:1; ++- /* one bit hole */ +++ xmit_more:1, +++ __unused:1; /* one bit hole */ ++ kmemcheck_bitfield_end(flags1); ++ ++ /* fields enclosed in headers_start/headers_end are copied ++@@ -639,6 +650,14 @@ struct sk_buff { ++ __u8 csum_level:2; ++ __u8 csum_bad:1; ++ +++#ifdef __BIG_ENDIAN_BITFIELD +++#define PKT_VLAN_PRESENT_BIT 7 +++#else +++#define PKT_VLAN_PRESENT_BIT 0 +++#endif +++#define PKT_VLAN_PRESENT_OFFSET() offsetof(struct sk_buff, __pkt_vlan_present_offset) +++ __u8 __pkt_vlan_present_offset[0]; +++ __u8 vlan_present:1; ++ #ifdef CONFIG_IPV6_NDISC_NODETYPE ++ __u8 ndisc_nodetype:2; ++ #endif ++@@ -647,7 +666,7 @@ struct sk_buff { ++ __u8 remcsum_offload:1; ++ __u8 gro_skip:1; ++ __u8 fast_forwarded:1; ++- /* 1 or 3 bit hole */ +++ /* 0 or 2 bit hole */ ++ ++ #ifdef CONFIG_NET_SCHED ++ __u16 tc_index; /* traffic control index */ ++@@ -805,6 +824,15 @@ static inline struct rtable *skb_rtable( ++ return (struct rtable *)skb_dst(skb); ++ } ++ +++/* For mangling skb->pkt_type from user space side from applications +++ * such as nft, tc, etc, we only allow a conservative subset of +++ * possible pkt_types to be set. +++*/ +++static inline bool skb_pkt_type_ok(u32 ptype) +++{ +++ return ptype <= PACKET_OTHERHOST; +++} +++ ++ void kfree_skb(struct sk_buff *skb); ++ void kfree_skb_list(struct sk_buff *segs); ++ void skb_tx_error(struct sk_buff *skb); ++@@ -2127,6 +2155,11 @@ static inline unsigned char *skb_mac_hea ++ return skb->head + skb->mac_header; ++ } ++ +++static inline u32 skb_mac_header_len(const struct sk_buff *skb) +++{ +++ return skb->network_header - skb->mac_header; +++} +++ ++ static inline int skb_mac_header_was_set(const struct sk_buff *skb) ++ { ++ return skb->mac_header != (typeof(skb->mac_header))~0U; ++@@ -2256,7 +2289,7 @@ static inline int pskb_network_may_pull( ++ ++ int ___pskb_trim(struct sk_buff *skb, unsigned int len); ++ ++-static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +++static inline void __skb_set_length(struct sk_buff *skb, unsigned int len) ++ { ++ if (unlikely(skb_is_nonlinear(skb))) { ++ WARN_ON(1); ++@@ -2266,6 +2299,11 @@ static inline void __skb_trim(struct sk_ ++ skb_set_tail_pointer(skb, len); ++ } ++ +++static inline void __skb_trim(struct sk_buff *skb, unsigned int len) +++{ +++ __skb_set_length(skb, len); +++} +++ ++ void skb_trim(struct sk_buff *skb, unsigned int len); ++ ++ static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) ++@@ -2318,6 +2356,20 @@ static inline struct sk_buff *skb_reduce ++ return skb; ++ } ++ +++static inline int __skb_grow(struct sk_buff *skb, unsigned int len) +++{ +++ unsigned int diff = len - skb->len; +++ +++ if (skb_tailroom(skb) < diff) { +++ int ret = pskb_expand_head(skb, 0, diff - skb_tailroom(skb), +++ GFP_ATOMIC); +++ if (ret) +++ return ret; +++ } +++ __skb_set_length(skb, len); +++ return 0; +++} +++ ++ /** ++ * skb_orphan - orphan a buffer ++ * @skb: buffer to orphan ++@@ -2818,6 +2870,18 @@ static inline int skb_linearize_cow(stru ++ __skb_linearize(skb) : 0; ++ } ++ +++static __always_inline void +++__skb_postpull_rcsum(struct sk_buff *skb, const void *start, unsigned int len, +++ unsigned int off) +++{ +++ if (skb->ip_summed == CHECKSUM_COMPLETE) +++ skb->csum = csum_block_sub(skb->csum, +++ csum_partial(start, len, 0), off); +++ else if (skb->ip_summed == CHECKSUM_PARTIAL && +++ skb_checksum_start_offset(skb) < 0) +++ skb->ip_summed = CHECKSUM_NONE; +++} +++ ++ /** ++ * skb_postpull_rcsum - update checksum for received skb after pull ++ * @skb: buffer to update ++@@ -2828,36 +2892,38 @@ static inline int skb_linearize_cow(stru ++ * update the CHECKSUM_COMPLETE checksum, or set ip_summed to ++ * CHECKSUM_NONE so that it can be recomputed from scratch. ++ */ ++- ++ static inline void skb_postpull_rcsum(struct sk_buff *skb, ++ const void *start, unsigned int len) ++ { ++- if (skb->ip_summed == CHECKSUM_COMPLETE) ++- skb->csum = csum_sub(skb->csum, csum_partial(start, len, 0)); ++- else if (skb->ip_summed == CHECKSUM_PARTIAL && ++- skb_checksum_start_offset(skb) < 0) ++- skb->ip_summed = CHECKSUM_NONE; +++ __skb_postpull_rcsum(skb, start, len, 0); ++ } ++ ++-unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); +++static __always_inline void +++__skb_postpush_rcsum(struct sk_buff *skb, const void *start, unsigned int len, +++ unsigned int off) +++{ +++ if (skb->ip_summed == CHECKSUM_COMPLETE) +++ skb->csum = csum_block_add(skb->csum, +++ csum_partial(start, len, 0), off); +++} ++ +++/** +++ * skb_postpush_rcsum - update checksum for received skb after push +++ * @skb: buffer to update +++ * @start: start of data after push +++ * @len: length of data pushed +++ * +++ * After doing a push on a received packet, you need to call this to +++ * update the CHECKSUM_COMPLETE checksum. +++ */ ++ static inline void skb_postpush_rcsum(struct sk_buff *skb, ++ const void *start, unsigned int len) ++ { ++- /* For performing the reverse operation to skb_postpull_rcsum(), ++- * we can instead of ... ++- * ++- * skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); ++- * ++- * ... just use this equivalent version here to save a few ++- * instructions. Feeding csum of 0 in csum_partial() and later ++- * on adding skb->csum is equivalent to feed skb->csum in the ++- * first place. ++- */ ++- if (skb->ip_summed == CHECKSUM_COMPLETE) ++- skb->csum = csum_partial(start, len, skb->csum); +++ __skb_postpush_rcsum(skb, start, len, 0); ++ } ++ +++unsigned char *skb_pull_rcsum(struct sk_buff *skb, unsigned int len); +++ ++ /** ++ * skb_push_rcsum - push skb and update receive checksum ++ * @skb: buffer to update ++@@ -2901,6 +2967,21 @@ static inline int pskb_trim_rcsum(struct ++ #define skb_rb_next(skb) rb_to_skb(rb_next(&(skb)->rbnode)) ++ #define skb_rb_prev(skb) rb_to_skb(rb_prev(&(skb)->rbnode)) ++ +++static inline int __skb_trim_rcsum(struct sk_buff *skb, unsigned int len) +++{ +++ if (skb->ip_summed == CHECKSUM_COMPLETE) +++ skb->ip_summed = CHECKSUM_NONE; +++ __skb_trim(skb, len); +++ return 0; +++} +++ +++static inline int __skb_grow_rcsum(struct sk_buff *skb, unsigned int len) +++{ +++ if (skb->ip_summed == CHECKSUM_COMPLETE) +++ skb->ip_summed = CHECKSUM_NONE; +++ return __skb_grow(skb, len); +++} +++ ++ #define skb_queue_walk(queue, skb) \ ++ for (skb = (queue)->next; \ ++ skb != (struct sk_buff *)(queue); \ ++@@ -3662,6 +3743,13 @@ static inline bool skb_is_gso_v6(const s ++ return skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6; ++ } ++ +++static inline void skb_gso_reset(struct sk_buff *skb) +++{ +++ skb_shinfo(skb)->gso_size = 0; +++ skb_shinfo(skb)->gso_segs = 0; +++ skb_shinfo(skb)->gso_type = 0; +++} +++ ++ void __skb_warn_lro_forwarding(const struct sk_buff *skb); ++ ++ static inline bool skb_warn_if_lro(const struct sk_buff *skb) ++--- a/include/linux/if_arp.h +++++ b/include/linux/if_arp.h ++@@ -44,4 +44,21 @@ static inline int arp_hdr_len(struct net ++ return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2; ++ } ++ } +++ +++static inline bool dev_is_mac_header_xmit(const struct net_device *dev) +++{ +++ switch (dev->type) { +++ case ARPHRD_TUNNEL: +++ case ARPHRD_TUNNEL6: +++ case ARPHRD_SIT: +++ case ARPHRD_IPGRE: +++ case ARPHRD_VOID: +++ case ARPHRD_NONE: +++ case ARPHRD_RAWIP: +++ return false; +++ default: +++ return true; +++ } +++} +++ ++ #endif /* _LINUX_IF_ARP_H */ ++--- a/include/linux/if_vlan.h +++++ b/include/linux/if_vlan.h ++@@ -66,7 +66,6 @@ static inline struct vlan_ethhdr *vlan_e ++ #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ ++ #define VLAN_PRIO_SHIFT 13 ++ #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */ ++-#define VLAN_TAG_PRESENT VLAN_CFI_MASK ++ #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ ++ #define VLAN_N_VID 4096 ++ ++@@ -78,8 +77,8 @@ static inline bool is_vlan_dev(struct ne ++ return dev->priv_flags & IFF_802_1Q_VLAN; ++ } ++ ++-#define skb_vlan_tag_present(__skb) ((__skb)->vlan_tci & VLAN_TAG_PRESENT) ++-#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci & ~VLAN_TAG_PRESENT) +++#define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) +++#define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) ++ #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) ++ ++ /** ++@@ -376,6 +375,31 @@ static inline struct sk_buff *vlan_inser ++ return skb; ++ } ++ +++/** +++ * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info +++ * @skb: skbuff to clear +++ * +++ * Clears the VLAN information from @skb +++ */ +++static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) +++{ +++ skb->vlan_present = 0; +++} +++ +++/** +++ * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb +++ * @dst: skbuff to copy to +++ * @src: skbuff to copy from +++ * +++ * Copies VLAN information from @src to @dst (for branchless code) +++ */ +++static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) +++{ +++ dst->vlan_present = src->vlan_present; +++ dst->vlan_proto = src->vlan_proto; +++ dst->vlan_tci = src->vlan_tci; +++} +++ ++ /* ++ * __vlan_hwaccel_push_inside - pushes vlan tag to the payload ++ * @skb: skbuff to tag ++@@ -390,7 +414,7 @@ static inline struct sk_buff *__vlan_hwa ++ skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, ++ skb_vlan_tag_get(skb)); ++ if (likely(skb)) ++- skb->vlan_tci = 0; +++ __vlan_hwaccel_clear_tag(skb); ++ return skb; ++ } ++ /* ++@@ -422,7 +446,8 @@ static inline void __vlan_hwaccel_put_ta ++ __be16 vlan_proto, u16 vlan_tci) ++ { ++ skb->vlan_proto = vlan_proto; ++- skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci; +++ skb->vlan_tci = vlan_tci; +++ skb->vlan_present = 1; ++ } ++ ++ /** ++--- a/include/net/checksum.h +++++ b/include/net/checksum.h ++@@ -120,6 +120,11 @@ static inline __wsum csum_partial_ext(co ++ ++ #define CSUM_MANGLED_0 ((__force __sum16)0xffff) ++ +++static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) +++{ +++ *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); +++} +++ ++ static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) ++ { ++ __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); ++--- a/lib/test_bpf.c +++++ b/lib/test_bpf.c ++@@ -38,6 +38,7 @@ ++ #define SKB_HASH 0x1234aaab ++ #define SKB_QUEUE_MAP 123 ++ #define SKB_VLAN_TCI 0xffff +++#define SKB_VLAN_PRESENT 1 ++ #define SKB_DEV_IFINDEX 577 ++ #define SKB_DEV_TYPE 588 ++ ++@@ -691,8 +692,8 @@ static struct bpf_test tests[] = { ++ CLASSIC, ++ { }, ++ { ++- { 1, SKB_VLAN_TCI & ~VLAN_TAG_PRESENT }, ++- { 10, SKB_VLAN_TCI & ~VLAN_TAG_PRESENT } +++ { 1, SKB_VLAN_TCI }, +++ { 10, SKB_VLAN_TCI } ++ }, ++ }, ++ { ++@@ -705,8 +706,8 @@ static struct bpf_test tests[] = { ++ CLASSIC, ++ { }, ++ { ++- { 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }, ++- { 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) } +++ { 1, SKB_VLAN_PRESENT }, +++ { 10, SKB_VLAN_PRESENT } ++ }, ++ }, ++ { ++@@ -4432,8 +4433,8 @@ static struct bpf_test tests[] = { ++ CLASSIC, ++ { }, ++ { ++- { 1, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) }, ++- { 10, !!(SKB_VLAN_TCI & VLAN_TAG_PRESENT) } +++ { 1, SKB_VLAN_PRESENT }, +++ { 10, SKB_VLAN_PRESENT } ++ }, ++ .fill_helper = bpf_fill_maxinsns6, ++ }, ++@@ -5144,6 +5145,7 @@ static struct sk_buff *populate_skb(char ++ skb->hash = SKB_HASH; ++ skb->queue_mapping = SKB_QUEUE_MAP; ++ skb->vlan_tci = SKB_VLAN_TCI; +++ skb->vlan_present = SKB_VLAN_PRESENT; ++ skb->dev = &dev; ++ skb->dev->ifindex = SKB_DEV_IFINDEX; ++ skb->dev->type = SKB_DEV_TYPE; ++--- a/include/linux/netdevice.h +++++ b/include/linux/netdevice.h ++@@ -3171,6 +3171,21 @@ int __dev_forward_skb(struct net_device ++ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); ++ bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb); ++ +++static __always_inline int ____dev_forward_skb(struct net_device *dev, +++ struct sk_buff *skb) +++{ +++ if (skb_orphan_frags(skb, GFP_ATOMIC) || +++ unlikely(!is_skb_forwardable(dev, skb))) { +++ atomic_long_inc(&dev->rx_dropped); +++ kfree_skb(skb); +++ return NET_RX_DROP; +++ } +++ +++ skb_scrub_packet(skb, true); +++ skb->priority = 0; +++ return 0; +++} +++ ++ extern int netdev_budget; ++ ++ /* Called by rtnetlink.c:rtnl_unlock() */ ++--- a/net/openvswitch/actions.c +++++ b/net/openvswitch/actions.c ++@@ -246,7 +246,7 @@ static int push_vlan(struct sk_buff *skb ++ else ++ key->eth.tci = vlan->vlan_tci; ++ return skb_vlan_push(skb, vlan->vlan_tpid, ++- ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); +++ ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK); ++ } ++ ++ /* 'src' is already properly masked. */ ++--- a/net/openvswitch/flow.c +++++ b/net/openvswitch/flow.c ++@@ -318,7 +318,7 @@ static int parse_vlan(struct sk_buff *sk ++ return -ENOMEM; ++ ++ qp = (struct qtag_prefix *) skb->data; ++- key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); +++ key->eth.tci = qp->tci | htons(VLAN_CFI_MASK); ++ __skb_pull(skb, sizeof(struct qtag_prefix)); ++ ++ return 0; ++--- a/net/openvswitch/flow.h +++++ b/net/openvswitch/flow.h ++@@ -69,7 +69,7 @@ struct sw_flow_key { ++ struct { ++ u8 src[ETH_ALEN]; /* Ethernet source address. */ ++ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ ++- __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ +++ __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */ ++ __be16 type; /* Ethernet frame type. */ ++ } eth; ++ union { ++--- a/net/openvswitch/flow_netlink.c +++++ b/net/openvswitch/flow_netlink.c ++@@ -925,11 +925,11 @@ static int ovs_key_from_nlattrs(struct n ++ __be16 tci; ++ ++ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); ++- if (!(tci & htons(VLAN_TAG_PRESENT))) { +++ if (!(tci & htons(VLAN_CFI_MASK))) { ++ if (is_mask) ++- OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_TAG_PRESENT bit."); +++ OVS_NLERR(log, "VLAN TCI mask does not have exact match for VLAN_CFI_MASK bit."); ++ else ++- OVS_NLERR(log, "VLAN TCI does not have VLAN_TAG_PRESENT bit set."); +++ OVS_NLERR(log, "VLAN TCI does not have VLAN_CFI_MASK bit set."); ++ ++ return -EINVAL; ++ } ++@@ -1209,7 +1209,7 @@ int ovs_nla_get_match(struct net *net, s ++ key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); ++ encap_valid = true; ++ ++- if (tci & htons(VLAN_TAG_PRESENT)) { +++ if (tci & htons(VLAN_CFI_MASK)) { ++ err = parse_flow_nlattrs(encap, a, &key_attrs, log); ++ if (err) ++ return err; ++@@ -1297,7 +1297,7 @@ int ovs_nla_get_match(struct net *net, s ++ if (a[OVS_KEY_ATTR_VLAN]) ++ tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); ++ ++- if (!(tci & htons(VLAN_TAG_PRESENT))) { +++ if (!(tci & htons(VLAN_CFI_MASK))) { ++ OVS_NLERR(log, "VLAN tag present bit must have an exact match (tci_mask=%x).", ++ ntohs(tci)); ++ err = -EINVAL; ++@@ -2272,7 +2272,7 @@ static int __ovs_nla_copy_actions(struct ++ vlan = nla_data(a); ++ if (vlan->vlan_tpid != htons(ETH_P_8021Q)) ++ return -EINVAL; ++- if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) +++ if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK))) ++ return -EINVAL; ++ vlan_tci = vlan->vlan_tci; ++ break; ++@@ -2288,7 +2288,7 @@ static int __ovs_nla_copy_actions(struct ++ /* Prohibit push MPLS other than to a white list ++ * for packets that have a known tag order. ++ */ ++- if (vlan_tci & htons(VLAN_TAG_PRESENT) || +++ if (vlan_tci & htons(VLAN_CFI_MASK) || ++ (eth_type != htons(ETH_P_IP) && ++ eth_type != htons(ETH_P_IPV6) && ++ eth_type != htons(ETH_P_ARP) && ++@@ -2300,7 +2300,7 @@ static int __ovs_nla_copy_actions(struct ++ } ++ ++ case OVS_ACTION_ATTR_POP_MPLS: ++- if (vlan_tci & htons(VLAN_TAG_PRESENT) || +++ if (vlan_tci & htons(VLAN_CFI_MASK) || ++ !eth_p_mpls(eth_type)) ++ return -EINVAL; ++ ++--- a/net/sched/act_bpf.c +++++ b/net/sched/act_bpf.c ++@@ -220,7 +220,7 @@ static int tcf_bpf_init_from_efd(struct ++ ++ bpf_fd = nla_get_u32(tb[TCA_ACT_BPF_FD]); ++ ++- fp = bpf_prog_get(bpf_fd); +++ fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_ACT, false); ++ if (IS_ERR(fp)) ++ return PTR_ERR(fp); ++ ++--- a/net/sched/cls_bpf.c +++++ b/net/sched/cls_bpf.c ++@@ -267,7 +267,7 @@ static int cls_bpf_prog_from_efd(struct ++ ++ bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); ++ ++- fp = bpf_prog_get(bpf_fd); +++ fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, false); ++ if (IS_ERR(fp)) ++ return PTR_ERR(fp); ++ diff --git a/toolchain/kernel-headers/Makefile b/toolchain/kernel-headers/Makefile index c33f26d46d..06236b5a47 100644 --- a/toolchain/kernel-headers/Makefile