diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 1e5760d567ae..6aa2dc836db1 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -201,6 +201,8 @@ enum aarch64_insn_size_type {
 enum aarch64_insn_ldst_type {
 	AARCH64_INSN_LDST_LOAD_REG_OFFSET,
 	AARCH64_INSN_LDST_STORE_REG_OFFSET,
+	AARCH64_INSN_LDST_LOAD_IMM_OFFSET,
+	AARCH64_INSN_LDST_STORE_IMM_OFFSET,
 	AARCH64_INSN_LDST_LOAD_PAIR_PRE_INDEX,
 	AARCH64_INSN_LDST_STORE_PAIR_PRE_INDEX,
 	AARCH64_INSN_LDST_LOAD_PAIR_POST_INDEX,
@@ -335,6 +337,7 @@ __AARCH64_INSN_FUNCS(load_pre,	0x3FE00C00, 0x38400C00)
 __AARCH64_INSN_FUNCS(store_post,	0x3FE00C00, 0x38000400)
 __AARCH64_INSN_FUNCS(load_post,	0x3FE00C00, 0x38400400)
 __AARCH64_INSN_FUNCS(str_reg,	0x3FE0EC00, 0x38206800)
+__AARCH64_INSN_FUNCS(str_imm,	0x3FC00000, 0x39000000)
 __AARCH64_INSN_FUNCS(ldadd,	0x3F20FC00, 0x38200000)
 __AARCH64_INSN_FUNCS(ldclr,	0x3F20FC00, 0x38201000)
 __AARCH64_INSN_FUNCS(ldeor,	0x3F20FC00, 0x38202000)
@@ -342,6 +345,7 @@ __AARCH64_INSN_FUNCS(ldset,	0x3F20FC00, 0x38203000)
 __AARCH64_INSN_FUNCS(swp,	0x3F20FC00, 0x38208000)
 __AARCH64_INSN_FUNCS(cas,	0x3FA07C00, 0x08A07C00)
 __AARCH64_INSN_FUNCS(ldr_reg,	0x3FE0EC00, 0x38606800)
+__AARCH64_INSN_FUNCS(ldr_imm,	0x3FC00000, 0x39400000)
 __AARCH64_INSN_FUNCS(ldr_lit,	0xBF000000, 0x18000000)
 __AARCH64_INSN_FUNCS(ldrsw_lit,	0xFF000000, 0x98000000)
 __AARCH64_INSN_FUNCS(exclusive,	0x3F800000, 0x08000000)
@@ -501,6 +505,11 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
 				    enum aarch64_insn_register offset,
 				    enum aarch64_insn_size_type size,
 				    enum aarch64_insn_ldst_type type);
+u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg,
+				    enum aarch64_insn_register base,
+				    unsigned int imm,
+				    enum aarch64_insn_size_type size,
+				    enum aarch64_insn_ldst_type type);
 u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
 				     enum aarch64_insn_register reg2,
 				     enum aarch64_insn_register base,
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 5e90887deec4..695d7368fadc 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -299,29 +299,24 @@ static u32 aarch64_insn_encode_register(enum aarch64_insn_register_type type,
 	return insn;
 }
 
+static const u32 aarch64_insn_ldst_size[] = {
+	[AARCH64_INSN_SIZE_8] = 0,
+	[AARCH64_INSN_SIZE_16] = 1,
+	[AARCH64_INSN_SIZE_32] = 2,
+	[AARCH64_INSN_SIZE_64] = 3,
+};
+
 static u32 aarch64_insn_encode_ldst_size(enum aarch64_insn_size_type type,
 					 u32 insn)
 {
 	u32 size;
 
-	switch (type) {
-	case AARCH64_INSN_SIZE_8:
-		size = 0;
-		break;
-	case AARCH64_INSN_SIZE_16:
-		size = 1;
-		break;
-	case AARCH64_INSN_SIZE_32:
-		size = 2;
-		break;
-	case AARCH64_INSN_SIZE_64:
-		size = 3;
-		break;
-	default:
+	if (type < AARCH64_INSN_SIZE_8 || type > AARCH64_INSN_SIZE_64) {
 		pr_err("%s: unknown size encoding %d\n", __func__, type);
 		return AARCH64_BREAK_FAULT;
 	}
 
+	size = aarch64_insn_ldst_size[type];
 	insn &= ~GENMASK(31, 30);
 	insn |= size << 30;
 
@@ -504,6 +499,50 @@ u32 aarch64_insn_gen_load_store_reg(enum aarch64_insn_register reg,
 					    offset);
 }
 
+u32 aarch64_insn_gen_load_store_imm(enum aarch64_insn_register reg,
+				    enum aarch64_insn_register base,
+				    unsigned int imm,
+				    enum aarch64_insn_size_type size,
+				    enum aarch64_insn_ldst_type type)
+{
+	u32 insn;
+	u32 shift;
+
+	if (size < AARCH64_INSN_SIZE_8 || size > AARCH64_INSN_SIZE_64) {
+		pr_err("%s: unknown size encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	shift = aarch64_insn_ldst_size[size];
+	if (imm & ~(BIT(12 + shift) - BIT(shift))) {
+		pr_err("%s: invalid imm: %d\n", __func__, imm);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	imm >>= shift;
+
+	switch (type) {
+	case AARCH64_INSN_LDST_LOAD_IMM_OFFSET:
+		insn = aarch64_insn_get_ldr_imm_value();
+		break;
+	case AARCH64_INSN_LDST_STORE_IMM_OFFSET:
+		insn = aarch64_insn_get_str_imm_value();
+		break;
+	default:
+		pr_err("%s: unknown load/store encoding %d\n", __func__, type);
+		return AARCH64_BREAK_FAULT;
+	}
+
+	insn = aarch64_insn_encode_ldst_size(size, insn);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RT, insn, reg);
+
+	insn = aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RN, insn,
+					    base);
+
+	return aarch64_insn_encode_immediate(AARCH64_INSN_IMM_12, insn, imm);
+}
+
 u32 aarch64_insn_gen_load_store_pair(enum aarch64_insn_register reg1,
 				     enum aarch64_insn_register reg2,
 				     enum aarch64_insn_register base,
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index dd59b5ad8fe4..194c95ccc1cf 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -66,6 +66,20 @@
 #define A64_STR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, STORE)
 #define A64_LDR64(Xt, Xn, Xm) A64_LS_REG(Xt, Xn, Xm, 64, LOAD)
 
+/* Load/store register (immediate offset) */
+#define A64_LS_IMM(Rt, Rn, imm, size, type) \
+	aarch64_insn_gen_load_store_imm(Rt, Rn, imm, \
+		AARCH64_INSN_SIZE_##size, \
+		AARCH64_INSN_LDST_##type##_IMM_OFFSET)
+#define A64_STRBI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 8, STORE)
+#define A64_LDRBI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 8, LOAD)
+#define A64_STRHI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 16, STORE)
+#define A64_LDRHI(Wt, Xn, imm)  A64_LS_IMM(Wt, Xn, imm, 16, LOAD)
+#define A64_STR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, STORE)
+#define A64_LDR32I(Wt, Xn, imm) A64_LS_IMM(Wt, Xn, imm, 32, LOAD)
+#define A64_STR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, STORE)
+#define A64_LDR64I(Xt, Xn, imm) A64_LS_IMM(Xt, Xn, imm, 64, LOAD)
+
 /* Load/store register pair */
 #define A64_LS_PAIR(Rt, Rt2, Rn, offset, ls, type) \
 	aarch64_insn_gen_load_store_pair(Rt, Rt2, Rn, offset, \
@@ -249,6 +263,9 @@
 /* HINTs */
 #define A64_HINT(x) aarch64_insn_gen_hint(x)
 
+#define A64_PACIASP A64_HINT(AARCH64_INSN_HINT_PACIASP)
+#define A64_AUTIASP A64_HINT(AARCH64_INSN_HINT_AUTIASP)
+
 /* BTI */
 #define A64_BTI_C  A64_HINT(AARCH64_INSN_HINT_BTIC)
 #define A64_BTI_J  A64_HINT(AARCH64_INSN_HINT_BTIJ)
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index fcc675aa1670..8ab4035dea27 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -26,6 +26,7 @@
 #define TMP_REG_2 (MAX_BPF_JIT_REG + 1)
 #define TCALL_CNT (MAX_BPF_JIT_REG + 2)
 #define TMP_REG_3 (MAX_BPF_JIT_REG + 3)
+#define FP_BOTTOM (MAX_BPF_JIT_REG + 4)
 
 #define check_imm(bits, imm) do {				\
 	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
@@ -63,6 +64,7 @@ static const int bpf2a64[] = {
 	[TCALL_CNT] = A64_R(26),
 	/* temporary register for blinding constants */
 	[BPF_REG_AX] = A64_R(9),
+	[FP_BOTTOM] = A64_R(27),
 };
 
 struct jit_ctx {
@@ -73,6 +75,7 @@ struct jit_ctx {
 	int exentry_idx;
 	__le32 *image;
 	u32 stack_size;
+	int fpb_offset;
 };
 
 static inline void emit(const u32 insn, struct jit_ctx *ctx)
@@ -191,11 +194,53 @@ static bool is_addsub_imm(u32 imm)
 	return !(imm & ~0xfff) || !(imm & ~0xfff000);
 }
 
+/*
+ * There are 3 types of AArch64 LDR/STR (immediate) instruction:
+ * Post-index, Pre-index, Unsigned offset.
+ *
+ * For BPF ldr/str, the "unsigned offset" type is sufficient.
+ *
+ * "Unsigned offset" type LDR(immediate) format:
+ *
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x x|1 1 1 0 0 1 0 1|         imm12         |    Rn   |    Rt   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * scale
+ *
+ * "Unsigned offset" type STR(immediate) format:
+ *    3                   2                   1                   0
+ *  1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0 9 8 7 6 5 4 3 2 1 0
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |x x|1 1 1 0 0 1 0 0|         imm12         |    Rn   |    Rt   |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * scale
+ *
+ * The offset is calculated from imm12 and scale in the following way:
+ *
+ * offset = (u64)imm12 << scale
+ */
+static bool is_lsi_offset(int offset, int scale)
+{
+	if (offset < 0)
+		return false;
+
+	if (offset > (0xFFF << scale))
+		return false;
+
+	if (offset & ((1 << scale) - 1))
+		return false;
+
+	return true;
+}
+
 /* Tail call offset to jump into */
-#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL)
-#define PROLOGUE_OFFSET 8
+#if IS_ENABLED(CONFIG_ARM64_BTI_KERNEL) || \
+	IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL)
+#define PROLOGUE_OFFSET 9
 #else
-#define PROLOGUE_OFFSET 7
+#define PROLOGUE_OFFSET 8
 #endif
 
 static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
@@ -207,6 +252,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	const u8 r9 = bpf2a64[BPF_REG_9];
 	const u8 fp = bpf2a64[BPF_REG_FP];
 	const u8 tcc = bpf2a64[TCALL_CNT];
+	const u8 fpb = bpf2a64[FP_BOTTOM];
 	const int idx0 = ctx->idx;
 	int cur_offset;
 
@@ -233,8 +279,11 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	 *
 	 */
 
+	/* Sign lr */
+	if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL))
+		emit(A64_PACIASP, ctx);
 	/* BTI landing pad */
-	if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
+	else if (IS_ENABLED(CONFIG_ARM64_BTI_KERNEL))
 		emit(A64_BTI_C, ctx);
 
 	/* Save FP and LR registers to stay align with ARM64 AAPCS */
@@ -245,6 +294,7 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 	emit(A64_PUSH(r6, r7, A64_SP), ctx);
 	emit(A64_PUSH(r8, r9, A64_SP), ctx);
 	emit(A64_PUSH(fp, tcc, A64_SP), ctx);
+	emit(A64_PUSH(fpb, A64_R(28), A64_SP), ctx);
 
 	/* Set up BPF prog stack base register */
 	emit(A64_MOV(1, fp, A64_SP), ctx);
@@ -265,6 +315,8 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf)
 			emit(A64_BTI_J, ctx);
 	}
 
+	emit(A64_SUB_I(1, fpb, fp, ctx->fpb_offset), ctx);
+
 	/* Stack must be multiples of 16B */
 	ctx->stack_size = round_up(prog->aux->stack_depth, 16);
 
@@ -512,10 +564,13 @@ static void build_epilogue(struct jit_ctx *ctx)
 	const u8 r8 = bpf2a64[BPF_REG_8];
 	const u8 r9 = bpf2a64[BPF_REG_9];
 	const u8 fp = bpf2a64[BPF_REG_FP];
+	const u8 fpb = bpf2a64[FP_BOTTOM];
 
 	/* We're done with BPF stack */
 	emit(A64_ADD_I(1, A64_SP, A64_SP, ctx->stack_size), ctx);
 
+	/* Restore x27 and x28 */
+	emit(A64_POP(fpb, A64_R(28), A64_SP), ctx);
 	/* Restore fs (x25) and x26 */
 	emit(A64_POP(fp, A64_R(26), A64_SP), ctx);
 
@@ -529,6 +584,10 @@ static void build_epilogue(struct jit_ctx *ctx)
 	/* Set return value */
 	emit(A64_MOV(1, A64_R(0), r0), ctx);
 
+	/* Authenticate lr */
+	if (IS_ENABLED(CONFIG_ARM64_PTR_AUTH_KERNEL))
+		emit(A64_AUTIASP, ctx);
+
 	emit(A64_RET(A64_LR), ctx);
 }
 
@@ -609,6 +668,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	const u8 src = bpf2a64[insn->src_reg];
 	const u8 tmp = bpf2a64[TMP_REG_1];
 	const u8 tmp2 = bpf2a64[TMP_REG_2];
+	const u8 fp = bpf2a64[BPF_REG_FP];
+	const u8 fpb = bpf2a64[FP_BOTTOM];
 	const s16 off = insn->off;
 	const s32 imm = insn->imm;
 	const int i = insn - ctx->prog->insnsi;
@@ -617,6 +678,9 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	u8 jmp_cond;
 	s32 jmp_offset;
 	u32 a64_insn;
+	u8 src_adj;
+	u8 dst_adj;
+	int off_adj;
 	int ret;
 
 	switch (code) {
@@ -971,19 +1035,45 @@ emit_cond_jmp:
 	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
 	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
 	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
-		emit_a64_mov_i(1, tmp, off, ctx);
+		if (ctx->fpb_offset > 0 && src == fp) {
+			src_adj = fpb;
+			off_adj = off + ctx->fpb_offset;
+		} else {
+			src_adj = src;
+			off_adj = off;
+		}
 		switch (BPF_SIZE(code)) {
 		case BPF_W:
-			emit(A64_LDR32(dst, src, tmp), ctx);
+			if (is_lsi_offset(off_adj, 2)) {
+				emit(A64_LDR32I(dst, src_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_LDR32(dst, src, tmp), ctx);
+			}
 			break;
 		case BPF_H:
-			emit(A64_LDRH(dst, src, tmp), ctx);
+			if (is_lsi_offset(off_adj, 1)) {
+				emit(A64_LDRHI(dst, src_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_LDRH(dst, src, tmp), ctx);
+			}
 			break;
 		case BPF_B:
-			emit(A64_LDRB(dst, src, tmp), ctx);
+			if (is_lsi_offset(off_adj, 0)) {
+				emit(A64_LDRBI(dst, src_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_LDRB(dst, src, tmp), ctx);
+			}
 			break;
 		case BPF_DW:
-			emit(A64_LDR64(dst, src, tmp), ctx);
+			if (is_lsi_offset(off_adj, 3)) {
+				emit(A64_LDR64I(dst, src_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_LDR64(dst, src, tmp), ctx);
+			}
 			break;
 		}
 
@@ -1010,21 +1100,47 @@ emit_cond_jmp:
 	case BPF_ST | BPF_MEM | BPF_H:
 	case BPF_ST | BPF_MEM | BPF_B:
 	case BPF_ST | BPF_MEM | BPF_DW:
+		if (ctx->fpb_offset > 0 && dst == fp) {
+			dst_adj = fpb;
+			off_adj = off + ctx->fpb_offset;
+		} else {
+			dst_adj = dst;
+			off_adj = off;
+		}
 		/* Load imm to a register then store it */
-		emit_a64_mov_i(1, tmp2, off, ctx);
 		emit_a64_mov_i(1, tmp, imm, ctx);
 		switch (BPF_SIZE(code)) {
 		case BPF_W:
-			emit(A64_STR32(tmp, dst, tmp2), ctx);
+			if (is_lsi_offset(off_adj, 2)) {
+				emit(A64_STR32I(tmp, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp2, off, ctx);
+				emit(A64_STR32(tmp, dst, tmp2), ctx);
+			}
 			break;
 		case BPF_H:
-			emit(A64_STRH(tmp, dst, tmp2), ctx);
+			if (is_lsi_offset(off_adj, 1)) {
+				emit(A64_STRHI(tmp, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp2, off, ctx);
+				emit(A64_STRH(tmp, dst, tmp2), ctx);
+			}
 			break;
 		case BPF_B:
-			emit(A64_STRB(tmp, dst, tmp2), ctx);
+			if (is_lsi_offset(off_adj, 0)) {
+				emit(A64_STRBI(tmp, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp2, off, ctx);
+				emit(A64_STRB(tmp, dst, tmp2), ctx);
+			}
 			break;
 		case BPF_DW:
-			emit(A64_STR64(tmp, dst, tmp2), ctx);
+			if (is_lsi_offset(off_adj, 3)) {
+				emit(A64_STR64I(tmp, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp2, off, ctx);
+				emit(A64_STR64(tmp, dst, tmp2), ctx);
+			}
 			break;
 		}
 		break;
@@ -1034,19 +1150,45 @@ emit_cond_jmp:
 	case BPF_STX | BPF_MEM | BPF_H:
 	case BPF_STX | BPF_MEM | BPF_B:
 	case BPF_STX | BPF_MEM | BPF_DW:
-		emit_a64_mov_i(1, tmp, off, ctx);
+		if (ctx->fpb_offset > 0 && dst == fp) {
+			dst_adj = fpb;
+			off_adj = off + ctx->fpb_offset;
+		} else {
+			dst_adj = dst;
+			off_adj = off;
+		}
 		switch (BPF_SIZE(code)) {
 		case BPF_W:
-			emit(A64_STR32(src, dst, tmp), ctx);
+			if (is_lsi_offset(off_adj, 2)) {
+				emit(A64_STR32I(src, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_STR32(src, dst, tmp), ctx);
+			}
 			break;
 		case BPF_H:
-			emit(A64_STRH(src, dst, tmp), ctx);
+			if (is_lsi_offset(off_adj, 1)) {
+				emit(A64_STRHI(src, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_STRH(src, dst, tmp), ctx);
+			}
 			break;
 		case BPF_B:
-			emit(A64_STRB(src, dst, tmp), ctx);
+			if (is_lsi_offset(off_adj, 0)) {
+				emit(A64_STRBI(src, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_STRB(src, dst, tmp), ctx);
+			}
 			break;
 		case BPF_DW:
-			emit(A64_STR64(src, dst, tmp), ctx);
+			if (is_lsi_offset(off_adj, 3)) {
+				emit(A64_STR64I(src, dst_adj, off_adj), ctx);
+			} else {
+				emit_a64_mov_i(1, tmp, off, ctx);
+				emit(A64_STR64(src, dst, tmp), ctx);
+			}
 			break;
 		}
 		break;
@@ -1069,6 +1211,79 @@ emit_cond_jmp:
 	return 0;
 }
 
+/*
+ * Return 0 if FP may change at runtime, otherwise find the minimum negative
+ * offset to FP, converts it to positive number, and align down to 8 bytes.
+ */
+static int find_fpb_offset(struct bpf_prog *prog)
+{
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < prog->len; i++) {
+		const struct bpf_insn *insn = &prog->insnsi[i];
+		const u8 class = BPF_CLASS(insn->code);
+		const u8 mode = BPF_MODE(insn->code);
+		const u8 src = insn->src_reg;
+		const u8 dst = insn->dst_reg;
+		const s32 imm = insn->imm;
+		const s16 off = insn->off;
+
+		switch (class) {
+		case BPF_STX:
+		case BPF_ST:
+			/* fp holds atomic operation result */
+			if (class == BPF_STX && mode == BPF_ATOMIC &&
+			    ((imm == BPF_XCHG ||
+			      imm == (BPF_FETCH | BPF_ADD) ||
+			      imm == (BPF_FETCH | BPF_AND) ||
+			      imm == (BPF_FETCH | BPF_XOR) ||
+			      imm == (BPF_FETCH | BPF_OR)) &&
+			     src == BPF_REG_FP))
+				return 0;
+
+			if (mode == BPF_MEM && dst == BPF_REG_FP &&
+			    off < offset)
+				offset = insn->off;
+			break;
+
+		case BPF_JMP32:
+		case BPF_JMP:
+			break;
+
+		case BPF_LDX:
+		case BPF_LD:
+			/* fp holds load result */
+			if (dst == BPF_REG_FP)
+				return 0;
+
+			if (class == BPF_LDX && mode == BPF_MEM &&
+			    src == BPF_REG_FP && off < offset)
+				offset = off;
+			break;
+
+		case BPF_ALU:
+		case BPF_ALU64:
+		default:
+			/* fp holds ALU result */
+			if (dst == BPF_REG_FP)
+				return 0;
+		}
+	}
+
+	if (offset < 0) {
+		/*
+		 * safely be converted to a positive 'int', since insn->off
+		 * is 's16'
+		 */
+		offset = -offset;
+		/* align down to 8 bytes */
+		offset = ALIGN_DOWN(offset, 8);
+	}
+
+	return offset;
+}
+
 static int build_body(struct jit_ctx *ctx, bool extra_pass)
 {
 	const struct bpf_prog *prog = ctx->prog;
@@ -1190,6 +1405,8 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		goto out_off;
 	}
 
+	ctx.fpb_offset = find_fpb_offset(prog);
+
 	/*
 	 * 1. Initial fake pass to compute ctx->idx and ctx->offset.
 	 *
diff --git a/include/uapi/linux/btf.h b/include/uapi/linux/btf.h
index b0d8fea1951d..a9162a6c0284 100644
--- a/include/uapi/linux/btf.h
+++ b/include/uapi/linux/btf.h
@@ -33,8 +33,8 @@ struct btf_type {
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-27: kind (e.g. int, ptr, array...etc)
-	 * bits 28-30: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
 	 *             struct, union and fwd
 	 */
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 110029ede71e..dea920b3b840 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -330,35 +330,34 @@ static void cache_btf_id(struct bpf_iter_target_info *tinfo,
 bool bpf_iter_prog_supported(struct bpf_prog *prog)
 {
 	const char *attach_fname = prog->aux->attach_func_name;
+	struct bpf_iter_target_info *tinfo = NULL, *iter;
 	u32 prog_btf_id = prog->aux->attach_btf_id;
 	const char *prefix = BPF_ITER_FUNC_PREFIX;
-	struct bpf_iter_target_info *tinfo;
 	int prefix_len = strlen(prefix);
-	bool supported = false;
 
 	if (strncmp(attach_fname, prefix, prefix_len))
 		return false;
 
 	mutex_lock(&targets_mutex);
-	list_for_each_entry(tinfo, &targets, list) {
-		if (tinfo->btf_id && tinfo->btf_id == prog_btf_id) {
-			supported = true;
+	list_for_each_entry(iter, &targets, list) {
+		if (iter->btf_id && iter->btf_id == prog_btf_id) {
+			tinfo = iter;
 			break;
 		}
-		if (!strcmp(attach_fname + prefix_len, tinfo->reg_info->target)) {
-			cache_btf_id(tinfo, prog);
-			supported = true;
+		if (!strcmp(attach_fname + prefix_len, iter->reg_info->target)) {
+			cache_btf_id(iter, prog);
+			tinfo = iter;
 			break;
 		}
 	}
 	mutex_unlock(&targets_mutex);
 
-	if (supported) {
+	if (tinfo) {
 		prog->aux->ctx_arg_info_size = tinfo->reg_info->ctx_arg_info_size;
 		prog->aux->ctx_arg_info = tinfo->reg_info->ctx_arg_info;
 	}
 
-	return supported;
+	return tinfo != NULL;
 }
 
 const struct bpf_func_proto *
@@ -499,12 +498,11 @@ bool bpf_link_is_iter(struct bpf_link *link)
 int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 			 struct bpf_prog *prog)
 {
+	struct bpf_iter_target_info *tinfo = NULL, *iter;
 	struct bpf_link_primer link_primer;
-	struct bpf_iter_target_info *tinfo;
 	union bpf_iter_link_info linfo;
 	struct bpf_iter_link *link;
 	u32 prog_btf_id, linfo_len;
-	bool existed = false;
 	bpfptr_t ulinfo;
 	int err;
 
@@ -530,14 +528,14 @@ int bpf_iter_link_attach(const union bpf_attr *attr, bpfptr_t uattr,
 
 	prog_btf_id = prog->aux->attach_btf_id;
 	mutex_lock(&targets_mutex);
-	list_for_each_entry(tinfo, &targets, list) {
-		if (tinfo->btf_id == prog_btf_id) {
-			existed = true;
+	list_for_each_entry(iter, &targets, list) {
+		if (iter->btf_id == prog_btf_id) {
+			tinfo = iter;
 			break;
 		}
 	}
 	mutex_unlock(&targets_mutex);
-	if (!existed)
+	if (!tinfo)
 		return -ENOENT;
 
 	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 34725bfa1e97..1dd5266fbebb 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -100,13 +100,11 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 		return ERR_PTR(-E2BIG);
 
 	cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap);
-	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
 	smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
 	if (!smap)
 		return ERR_PTR(-ENOMEM);
 
 	bpf_map_init_from_attr(&smap->map, attr);
-	smap->map.value_size = value_size;
 	smap->n_buckets = n_buckets;
 
 	err = get_callchain_buffers(sysctl_perf_event_max_stack);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d175b70067b3..9c1a02b82ecd 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4861,6 +4861,11 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_packet_access(env, regno, reg->off, access_size,
 					   zero_size_allowed);
 	case PTR_TO_MAP_KEY:
+		if (meta && meta->raw_mode) {
+			verbose(env, "R%d cannot write into %s\n", regno,
+				reg_type_str(env, reg->type));
+			return -EACCES;
+		}
 		return check_mem_region_access(env, regno, reg->off, access_size,
 					       reg->map_ptr->key_size, false);
 	case PTR_TO_MAP_VALUE:
@@ -4871,13 +4876,23 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_map_access(env, regno, reg->off, access_size,
 					zero_size_allowed);
 	case PTR_TO_MEM:
+		if (type_is_rdonly_mem(reg->type)) {
+			if (meta && meta->raw_mode) {
+				verbose(env, "R%d cannot write into %s\n", regno,
+					reg_type_str(env, reg->type));
+				return -EACCES;
+			}
+		}
 		return check_mem_region_access(env, regno, reg->off,
 					       access_size, reg->mem_size,
 					       zero_size_allowed);
 	case PTR_TO_BUF:
 		if (type_is_rdonly_mem(reg->type)) {
-			if (meta && meta->raw_mode)
+			if (meta && meta->raw_mode) {
+				verbose(env, "R%d cannot write into %s\n", regno,
+					reg_type_str(env, reg->type));
 				return -EACCES;
+			}
 
 			max_access = &env->prog->aux->max_rdonly_access;
 		} else {
@@ -4919,8 +4934,7 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 	 * out. Only upper bounds can be learned because retval is an
 	 * int type and negative retvals are allowed.
 	 */
-	if (meta)
-		meta->msize_max_value = reg->umax_value;
+	meta->msize_max_value = reg->umax_value;
 
 	/* The register is SCALAR_VALUE; the access check
 	 * happens using its boundaries.
@@ -4963,24 +4977,33 @@ static int check_mem_size_reg(struct bpf_verifier_env *env,
 int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
 		   u32 regno, u32 mem_size)
 {
+	bool may_be_null = type_may_be_null(reg->type);
+	struct bpf_reg_state saved_reg;
+	struct bpf_call_arg_meta meta;
+	int err;
+
 	if (register_is_null(reg))
 		return 0;
 
-	if (type_may_be_null(reg->type)) {
-		/* Assuming that the register contains a value check if the memory
-		 * access is safe. Temporarily save and restore the register's state as
-		 * the conversion shouldn't be visible to a caller.
-		 */
-		const struct bpf_reg_state saved_reg = *reg;
-		int rv;
-
+	memset(&meta, 0, sizeof(meta));
+	/* Assuming that the register contains a value check if the memory
+	 * access is safe. Temporarily save and restore the register's state as
+	 * the conversion shouldn't be visible to a caller.
+	 */
+	if (may_be_null) {
+		saved_reg = *reg;
 		mark_ptr_not_null_reg(reg);
-		rv = check_helper_mem_access(env, regno, mem_size, true, NULL);
-		*reg = saved_reg;
-		return rv;
 	}
 
-	return check_helper_mem_access(env, regno, mem_size, true, NULL);
+	err = check_helper_mem_access(env, regno, mem_size, true, &meta);
+	/* Check access for BPF_WRITE */
+	meta.raw_mode = true;
+	err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta);
+
+	if (may_be_null)
+		*reg = saved_reg;
+
+	return err;
 }
 
 int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg,
@@ -4989,16 +5012,22 @@ int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state
 	struct bpf_reg_state *mem_reg = &cur_regs(env)[regno - 1];
 	bool may_be_null = type_may_be_null(mem_reg->type);
 	struct bpf_reg_state saved_reg;
+	struct bpf_call_arg_meta meta;
 	int err;
 
 	WARN_ON_ONCE(regno < BPF_REG_2 || regno > BPF_REG_5);
 
+	memset(&meta, 0, sizeof(meta));
+
 	if (may_be_null) {
 		saved_reg = *mem_reg;
 		mark_ptr_not_null_reg(mem_reg);
 	}
 
-	err = check_mem_size_reg(env, reg, regno, true, NULL);
+	err = check_mem_size_reg(env, reg, regno, true, &meta);
+	/* Check access for BPF_WRITE */
+	meta.raw_mode = true;
+	err = err ?: check_mem_size_reg(env, reg, regno, true, &meta);
 
 	if (may_be_null)
 		*mem_reg = saved_reg;
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d8553f46caa2..b26f3da943de 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -2254,15 +2254,13 @@ static void bpf_kprobe_multi_cookie_swap(void *a, void *b, int size, const void
 	const struct bpf_kprobe_multi_link *link = priv;
 	unsigned long *addr_a = a, *addr_b = b;
 	u64 *cookie_a, *cookie_b;
-	unsigned long tmp1;
-	u64 tmp2;
 
 	cookie_a = link->cookies + (addr_a - link->addrs);
 	cookie_b = link->cookies + (addr_b - link->addrs);
 
 	/* swap addr_a/addr_b and cookie_a/cookie_b values */
-	tmp1 = *addr_a; *addr_a = *addr_b; *addr_b = tmp1;
-	tmp2 = *cookie_a; *cookie_a = *cookie_b; *cookie_b = tmp2;
+	swap(*addr_a, *addr_b);
+	swap(*cookie_a, *cookie_b);
 }
 
 static int __bpf_kprobe_multi_cookie_cmp(const void *a, const void *b)
diff --git a/lib/test_bpf.c b/lib/test_bpf.c
index 0c5cb2d6436a..2a7836e115b4 100644
--- a/lib/test_bpf.c
+++ b/lib/test_bpf.c
@@ -53,6 +53,7 @@
 #define FLAG_EXPECTED_FAIL	BIT(1)
 #define FLAG_SKB_FRAG		BIT(2)
 #define FLAG_VERIFIER_ZEXT	BIT(3)
+#define FLAG_LARGE_MEM		BIT(4)
 
 enum {
 	CLASSIC  = BIT(6),	/* Old BPF instructions only. */
@@ -7838,7 +7839,7 @@ static struct bpf_test tests[] = {
 	},
 	/* BPF_LDX_MEM B/H/W/DW */
 	{
-		"BPF_LDX_MEM | BPF_B",
+		"BPF_LDX_MEM | BPF_B, base",
 		.u.insns_int = {
 			BPF_LD_IMM64(R1, 0x0102030405060708ULL),
 			BPF_LD_IMM64(R2, 0x0000000000000008ULL),
@@ -7878,7 +7879,56 @@ static struct bpf_test tests[] = {
 		.stack_depth = 8,
 	},
 	{
-		"BPF_LDX_MEM | BPF_H",
+		"BPF_LDX_MEM | BPF_B, negative offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000000000088ULL),
+			BPF_ALU64_IMM(BPF_ADD, R1, 512),
+			BPF_STX_MEM(BPF_B, R1, R2, -256),
+			BPF_LDX_MEM(BPF_B, R0, R1, -256),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_B, small positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000000000088ULL),
+			BPF_STX_MEM(BPF_B, R1, R2, 256),
+			BPF_LDX_MEM(BPF_B, R0, R1, 256),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_B, large positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000000000088ULL),
+			BPF_STX_MEM(BPF_B, R1, R2, 4096),
+			BPF_LDX_MEM(BPF_B, R0, R1, 4096),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 4096 + 16, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_H, base",
 		.u.insns_int = {
 			BPF_LD_IMM64(R1, 0x0102030405060708ULL),
 			BPF_LD_IMM64(R2, 0x0000000000000708ULL),
@@ -7918,7 +7968,72 @@ static struct bpf_test tests[] = {
 		.stack_depth = 8,
 	},
 	{
-		"BPF_LDX_MEM | BPF_W",
+		"BPF_LDX_MEM | BPF_H, negative offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000000008788ULL),
+			BPF_ALU64_IMM(BPF_ADD, R1, 512),
+			BPF_STX_MEM(BPF_H, R1, R2, -256),
+			BPF_LDX_MEM(BPF_H, R0, R1, -256),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_H, small positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000000008788ULL),
+			BPF_STX_MEM(BPF_H, R1, R2, 256),
+			BPF_LDX_MEM(BPF_H, R0, R1, 256),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_H, large positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000000008788ULL),
+			BPF_STX_MEM(BPF_H, R1, R2, 8192),
+			BPF_LDX_MEM(BPF_H, R0, R1, 8192),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 8192 + 16, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_H, unaligned positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000000008788ULL),
+			BPF_STX_MEM(BPF_H, R1, R2, 13),
+			BPF_LDX_MEM(BPF_H, R0, R1, 13),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 32, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_W, base",
 		.u.insns_int = {
 			BPF_LD_IMM64(R1, 0x0102030405060708ULL),
 			BPF_LD_IMM64(R2, 0x0000000005060708ULL),
@@ -7957,6 +8072,162 @@ static struct bpf_test tests[] = {
 		{ { 0, 0 } },
 		.stack_depth = 8,
 	},
+	{
+		"BPF_LDX_MEM | BPF_W, negative offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000085868788ULL),
+			BPF_ALU64_IMM(BPF_ADD, R1, 512),
+			BPF_STX_MEM(BPF_W, R1, R2, -256),
+			BPF_LDX_MEM(BPF_W, R0, R1, -256),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_W, small positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000085868788ULL),
+			BPF_STX_MEM(BPF_W, R1, R2, 256),
+			BPF_LDX_MEM(BPF_W, R0, R1, 256),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_W, large positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000085868788ULL),
+			BPF_STX_MEM(BPF_W, R1, R2, 16384),
+			BPF_LDX_MEM(BPF_W, R0, R1, 16384),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 16384 + 16, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_W, unaligned positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_LD_IMM64(R3, 0x0000000085868788ULL),
+			BPF_STX_MEM(BPF_W, R1, R2, 13),
+			BPF_LDX_MEM(BPF_W, R0, R1, 13),
+			BPF_JMP_REG(BPF_JNE, R0, R3, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 32, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_DW, base",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0x0102030405060708ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+			BPF_LDX_MEM(BPF_DW, R0, R10, -8),
+			BPF_JMP_REG(BPF_JNE, R0, R1, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEM | BPF_DW, MSB set",
+		.u.insns_int = {
+			BPF_LD_IMM64(R1, 0x8182838485868788ULL),
+			BPF_STX_MEM(BPF_DW, R10, R1, -8),
+			BPF_LDX_MEM(BPF_DW, R0, R10, -8),
+			BPF_JMP_REG(BPF_JNE, R0, R1, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL,
+		{ },
+		{ { 0, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEM | BPF_DW, negative offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_ALU64_IMM(BPF_ADD, R1, 512),
+			BPF_STX_MEM(BPF_DW, R1, R2, -256),
+			BPF_LDX_MEM(BPF_DW, R0, R1, -256),
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_DW, small positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_STX_MEM(BPF_DW, R1, R2, 256),
+			BPF_LDX_MEM(BPF_DW, R0, R1, 256),
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 512, 0 } },
+		.stack_depth = 8,
+	},
+	{
+		"BPF_LDX_MEM | BPF_DW, large positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_STX_MEM(BPF_DW, R1, R2, 32760),
+			BPF_LDX_MEM(BPF_DW, R0, R1, 32760),
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 32768, 0 } },
+		.stack_depth = 0,
+	},
+	{
+		"BPF_LDX_MEM | BPF_DW, unaligned positive offset",
+		.u.insns_int = {
+			BPF_LD_IMM64(R2, 0x8182838485868788ULL),
+			BPF_STX_MEM(BPF_DW, R1, R2, 13),
+			BPF_LDX_MEM(BPF_DW, R0, R1, 13),
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		INTERNAL | FLAG_LARGE_MEM,
+		{ },
+		{ { 32, 0 } },
+		.stack_depth = 0,
+	},
 	/* BPF_STX_MEM B/H/W/DW */
 	{
 		"BPF_STX_MEM | BPF_B",
@@ -14094,6 +14365,9 @@ static void *generate_test_data(struct bpf_test *test, int sub)
 	if (test->aux & FLAG_NO_DATA)
 		return NULL;
 
+	if (test->aux & FLAG_LARGE_MEM)
+		return kmalloc(test->test[sub].data_size, GFP_KERNEL);
+
 	/* Test case expects an skb, so populate one. Various
 	 * subtests generate skbs of different sizes based on
 	 * the same data.
@@ -14137,7 +14411,10 @@ static void release_test_data(const struct bpf_test *test, void *data)
 	if (test->aux & FLAG_NO_DATA)
 		return;
 
-	kfree_skb(data);
+	if (test->aux & FLAG_LARGE_MEM)
+		kfree(data);
+	else
+		kfree_skb(data);
 }
 
 static int filter_length(int which)
@@ -14673,6 +14950,36 @@ static struct tail_call_test tail_call_tests[] = {
 		},
 		.result = 10,
 	},
+	{
+		"Tail call load/store leaf",
+		.insns = {
+			BPF_ALU64_IMM(BPF_MOV, R1, 1),
+			BPF_ALU64_IMM(BPF_MOV, R2, 2),
+			BPF_ALU64_REG(BPF_MOV, R3, BPF_REG_FP),
+			BPF_STX_MEM(BPF_DW, R3, R1, -8),
+			BPF_STX_MEM(BPF_DW, R3, R2, -16),
+			BPF_LDX_MEM(BPF_DW, R0, BPF_REG_FP, -8),
+			BPF_JMP_REG(BPF_JNE, R0, R1, 3),
+			BPF_LDX_MEM(BPF_DW, R0, BPF_REG_FP, -16),
+			BPF_JMP_REG(BPF_JNE, R0, R2, 1),
+			BPF_ALU64_IMM(BPF_MOV, R0, 0),
+			BPF_EXIT_INSN(),
+		},
+		.result = 0,
+		.stack_depth = 32,
+	},
+	{
+		"Tail call load/store",
+		.insns = {
+			BPF_ALU64_IMM(BPF_MOV, R0, 3),
+			BPF_STX_MEM(BPF_DW, BPF_REG_FP, R0, -8),
+			TAIL_CALL(-1),
+			BPF_ALU64_IMM(BPF_MOV, R0, -1),
+			BPF_EXIT_INSN(),
+		},
+		.result = 0,
+		.stack_depth = 16,
+	},
 	{
 		"Tail call error path, max count reached",
 		.insns = {
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index fe98673dd5ac..bc4d5cd63a94 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -38,6 +38,7 @@
  * @l4proto    - Layer 4 protocol
  *		 Values:
  *		   IPPROTO_TCP, IPPROTO_UDP
+ * @dir:       - connection tracking tuple direction.
  * @reserved   - Reserved member, will be reused for more options in future
  *		 Values:
  *		   0
@@ -46,7 +47,8 @@ struct bpf_ct_opts {
 	s32 netns_id;
 	s32 error;
 	u8 l4proto;
-	u8 reserved[3];
+	u8 dir;
+	u8 reserved[2];
 };
 
 enum {
@@ -56,10 +58,11 @@ enum {
 static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 					  struct bpf_sock_tuple *bpf_tuple,
 					  u32 tuple_len, u8 protonum,
-					  s32 netns_id)
+					  s32 netns_id, u8 *dir)
 {
 	struct nf_conntrack_tuple_hash *hash;
 	struct nf_conntrack_tuple tuple;
+	struct nf_conn *ct;
 
 	if (unlikely(protonum != IPPROTO_TCP && protonum != IPPROTO_UDP))
 		return ERR_PTR(-EPROTO);
@@ -99,7 +102,12 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net,
 		put_net(net);
 	if (!hash)
 		return ERR_PTR(-ENOENT);
-	return nf_ct_tuplehash_to_ctrack(hash);
+
+	ct = nf_ct_tuplehash_to_ctrack(hash);
+	if (dir)
+		*dir = NF_CT_DIRECTION(hash);
+
+	return ct;
 }
 
 __diag_push();
@@ -135,13 +143,13 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple,
 	if (!opts)
 		return NULL;
 	if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
-	    opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+	    opts__sz != NF_BPF_CT_OPTS_SZ) {
 		opts->error = -EINVAL;
 		return NULL;
 	}
 	caller_net = dev_net(ctx->rxq->dev);
 	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
-				  opts->netns_id);
+				  opts->netns_id, &opts->dir);
 	if (IS_ERR(nfct)) {
 		opts->error = PTR_ERR(nfct);
 		return NULL;
@@ -178,13 +186,13 @@ bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple,
 	if (!opts)
 		return NULL;
 	if (!bpf_tuple || opts->reserved[0] || opts->reserved[1] ||
-	    opts->reserved[2] || opts__sz != NF_BPF_CT_OPTS_SZ) {
+	    opts__sz != NF_BPF_CT_OPTS_SZ) {
 		opts->error = -EINVAL;
 		return NULL;
 	}
 	caller_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
 	nfct = __bpf_nf_ct_lookup(caller_net, bpf_tuple, tuple__sz, opts->l4proto,
-				  opts->netns_id);
+				  opts->netns_id, &opts->dir);
 	if (IS_ERR(nfct)) {
 		opts->error = PTR_ERR(nfct);
 		return NULL;
diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 38638845db9d..8fff5ad3444b 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -96,7 +96,6 @@ test_cgrp2_sock2-objs := test_cgrp2_sock2.o
 xdp1-objs := xdp1_user.o
 # reuse xdp1 source intentionally
 xdp2-objs := xdp1_user.o
-xdp_router_ipv4-objs := xdp_router_ipv4_user.o
 test_current_task_under_cgroup-objs := $(CGROUP_HELPERS) \
 				       test_current_task_under_cgroup_user.o
 trace_event-objs := trace_event_user.o $(TRACE_HELPERS)
@@ -124,6 +123,7 @@ xdp_redirect_cpu-objs := xdp_redirect_cpu_user.o $(XDP_SAMPLE)
 xdp_redirect_map-objs := xdp_redirect_map_user.o $(XDP_SAMPLE)
 xdp_redirect-objs := xdp_redirect_user.o $(XDP_SAMPLE)
 xdp_monitor-objs := xdp_monitor_user.o $(XDP_SAMPLE)
+xdp_router_ipv4-objs := xdp_router_ipv4_user.o $(XDP_SAMPLE)
 
 # Tell kbuild to always build the programs
 always-y := $(tprogs-y)
@@ -153,7 +153,6 @@ always-y += parse_varlen.o parse_simple.o parse_ldabs.o
 always-y += test_cgrp2_tc_kern.o
 always-y += xdp1_kern.o
 always-y += xdp2_kern.o
-always-y += xdp_router_ipv4_kern.o
 always-y += test_current_task_under_cgroup_kern.o
 always-y += trace_event_kern.o
 always-y += sampleip_kern.o
@@ -220,6 +219,7 @@ TPROGLDLIBS_xdp_redirect	+= -lm
 TPROGLDLIBS_xdp_redirect_cpu	+= -lm
 TPROGLDLIBS_xdp_redirect_map	+= -lm
 TPROGLDLIBS_xdp_redirect_map_multi += -lm
+TPROGLDLIBS_xdp_router_ipv4	+= -lm -pthread
 TPROGLDLIBS_tracex4		+= -lrt
 TPROGLDLIBS_trace_output	+= -lrt
 TPROGLDLIBS_map_perf_test	+= -lrt
@@ -342,6 +342,7 @@ $(obj)/xdp_redirect_map_multi_user.o: $(obj)/xdp_redirect_map_multi.skel.h
 $(obj)/xdp_redirect_map_user.o: $(obj)/xdp_redirect_map.skel.h
 $(obj)/xdp_redirect_user.o: $(obj)/xdp_redirect.skel.h
 $(obj)/xdp_monitor_user.o: $(obj)/xdp_monitor.skel.h
+$(obj)/xdp_router_ipv4_user.o: $(obj)/xdp_router_ipv4.skel.h
 
 $(obj)/tracex5_kern.o: $(obj)/syscall_nrs.h
 $(obj)/hbm_out_kern.o: $(src)/hbm.h $(src)/hbm_kern.h
@@ -399,6 +400,7 @@ $(obj)/xdp_redirect_map_multi.bpf.o: $(obj)/xdp_sample.bpf.o
 $(obj)/xdp_redirect_map.bpf.o: $(obj)/xdp_sample.bpf.o
 $(obj)/xdp_redirect.bpf.o: $(obj)/xdp_sample.bpf.o
 $(obj)/xdp_monitor.bpf.o: $(obj)/xdp_sample.bpf.o
+$(obj)/xdp_router_ipv4.bpf.o: $(obj)/xdp_sample.bpf.o
 
 $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/xdp_sample_shared.h
 	@echo "  CLANG-BPF " $@
@@ -409,7 +411,8 @@ $(obj)/%.bpf.o: $(src)/%.bpf.c $(obj)/vmlinux.h $(src)/xdp_sample.bpf.h $(src)/x
 		-c $(filter %.bpf.c,$^) -o $@
 
 LINKED_SKELS := xdp_redirect_cpu.skel.h xdp_redirect_map_multi.skel.h \
-		xdp_redirect_map.skel.h xdp_redirect.skel.h xdp_monitor.skel.h
+		xdp_redirect_map.skel.h xdp_redirect.skel.h xdp_monitor.skel.h \
+		xdp_router_ipv4.skel.h
 clean-files += $(LINKED_SKELS)
 
 xdp_redirect_cpu.skel.h-deps := xdp_redirect_cpu.bpf.o xdp_sample.bpf.o
@@ -417,6 +420,7 @@ xdp_redirect_map_multi.skel.h-deps := xdp_redirect_map_multi.bpf.o xdp_sample.bp
 xdp_redirect_map.skel.h-deps := xdp_redirect_map.bpf.o xdp_sample.bpf.o
 xdp_redirect.skel.h-deps := xdp_redirect.bpf.o xdp_sample.bpf.o
 xdp_monitor.skel.h-deps := xdp_monitor.bpf.o xdp_sample.bpf.o
+xdp_router_ipv4.skel.h-deps := xdp_router_ipv4.bpf.o xdp_sample.bpf.o
 
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.bpf.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
 
diff --git a/samples/bpf/syscall_tp_user.c b/samples/bpf/syscall_tp_user.c
index a0ebf1833ed3..c55383068384 100644
--- a/samples/bpf/syscall_tp_user.c
+++ b/samples/bpf/syscall_tp_user.c
@@ -36,6 +36,9 @@ static void verify_map(int map_id)
 		fprintf(stderr, "failed: map #%d returns value 0\n", map_id);
 		return;
 	}
+
+	printf("verify map:%d val: %d\n", map_id, val);
+
 	val = 0;
 	if (bpf_map_update_elem(map_id, &key, &val, BPF_ANY) != 0) {
 		fprintf(stderr, "map_update failed: %s\n", strerror(errno));
diff --git a/samples/bpf/xdp_router_ipv4.bpf.c b/samples/bpf/xdp_router_ipv4.bpf.c
new file mode 100644
index 000000000000..248119ca7938
--- /dev/null
+++ b/samples/bpf/xdp_router_ipv4.bpf.c
@@ -0,0 +1,180 @@
+/* Copyright (C) 2017 Cavium, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ */
+
+#include "vmlinux.h"
+#include "xdp_sample.bpf.h"
+#include "xdp_sample_shared.h"
+
+#define ETH_ALEN	6
+#define ETH_P_8021Q	0x8100
+#define ETH_P_8021AD	0x88A8
+
+struct trie_value {
+	__u8 prefix[4];
+	__be64 value;
+	int ifindex;
+	int metric;
+	__be32 gw;
+};
+
+/* Key for lpm_trie */
+union key_4 {
+	u32 b32[2];
+	u8 b8[8];
+};
+
+struct arp_entry {
+	__be64 mac;
+	__be32 dst;
+};
+
+struct direct_map {
+	struct arp_entry arp;
+	int ifindex;
+	__be64 mac;
+};
+
+/* Map for trie implementation */
+struct {
+	__uint(type, BPF_MAP_TYPE_LPM_TRIE);
+	__uint(key_size, 8);
+	__uint(value_size, sizeof(struct trie_value));
+	__uint(max_entries, 50);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+} lpm_map SEC(".maps");
+
+/* Map for ARP table */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, __be32);
+	__type(value, __be64);
+	__uint(max_entries, 50);
+} arp_table SEC(".maps");
+
+/* Map to keep the exact match entries in the route table */
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, __be32);
+	__type(value, struct direct_map);
+	__uint(max_entries, 50);
+} exact_match SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_DEVMAP);
+	__uint(key_size, sizeof(int));
+	__uint(value_size, sizeof(int));
+	__uint(max_entries, 100);
+} tx_port SEC(".maps");
+
+SEC("xdp")
+int xdp_router_ipv4_prog(struct xdp_md *ctx)
+{
+	void *data_end = (void *)(long)ctx->data_end;
+	void *data = (void *)(long)ctx->data;
+	struct ethhdr *eth = data;
+	u64 nh_off = sizeof(*eth);
+	struct datarec *rec;
+	__be16 h_proto;
+	u32 key = 0;
+
+	rec = bpf_map_lookup_elem(&rx_cnt, &key);
+	if (rec)
+		NO_TEAR_INC(rec->processed);
+
+	if (data + nh_off > data_end)
+		goto drop;
+
+	h_proto = eth->h_proto;
+	if (h_proto == bpf_htons(ETH_P_8021Q) ||
+	    h_proto == bpf_htons(ETH_P_8021AD)) {
+		struct vlan_hdr *vhdr;
+
+		vhdr = data + nh_off;
+		nh_off += sizeof(struct vlan_hdr);
+		if (data + nh_off > data_end)
+			goto drop;
+
+		h_proto = vhdr->h_vlan_encapsulated_proto;
+	}
+
+	switch (bpf_ntohs(h_proto)) {
+	case ETH_P_ARP:
+		if (rec)
+			NO_TEAR_INC(rec->xdp_pass);
+		return XDP_PASS;
+	case ETH_P_IP: {
+		struct iphdr *iph = data + nh_off;
+		struct direct_map *direct_entry;
+		__be64 *dest_mac, *src_mac;
+		int forward_to;
+
+		if (iph + 1 > data_end)
+			goto drop;
+
+		direct_entry = bpf_map_lookup_elem(&exact_match, &iph->daddr);
+
+		/* Check for exact match, this would give a faster lookup */
+		if (direct_entry && direct_entry->mac &&
+		    direct_entry->arp.mac) {
+			src_mac = &direct_entry->mac;
+			dest_mac = &direct_entry->arp.mac;
+			forward_to = direct_entry->ifindex;
+		} else {
+			struct trie_value *prefix_value;
+			union key_4 key4;
+
+			/* Look up in the trie for lpm */
+			key4.b32[0] = 32;
+			key4.b8[4] = iph->daddr & 0xff;
+			key4.b8[5] = (iph->daddr >> 8) & 0xff;
+			key4.b8[6] = (iph->daddr >> 16) & 0xff;
+			key4.b8[7] = (iph->daddr >> 24) & 0xff;
+
+			prefix_value = bpf_map_lookup_elem(&lpm_map, &key4);
+			if (!prefix_value)
+				goto drop;
+
+			forward_to = prefix_value->ifindex;
+			src_mac = &prefix_value->value;
+			if (!src_mac)
+				goto drop;
+
+			dest_mac = bpf_map_lookup_elem(&arp_table, &iph->daddr);
+			if (!dest_mac) {
+				if (!prefix_value->gw)
+					goto drop;
+
+				dest_mac = bpf_map_lookup_elem(&arp_table,
+							       &prefix_value->gw);
+			}
+		}
+
+		if (src_mac && dest_mac) {
+			int ret;
+
+			__builtin_memcpy(eth->h_dest, dest_mac, ETH_ALEN);
+			__builtin_memcpy(eth->h_source, src_mac, ETH_ALEN);
+
+			ret = bpf_redirect_map(&tx_port, forward_to, 0);
+			if (ret == XDP_REDIRECT) {
+				if (rec)
+					NO_TEAR_INC(rec->xdp_redirect);
+				return ret;
+			}
+		}
+	}
+	default:
+		break;
+	}
+drop:
+	if (rec)
+		NO_TEAR_INC(rec->xdp_drop);
+
+	return XDP_DROP;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_router_ipv4_kern.c b/samples/bpf/xdp_router_ipv4_kern.c
deleted file mode 100644
index b37ca2b13063..000000000000
--- a/samples/bpf/xdp_router_ipv4_kern.c
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (C) 2017 Cavium, Inc.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2 of the GNU General Public License
- * as published by the Free Software Foundation.
- */
-#define KBUILD_MODNAME "foo"
-#include <uapi/linux/bpf.h>
-#include <linux/in.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/if_vlan.h>
-#include <linux/ip.h>
-#include <linux/ipv6.h>
-#include <bpf/bpf_helpers.h>
-#include <linux/slab.h>
-#include <net/ip_fib.h>
-
-struct trie_value {
-	__u8 prefix[4];
-	__be64 value;
-	int ifindex;
-	int metric;
-	__be32 gw;
-};
-
-/* Key for lpm_trie*/
-union key_4 {
-	u32 b32[2];
-	u8 b8[8];
-};
-
-struct arp_entry {
-	__be64 mac;
-	__be32 dst;
-};
-
-struct direct_map {
-	struct arp_entry arp;
-	int ifindex;
-	__be64 mac;
-};
-
-/* Map for trie implementation*/
-struct {
-	__uint(type, BPF_MAP_TYPE_LPM_TRIE);
-	__uint(key_size, 8);
-	__uint(value_size, sizeof(struct trie_value));
-	__uint(max_entries, 50);
-	__uint(map_flags, BPF_F_NO_PREALLOC);
-} lpm_map SEC(".maps");
-
-/* Map for counter*/
-struct {
-	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
-	__type(key, u32);
-	__type(value, u64);
-	__uint(max_entries, 256);
-} rxcnt SEC(".maps");
-
-/* Map for ARP table*/
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__type(key, __be32);
-	__type(value, __be64);
-	__uint(max_entries, 50);
-} arp_table SEC(".maps");
-
-/* Map to keep the exact match entries in the route table*/
-struct {
-	__uint(type, BPF_MAP_TYPE_HASH);
-	__type(key, __be32);
-	__type(value, struct direct_map);
-	__uint(max_entries, 50);
-} exact_match SEC(".maps");
-
-struct {
-	__uint(type, BPF_MAP_TYPE_DEVMAP);
-	__uint(key_size, sizeof(int));
-	__uint(value_size, sizeof(int));
-	__uint(max_entries, 100);
-} tx_port SEC(".maps");
-
-/* Function to set source and destination mac of the packet */
-static inline void set_src_dst_mac(void *data, void *src, void *dst)
-{
-	unsigned short *source = src;
-	unsigned short *dest  = dst;
-	unsigned short *p = data;
-
-	__builtin_memcpy(p, dest, 6);
-	__builtin_memcpy(p + 3, source, 6);
-}
-
-/* Parse IPV4 packet to get SRC, DST IP and protocol */
-static inline int parse_ipv4(void *data, u64 nh_off, void *data_end,
-			     __be32 *src, __be32 *dest)
-{
-	struct iphdr *iph = data + nh_off;
-
-	if (iph + 1 > data_end)
-		return 0;
-	*src = iph->saddr;
-	*dest = iph->daddr;
-	return iph->protocol;
-}
-
-SEC("xdp_router_ipv4")
-int xdp_router_ipv4_prog(struct xdp_md *ctx)
-{
-	void *data_end = (void *)(long)ctx->data_end;
-	__be64 *dest_mac = NULL, *src_mac = NULL;
-	void *data = (void *)(long)ctx->data;
-	struct trie_value *prefix_value;
-	int rc = XDP_DROP, forward_to;
-	struct ethhdr *eth = data;
-	union key_4 key4;
-	long *value;
-	u16 h_proto;
-	u32 ipproto;
-	u64 nh_off;
-
-	nh_off = sizeof(*eth);
-	if (data + nh_off > data_end)
-		return rc;
-
-	h_proto = eth->h_proto;
-
-	if (h_proto == htons(ETH_P_8021Q) || h_proto == htons(ETH_P_8021AD)) {
-		struct vlan_hdr *vhdr;
-
-		vhdr = data + nh_off;
-		nh_off += sizeof(struct vlan_hdr);
-		if (data + nh_off > data_end)
-			return rc;
-		h_proto = vhdr->h_vlan_encapsulated_proto;
-	}
-	if (h_proto == htons(ETH_P_ARP)) {
-		return XDP_PASS;
-	} else if (h_proto == htons(ETH_P_IP)) {
-		struct direct_map *direct_entry;
-		__be32 src_ip = 0, dest_ip = 0;
-
-		ipproto = parse_ipv4(data, nh_off, data_end, &src_ip, &dest_ip);
-		direct_entry = bpf_map_lookup_elem(&exact_match, &dest_ip);
-		/* Check for exact match, this would give a faster lookup*/
-		if (direct_entry && direct_entry->mac && direct_entry->arp.mac) {
-			src_mac = &direct_entry->mac;
-			dest_mac = &direct_entry->arp.mac;
-			forward_to = direct_entry->ifindex;
-		} else {
-			/* Look up in the trie for lpm*/
-			key4.b32[0] = 32;
-			key4.b8[4] = dest_ip & 0xff;
-			key4.b8[5] = (dest_ip >> 8) & 0xff;
-			key4.b8[6] = (dest_ip >> 16) & 0xff;
-			key4.b8[7] = (dest_ip >> 24) & 0xff;
-			prefix_value = bpf_map_lookup_elem(&lpm_map, &key4);
-			if (!prefix_value)
-				return XDP_DROP;
-			src_mac = &prefix_value->value;
-			if (!src_mac)
-				return XDP_DROP;
-			dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
-			if (!dest_mac) {
-				if (!prefix_value->gw)
-					return XDP_DROP;
-				dest_ip = prefix_value->gw;
-				dest_mac = bpf_map_lookup_elem(&arp_table, &dest_ip);
-			}
-			forward_to = prefix_value->ifindex;
-		}
-	} else {
-		ipproto = 0;
-	}
-	if (src_mac && dest_mac) {
-		set_src_dst_mac(data, src_mac, dest_mac);
-		value = bpf_map_lookup_elem(&rxcnt, &ipproto);
-		if (value)
-			*value += 1;
-		return  bpf_redirect_map(&tx_port, forward_to, 0);
-	}
-	return rc;
-}
-
-char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/xdp_router_ipv4_user.c b/samples/bpf/xdp_router_ipv4_user.c
index 6dae87d83e1c..f32bbd5c32bf 100644
--- a/samples/bpf/xdp_router_ipv4_user.c
+++ b/samples/bpf/xdp_router_ipv4_user.c
@@ -24,70 +24,40 @@
 #include <bpf/libbpf.h>
 #include <sys/resource.h>
 #include <libgen.h>
+#include <getopt.h>
+#include <pthread.h>
+#include "xdp_sample_user.h"
+#include "xdp_router_ipv4.skel.h"
 
-int sock, sock_arp, flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
-static int total_ifindex;
-static int *ifindex_list;
-static __u32 *prog_id_list;
-char buf[8192];
+static const char *__doc__ =
+"XDP IPv4 router implementation\n"
+"Usage: xdp_router_ipv4 <IFNAME-0> ... <IFNAME-N>\n";
+
+static char buf[8192];
 static int lpm_map_fd;
-static int rxcnt_map_fd;
 static int arp_table_map_fd;
 static int exact_match_map_fd;
 static int tx_port_map_fd;
 
+static bool routes_thread_exit;
+static int interval = 5;
+
+static int mask = SAMPLE_RX_CNT | SAMPLE_REDIRECT_ERR_MAP_CNT |
+		  SAMPLE_DEVMAP_XMIT_CNT_MULTI | SAMPLE_EXCEPTION_CNT;
+
+DEFINE_SAMPLE_INIT(xdp_router_ipv4);
+
+static const struct option long_options[] = {
+	{ "help", no_argument, NULL, 'h' },
+	{ "skb-mode", no_argument, NULL, 'S' },
+	{ "force", no_argument, NULL, 'F' },
+	{ "interval", required_argument, NULL, 'i' },
+	{ "verbose", no_argument, NULL, 'v' },
+	{ "stats", no_argument, NULL, 's' },
+	{}
+};
+
 static int get_route_table(int rtm_family);
-static void int_exit(int sig)
-{
-	__u32 prog_id = 0;
-	int i = 0;
-
-	for (i = 0; i < total_ifindex; i++) {
-		if (bpf_xdp_query_id(ifindex_list[i], flags, &prog_id)) {
-			printf("bpf_xdp_query_id on iface %d failed\n",
-			       ifindex_list[i]);
-			exit(1);
-		}
-		if (prog_id_list[i] == prog_id)
-			bpf_xdp_detach(ifindex_list[i], flags, NULL);
-		else if (!prog_id)
-			printf("couldn't find a prog id on iface %d\n",
-			       ifindex_list[i]);
-		else
-			printf("program on iface %d changed, not removing\n",
-			       ifindex_list[i]);
-		prog_id = 0;
-	}
-	exit(0);
-}
-
-static void close_and_exit(int sig)
-{
-	close(sock);
-	close(sock_arp);
-
-	int_exit(0);
-}
-
-/* Get the mac address of the interface given interface name */
-static __be64 getmac(char *iface)
-{
-	struct ifreq ifr;
-	__be64 mac = 0;
-	int fd, i;
-
-	fd = socket(AF_INET, SOCK_DGRAM, 0);
-	ifr.ifr_addr.sa_family = AF_INET;
-	strncpy(ifr.ifr_name, iface, IFNAMSIZ - 1);
-	if (ioctl(fd, SIOCGIFHWADDR, &ifr) < 0) {
-		printf("ioctl failed leaving....\n");
-		return -1;
-	}
-	for (i = 0; i < 6 ; i++)
-		*((__u8 *)&mac + i) = (__u8)ifr.ifr_hwaddr.sa_data[i];
-	close(fd);
-	return mac;
-}
 
 static int recv_msg(struct sockaddr_nl sock_addr, int sock)
 {
@@ -130,7 +100,6 @@ static void read_route(struct nlmsghdr *nh, int nll)
 	int i;
 	struct route_table {
 		int  dst_len, iface, metric;
-		char *iface_name;
 		__be32 dst, gw;
 		__be64 mac;
 	} route;
@@ -145,17 +114,7 @@ static void read_route(struct nlmsghdr *nh, int nll)
 		__be64 mac;
 	} direct_entry;
 
-	if (nh->nlmsg_type == RTM_DELROUTE)
-		printf("DELETING Route entry\n");
-	else if (nh->nlmsg_type == RTM_GETROUTE)
-		printf("READING Route entry\n");
-	else if (nh->nlmsg_type == RTM_NEWROUTE)
-		printf("NEW Route entry\n");
-	else
-		printf("%d\n", nh->nlmsg_type);
-
 	memset(&route, 0, sizeof(route));
-	printf("Destination     Gateway         Genmask         Metric Iface\n");
 	for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
 		rt_msg = (struct rtmsg *)NLMSG_DATA(nh);
 		rtm_family = rt_msg->rtm_family;
@@ -192,11 +151,7 @@ static void read_route(struct nlmsghdr *nh, int nll)
 		route.gw = atoi(gws);
 		route.iface = atoi(ifs);
 		route.metric = atoi(metrics);
-		route.iface_name = alloca(sizeof(char *) * IFNAMSIZ);
-		route.iface_name = if_indextoname(route.iface, route.iface_name);
-		route.mac = getmac(route.iface_name);
-		if (route.mac == -1)
-			int_exit(0);
+		assert(get_mac_addr(route.iface, &route.mac) == 0);
 		assert(bpf_map_update_elem(tx_port_map_fd,
 					   &route.iface, &route.iface, 0) == 0);
 		if (rtm_family == AF_INET) {
@@ -207,7 +162,6 @@ static void read_route(struct nlmsghdr *nh, int nll)
 				int metric;
 				__be32 gw;
 			} *prefix_value;
-			struct in_addr dst_addr, gw_addr, mask_addr;
 
 			prefix_key = alloca(sizeof(*prefix_key) + 3);
 			prefix_value = alloca(sizeof(*prefix_value));
@@ -235,17 +189,6 @@ static void read_route(struct nlmsghdr *nh, int nll)
 			for (i = 0; i < 4; i++)
 				prefix_key->data[i] = (route.dst >> i * 8) & 0xff;
 
-			dst_addr.s_addr = route.dst;
-			printf("%-16s", inet_ntoa(dst_addr));
-
-			gw_addr.s_addr = route.gw;
-			printf("%-16s", inet_ntoa(gw_addr));
-
-			mask_addr.s_addr = htonl(~(0xffffffffU >> route.dst_len));
-			printf("%-16s%-7d%s\n", inet_ntoa(mask_addr),
-			       route.metric,
-			       route.iface_name);
-
 			if (bpf_map_lookup_elem(lpm_map_fd, prefix_key,
 						prefix_value) < 0) {
 				for (i = 0; i < 4; i++)
@@ -261,13 +204,6 @@ static void read_route(struct nlmsghdr *nh, int nll)
 							   ) == 0);
 			} else {
 				if (nh->nlmsg_type == RTM_DELROUTE) {
-					printf("deleting entry\n");
-					printf("prefix key=%d.%d.%d.%d/%d",
-					       prefix_key->data[0],
-					       prefix_key->data[1],
-					       prefix_key->data[2],
-					       prefix_key->data[3],
-					       prefix_key->prefixlen);
 					assert(bpf_map_delete_elem(lpm_map_fd,
 								   prefix_key
 								   ) == 0);
@@ -331,14 +267,14 @@ static int get_route_table(int rtm_family)
 
 	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
 	if (sock < 0) {
-		printf("open netlink socket: %s\n", strerror(errno));
-		return -1;
+		fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+		return -errno;
 	}
 	memset(&sa, 0, sizeof(sa));
 	sa.nl_family = AF_NETLINK;
 	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
-		printf("bind to netlink: %s\n", strerror(errno));
-		ret = -1;
+		fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+		ret = -errno;
 		goto cleanup;
 	}
 	memset(&req, 0, sizeof(req));
@@ -357,15 +293,15 @@ static int get_route_table(int rtm_family)
 	msg.msg_iovlen = 1;
 	ret = sendmsg(sock, &msg, 0);
 	if (ret < 0) {
-		printf("send to netlink: %s\n", strerror(errno));
-		ret = -1;
+		fprintf(stderr, "send to netlink: %s\n", strerror(errno));
+		ret = -errno;
 		goto cleanup;
 	}
 	memset(buf, 0, sizeof(buf));
 	nll = recv_msg(sa, sock);
 	if (nll < 0) {
-		printf("recv from netlink: %s\n", strerror(nll));
-		ret = -1;
+		fprintf(stderr, "recv from netlink: %s\n", strerror(nll));
+		ret = nll;
 		goto cleanup;
 	}
 	nh = (struct nlmsghdr *)buf;
@@ -395,14 +331,7 @@ static void read_arp(struct nlmsghdr *nh, int nll)
 		__be64 mac;
 	} direct_entry;
 
-	if (nh->nlmsg_type == RTM_GETNEIGH)
-		printf("READING arp entry\n");
-	printf("Address         HwAddress\n");
 	for (; NLMSG_OK(nh, nll); nh = NLMSG_NEXT(nh, nll)) {
-		struct in_addr dst_addr;
-		char mac_str[18];
-		int len = 0, i;
-
 		rt_msg = (struct ndmsg *)NLMSG_DATA(nh);
 		rt_attr = (struct rtattr *)RTM_RTA(rt_msg);
 		ndm_family = rt_msg->ndm_family;
@@ -424,13 +353,6 @@ static void read_arp(struct nlmsghdr *nh, int nll)
 		arp_entry.dst = atoi(dsts);
 		arp_entry.mac = atol(mac);
 
-		dst_addr.s_addr = arp_entry.dst;
-		for (i = 0; i < 6; i++)
-			len += snprintf(mac_str + len, 18 - len, "%02llx%s",
-					((arp_entry.mac >> i * 8) & 0xff),
-					i < 5 ? ":" : "");
-		printf("%-16s%s\n", inet_ntoa(dst_addr), mac_str);
-
 		if (ndm_family == AF_INET) {
 			if (bpf_map_lookup_elem(exact_match_map_fd,
 						&arp_entry.dst,
@@ -481,14 +403,14 @@ static int get_arp_table(int rtm_family)
 
 	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
 	if (sock < 0) {
-		printf("open netlink socket: %s\n", strerror(errno));
-		return -1;
+		fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+		return -errno;
 	}
 	memset(&sa, 0, sizeof(sa));
 	sa.nl_family = AF_NETLINK;
 	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
-		printf("bind to netlink: %s\n", strerror(errno));
-		ret = -1;
+		fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+		ret = -errno;
 		goto cleanup;
 	}
 	memset(&req, 0, sizeof(req));
@@ -506,15 +428,15 @@ static int get_arp_table(int rtm_family)
 	msg.msg_iovlen = 1;
 	ret = sendmsg(sock, &msg, 0);
 	if (ret < 0) {
-		printf("send to netlink: %s\n", strerror(errno));
-		ret = -1;
+		fprintf(stderr, "send to netlink: %s\n", strerror(errno));
+		ret = -errno;
 		goto cleanup;
 	}
 	memset(buf, 0, sizeof(buf));
 	nll = recv_msg(sa, sock);
 	if (nll < 0) {
-		printf("recv from netlink: %s\n", strerror(nll));
-		ret = -1;
+		fprintf(stderr, "recv from netlink: %s\n", strerror(nll));
+		ret = nll;
 		goto cleanup;
 	}
 	nh = (struct nlmsghdr *)buf;
@@ -527,24 +449,17 @@ cleanup:
 /* Function to keep track and update changes in route and arp table
  * Give regular statistics of packets forwarded
  */
-static int monitor_route(void)
+static void *monitor_routes_thread(void *arg)
 {
-	unsigned int nr_cpus = bpf_num_possible_cpus();
-	const unsigned int nr_keys = 256;
 	struct pollfd fds_route, fds_arp;
-	__u64 prev[nr_keys][nr_cpus];
 	struct sockaddr_nl la, lr;
-	__u64 values[nr_cpus];
+	int sock, sock_arp, nll;
 	struct nlmsghdr *nh;
-	int nll, ret = 0;
-	int interval = 5;
-	__u32 key;
-	int i;
 
 	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
 	if (sock < 0) {
-		printf("open netlink socket: %s\n", strerror(errno));
-		return -1;
+		fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+		return NULL;
 	}
 
 	fcntl(sock, F_SETFL, O_NONBLOCK);
@@ -552,17 +467,19 @@ static int monitor_route(void)
 	lr.nl_family = AF_NETLINK;
 	lr.nl_groups = RTMGRP_IPV6_ROUTE | RTMGRP_IPV4_ROUTE | RTMGRP_NOTIFY;
 	if (bind(sock, (struct sockaddr *)&lr, sizeof(lr)) < 0) {
-		printf("bind to netlink: %s\n", strerror(errno));
-		ret = -1;
-		goto cleanup;
+		fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
+		close(sock);
+		return NULL;
 	}
+
 	fds_route.fd = sock;
 	fds_route.events = POLL_IN;
 
 	sock_arp = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
 	if (sock_arp < 0) {
-		printf("open netlink socket: %s\n", strerror(errno));
-		return -1;
+		fprintf(stderr, "open netlink socket: %s\n", strerror(errno));
+		close(sock);
+		return NULL;
 	}
 
 	fcntl(sock_arp, F_SETFL, O_NONBLOCK);
@@ -570,51 +487,44 @@ static int monitor_route(void)
 	la.nl_family = AF_NETLINK;
 	la.nl_groups = RTMGRP_NEIGH | RTMGRP_NOTIFY;
 	if (bind(sock_arp, (struct sockaddr *)&la, sizeof(la)) < 0) {
-		printf("bind to netlink: %s\n", strerror(errno));
-		ret = -1;
+		fprintf(stderr, "bind netlink socket: %s\n", strerror(errno));
 		goto cleanup;
 	}
+
 	fds_arp.fd = sock_arp;
 	fds_arp.events = POLL_IN;
 
-	memset(prev, 0, sizeof(prev));
-	do {
-		signal(SIGINT, close_and_exit);
-		signal(SIGTERM, close_and_exit);
+	/* dump route and arp tables */
+	if (get_arp_table(AF_INET) < 0) {
+		fprintf(stderr, "Failed reading arp table\n");
+		goto cleanup;
+	}
 
-		sleep(interval);
-		for (key = 0; key < nr_keys; key++) {
-			__u64 sum = 0;
-
-			assert(bpf_map_lookup_elem(rxcnt_map_fd,
-						   &key, values) == 0);
-			for (i = 0; i < nr_cpus; i++)
-				sum += (values[i] - prev[key][i]);
-			if (sum)
-				printf("proto %u: %10llu pkt/s\n",
-				       key, sum / interval);
-			memcpy(prev[key], values, sizeof(values));
-		}
+	if (get_route_table(AF_INET) < 0) {
+		fprintf(stderr, "Failed reading route table\n");
+		goto cleanup;
+	}
 
+	while (!routes_thread_exit) {
 		memset(buf, 0, sizeof(buf));
 		if (poll(&fds_route, 1, 3) == POLL_IN) {
 			nll = recv_msg(lr, sock);
 			if (nll < 0) {
-				printf("recv from netlink: %s\n", strerror(nll));
-				ret = -1;
+				fprintf(stderr, "recv from netlink: %s\n",
+					strerror(nll));
 				goto cleanup;
 			}
 
 			nh = (struct nlmsghdr *)buf;
-			printf("Routing table updated.\n");
 			read_route(nh, nll);
 		}
+
 		memset(buf, 0, sizeof(buf));
 		if (poll(&fds_arp, 1, 3) == POLL_IN) {
 			nll = recv_msg(la, sock_arp);
 			if (nll < 0) {
-				printf("recv from netlink: %s\n", strerror(nll));
-				ret = -1;
+				fprintf(stderr, "recv from netlink: %s\n",
+					strerror(nll));
 				goto cleanup;
 			}
 
@@ -622,132 +532,169 @@ static int monitor_route(void)
 			read_arp(nh, nll);
 		}
 
-	} while (1);
+		sleep(interval);
+	}
+
 cleanup:
+	close(sock_arp);
 	close(sock);
-	return ret;
+	return NULL;
 }
 
-static void usage(const char *prog)
+static void usage(char *argv[], const struct option *long_options,
+		  const char *doc, int mask, bool error,
+		  struct bpf_object *obj)
 {
-	fprintf(stderr,
-		"%s: %s [OPTS] interface name list\n\n"
-		"OPTS:\n"
-		"    -S    use skb-mode\n"
-		"    -F    force loading prog\n",
-		__func__, prog);
+	sample_usage(argv, long_options, doc, mask, error);
 }
 
-int main(int ac, char **argv)
+int main(int argc, char **argv)
 {
-	struct bpf_prog_info info = {};
-	__u32 info_len = sizeof(info);
-	const char *optstr = "SF";
-	struct bpf_program *prog;
-	struct bpf_object *obj;
-	char filename[256];
-	char **ifname_list;
-	int prog_fd, opt;
-	int err, i = 1;
+	bool error = true, generic = false, force = false;
+	int opt, ret = EXIT_FAIL_BPF;
+	struct xdp_router_ipv4 *skel;
+	int i, total_ifindex = argc - 1;
+	char **ifname_list = argv + 1;
+	pthread_t routes_thread;
+	int longindex = 0;
 
-	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+	if (libbpf_set_strict_mode(LIBBPF_STRICT_ALL) < 0) {
+		fprintf(stderr, "Failed to set libbpf strict mode: %s\n",
+			strerror(errno));
+		goto end;
+	}
 
-	total_ifindex = ac - 1;
-	ifname_list = (argv + 1);
+	skel = xdp_router_ipv4__open();
+	if (!skel) {
+		fprintf(stderr, "Failed to xdp_router_ipv4__open: %s\n",
+			strerror(errno));
+		goto end;
+	}
 
-	while ((opt = getopt(ac, argv, optstr)) != -1) {
+	ret = sample_init_pre_load(skel);
+	if (ret < 0) {
+		fprintf(stderr, "Failed to sample_init_pre_load: %s\n",
+			strerror(-ret));
+		ret = EXIT_FAIL_BPF;
+		goto end_destroy;
+	}
+
+	ret = xdp_router_ipv4__load(skel);
+	if (ret < 0) {
+		fprintf(stderr, "Failed to xdp_router_ipv4__load: %s\n",
+			strerror(errno));
+		goto end_destroy;
+	}
+
+	ret = sample_init(skel, mask);
+	if (ret < 0) {
+		fprintf(stderr, "Failed to initialize sample: %s\n", strerror(-ret));
+		ret = EXIT_FAIL;
+		goto end_destroy;
+	}
+
+	while ((opt = getopt_long(argc, argv, "si:SFvh",
+				  long_options, &longindex)) != -1) {
 		switch (opt) {
+		case 's':
+			mask |= SAMPLE_REDIRECT_MAP_CNT;
+			total_ifindex--;
+			ifname_list++;
+			break;
+		case 'i':
+			interval = strtoul(optarg, NULL, 0);
+			total_ifindex -= 2;
+			ifname_list += 2;
+			break;
 		case 'S':
-			flags |= XDP_FLAGS_SKB_MODE;
+			generic = true;
 			total_ifindex--;
 			ifname_list++;
 			break;
 		case 'F':
-			flags &= ~XDP_FLAGS_UPDATE_IF_NOEXIST;
+			force = true;
 			total_ifindex--;
 			ifname_list++;
 			break;
+		case 'v':
+			sample_switch_mode();
+			total_ifindex--;
+			ifname_list++;
+			break;
+		case 'h':
+			error = false;
 		default:
-			usage(basename(argv[0]));
-			return 1;
+			usage(argv, long_options, __doc__, mask, error, skel->obj);
+			goto end_destroy;
 		}
 	}
 
-	if (!(flags & XDP_FLAGS_SKB_MODE))
-		flags |= XDP_FLAGS_DRV_MODE;
-
-	if (optind == ac) {
-		usage(basename(argv[0]));
-		return 1;
+	ret = EXIT_FAIL_OPTION;
+	if (optind == argc) {
+		usage(argv, long_options, __doc__, mask, true, skel->obj);
+		goto end_destroy;
 	}
 
-	obj = bpf_object__open_file(filename, NULL);
-	if (libbpf_get_error(obj))
-		return 1;
-
-	prog = bpf_object__next_program(obj, NULL);
-	bpf_program__set_type(prog, BPF_PROG_TYPE_XDP);
-
-	printf("\n******************loading bpf file*********************\n");
-	err = bpf_object__load(obj);
-	if (err) {
-		printf("bpf_object__load(): %s\n", strerror(errno));
-		return 1;
+	lpm_map_fd = bpf_map__fd(skel->maps.lpm_map);
+	if (lpm_map_fd < 0) {
+		fprintf(stderr, "Failed loading lpm_map %s\n",
+			strerror(-lpm_map_fd));
+		goto end_destroy;
 	}
-	prog_fd = bpf_program__fd(prog);
-
-	lpm_map_fd = bpf_object__find_map_fd_by_name(obj, "lpm_map");
-	rxcnt_map_fd = bpf_object__find_map_fd_by_name(obj, "rxcnt");
-	arp_table_map_fd = bpf_object__find_map_fd_by_name(obj, "arp_table");
-	exact_match_map_fd = bpf_object__find_map_fd_by_name(obj,
-							     "exact_match");
-	tx_port_map_fd = bpf_object__find_map_fd_by_name(obj, "tx_port");
-	if (lpm_map_fd < 0 || rxcnt_map_fd < 0 || arp_table_map_fd < 0 ||
-	    exact_match_map_fd < 0 || tx_port_map_fd < 0) {
-		printf("bpf_object__find_map_fd_by_name failed\n");
-		return 1;
+	arp_table_map_fd = bpf_map__fd(skel->maps.arp_table);
+	if (arp_table_map_fd < 0) {
+		fprintf(stderr, "Failed loading arp_table_map_fd %s\n",
+			strerror(-arp_table_map_fd));
+		goto end_destroy;
+	}
+	exact_match_map_fd = bpf_map__fd(skel->maps.exact_match);
+	if (exact_match_map_fd < 0) {
+		fprintf(stderr, "Failed loading exact_match_map_fd %s\n",
+			strerror(-exact_match_map_fd));
+		goto end_destroy;
+	}
+	tx_port_map_fd = bpf_map__fd(skel->maps.tx_port);
+	if (tx_port_map_fd < 0) {
+		fprintf(stderr, "Failed loading tx_port_map_fd %s\n",
+			strerror(-tx_port_map_fd));
+		goto end_destroy;
 	}
 
-	ifindex_list = (int *)calloc(total_ifindex, sizeof(int *));
+	ret = EXIT_FAIL_XDP;
 	for (i = 0; i < total_ifindex; i++) {
-		ifindex_list[i] = if_nametoindex(ifname_list[i]);
-		if (!ifindex_list[i]) {
-			printf("Couldn't translate interface name: %s",
-			       strerror(errno));
-			return 1;
-		}
-	}
-	prog_id_list = (__u32 *)calloc(total_ifindex, sizeof(__u32 *));
-	for (i = 0; i < total_ifindex; i++) {
-		if (bpf_xdp_attach(ifindex_list[i], prog_fd, flags, NULL) < 0) {
-			printf("link set xdp fd failed\n");
-			int recovery_index = i;
+		int index = if_nametoindex(ifname_list[i]);
 
-			for (i = 0; i < recovery_index; i++)
-				bpf_xdp_detach(ifindex_list[i], flags, NULL);
-
-			return 1;
+		if (!index) {
+			fprintf(stderr, "Interface %s not found %s\n",
+				ifname_list[i], strerror(-tx_port_map_fd));
+			goto end_destroy;
 		}
-		err = bpf_obj_get_info_by_fd(prog_fd, &info, &info_len);
-		if (err) {
-			printf("can't get prog info - %s\n", strerror(errno));
-			return err;
-		}
-		prog_id_list[i] = info.id;
-		memset(&info, 0, sizeof(info));
-		printf("Attached to %d\n", ifindex_list[i]);
-	}
-	signal(SIGINT, int_exit);
-	signal(SIGTERM, int_exit);
-
-	printf("\n*******************ROUTE TABLE*************************\n");
-	get_route_table(AF_INET);
-	printf("\n*******************ARP TABLE***************************\n");
-	get_arp_table(AF_INET);
-	if (monitor_route() < 0) {
-		printf("Error in receiving route update");
-		return 1;
+		if (sample_install_xdp(skel->progs.xdp_router_ipv4_prog,
+				       index, generic, force) < 0)
+			goto end_destroy;
 	}
 
-	return 0;
+	ret = pthread_create(&routes_thread, NULL, monitor_routes_thread, NULL);
+	if (ret) {
+		fprintf(stderr, "Failed creating routes_thread: %s\n", strerror(-ret));
+		ret = EXIT_FAIL;
+		goto end_destroy;
+	}
+
+	ret = sample_run(interval, NULL, NULL);
+	routes_thread_exit = true;
+
+	if (ret < 0) {
+		fprintf(stderr, "Failed during sample run: %s\n", strerror(-ret));
+		ret = EXIT_FAIL;
+		goto end_thread_wait;
+	}
+	ret = EXIT_OK;
+
+end_thread_wait:
+	pthread_join(routes_thread, NULL);
+end_destroy:
+	xdp_router_ipv4__destroy(skel);
+end:
+	sample_exit(ret);
 }
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 290998c82de1..f041c4a6a1f2 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -567,7 +567,7 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types,
 
 		res = probe_prog_type_ifindex(prog_type, ifindex);
 	} else {
-		res = libbpf_probe_bpf_prog_type(prog_type, NULL);
+		res = libbpf_probe_bpf_prog_type(prog_type, NULL) > 0;
 	}
 
 #ifdef USE_LIBCAP
diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index 97dec81950e5..8fb0116f9136 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -20,6 +20,9 @@ static const char * const link_type_name[] = {
 	[BPF_LINK_TYPE_CGROUP]			= "cgroup",
 	[BPF_LINK_TYPE_ITER]			= "iter",
 	[BPF_LINK_TYPE_NETNS]			= "netns",
+	[BPF_LINK_TYPE_XDP]			= "xdp",
+	[BPF_LINK_TYPE_PERF_EVENT]		= "perf_event",
+	[BPF_LINK_TYPE_KPROBE_MULTI]		= "kprobe_multi",
 };
 
 static struct hashmap *link_table;
diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index bc4e05542c2b..8643b37d4e43 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -68,6 +68,7 @@ const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_EXT]			= "ext",
 	[BPF_PROG_TYPE_LSM]			= "lsm",
 	[BPF_PROG_TYPE_SK_LOOKUP]		= "sk_lookup",
+	[BPF_PROG_TYPE_SYSCALL]			= "syscall",
 };
 
 const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name);
diff --git a/tools/include/uapi/linux/btf.h b/tools/include/uapi/linux/btf.h
index b0d8fea1951d..a9162a6c0284 100644
--- a/tools/include/uapi/linux/btf.h
+++ b/tools/include/uapi/linux/btf.h
@@ -33,8 +33,8 @@ struct btf_type {
 	/* "info" bits arrangement
 	 * bits  0-15: vlen (e.g. # of struct's members)
 	 * bits 16-23: unused
-	 * bits 24-27: kind (e.g. int, ptr, array...etc)
-	 * bits 28-30: unused
+	 * bits 24-28: kind (e.g. int, ptr, array...etc)
+	 * bits 29-30: unused
 	 * bit     31: kind_flag, currently used by
 	 *             struct, union and fwd
 	 */
diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
index 94f0a146bb7b..31a1a9015902 100644
--- a/tools/lib/bpf/Build
+++ b/tools/lib/bpf/Build
@@ -1,3 +1,4 @@
 libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
 	    netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \
-	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o
+	    btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \
+	    usdt.o
diff --git a/tools/lib/bpf/Makefile b/tools/lib/bpf/Makefile
index 064c89e31560..64741c55b8e3 100644
--- a/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@ -239,7 +239,7 @@ install_lib: all_cmd
 
 SRC_HDRS := bpf.h libbpf.h btf.h libbpf_common.h libbpf_legacy.h xsk.h	     \
 	    bpf_helpers.h bpf_tracing.h bpf_endian.h bpf_core_read.h	     \
-	    skel_internal.h libbpf_version.h
+	    skel_internal.h libbpf_version.h usdt.bpf.h
 GEN_HDRS := $(BPF_GENERATED)
 
 INSTALL_PFX := $(DESTDIR)$(prefix)/include/bpf
diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index 1383e26c5d1f..d124e9e533f0 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -2826,10 +2826,8 @@ struct btf_ext *btf_ext__new(const __u8 *data, __u32 size)
 	if (err)
 		goto done;
 
-	if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len)) {
-		err = -EINVAL;
-		goto done;
-	}
+	if (btf_ext->hdr->hdr_len < offsetofend(struct btf_ext_header, core_relo_len))
+		goto done; /* skip core relos parsing */
 
 	err = btf_ext_setup_core_relos(btf_ext);
 	if (err)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 809fe209cdcc..465b7c0996f1 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -483,6 +483,8 @@ struct elf_state {
 	int st_ops_shndx;
 };
 
+struct usdt_manager;
+
 struct bpf_object {
 	char name[BPF_OBJ_NAME_LEN];
 	char license[64];
@@ -545,6 +547,8 @@ struct bpf_object {
 	size_t fd_array_cap;
 	size_t fd_array_cnt;
 
+	struct usdt_manager *usdt_man;
+
 	char path[];
 };
 
@@ -1397,8 +1401,11 @@ static int find_elf_var_offset(const struct bpf_object *obj, const char *name, _
 	for (si = 0; si < symbols->d_size / sizeof(Elf64_Sym); si++) {
 		Elf64_Sym *sym = elf_sym_by_idx(obj, si);
 
-		if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL ||
-		    ELF64_ST_TYPE(sym->st_info) != STT_OBJECT)
+		if (ELF64_ST_TYPE(sym->st_info) != STT_OBJECT)
+			continue;
+
+		if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL &&
+		    ELF64_ST_BIND(sym->st_info) != STB_WEAK)
 			continue;
 
 		sname = elf_sym_str(obj, sym->st_name);
@@ -4678,6 +4685,18 @@ static int probe_perf_link(void)
 	return link_fd < 0 && err == -EBADF;
 }
 
+static int probe_kern_bpf_cookie(void)
+{
+	struct bpf_insn insns[] = {
+		BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie),
+		BPF_EXIT_INSN(),
+	};
+	int ret, insn_cnt = ARRAY_SIZE(insns);
+
+	ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL);
+	return probe_fd(ret);
+}
+
 enum kern_feature_result {
 	FEAT_UNKNOWN = 0,
 	FEAT_SUPPORTED = 1,
@@ -4740,6 +4759,9 @@ static struct kern_feature_desc {
 	[FEAT_MEMCG_ACCOUNT] = {
 		"memcg-based memory accounting", probe_memcg_account,
 	},
+	[FEAT_BPF_COOKIE] = {
+		"BPF cookie support", probe_kern_bpf_cookie,
+	},
 };
 
 bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id)
@@ -5665,10 +5687,17 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path)
 			insn_idx = rec->insn_off / BPF_INSN_SZ;
 			prog = find_prog_by_sec_insn(obj, sec_idx, insn_idx);
 			if (!prog) {
-				pr_warn("sec '%s': failed to find program at insn #%d for CO-RE offset relocation #%d\n",
-					sec_name, insn_idx, i);
-				err = -EINVAL;
-				goto out;
+				/* When __weak subprog is "overridden" by another instance
+				 * of the subprog from a different object file, linker still
+				 * appends all the .BTF.ext info that used to belong to that
+				 * eliminated subprogram.
+				 * This is similar to what x86-64 linker does for relocations.
+				 * So just ignore such relocations just like we ignore
+				 * subprog instructions when discovering subprograms.
+				 */
+				pr_debug("sec '%s': skipping CO-RE relocation #%d for insn #%d belonging to eliminated weak subprogram\n",
+					 sec_name, i, insn_idx);
+				continue;
 			}
 			/* no need to apply CO-RE relocation if the program is
 			 * not going to be loaded
@@ -8200,6 +8229,9 @@ void bpf_object__close(struct bpf_object *obj)
 	if (obj->clear_priv)
 		obj->clear_priv(obj, obj->priv);
 
+	usdt_manager_free(obj->usdt_man);
+	obj->usdt_man = NULL;
+
 	bpf_gen__free(obj->gen_loader);
 	bpf_object__elf_finish(obj);
 	bpf_object_unload(obj);
@@ -8630,6 +8662,8 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log
 }
 
 static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link);
+static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link);
+static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link);
 static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link);
@@ -8642,11 +8676,12 @@ static const struct bpf_sec_def section_defs[] = {
 	SEC_DEF("sk_reuseport/migrate",	SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX),
 	SEC_DEF("sk_reuseport",		SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX),
 	SEC_DEF("kprobe/",		KPROBE,	0, SEC_NONE, attach_kprobe),
-	SEC_DEF("uprobe/",		KPROBE,	0, SEC_NONE),
+	SEC_DEF("uprobe+",		KPROBE,	0, SEC_NONE, attach_uprobe),
 	SEC_DEF("kretprobe/",		KPROBE, 0, SEC_NONE, attach_kprobe),
-	SEC_DEF("uretprobe/",		KPROBE, 0, SEC_NONE),
+	SEC_DEF("uretprobe+",		KPROBE, 0, SEC_NONE, attach_uprobe),
 	SEC_DEF("kprobe.multi/",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
 	SEC_DEF("kretprobe.multi/",	KPROBE,	BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi),
+	SEC_DEF("usdt+",		KPROBE,	0, SEC_NONE, attach_usdt),
 	SEC_DEF("tc",			SCHED_CLS, 0, SEC_NONE),
 	SEC_DEF("classifier",		SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX | SEC_DEPRECATED),
 	SEC_DEF("action",		SCHED_ACT, 0, SEC_NONE | SEC_SLOPPY_PFX),
@@ -9692,14 +9727,6 @@ int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type,
 	return bpf_prog_load_xattr2(&attr, pobj, prog_fd);
 }
 
-struct bpf_link {
-	int (*detach)(struct bpf_link *link);
-	void (*dealloc)(struct bpf_link *link);
-	char *pin_path;		/* NULL, if not pinned */
-	int fd;			/* hook FD, -1 if not applicable */
-	bool disconnected;
-};
-
 /* Replace link's underlying BPF program with the new one */
 int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog)
 {
@@ -10517,6 +10544,273 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe,
 	return pfd;
 }
 
+/* uprobes deal in relative offsets; subtract the base address associated with
+ * the mapped binary.  See Documentation/trace/uprobetracer.rst for more
+ * details.
+ */
+static long elf_find_relative_offset(const char *filename, Elf *elf, long addr)
+{
+	size_t n;
+	int i;
+
+	if (elf_getphdrnum(elf, &n)) {
+		pr_warn("elf: failed to find program headers for '%s': %s\n", filename,
+			elf_errmsg(-1));
+		return -ENOENT;
+	}
+
+	for (i = 0; i < n; i++) {
+		int seg_start, seg_end, seg_offset;
+		GElf_Phdr phdr;
+
+		if (!gelf_getphdr(elf, i, &phdr)) {
+			pr_warn("elf: failed to get program header %d from '%s': %s\n", i, filename,
+				elf_errmsg(-1));
+			return -ENOENT;
+		}
+		if (phdr.p_type != PT_LOAD || !(phdr.p_flags & PF_X))
+			continue;
+
+		seg_start = phdr.p_vaddr;
+		seg_end = seg_start + phdr.p_memsz;
+		seg_offset = phdr.p_offset;
+		if (addr >= seg_start && addr < seg_end)
+			return addr - seg_start + seg_offset;
+	}
+	pr_warn("elf: failed to find prog header containing 0x%lx in '%s'\n", addr, filename);
+	return -ENOENT;
+}
+
+/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */
+static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn)
+{
+	while ((scn = elf_nextscn(elf, scn)) != NULL) {
+		GElf_Shdr sh;
+
+		if (!gelf_getshdr(scn, &sh))
+			continue;
+		if (sh.sh_type == sh_type)
+			return scn;
+	}
+	return NULL;
+}
+
+/* Find offset of function name in object specified by path.  "name" matches
+ * symbol name or name@@LIB for library functions.
+ */
+static long elf_find_func_offset(const char *binary_path, const char *name)
+{
+	int fd, i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB };
+	bool is_shared_lib, is_name_qualified;
+	char errmsg[STRERR_BUFSIZE];
+	long ret = -ENOENT;
+	size_t name_len;
+	GElf_Ehdr ehdr;
+	Elf *elf;
+
+	fd = open(binary_path, O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		ret = -errno;
+		pr_warn("failed to open %s: %s\n", binary_path,
+			libbpf_strerror_r(ret, errmsg, sizeof(errmsg)));
+		return ret;
+	}
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (!elf) {
+		pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1));
+		close(fd);
+		return -LIBBPF_ERRNO__FORMAT;
+	}
+	if (!gelf_getehdr(elf, &ehdr)) {
+		pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1));
+		ret = -LIBBPF_ERRNO__FORMAT;
+		goto out;
+	}
+	/* for shared lib case, we do not need to calculate relative offset */
+	is_shared_lib = ehdr.e_type == ET_DYN;
+
+	name_len = strlen(name);
+	/* Does name specify "@@LIB"? */
+	is_name_qualified = strstr(name, "@@") != NULL;
+
+	/* Search SHT_DYNSYM, SHT_SYMTAB for symbol.  This search order is used because if
+	 * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically
+	 * linked binary may not have SHT_DYMSYM, so absence of a section should not be
+	 * reported as a warning/error.
+	 */
+	for (i = 0; i < ARRAY_SIZE(sh_types); i++) {
+		size_t nr_syms, strtabidx, idx;
+		Elf_Data *symbols = NULL;
+		Elf_Scn *scn = NULL;
+		int last_bind = -1;
+		const char *sname;
+		GElf_Shdr sh;
+
+		scn = elf_find_next_scn_by_type(elf, sh_types[i], NULL);
+		if (!scn) {
+			pr_debug("elf: failed to find symbol table ELF sections in '%s'\n",
+				 binary_path);
+			continue;
+		}
+		if (!gelf_getshdr(scn, &sh))
+			continue;
+		strtabidx = sh.sh_link;
+		symbols = elf_getdata(scn, 0);
+		if (!symbols) {
+			pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n",
+				binary_path, elf_errmsg(-1));
+			ret = -LIBBPF_ERRNO__FORMAT;
+			goto out;
+		}
+		nr_syms = symbols->d_size / sh.sh_entsize;
+
+		for (idx = 0; idx < nr_syms; idx++) {
+			int curr_bind;
+			GElf_Sym sym;
+
+			if (!gelf_getsym(symbols, idx, &sym))
+				continue;
+
+			if (GELF_ST_TYPE(sym.st_info) != STT_FUNC)
+				continue;
+
+			sname = elf_strptr(elf, strtabidx, sym.st_name);
+			if (!sname)
+				continue;
+
+			curr_bind = GELF_ST_BIND(sym.st_info);
+
+			/* User can specify func, func@@LIB or func@@LIB_VERSION. */
+			if (strncmp(sname, name, name_len) != 0)
+				continue;
+			/* ...but we don't want a search for "foo" to match 'foo2" also, so any
+			 * additional characters in sname should be of the form "@@LIB".
+			 */
+			if (!is_name_qualified && sname[name_len] != '\0' && sname[name_len] != '@')
+				continue;
+
+			if (ret >= 0) {
+				/* handle multiple matches */
+				if (last_bind != STB_WEAK && curr_bind != STB_WEAK) {
+					/* Only accept one non-weak bind. */
+					pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n",
+						sname, name, binary_path);
+					ret = -LIBBPF_ERRNO__FORMAT;
+					goto out;
+				} else if (curr_bind == STB_WEAK) {
+					/* already have a non-weak bind, and
+					 * this is a weak bind, so ignore.
+					 */
+					continue;
+				}
+			}
+			ret = sym.st_value;
+			last_bind = curr_bind;
+		}
+		/* For binaries that are not shared libraries, we need relative offset */
+		if (ret > 0 && !is_shared_lib)
+			ret = elf_find_relative_offset(binary_path, elf, ret);
+		if (ret > 0)
+			break;
+	}
+
+	if (ret > 0) {
+		pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path,
+			 ret);
+	} else {
+		if (ret == 0) {
+			pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path,
+				is_shared_lib ? "should not be 0 in a shared library" :
+						"try using shared library path instead");
+			ret = -ENOENT;
+		} else {
+			pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path);
+		}
+	}
+out:
+	elf_end(elf);
+	close(fd);
+	return ret;
+}
+
+static const char *arch_specific_lib_paths(void)
+{
+	/*
+	 * Based on https://packages.debian.org/sid/libc6.
+	 *
+	 * Assume that the traced program is built for the same architecture
+	 * as libbpf, which should cover the vast majority of cases.
+	 */
+#if defined(__x86_64__)
+	return "/lib/x86_64-linux-gnu";
+#elif defined(__i386__)
+	return "/lib/i386-linux-gnu";
+#elif defined(__s390x__)
+	return "/lib/s390x-linux-gnu";
+#elif defined(__s390__)
+	return "/lib/s390-linux-gnu";
+#elif defined(__arm__) && defined(__SOFTFP__)
+	return "/lib/arm-linux-gnueabi";
+#elif defined(__arm__) && !defined(__SOFTFP__)
+	return "/lib/arm-linux-gnueabihf";
+#elif defined(__aarch64__)
+	return "/lib/aarch64-linux-gnu";
+#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 64
+	return "/lib/mips64el-linux-gnuabi64";
+#elif defined(__mips__) && defined(__MIPSEL__) && _MIPS_SZLONG == 32
+	return "/lib/mipsel-linux-gnu";
+#elif defined(__powerpc64__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	return "/lib/powerpc64le-linux-gnu";
+#elif defined(__sparc__) && defined(__arch64__)
+	return "/lib/sparc64-linux-gnu";
+#elif defined(__riscv) && __riscv_xlen == 64
+	return "/lib/riscv64-linux-gnu";
+#else
+	return NULL;
+#endif
+}
+
+/* Get full path to program/shared library. */
+static int resolve_full_path(const char *file, char *result, size_t result_sz)
+{
+	const char *search_paths[3] = {};
+	int i;
+
+	if (str_has_sfx(file, ".so") || strstr(file, ".so.")) {
+		search_paths[0] = getenv("LD_LIBRARY_PATH");
+		search_paths[1] = "/usr/lib64:/usr/lib";
+		search_paths[2] = arch_specific_lib_paths();
+	} else {
+		search_paths[0] = getenv("PATH");
+		search_paths[1] = "/usr/bin:/usr/sbin";
+	}
+
+	for (i = 0; i < ARRAY_SIZE(search_paths); i++) {
+		const char *s;
+
+		if (!search_paths[i])
+			continue;
+		for (s = search_paths[i]; s != NULL; s = strchr(s, ':')) {
+			char *next_path;
+			int seg_len;
+
+			if (s[0] == ':')
+				s++;
+			next_path = strchr(s, ':');
+			seg_len = next_path ? next_path - s : strlen(s);
+			if (!seg_len)
+				continue;
+			snprintf(result, result_sz, "%.*s/%s", seg_len, s, file);
+			/* ensure it is an executable file/link */
+			if (access(result, R_OK | X_OK) < 0)
+				continue;
+			pr_debug("resolved '%s' to '%s'\n", file, result);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
 LIBBPF_API struct bpf_link *
 bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid,
 				const char *binary_path, size_t func_offset,
@@ -10524,10 +10818,12 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid,
 {
 	DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts);
 	char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL;
+	char full_binary_path[PATH_MAX];
 	struct bpf_link *link;
 	size_t ref_ctr_off;
 	int pfd, err;
 	bool retprobe, legacy;
+	const char *func_name;
 
 	if (!OPTS_VALID(opts, bpf_uprobe_opts))
 		return libbpf_err_ptr(-EINVAL);
@@ -10536,12 +10832,37 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid,
 	ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0);
 	pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0);
 
+	if (binary_path && !strchr(binary_path, '/')) {
+		err = resolve_full_path(binary_path, full_binary_path,
+					sizeof(full_binary_path));
+		if (err) {
+			pr_warn("prog '%s': failed to resolve full path for '%s': %d\n",
+				prog->name, binary_path, err);
+			return libbpf_err_ptr(err);
+		}
+		binary_path = full_binary_path;
+	}
+	func_name = OPTS_GET(opts, func_name, NULL);
+	if (func_name) {
+		long sym_off;
+
+		if (!binary_path) {
+			pr_warn("prog '%s': name-based attach requires binary_path\n",
+				prog->name);
+			return libbpf_err_ptr(-EINVAL);
+		}
+		sym_off = elf_find_func_offset(binary_path, func_name);
+		if (sym_off < 0)
+			return libbpf_err_ptr(sym_off);
+		func_offset += sym_off;
+	}
+
 	legacy = determine_uprobe_perf_type() < 0;
 	if (!legacy) {
 		pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path,
 					    func_offset, pid, ref_ctr_off);
 	} else {
-		char probe_name[512];
+		char probe_name[PATH_MAX + 64];
 
 		if (ref_ctr_off)
 			return libbpf_err_ptr(-EINVAL);
@@ -10589,6 +10910,60 @@ err_out:
 
 }
 
+/* Format of u[ret]probe section definition supporting auto-attach:
+ * u[ret]probe/binary:function[+offset]
+ *
+ * binary can be an absolute/relative path or a filename; the latter is resolved to a
+ * full binary path via bpf_program__attach_uprobe_opts.
+ *
+ * Specifying uprobe+ ensures we carry out strict matching; either "uprobe" must be
+ * specified (and auto-attach is not possible) or the above format is specified for
+ * auto-attach.
+ */
+static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link)
+{
+	DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts);
+	char *probe_type = NULL, *binary_path = NULL, *func_name = NULL;
+	int n, ret = -EINVAL;
+	long offset = 0;
+
+	*link = NULL;
+
+	n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li",
+		   &probe_type, &binary_path, &func_name, &offset);
+	switch (n) {
+	case 1:
+		/* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */
+		ret = 0;
+		break;
+	case 2:
+		pr_warn("prog '%s': section '%s' missing ':function[+offset]' specification\n",
+			prog->name, prog->sec_name);
+		break;
+	case 3:
+	case 4:
+		opts.retprobe = strcmp(probe_type, "uretprobe") == 0;
+		if (opts.retprobe && offset != 0) {
+			pr_warn("prog '%s': uretprobes do not support offset specification\n",
+				prog->name);
+			break;
+		}
+		opts.func_name = func_name;
+		*link = bpf_program__attach_uprobe_opts(prog, -1, binary_path, offset, &opts);
+		ret = libbpf_get_error(*link);
+		break;
+	default:
+		pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name,
+			prog->sec_name);
+		break;
+	}
+	free(probe_type);
+	free(binary_path);
+	free(func_name);
+
+	return ret;
+}
+
 struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog,
 					    bool retprobe, pid_t pid,
 					    const char *binary_path,
@@ -10599,6 +10974,85 @@ struct bpf_link *bpf_program__attach_uprobe(const struct bpf_program *prog,
 	return bpf_program__attach_uprobe_opts(prog, pid, binary_path, func_offset, &opts);
 }
 
+struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog,
+					  pid_t pid, const char *binary_path,
+					  const char *usdt_provider, const char *usdt_name,
+					  const struct bpf_usdt_opts *opts)
+{
+	char resolved_path[512];
+	struct bpf_object *obj = prog->obj;
+	struct bpf_link *link;
+	long usdt_cookie;
+	int err;
+
+	if (!OPTS_VALID(opts, bpf_uprobe_opts))
+		return libbpf_err_ptr(-EINVAL);
+
+	if (bpf_program__fd(prog) < 0) {
+		pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n",
+			prog->name);
+		return libbpf_err_ptr(-EINVAL);
+	}
+
+	if (!strchr(binary_path, '/')) {
+		err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path));
+		if (err) {
+			pr_warn("prog '%s': failed to resolve full path for '%s': %d\n",
+				prog->name, binary_path, err);
+			return libbpf_err_ptr(err);
+		}
+		binary_path = resolved_path;
+	}
+
+	/* USDT manager is instantiated lazily on first USDT attach. It will
+	 * be destroyed together with BPF object in bpf_object__close().
+	 */
+	if (IS_ERR(obj->usdt_man))
+		return libbpf_ptr(obj->usdt_man);
+	if (!obj->usdt_man) {
+		obj->usdt_man = usdt_manager_new(obj);
+		if (IS_ERR(obj->usdt_man))
+			return libbpf_ptr(obj->usdt_man);
+	}
+
+	usdt_cookie = OPTS_GET(opts, usdt_cookie, 0);
+	link = usdt_manager_attach_usdt(obj->usdt_man, prog, pid, binary_path,
+				        usdt_provider, usdt_name, usdt_cookie);
+	err = libbpf_get_error(link);
+	if (err)
+		return libbpf_err_ptr(err);
+	return link;
+}
+
+static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link)
+{
+	char *path = NULL, *provider = NULL, *name = NULL;
+	const char *sec_name;
+	int n, err;
+
+	sec_name = bpf_program__section_name(prog);
+	if (strcmp(sec_name, "usdt") == 0) {
+		/* no auto-attach for just SEC("usdt") */
+		*link = NULL;
+		return 0;
+	}
+
+	n = sscanf(sec_name, "usdt/%m[^:]:%m[^:]:%m[^:]", &path, &provider, &name);
+	if (n != 3) {
+		pr_warn("invalid section '%s', expected SEC(\"usdt/<path>:<provider>:<name>\")\n",
+			sec_name);
+		err = -EINVAL;
+	} else {
+		*link = bpf_program__attach_usdt(prog, -1 /* any process */, path,
+						 provider, name, NULL);
+		err = libbpf_get_error(*link);
+	}
+	free(path);
+	free(provider);
+	free(name);
+	return err;
+}
+
 static int determine_tracepoint_id(const char *tp_category,
 				   const char *tp_name)
 {
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 05dde85e19a6..63d66f1adf1a 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -459,9 +459,17 @@ struct bpf_uprobe_opts {
 	__u64 bpf_cookie;
 	/* uprobe is return probe, invoked at function return time */
 	bool retprobe;
+	/* Function name to attach to.  Could be an unqualified ("abc") or library-qualified
+	 * "abc@LIBXYZ" name.  To specify function entry, func_name should be set while
+	 * func_offset argument to bpf_prog__attach_uprobe_opts() should be 0.  To trace an
+	 * offset within a function, specify func_name and use func_offset argument to specify
+	 * offset within the function.  Shared library functions must specify the shared library
+	 * binary_path.
+	 */
+	const char *func_name;
 	size_t :0;
 };
-#define bpf_uprobe_opts__last_field retprobe
+#define bpf_uprobe_opts__last_field func_name
 
 /**
  * @brief **bpf_program__attach_uprobe()** attaches a BPF program
@@ -503,6 +511,37 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid,
 				const char *binary_path, size_t func_offset,
 				const struct bpf_uprobe_opts *opts);
 
+struct bpf_usdt_opts {
+	/* size of this struct, for forward/backward compatibility */
+	size_t sz;
+	/* custom user-provided value accessible through usdt_cookie() */
+	__u64 usdt_cookie;
+	size_t :0;
+};
+#define bpf_usdt_opts__last_field usdt_cookie
+
+/**
+ * @brief **bpf_program__attach_usdt()** is just like
+ * bpf_program__attach_uprobe_opts() except it covers USDT (User-space
+ * Statically Defined Tracepoint) attachment, instead of attaching to
+ * user-space function entry or exit.
+ *
+ * @param prog BPF program to attach
+ * @param pid Process ID to attach the uprobe to, 0 for self (own process),
+ * -1 for all processes
+ * @param binary_path Path to binary that contains provided USDT probe
+ * @param usdt_provider USDT provider name
+ * @param usdt_name USDT probe name
+ * @param opts Options for altering program attachment
+ * @return Reference to the newly created BPF link; or NULL is returned on error,
+ * error code is stored in errno
+ */
+LIBBPF_API struct bpf_link *
+bpf_program__attach_usdt(const struct bpf_program *prog,
+			 pid_t pid, const char *binary_path,
+			 const char *usdt_provider, const char *usdt_name,
+			 const struct bpf_usdt_opts *opts);
+
 struct bpf_tracepoint_opts {
 	/* size of this struct, for forward/backward compatiblity */
 	size_t sz;
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index dd35ee58bfaa..82f6d62176dd 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -444,6 +444,7 @@ LIBBPF_0.8.0 {
 	global:
 		bpf_object__destroy_subskeleton;
 		bpf_object__open_subskeleton;
+		bpf_program__attach_usdt;
 		libbpf_register_prog_handler;
 		libbpf_unregister_prog_handler;
 		bpf_program__attach_kprobe_multi_opts;
diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h
index b6247dc7f8eb..080272421f6c 100644
--- a/tools/lib/bpf/libbpf_internal.h
+++ b/tools/lib/bpf/libbpf_internal.h
@@ -103,6 +103,17 @@
 #define str_has_pfx(str, pfx) \
 	(strncmp(str, pfx, __builtin_constant_p(pfx) ? sizeof(pfx) - 1 : strlen(pfx)) == 0)
 
+/* suffix check */
+static inline bool str_has_sfx(const char *str, const char *sfx)
+{
+	size_t str_len = strlen(str);
+	size_t sfx_len = strlen(sfx);
+
+	if (sfx_len <= str_len)
+		return strcmp(str + str_len - sfx_len, sfx);
+	return false;
+}
+
 /* Symbol versioning is different between static and shared library.
  * Properly versioned symbols are needed for shared library, but
  * only the symbol of the new version is needed for static library.
@@ -148,6 +159,15 @@ do {				\
 #ifndef __has_builtin
 #define __has_builtin(x) 0
 #endif
+
+struct bpf_link {
+	int (*detach)(struct bpf_link *link);
+	void (*dealloc)(struct bpf_link *link);
+	char *pin_path;		/* NULL, if not pinned */
+	int fd;			/* hook FD, -1 if not applicable */
+	bool disconnected;
+};
+
 /*
  * Re-implement glibc's reallocarray() for libbpf internal-only use.
  * reallocarray(), unfortunately, is not available in all versions of glibc,
@@ -329,6 +349,8 @@ enum kern_feature_id {
 	FEAT_BTF_TYPE_TAG,
 	/* memcg-based accounting for BPF maps and progs */
 	FEAT_MEMCG_ACCOUNT,
+	/* BPF cookie (bpf_get_attach_cookie() BPF helper) support */
+	FEAT_BPF_COOKIE,
 	__FEAT_CNT,
 };
 
@@ -543,4 +565,12 @@ int bpf_core_add_cands(struct bpf_core_cand *local_cand,
 		       struct bpf_core_cand_list *cands);
 void bpf_core_free_cands(struct bpf_core_cand_list *cands);
 
+struct usdt_manager *usdt_manager_new(struct bpf_object *obj);
+void usdt_manager_free(struct usdt_manager *man);
+struct bpf_link * usdt_manager_attach_usdt(struct usdt_manager *man,
+					   const struct bpf_program *prog,
+					   pid_t pid, const char *path,
+					   const char *usdt_provider, const char *usdt_name,
+					   long usdt_cookie);
+
 #endif /* __LIBBPF_LIBBPF_INTERNAL_H */
diff --git a/tools/lib/bpf/usdt.bpf.h b/tools/lib/bpf/usdt.bpf.h
new file mode 100644
index 000000000000..4181fddb3687
--- /dev/null
+++ b/tools/lib/bpf/usdt.bpf.h
@@ -0,0 +1,259 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#ifndef __USDT_BPF_H__
+#define __USDT_BPF_H__
+
+#include <linux/errno.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
+
+/* Below types and maps are internal implementation details of libbpf's USDT
+ * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should
+ * be considered an unstable API as well and might be adjusted based on user
+ * feedback from using libbpf's USDT support in production.
+ */
+
+/* User can override BPF_USDT_MAX_SPEC_CNT to change default size of internal
+ * map that keeps track of USDT argument specifications. This might be
+ * necessary if there are a lot of USDT attachments.
+ */
+#ifndef BPF_USDT_MAX_SPEC_CNT
+#define BPF_USDT_MAX_SPEC_CNT 256
+#endif
+/* User can override BPF_USDT_MAX_IP_CNT to change default size of internal
+ * map that keeps track of IP (memory address) mapping to USDT argument
+ * specification.
+ * Note, if kernel supports BPF cookies, this map is not used and could be
+ * resized all the way to 1 to save a bit of memory.
+ */
+#ifndef BPF_USDT_MAX_IP_CNT
+#define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT)
+#endif
+/* We use BPF CO-RE to detect support for BPF cookie from BPF side. This is
+ * the only dependency on CO-RE, so if it's undesirable, user can override
+ * BPF_USDT_HAS_BPF_COOKIE to specify whether to BPF cookie is supported or not.
+ */
+#ifndef BPF_USDT_HAS_BPF_COOKIE
+#define BPF_USDT_HAS_BPF_COOKIE \
+	bpf_core_enum_value_exists(enum bpf_func_id___usdt, BPF_FUNC_get_attach_cookie___usdt)
+#endif
+
+enum __bpf_usdt_arg_type {
+	BPF_USDT_ARG_CONST,
+	BPF_USDT_ARG_REG,
+	BPF_USDT_ARG_REG_DEREF,
+};
+
+struct __bpf_usdt_arg_spec {
+	/* u64 scalar interpreted depending on arg_type, see below */
+	__u64 val_off;
+	/* arg location case, see bpf_udst_arg() for details */
+	enum __bpf_usdt_arg_type arg_type;
+	/* offset of referenced register within struct pt_regs */
+	short reg_off;
+	/* whether arg should be interpreted as signed value */
+	bool arg_signed;
+	/* number of bits that need to be cleared and, optionally,
+	 * sign-extended to cast arguments that are 1, 2, or 4 bytes
+	 * long into final 8-byte u64/s64 value returned to user
+	 */
+	char arg_bitshift;
+};
+
+/* should match USDT_MAX_ARG_CNT in usdt.c exactly */
+#define BPF_USDT_MAX_ARG_CNT 12
+struct __bpf_usdt_spec {
+	struct __bpf_usdt_arg_spec args[BPF_USDT_MAX_ARG_CNT];
+	__u64 usdt_cookie;
+	short arg_cnt;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, BPF_USDT_MAX_SPEC_CNT);
+	__type(key, int);
+	__type(value, struct __bpf_usdt_spec);
+} __bpf_usdt_specs SEC(".maps") __weak;
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, BPF_USDT_MAX_IP_CNT);
+	__type(key, long);
+	__type(value, __u32);
+} __bpf_usdt_ip_to_spec_id SEC(".maps") __weak;
+
+/* don't rely on user's BPF code to have latest definition of bpf_func_id */
+enum bpf_func_id___usdt {
+	BPF_FUNC_get_attach_cookie___usdt = 0xBAD, /* value doesn't matter */
+};
+
+static __always_inline
+int __bpf_usdt_spec_id(struct pt_regs *ctx)
+{
+	if (!BPF_USDT_HAS_BPF_COOKIE) {
+		long ip = PT_REGS_IP(ctx);
+		int *spec_id_ptr;
+
+		spec_id_ptr = bpf_map_lookup_elem(&__bpf_usdt_ip_to_spec_id, &ip);
+		return spec_id_ptr ? *spec_id_ptr : -ESRCH;
+	}
+
+	return bpf_get_attach_cookie(ctx);
+}
+
+/* Return number of USDT arguments defined for currently traced USDT. */
+__weak __hidden
+int bpf_usdt_arg_cnt(struct pt_regs *ctx)
+{
+	struct __bpf_usdt_spec *spec;
+	int spec_id;
+
+	spec_id = __bpf_usdt_spec_id(ctx);
+	if (spec_id < 0)
+		return -ESRCH;
+
+	spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id);
+	if (!spec)
+		return -ESRCH;
+
+	return spec->arg_cnt;
+}
+
+/* Fetch USDT argument #*arg_num* (zero-indexed) and put its value into *res.
+ * Returns 0 on success; negative error, otherwise.
+ * On error *res is guaranteed to be set to zero.
+ */
+__weak __hidden
+int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res)
+{
+	struct __bpf_usdt_spec *spec;
+	struct __bpf_usdt_arg_spec *arg_spec;
+	unsigned long val;
+	int err, spec_id;
+
+	*res = 0;
+
+	spec_id = __bpf_usdt_spec_id(ctx);
+	if (spec_id < 0)
+		return -ESRCH;
+
+	spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id);
+	if (!spec)
+		return -ESRCH;
+
+	if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt)
+		return -ENOENT;
+
+	arg_spec = &spec->args[arg_num];
+	switch (arg_spec->arg_type) {
+	case BPF_USDT_ARG_CONST:
+		/* Arg is just a constant ("-4@$-9" in USDT arg spec).
+		 * value is recorded in arg_spec->val_off directly.
+		 */
+		val = arg_spec->val_off;
+		break;
+	case BPF_USDT_ARG_REG:
+		/* Arg is in a register (e.g, "8@%rax" in USDT arg spec),
+		 * so we read the contents of that register directly from
+		 * struct pt_regs. To keep things simple user-space parts
+		 * record offsetof(struct pt_regs, <regname>) in arg_spec->reg_off.
+		 */
+		err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off);
+		if (err)
+			return err;
+		break;
+	case BPF_USDT_ARG_REG_DEREF:
+		/* Arg is in memory addressed by register, plus some offset
+		 * (e.g., "-4@-1204(%rbp)" in USDT arg spec). Register is
+		 * identified like with BPF_USDT_ARG_REG case, and the offset
+		 * is in arg_spec->val_off. We first fetch register contents
+		 * from pt_regs, then do another user-space probe read to
+		 * fetch argument value itself.
+		 */
+		err = bpf_probe_read_kernel(&val, sizeof(val), (void *)ctx + arg_spec->reg_off);
+		if (err)
+			return err;
+		err = bpf_probe_read_user(&val, sizeof(val), (void *)val + arg_spec->val_off);
+		if (err)
+			return err;
+#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+		val >>= arg_spec->arg_bitshift;
+#endif
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* cast arg from 1, 2, or 4 bytes to final 8 byte size clearing
+	 * necessary upper arg_bitshift bits, with sign extension if argument
+	 * is signed
+	 */
+	val <<= arg_spec->arg_bitshift;
+	if (arg_spec->arg_signed)
+		val = ((long)val) >> arg_spec->arg_bitshift;
+	else
+		val = val >> arg_spec->arg_bitshift;
+	*res = val;
+	return 0;
+}
+
+/* Retrieve user-specified cookie value provided during attach as
+ * bpf_usdt_opts.usdt_cookie. This serves the same purpose as BPF cookie
+ * returned by bpf_get_attach_cookie(). Libbpf's support for USDT is itself
+ * utilizing BPF cookies internally, so user can't use BPF cookie directly
+ * for USDT programs and has to use bpf_usdt_cookie() API instead.
+ */
+__weak __hidden
+long bpf_usdt_cookie(struct pt_regs *ctx)
+{
+	struct __bpf_usdt_spec *spec;
+	int spec_id;
+
+	spec_id = __bpf_usdt_spec_id(ctx);
+	if (spec_id < 0)
+		return 0;
+
+	spec = bpf_map_lookup_elem(&__bpf_usdt_specs, &spec_id);
+	if (!spec)
+		return 0;
+
+	return spec->usdt_cookie;
+}
+
+/* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */
+#define ___bpf_usdt_args0() ctx
+#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; })
+#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; })
+#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; })
+#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; })
+#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; })
+#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; })
+#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; })
+#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; })
+#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; })
+#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; })
+#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; })
+#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; })
+#define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args)
+
+/*
+ * BPF_USDT serves the same purpose for USDT handlers as BPF_PROG for
+ * tp_btf/fentry/fexit BPF programs and BPF_KPROBE for kprobes.
+ * Original struct pt_regs * context is preserved as 'ctx' argument.
+ */
+#define BPF_USDT(name, args...)						    \
+name(struct pt_regs *ctx);						    \
+static __attribute__((always_inline)) typeof(name(0))			    \
+____##name(struct pt_regs *ctx, ##args);				    \
+typeof(name(0)) name(struct pt_regs *ctx)				    \
+{									    \
+        _Pragma("GCC diagnostic push")					    \
+        _Pragma("GCC diagnostic ignored \"-Wint-conversion\"")		    \
+        return ____##name(___bpf_usdt_args(args));			    \
+        _Pragma("GCC diagnostic pop")					    \
+}									    \
+static __attribute__((always_inline)) typeof(name(0))			    \
+____##name(struct pt_regs *ctx, ##args)
+
+#endif /* __USDT_BPF_H__ */
diff --git a/tools/lib/bpf/usdt.c b/tools/lib/bpf/usdt.c
new file mode 100644
index 000000000000..acf2d99a9e77
--- /dev/null
+++ b/tools/lib/bpf/usdt.c
@@ -0,0 +1,1335 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <libelf.h>
+#include <gelf.h>
+#include <unistd.h>
+#include <linux/ptrace.h>
+#include <linux/kernel.h>
+
+#include "bpf.h"
+#include "libbpf.h"
+#include "libbpf_common.h"
+#include "libbpf_internal.h"
+#include "hashmap.h"
+
+/* libbpf's USDT support consists of BPF-side state/code and user-space
+ * state/code working together in concert. BPF-side parts are defined in
+ * usdt.bpf.h header library. User-space state is encapsulated by struct
+ * usdt_manager and all the supporting code centered around usdt_manager.
+ *
+ * usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map
+ * and IP-to-spec-ID map, which is auxiliary map necessary for kernels that
+ * don't support BPF cookie (see below). These two maps are implicitly
+ * embedded into user's end BPF object file when user's code included
+ * usdt.bpf.h. This means that libbpf doesn't do anything special to create
+ * these USDT support maps. They are created by normal libbpf logic of
+ * instantiating BPF maps when opening and loading BPF object.
+ *
+ * As such, libbpf is basically unaware of the need to do anything
+ * USDT-related until the very first call to bpf_program__attach_usdt(), which
+ * can be called by user explicitly or happen automatically during skeleton
+ * attach (or, equivalently, through generic bpf_program__attach() call). At
+ * this point, libbpf will instantiate and initialize struct usdt_manager and
+ * store it in bpf_object. USDT manager is per-BPF object construct, as each
+ * independent BPF object might or might not have USDT programs, and thus all
+ * the expected USDT-related state. There is no coordination between two
+ * bpf_object in parts of USDT attachment, they are oblivious of each other's
+ * existence and libbpf is just oblivious, dealing with bpf_object-specific
+ * USDT state.
+ *
+ * Quick crash course on USDTs.
+ *
+ * From user-space application's point of view, USDT is essentially just
+ * a slightly special function call that normally has zero overhead, unless it
+ * is being traced by some external entity (e.g, BPF-based tool). Here's how
+ * a typical application can trigger USDT probe:
+ *
+ * #include <sys/sdt.h>  // provided by systemtap-sdt-devel package
+ * // folly also provide similar functionality in folly/tracing/StaticTracepoint.h
+ *
+ * STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y);
+ *
+ * USDT is identified by it's <provider-name>:<probe-name> pair of names. Each
+ * individual USDT has a fixed number of arguments (3 in the above example)
+ * and specifies values of each argument as if it was a function call.
+ *
+ * USDT call is actually not a function call, but is instead replaced by
+ * a single NOP instruction (thus zero overhead, effectively). But in addition
+ * to that, those USDT macros generate special SHT_NOTE ELF records in
+ * .note.stapsdt ELF section. Here's an example USDT definition as emitted by
+ * `readelf -n <binary>`:
+ *
+ *   stapsdt              0x00000089       NT_STAPSDT (SystemTap probe descriptors)
+ *   Provider: test
+ *   Name: usdt12
+ *   Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e
+ *   Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil
+ *
+ * In this case we have USDT test:usdt12 with 12 arguments.
+ *
+ * Location and base are offsets used to calculate absolute IP address of that
+ * NOP instruction that kernel can replace with an interrupt instruction to
+ * trigger instrumentation code (BPF program for all that we care about).
+ *
+ * Semaphore above is and optional feature. It records an address of a 2-byte
+ * refcount variable (normally in '.probes' ELF section) used for signaling if
+ * there is anything that is attached to USDT. This is useful for user
+ * applications if, for example, they need to prepare some arguments that are
+ * passed only to USDTs and preparation is expensive. By checking if USDT is
+ * "activated", an application can avoid paying those costs unnecessarily.
+ * Recent enough kernel has built-in support for automatically managing this
+ * refcount, which libbpf expects and relies on. If USDT is defined without
+ * associated semaphore, this value will be zero. See selftests for semaphore
+ * examples.
+ *
+ * Arguments is the most interesting part. This USDT specification string is
+ * providing information about all the USDT arguments and their locations. The
+ * part before @ sign defined byte size of the argument (1, 2, 4, or 8) and
+ * whether the argument is signed or unsigned (negative size means signed).
+ * The part after @ sign is assembly-like definition of argument location
+ * (see [0] for more details). Technically, assembler can provide some pretty
+ * advanced definitions, but libbpf is currently supporting three most common
+ * cases:
+ *   1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9);
+ *   2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer
+ *      whose value is in register %rdx";
+ *   3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which
+ *      specifies signed 32-bit integer stored at offset -1204 bytes from
+ *      memory address stored in %rbp.
+ *
+ *   [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
+ *
+ * During attachment, libbpf parses all the relevant USDT specifications and
+ * prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side
+ * code through spec map. This allows BPF applications to quickly fetch the
+ * actual value at runtime using a simple BPF-side code.
+ *
+ * With basics out of the way, let's go over less immediately obvious aspects
+ * of supporting USDTs.
+ *
+ * First, there is no special USDT BPF program type. It is actually just
+ * a uprobe BPF program (which for kernel, at least currently, is just a kprobe
+ * program, so BPF_PROG_TYPE_KPROBE program type). With the only difference
+ * that uprobe is usually attached at the function entry, while USDT will
+ * normally will be somewhere inside the function. But it should always be
+ * pointing to NOP instruction, which makes such uprobes the fastest uprobe
+ * kind.
+ *
+ * Second, it's important to realize that such STAP_PROBEn(provider, name, ...)
+ * macro invocations can end up being inlined many-many times, depending on
+ * specifics of each individual user application. So single conceptual USDT
+ * (identified by provider:name pair of identifiers) is, generally speaking,
+ * multiple uprobe locations (USDT call sites) in different places in user
+ * application. Further, again due to inlining, each USDT call site might end
+ * up having the same argument #N be located in a different place. In one call
+ * site it could be a constant, in another will end up in a register, and in
+ * yet another could be some other register or even somewhere on the stack.
+ *
+ * As such, "attaching to USDT" means (in general case) attaching the same
+ * uprobe BPF program to multiple target locations in user application, each
+ * potentially having a completely different USDT spec associated with it.
+ * To wire all this up together libbpf allocates a unique integer spec ID for
+ * each unique USDT spec. Spec IDs are allocated as sequential small integers
+ * so that they can be used as keys in array BPF map (for performance reasons).
+ * Spec ID allocation and accounting is big part of what usdt_manager is
+ * about. This state has to be maintained per-BPF object and coordinate
+ * between different USDT attachments within the same BPF object.
+ *
+ * Spec ID is the key in spec BPF map, value is the actual USDT spec layed out
+ * as struct usdt_spec. Each invocation of BPF program at runtime needs to
+ * know its associated spec ID. It gets it either through BPF cookie, which
+ * libbpf sets to spec ID during attach time, or, if kernel is too old to
+ * support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such
+ * case. The latter means that some modes of operation can't be supported
+ * without BPF cookie. Such mode is attaching to shared library "generically",
+ * without specifying target process. In such case, it's impossible to
+ * calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode
+ * is not supported without BPF cookie support.
+ *
+ * Note that libbpf is using BPF cookie functionality for its own internal
+ * needs, so user itself can't rely on BPF cookie feature. To that end, libbpf
+ * provides conceptually equivalent USDT cookie support. It's still u64
+ * user-provided value that can be associated with USDT attachment. Note that
+ * this will be the same value for all USDT call sites within the same single
+ * *logical* USDT attachment. This makes sense because to user attaching to
+ * USDT is a single BPF program triggered for singular USDT probe. The fact
+ * that this is done at multiple actual locations is a mostly hidden
+ * implementation details. This USDT cookie value can be fetched with
+ * bpf_usdt_cookie(ctx) API provided by usdt.bpf.h
+ *
+ * Lastly, while single USDT can have tons of USDT call sites, it doesn't
+ * necessarily have that many different USDT specs. It very well might be
+ * that 1000 USDT call sites only need 5 different USDT specs, because all the
+ * arguments are typically contained in a small set of registers or stack
+ * locations. As such, it's wasteful to allocate as many USDT spec IDs as
+ * there are USDT call sites. So libbpf tries to be frugal and performs
+ * on-the-fly deduplication during a single USDT attachment to only allocate
+ * the minimal required amount of unique USDT specs (and thus spec IDs). This
+ * is trivially achieved by using USDT spec string (Arguments string from USDT
+ * note) as a lookup key in a hashmap. USDT spec string uniquely defines
+ * everything about how to fetch USDT arguments, so two USDT call sites
+ * sharing USDT spec string can safely share the same USDT spec and spec ID.
+ * Note, this spec string deduplication is happening only during the same USDT
+ * attachment, so each USDT spec shares the same USDT cookie value. This is
+ * not generally true for other USDT attachments within the same BPF object,
+ * as even if USDT spec string is the same, USDT cookie value can be
+ * different. It was deemed excessive to try to deduplicate across independent
+ * USDT attachments by taking into account USDT spec string *and* USDT cookie
+ * value, which would complicated spec ID accounting significantly for little
+ * gain.
+ */
+
+#define USDT_BASE_SEC ".stapsdt.base"
+#define USDT_SEMA_SEC ".probes"
+#define USDT_NOTE_SEC  ".note.stapsdt"
+#define USDT_NOTE_TYPE 3
+#define USDT_NOTE_NAME "stapsdt"
+
+/* should match exactly enum __bpf_usdt_arg_type from usdt.bpf.h */
+enum usdt_arg_type {
+	USDT_ARG_CONST,
+	USDT_ARG_REG,
+	USDT_ARG_REG_DEREF,
+};
+
+/* should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h */
+struct usdt_arg_spec {
+	__u64 val_off;
+	enum usdt_arg_type arg_type;
+	short reg_off;
+	bool arg_signed;
+	char arg_bitshift;
+};
+
+/* should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h */
+#define USDT_MAX_ARG_CNT 12
+
+/* should match struct __bpf_usdt_spec from usdt.bpf.h */
+struct usdt_spec {
+	struct usdt_arg_spec args[USDT_MAX_ARG_CNT];
+	__u64 usdt_cookie;
+	short arg_cnt;
+};
+
+struct usdt_note {
+	const char *provider;
+	const char *name;
+	/* USDT args specification string, e.g.:
+	 * "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx"
+	 */
+	const char *args;
+	long loc_addr;
+	long base_addr;
+	long sema_addr;
+};
+
+struct usdt_target {
+	long abs_ip;
+	long rel_ip;
+	long sema_off;
+	struct usdt_spec spec;
+	const char *spec_str;
+};
+
+struct usdt_manager {
+	struct bpf_map *specs_map;
+	struct bpf_map *ip_to_spec_id_map;
+
+	int *free_spec_ids;
+	size_t free_spec_cnt;
+	size_t next_free_spec_id;
+
+	bool has_bpf_cookie;
+	bool has_sema_refcnt;
+};
+
+struct usdt_manager *usdt_manager_new(struct bpf_object *obj)
+{
+	static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset";
+	struct usdt_manager *man;
+	struct bpf_map *specs_map, *ip_to_spec_id_map;
+
+	specs_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_specs");
+	ip_to_spec_id_map = bpf_object__find_map_by_name(obj, "__bpf_usdt_ip_to_spec_id");
+	if (!specs_map || !ip_to_spec_id_map) {
+		pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n");
+		return ERR_PTR(-ESRCH);
+	}
+
+	man = calloc(1, sizeof(*man));
+	if (!man)
+		return ERR_PTR(-ENOMEM);
+
+	man->specs_map = specs_map;
+	man->ip_to_spec_id_map = ip_to_spec_id_map;
+
+	/* Detect if BPF cookie is supported for kprobes.
+	 * We don't need IP-to-ID mapping if we can use BPF cookies.
+	 * Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value")
+	 */
+	man->has_bpf_cookie = kernel_supports(obj, FEAT_BPF_COOKIE);
+
+	/* Detect kernel support for automatic refcounting of USDT semaphore.
+	 * If this is not supported, USDTs with semaphores will not be supported.
+	 * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe")
+	 */
+	man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0;
+
+	return man;
+}
+
+void usdt_manager_free(struct usdt_manager *man)
+{
+	if (IS_ERR_OR_NULL(man))
+		return;
+
+	free(man->free_spec_ids);
+	free(man);
+}
+
+static int sanity_check_usdt_elf(Elf *elf, const char *path)
+{
+	GElf_Ehdr ehdr;
+	int endianness;
+
+	if (elf_kind(elf) != ELF_K_ELF) {
+		pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path);
+		return -EBADF;
+	}
+
+	switch (gelf_getclass(elf)) {
+	case ELFCLASS64:
+		if (sizeof(void *) != 8) {
+			pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path);
+			return -EBADF;
+		}
+		break;
+	case ELFCLASS32:
+		if (sizeof(void *) != 4) {
+			pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path);
+			return -EBADF;
+		}
+		break;
+	default:
+		pr_warn("usdt: unsupported ELF class for '%s'\n", path);
+		return -EBADF;
+	}
+
+	if (!gelf_getehdr(elf, &ehdr))
+		return -EINVAL;
+
+	if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) {
+		pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n",
+			path, ehdr.e_type);
+		return -EBADF;
+	}
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+	endianness = ELFDATA2LSB;
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+	endianness = ELFDATA2MSB;
+#else
+# error "Unrecognized __BYTE_ORDER__"
+#endif
+	if (endianness != ehdr.e_ident[EI_DATA]) {
+		pr_warn("usdt: ELF endianness mismatch for '%s'\n", path);
+		return -EBADF;
+	}
+
+	return 0;
+}
+
+static int find_elf_sec_by_name(Elf *elf, const char *sec_name, GElf_Shdr *shdr, Elf_Scn **scn)
+{
+	Elf_Scn *sec = NULL;
+	size_t shstrndx;
+
+	if (elf_getshdrstrndx(elf, &shstrndx))
+		return -EINVAL;
+
+	/* check if ELF is corrupted and avoid calling elf_strptr if yes */
+	if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL))
+		return -EINVAL;
+
+	while ((sec = elf_nextscn(elf, sec)) != NULL) {
+		char *name;
+
+		if (!gelf_getshdr(sec, shdr))
+			return -EINVAL;
+
+		name = elf_strptr(elf, shstrndx, shdr->sh_name);
+		if (name && strcmp(sec_name, name) == 0) {
+			*scn = sec;
+			return 0;
+		}
+	}
+
+	return -ENOENT;
+}
+
+struct elf_seg {
+	long start;
+	long end;
+	long offset;
+	bool is_exec;
+};
+
+static int cmp_elf_segs(const void *_a, const void *_b)
+{
+	const struct elf_seg *a = _a;
+	const struct elf_seg *b = _b;
+
+	return a->start < b->start ? -1 : 1;
+}
+
+static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, size_t *seg_cnt)
+{
+	GElf_Phdr phdr;
+	size_t n;
+	int i, err;
+	struct elf_seg *seg;
+	void *tmp;
+
+	*seg_cnt = 0;
+
+	if (elf_getphdrnum(elf, &n)) {
+		err = -errno;
+		return err;
+	}
+
+	for (i = 0; i < n; i++) {
+		if (!gelf_getphdr(elf, i, &phdr)) {
+			err = -errno;
+			return err;
+		}
+
+		pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n",
+			 i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset,
+			 (long)phdr.p_type, (long)phdr.p_flags);
+		if (phdr.p_type != PT_LOAD)
+			continue;
+
+		tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs));
+		if (!tmp)
+			return -ENOMEM;
+
+		*segs = tmp;
+		seg = *segs + *seg_cnt;
+		(*seg_cnt)++;
+
+		seg->start = phdr.p_vaddr;
+		seg->end = phdr.p_vaddr + phdr.p_memsz;
+		seg->offset = phdr.p_offset;
+		seg->is_exec = phdr.p_flags & PF_X;
+	}
+
+	if (*seg_cnt == 0) {
+		pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path);
+		return -ESRCH;
+	}
+
+	qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs);
+	return 0;
+}
+
+static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt)
+{
+	char path[PATH_MAX], line[PATH_MAX], mode[16];
+	size_t seg_start, seg_end, seg_off;
+	struct elf_seg *seg;
+	int tmp_pid, i, err;
+	FILE *f;
+
+	*seg_cnt = 0;
+
+	/* Handle containerized binaries only accessible from
+	 * /proc/<pid>/root/<path>. They will be reported as just /<path> in
+	 * /proc/<pid>/maps.
+	 */
+	if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == 2 && pid == tmp_pid)
+		goto proceed;
+
+	if (!realpath(lib_path, path)) {
+		pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n",
+			lib_path, -errno);
+		libbpf_strlcpy(path, lib_path, sizeof(path));
+	}
+
+proceed:
+	sprintf(line, "/proc/%d/maps", pid);
+	f = fopen(line, "r");
+	if (!f) {
+		err = -errno;
+		pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n",
+			line, lib_path, err);
+		return err;
+	}
+
+	/* We need to handle lines with no path at the end:
+	 *
+	 * 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613      /usr/lib64/libc-2.17.so
+	 * 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0
+	 * 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598    /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so
+	 */
+	while (fscanf(f, "%zx-%zx %s %zx %*s %*d%[^\n]\n",
+		      &seg_start, &seg_end, mode, &seg_off, line) == 5) {
+		void *tmp;
+
+		/* to handle no path case (see above) we need to capture line
+		 * without skipping any whitespaces. So we need to strip
+		 * leading whitespaces manually here
+		 */
+		i = 0;
+		while (isblank(line[i]))
+			i++;
+		if (strcmp(line + i, path) != 0)
+			continue;
+
+		pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n",
+			 path, seg_start, seg_end, mode, seg_off);
+
+		/* ignore non-executable sections for shared libs */
+		if (mode[2] != 'x')
+			continue;
+
+		tmp = libbpf_reallocarray(*segs, *seg_cnt + 1, sizeof(**segs));
+		if (!tmp) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+
+		*segs = tmp;
+		seg = *segs + *seg_cnt;
+		*seg_cnt += 1;
+
+		seg->start = seg_start;
+		seg->end = seg_end;
+		seg->offset = seg_off;
+		seg->is_exec = true;
+	}
+
+	if (*seg_cnt == 0) {
+		pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n",
+			lib_path, path, pid);
+		err = -ESRCH;
+		goto err_out;
+	}
+
+	qsort(*segs, *seg_cnt, sizeof(**segs), cmp_elf_segs);
+	err = 0;
+err_out:
+	fclose(f);
+	return err;
+}
+
+static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long addr, bool relative)
+{
+	struct elf_seg *seg;
+	int i;
+
+	if (relative) {
+		/* for shared libraries, address is relative offset and thus
+		 * should be fall within logical offset-based range of
+		 * [offset_start, offset_end)
+		 */
+		for (i = 0, seg = segs; i < seg_cnt; i++, seg++) {
+			if (seg->offset <= addr && addr < seg->offset + (seg->end - seg->start))
+				return seg;
+		}
+	} else {
+		/* for binaries, address is absolute and thus should be within
+		 * absolute address range of [seg_start, seg_end)
+		 */
+		for (i = 0, seg = segs; i < seg_cnt; i++, seg++) {
+			if (seg->start <= addr && addr < seg->end)
+				return seg;
+		}
+	}
+
+	return NULL;
+}
+
+static int parse_usdt_note(Elf *elf, const char *path, long base_addr,
+			   GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off,
+			   struct usdt_note *usdt_note);
+
+static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie);
+
+static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char *path, pid_t pid,
+				const char *usdt_provider, const char *usdt_name, long usdt_cookie,
+				struct usdt_target **out_targets, size_t *out_target_cnt)
+{
+	size_t off, name_off, desc_off, seg_cnt = 0, lib_seg_cnt = 0, target_cnt = 0;
+	struct elf_seg *segs = NULL, *lib_segs = NULL;
+	struct usdt_target *targets = NULL, *target;
+	long base_addr = 0;
+	Elf_Scn *notes_scn, *base_scn;
+	GElf_Shdr base_shdr, notes_shdr;
+	GElf_Ehdr ehdr;
+	GElf_Nhdr nhdr;
+	Elf_Data *data;
+	int err;
+
+	*out_targets = NULL;
+	*out_target_cnt = 0;
+
+	err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, &notes_shdr, &notes_scn);
+	if (err) {
+		pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path);
+		return err;
+	}
+
+	if (notes_shdr.sh_type != SHT_NOTE || !gelf_getehdr(elf, &ehdr)) {
+		pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path);
+		return -EINVAL;
+	}
+
+	err = parse_elf_segs(elf, path, &segs, &seg_cnt);
+	if (err) {
+		pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err);
+		goto err_out;
+	}
+
+	/* .stapsdt.base ELF section is optional, but is used for prelink
+	 * offset compensation (see a big comment further below)
+	 */
+	if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == 0)
+		base_addr = base_shdr.sh_addr;
+
+	data = elf_getdata(notes_scn, 0);
+	off = 0;
+	while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > 0) {
+		long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = 0;
+		struct usdt_note note;
+		struct elf_seg *seg = NULL;
+		void *tmp;
+
+		err = parse_usdt_note(elf, path, base_addr, &nhdr,
+				      data->d_buf, name_off, desc_off, &note);
+		if (err)
+			goto err_out;
+
+		if (strcmp(note.provider, usdt_provider) != 0 || strcmp(note.name, usdt_name) != 0)
+			continue;
+
+		/* We need to compensate "prelink effect". See [0] for details,
+		 * relevant parts quoted here:
+		 *
+		 * Each SDT probe also expands into a non-allocated ELF note. You can
+		 * find this by looking at SHT_NOTE sections and decoding the format;
+		 * see below for details. Because the note is non-allocated, it means
+		 * there is no runtime cost, and also preserved in both stripped files
+		 * and .debug files.
+		 *
+		 * However, this means that prelink won't adjust the note's contents
+		 * for address offsets. Instead, this is done via the .stapsdt.base
+		 * section. This is a special section that is added to the text. We
+		 * will only ever have one of these sections in a final link and it
+		 * will only ever be one byte long. Nothing about this section itself
+		 * matters, we just use it as a marker to detect prelink address
+		 * adjustments.
+		 *
+		 * Each probe note records the link-time address of the .stapsdt.base
+		 * section alongside the probe PC address. The decoder compares the
+		 * base address stored in the note with the .stapsdt.base section's
+		 * sh_addr. Initially these are the same, but the section header will
+		 * be adjusted by prelink. So the decoder applies the difference to
+		 * the probe PC address to get the correct prelinked PC address; the
+		 * same adjustment is applied to the semaphore address, if any.
+		 *
+		 *   [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
+		 */
+		usdt_rel_ip = usdt_abs_ip = note.loc_addr;
+		if (base_addr) {
+			usdt_abs_ip += base_addr - note.base_addr;
+			usdt_rel_ip += base_addr - note.base_addr;
+		}
+
+		if (ehdr.e_type == ET_EXEC) {
+			/* When attaching uprobes (which what USDTs basically
+			 * are) kernel expects a relative IP to be specified,
+			 * so if we are attaching to an executable ELF binary
+			 * (i.e., not a shared library), we need to calculate
+			 * proper relative IP based on ELF's load address
+			 */
+			seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip, false /* relative */);
+			if (!seg) {
+				err = -ESRCH;
+				pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n",
+					usdt_provider, usdt_name, path, usdt_abs_ip);
+				goto err_out;
+			}
+			if (!seg->is_exec) {
+				err = -ESRCH;
+				pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n",
+					path, seg->start, seg->end, usdt_provider, usdt_name,
+					usdt_abs_ip);
+				goto err_out;
+			}
+
+			usdt_rel_ip = usdt_abs_ip - (seg->start - seg->offset);
+		} else if (!man->has_bpf_cookie) { /* ehdr.e_type == ET_DYN */
+			/* If we don't have BPF cookie support but need to
+			 * attach to a shared library, we'll need to know and
+			 * record absolute addresses of attach points due to
+			 * the need to lookup USDT spec by absolute IP of
+			 * triggered uprobe. Doing this resolution is only
+			 * possible when we have a specific PID of the process
+			 * that's using specified shared library. BPF cookie
+			 * removes the absolute address limitation as we don't
+			 * need to do this lookup (we just use BPF cookie as
+			 * an index of USDT spec), so for newer kernels with
+			 * BPF cookie support libbpf supports USDT attachment
+			 * to shared libraries with no PID filter.
+			 */
+			if (pid < 0) {
+				pr_warn("usdt: attaching to shared libraries without specific PID is not supported on current kernel\n");
+				err = -ENOTSUP;
+				goto err_out;
+			}
+
+			/* lib_segs are lazily initialized only if necessary */
+			if (lib_seg_cnt == 0) {
+				err = parse_lib_segs(pid, path, &lib_segs, &lib_seg_cnt);
+				if (err) {
+					pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n",
+						pid, path, err);
+					goto err_out;
+				}
+			}
+
+			seg = find_elf_seg(lib_segs, lib_seg_cnt, usdt_rel_ip, true /* relative */);
+			if (!seg) {
+				err = -ESRCH;
+				pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n",
+					usdt_provider, usdt_name, path, usdt_rel_ip);
+				goto err_out;
+			}
+
+			usdt_abs_ip = seg->start + (usdt_rel_ip - seg->offset);
+		}
+
+		pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n",
+			 usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path,
+			 note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args,
+			 seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0);
+
+		/* Adjust semaphore address to be a relative offset */
+		if (note.sema_addr) {
+			if (!man->has_sema_refcnt) {
+				pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n",
+					usdt_provider, usdt_name, path);
+				err = -ENOTSUP;
+				goto err_out;
+			}
+
+			seg = find_elf_seg(segs, seg_cnt, note.sema_addr, false /* relative */);
+			if (!seg) {
+				err = -ESRCH;
+				pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n",
+					usdt_provider, usdt_name, path, note.sema_addr);
+				goto err_out;
+			}
+			if (seg->is_exec) {
+				err = -ESRCH;
+				pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n",
+					path, seg->start, seg->end, usdt_provider, usdt_name,
+					note.sema_addr);
+				goto err_out;
+			}
+
+			usdt_sema_off = note.sema_addr - (seg->start - seg->offset);
+
+			pr_debug("usdt: sema  for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n",
+				 usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ",
+				 path, note.sema_addr, note.base_addr, usdt_sema_off,
+				 seg->start, seg->end, seg->offset);
+		}
+
+		/* Record adjusted addresses and offsets and parse USDT spec */
+		tmp = libbpf_reallocarray(targets, target_cnt + 1, sizeof(*targets));
+		if (!tmp) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+		targets = tmp;
+
+		target = &targets[target_cnt];
+		memset(target, 0, sizeof(*target));
+
+		target->abs_ip = usdt_abs_ip;
+		target->rel_ip = usdt_rel_ip;
+		target->sema_off = usdt_sema_off;
+
+		/* notes->args references strings from Elf itself, so they can
+		 * be referenced safely until elf_end() call
+		 */
+		target->spec_str = note.args;
+
+		err = parse_usdt_spec(&target->spec, &note, usdt_cookie);
+		if (err)
+			goto err_out;
+
+		target_cnt++;
+	}
+
+	*out_targets = targets;
+	*out_target_cnt = target_cnt;
+	err = target_cnt;
+
+err_out:
+	free(segs);
+	free(lib_segs);
+	if (err < 0)
+		free(targets);
+	return err;
+}
+
+struct bpf_link_usdt {
+	struct bpf_link link;
+
+	struct usdt_manager *usdt_man;
+
+	size_t spec_cnt;
+	int *spec_ids;
+
+	size_t uprobe_cnt;
+	struct {
+		long abs_ip;
+		struct bpf_link *link;
+	} *uprobes;
+};
+
+static int bpf_link_usdt_detach(struct bpf_link *link)
+{
+	struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link);
+	struct usdt_manager *man = usdt_link->usdt_man;
+	int i;
+
+	for (i = 0; i < usdt_link->uprobe_cnt; i++) {
+		/* detach underlying uprobe link */
+		bpf_link__destroy(usdt_link->uprobes[i].link);
+		/* there is no need to update specs map because it will be
+		 * unconditionally overwritten on subsequent USDT attaches,
+		 * but if BPF cookies are not used we need to remove entry
+		 * from ip_to_spec_id map, otherwise we'll run into false
+		 * conflicting IP errors
+		 */
+		if (!man->has_bpf_cookie) {
+			/* not much we can do about errors here */
+			(void)bpf_map_delete_elem(bpf_map__fd(man->ip_to_spec_id_map),
+						  &usdt_link->uprobes[i].abs_ip);
+		}
+	}
+
+	/* try to return the list of previously used spec IDs to usdt_manager
+	 * for future reuse for subsequent USDT attaches
+	 */
+	if (!man->free_spec_ids) {
+		/* if there were no free spec IDs yet, just transfer our IDs */
+		man->free_spec_ids = usdt_link->spec_ids;
+		man->free_spec_cnt = usdt_link->spec_cnt;
+		usdt_link->spec_ids = NULL;
+	} else {
+		/* otherwise concat IDs */
+		size_t new_cnt = man->free_spec_cnt + usdt_link->spec_cnt;
+		int *new_free_ids;
+
+		new_free_ids = libbpf_reallocarray(man->free_spec_ids, new_cnt,
+						   sizeof(*new_free_ids));
+		/* If we couldn't resize free_spec_ids, we'll just leak
+		 * a bunch of free IDs; this is very unlikely to happen and if
+		 * system is so exhausted on memory, it's the least of user's
+		 * concerns, probably.
+		 * So just do our best here to return those IDs to usdt_manager.
+		 */
+		if (new_free_ids) {
+			memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids,
+			       usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids));
+			man->free_spec_ids = new_free_ids;
+			man->free_spec_cnt = new_cnt;
+		}
+	}
+
+	return 0;
+}
+
+static void bpf_link_usdt_dealloc(struct bpf_link *link)
+{
+	struct bpf_link_usdt *usdt_link = container_of(link, struct bpf_link_usdt, link);
+
+	free(usdt_link->spec_ids);
+	free(usdt_link->uprobes);
+	free(usdt_link);
+}
+
+static size_t specs_hash_fn(const void *key, void *ctx)
+{
+	const char *s = key;
+
+	return str_hash(s);
+}
+
+static bool specs_equal_fn(const void *key1, const void *key2, void *ctx)
+{
+	const char *s1 = key1;
+	const char *s2 = key2;
+
+	return strcmp(s1, s2) == 0;
+}
+
+static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash,
+			    struct bpf_link_usdt *link, struct usdt_target *target,
+			    int *spec_id, bool *is_new)
+{
+	void *tmp;
+	int err;
+
+	/* check if we already allocated spec ID for this spec string */
+	if (hashmap__find(specs_hash, target->spec_str, &tmp)) {
+		*spec_id = (long)tmp;
+		*is_new = false;
+		return 0;
+	}
+
+	/* otherwise it's a new ID that needs to be set up in specs map and
+	 * returned back to usdt_manager when USDT link is detached
+	 */
+	tmp = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids));
+	if (!tmp)
+		return -ENOMEM;
+	link->spec_ids = tmp;
+
+	/* get next free spec ID, giving preference to free list, if not empty */
+	if (man->free_spec_cnt) {
+		*spec_id = man->free_spec_ids[man->free_spec_cnt - 1];
+
+		/* cache spec ID for current spec string for future lookups */
+		err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id);
+		if (err)
+			 return err;
+
+		man->free_spec_cnt--;
+	} else {
+		/* don't allocate spec ID bigger than what fits in specs map */
+		if (man->next_free_spec_id >= bpf_map__max_entries(man->specs_map))
+			return -E2BIG;
+
+		*spec_id = man->next_free_spec_id;
+
+		/* cache spec ID for current spec string for future lookups */
+		err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id);
+		if (err)
+			 return err;
+
+		man->next_free_spec_id++;
+	}
+
+	/* remember new spec ID in the link for later return back to free list on detach */
+	link->spec_ids[link->spec_cnt] = *spec_id;
+	link->spec_cnt++;
+	*is_new = true;
+	return 0;
+}
+
+struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct bpf_program *prog,
+					  pid_t pid, const char *path,
+					  const char *usdt_provider, const char *usdt_name,
+					  long usdt_cookie)
+{
+	int i, fd, err, spec_map_fd, ip_map_fd;
+	LIBBPF_OPTS(bpf_uprobe_opts, opts);
+	struct hashmap *specs_hash = NULL;
+	struct bpf_link_usdt *link = NULL;
+	struct usdt_target *targets = NULL;
+	size_t target_cnt;
+	Elf *elf;
+
+	spec_map_fd = bpf_map__fd(man->specs_map);
+	ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map);
+
+	/* TODO: perform path resolution similar to uprobe's */
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		err = -errno;
+		pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err);
+		return libbpf_err_ptr(err);
+	}
+
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (!elf) {
+		err = -EBADF;
+		pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1));
+		goto err_out;
+	}
+
+	err = sanity_check_usdt_elf(elf, path);
+	if (err)
+		goto err_out;
+
+	/* normalize PID filter */
+	if (pid < 0)
+		pid = -1;
+	else if (pid == 0)
+		pid = getpid();
+
+	/* discover USDT in given binary, optionally limiting
+	 * activations to a given PID, if pid > 0
+	 */
+	err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name,
+				   usdt_cookie, &targets, &target_cnt);
+	if (err <= 0) {
+		err = (err == 0) ? -ENOENT : err;
+		goto err_out;
+	}
+
+	specs_hash = hashmap__new(specs_hash_fn, specs_equal_fn, NULL);
+	if (IS_ERR(specs_hash)) {
+		err = PTR_ERR(specs_hash);
+		goto err_out;
+	}
+
+	link = calloc(1, sizeof(*link));
+	if (!link) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	link->usdt_man = man;
+	link->link.detach = &bpf_link_usdt_detach;
+	link->link.dealloc = &bpf_link_usdt_dealloc;
+
+	link->uprobes = calloc(target_cnt, sizeof(*link->uprobes));
+	if (!link->uprobes) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+
+	for (i = 0; i < target_cnt; i++) {
+		struct usdt_target *target = &targets[i];
+		struct bpf_link *uprobe_link;
+		bool is_new;
+		int spec_id;
+
+		/* Spec ID can be either reused or newly allocated. If it is
+		 * newly allocated, we'll need to fill out spec map, otherwise
+		 * entire spec should be valid and can be just used by a new
+		 * uprobe. We reuse spec when USDT arg spec is identical. We
+		 * also never share specs between two different USDT
+		 * attachments ("links"), so all the reused specs already
+		 * share USDT cookie value implicitly.
+		 */
+		err = allocate_spec_id(man, specs_hash, link, target, &spec_id, &is_new);
+		if (err)
+			goto err_out;
+
+		if (is_new && bpf_map_update_elem(spec_map_fd, &spec_id, &target->spec, BPF_ANY)) {
+			err = -errno;
+			pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n",
+				spec_id, usdt_provider, usdt_name, path, err);
+			goto err_out;
+		}
+		if (!man->has_bpf_cookie &&
+		    bpf_map_update_elem(ip_map_fd, &target->abs_ip, &spec_id, BPF_NOEXIST)) {
+			err = -errno;
+			if (err == -EEXIST) {
+				pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n",
+				        spec_id, usdt_provider, usdt_name, path);
+			} else {
+				pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n",
+					target->abs_ip, spec_id, usdt_provider, usdt_name,
+					path, err);
+			}
+			goto err_out;
+		}
+
+		opts.ref_ctr_offset = target->sema_off;
+		opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0;
+		uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path,
+							      target->rel_ip, &opts);
+		err = libbpf_get_error(uprobe_link);
+		if (err) {
+			pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n",
+				i, usdt_provider, usdt_name, path, err);
+			goto err_out;
+		}
+
+		link->uprobes[i].link = uprobe_link;
+		link->uprobes[i].abs_ip = target->abs_ip;
+		link->uprobe_cnt++;
+	}
+
+	free(targets);
+	hashmap__free(specs_hash);
+	elf_end(elf);
+	close(fd);
+
+	return &link->link;
+
+err_out:
+	if (link)
+		bpf_link__destroy(&link->link);
+	free(targets);
+	hashmap__free(specs_hash);
+	if (elf)
+		elf_end(elf);
+	close(fd);
+	return libbpf_err_ptr(err);
+}
+
+/* Parse out USDT ELF note from '.note.stapsdt' section.
+ * Logic inspired by perf's code.
+ */
+static int parse_usdt_note(Elf *elf, const char *path, long base_addr,
+			   GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off,
+			   struct usdt_note *note)
+{
+	const char *provider, *name, *args;
+	long addrs[3];
+	size_t len;
+
+	/* sanity check USDT note name and type first */
+	if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != 0)
+		return -EINVAL;
+	if (nhdr->n_type != USDT_NOTE_TYPE)
+		return -EINVAL;
+
+	/* sanity check USDT note contents ("description" in ELF terminology) */
+	len = nhdr->n_descsz;
+	data = data + desc_off;
+
+	/* +3 is the very minimum required to store three empty strings */
+	if (len < sizeof(addrs) + 3)
+		return -EINVAL;
+
+	/* get location, base, and semaphore addrs */
+	memcpy(&addrs, data, sizeof(addrs));
+
+	/* parse string fields: provider, name, args */
+	provider = data + sizeof(addrs);
+
+	name = (const char *)memchr(provider, '\0', data + len - provider);
+	if (!name) /* non-zero-terminated provider */
+		return -EINVAL;
+	name++;
+	if (name >= data + len || *name == '\0') /* missing or empty name */
+		return -EINVAL;
+
+	args = memchr(name, '\0', data + len - name);
+	if (!args) /* non-zero-terminated name */
+		return -EINVAL;
+	++args;
+	if (args >= data + len) /* missing arguments spec */
+		return -EINVAL;
+
+	note->provider = provider;
+	note->name = name;
+	if (*args == '\0' || *args == ':')
+		note->args = "";
+	else
+		note->args = args;
+	note->loc_addr = addrs[0];
+	note->base_addr = addrs[1];
+	note->sema_addr = addrs[2];
+
+	return 0;
+}
+
+static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg);
+
+static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, long usdt_cookie)
+{
+	const char *s;
+	int len;
+
+	spec->usdt_cookie = usdt_cookie;
+	spec->arg_cnt = 0;
+
+	s = note->args;
+	while (s[0]) {
+		if (spec->arg_cnt >= USDT_MAX_ARG_CNT) {
+			pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n",
+				USDT_MAX_ARG_CNT, note->provider, note->name, note->args);
+			return -E2BIG;
+		}
+
+		len = parse_usdt_arg(s, spec->arg_cnt, &spec->args[spec->arg_cnt]);
+		if (len < 0)
+			return len;
+
+		s += len;
+		spec->arg_cnt++;
+	}
+
+	return 0;
+}
+
+/* Architecture-specific logic for parsing USDT argument location specs */
+
+#if defined(__x86_64__) || defined(__i386__)
+
+static int calc_pt_regs_off(const char *reg_name)
+{
+	static struct {
+		const char *names[4];
+		size_t pt_regs_off;
+	} reg_map[] = {
+#ifdef __x86_64__
+#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64)
+#else
+#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32)
+#endif
+		{ {"rip", "eip", "", ""}, reg_off(rip, eip) },
+		{ {"rax", "eax", "ax", "al"}, reg_off(rax, eax) },
+		{ {"rbx", "ebx", "bx", "bl"}, reg_off(rbx, ebx) },
+		{ {"rcx", "ecx", "cx", "cl"}, reg_off(rcx, ecx) },
+		{ {"rdx", "edx", "dx", "dl"}, reg_off(rdx, edx) },
+		{ {"rsi", "esi", "si", "sil"}, reg_off(rsi, esi) },
+		{ {"rdi", "edi", "di", "dil"}, reg_off(rdi, edi) },
+		{ {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) },
+		{ {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) },
+#undef reg_off
+#ifdef __x86_64__
+		{ {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) },
+		{ {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) },
+		{ {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) },
+		{ {"r11", "r11d", "r11w", "r11b"}, offsetof(struct pt_regs, r11) },
+		{ {"r12", "r12d", "r12w", "r12b"}, offsetof(struct pt_regs, r12) },
+		{ {"r13", "r13d", "r13w", "r13b"}, offsetof(struct pt_regs, r13) },
+		{ {"r14", "r14d", "r14w", "r14b"}, offsetof(struct pt_regs, r14) },
+		{ {"r15", "r15d", "r15w", "r15b"}, offsetof(struct pt_regs, r15) },
+#endif
+	};
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(reg_map); i++) {
+		for (j = 0; j < ARRAY_SIZE(reg_map[i].names); j++) {
+			if (strcmp(reg_name, reg_map[i].names[j]) == 0)
+				return reg_map[i].pt_regs_off;
+		}
+	}
+
+	pr_warn("usdt: unrecognized register '%s'\n", reg_name);
+	return -ENOENT;
+}
+
+static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg)
+{
+	char *reg_name = NULL;
+	int arg_sz, len, reg_off;
+	long off;
+
+	if (sscanf(arg_str, " %d @ %ld ( %%%m[^)] ) %n", &arg_sz, &off, &reg_name, &len) == 3) {
+		/* Memory dereference case, e.g., -4@-20(%rbp) */
+		arg->arg_type = USDT_ARG_REG_DEREF;
+		arg->val_off = off;
+		reg_off = calc_pt_regs_off(reg_name);
+		free(reg_name);
+		if (reg_off < 0)
+			return reg_off;
+		arg->reg_off = reg_off;
+	} else if (sscanf(arg_str, " %d @ %%%ms %n", &arg_sz, &reg_name, &len) == 2) {
+		/* Register read case, e.g., -4@%eax */
+		arg->arg_type = USDT_ARG_REG;
+		arg->val_off = 0;
+
+		reg_off = calc_pt_regs_off(reg_name);
+		free(reg_name);
+		if (reg_off < 0)
+			return reg_off;
+		arg->reg_off = reg_off;
+	} else if (sscanf(arg_str, " %d @ $%ld %n", &arg_sz, &off, &len) == 2) {
+		/* Constant value case, e.g., 4@$71 */
+		arg->arg_type = USDT_ARG_CONST;
+		arg->val_off = off;
+		arg->reg_off = 0;
+	} else {
+		pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str);
+		return -EINVAL;
+	}
+
+	arg->arg_signed = arg_sz < 0;
+	if (arg_sz < 0)
+		arg_sz = -arg_sz;
+
+	switch (arg_sz) {
+	case 1: case 2: case 4: case 8:
+		arg->arg_bitshift = 64 - arg_sz * 8;
+		break;
+	default:
+		pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n",
+			arg_num, arg_str, arg_sz);
+		return -EINVAL;
+	}
+
+	return len;
+}
+
+#elif defined(__s390x__)
+
+/* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */
+
+static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg)
+{
+	unsigned int reg;
+	int arg_sz, len;
+	long off;
+
+	if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", &arg_sz, &off, &reg, &len) == 3) {
+		/* Memory dereference case, e.g., -2@-28(%r15) */
+		arg->arg_type = USDT_ARG_REG_DEREF;
+		arg->val_off = off;
+		if (reg > 15) {
+			pr_warn("usdt: unrecognized register '%%r%u'\n", reg);
+			return -EINVAL;
+		}
+		arg->reg_off = offsetof(user_pt_regs, gprs[reg]);
+	} else if (sscanf(arg_str, " %d @ %%r%u %n", &arg_sz, &reg, &len) == 2) {
+		/* Register read case, e.g., -8@%r0 */
+		arg->arg_type = USDT_ARG_REG;
+		arg->val_off = 0;
+		if (reg > 15) {
+			pr_warn("usdt: unrecognized register '%%r%u'\n", reg);
+			return -EINVAL;
+		}
+		arg->reg_off = offsetof(user_pt_regs, gprs[reg]);
+	} else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) {
+		/* Constant value case, e.g., 4@71 */
+		arg->arg_type = USDT_ARG_CONST;
+		arg->val_off = off;
+		arg->reg_off = 0;
+	} else {
+		pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str);
+		return -EINVAL;
+	}
+
+	arg->arg_signed = arg_sz < 0;
+	if (arg_sz < 0)
+		arg_sz = -arg_sz;
+
+	switch (arg_sz) {
+	case 1: case 2: case 4: case 8:
+		arg->arg_bitshift = 64 - arg_sz * 8;
+		break;
+	default:
+		pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n",
+			arg_num, arg_str, arg_sz);
+		return -EINVAL;
+	}
+
+	return len;
+}
+
+#else
+
+static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg)
+{
+	pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n");
+	return -ENOTSUP;
+}
+
+#endif
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 3820608faf57..bafdc5373a13 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -168,9 +168,15 @@ $(OUTPUT)/%:%.c
 	$(call msg,BINARY,,$@)
 	$(Q)$(LINK.c) $^ $(LDLIBS) -o $@
 
-$(OUTPUT)/urandom_read: urandom_read.c
+$(OUTPUT)/liburandom_read.so: urandom_read_lib1.c urandom_read_lib2.c
+	$(call msg,LIB,,$@)
+	$(Q)$(CC) $(CFLAGS) -fPIC $(LDFLAGS) $^ $(LDLIBS) --shared -o $@
+
+$(OUTPUT)/urandom_read: urandom_read.c urandom_read_aux.c $(OUTPUT)/liburandom_read.so
 	$(call msg,BINARY,,$@)
-	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $< $(LDLIBS) -Wl,--build-id=sha1 -o $@
+	$(Q)$(CC) $(CFLAGS) $(LDFLAGS) $(filter %.c,$^)			       \
+		  liburandom_read.so $(LDLIBS)	       			       \
+		  -Wl,-rpath=. -Wl,--build-id=sha1 -o $@
 
 $(OUTPUT)/bpf_testmod.ko: $(VMLINUX_BTF) $(wildcard bpf_testmod/Makefile bpf_testmod/*.[ch])
 	$(call msg,MOD,,$@)
@@ -328,12 +334,8 @@ SKEL_BLACKLIST := btf__% test_pinning_invalid.c test_sk_assign.c
 
 LINKED_SKELS := test_static_linked.skel.h linked_funcs.skel.h		\
 		linked_vars.skel.h linked_maps.skel.h 			\
-		test_subskeleton.skel.h test_subskeleton_lib.skel.h
-
-# In the subskeleton case, we want the test_subskeleton_lib.subskel.h file
-# but that's created as a side-effect of the skel.h generation.
-test_subskeleton.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o test_subskeleton.o
-test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o
+		test_subskeleton.skel.h test_subskeleton_lib.skel.h	\
+		test_usdt.skel.h
 
 LSKELS := kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \
 	test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \
@@ -346,6 +348,11 @@ test_static_linked.skel.h-deps := test_static_linked1.o test_static_linked2.o
 linked_funcs.skel.h-deps := linked_funcs1.o linked_funcs2.o
 linked_vars.skel.h-deps := linked_vars1.o linked_vars2.o
 linked_maps.skel.h-deps := linked_maps1.o linked_maps2.o
+# In the subskeleton case, we want the test_subskeleton_lib.subskel.h file
+# but that's created as a side-effect of the skel.h generation.
+test_subskeleton.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o test_subskeleton.o
+test_subskeleton_lib.skel.h-deps := test_subskeleton_lib2.o test_subskeleton_lib.o
+test_usdt.skel.h-deps := test_usdt.o test_usdt_multispec.o
 
 LINKED_BPF_SRCS := $(patsubst %.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
 
@@ -400,6 +407,7 @@ $(TRUNNER_BPF_OBJS): $(TRUNNER_OUTPUT)/%.o:				\
 		     $(TRUNNER_BPF_PROGS_DIR)/*.h			\
 		     $$(INCLUDE_DIR)/vmlinux.h				\
 		     $(wildcard $(BPFDIR)/bpf_*.h)			\
+		     $(wildcard $(BPFDIR)/*.bpf.h)			\
 		     | $(TRUNNER_OUTPUT) $$(BPFOBJ)
 	$$(call $(TRUNNER_BPF_BUILD_RULE),$$<,$$@,			\
 					  $(TRUNNER_BPF_CFLAGS))
@@ -491,6 +499,7 @@ TRUNNER_EXTRA_SOURCES := test_progs.c cgroup_helpers.c trace_helpers.c	\
 			 btf_helpers.c flow_dissector_load.h		\
 			 cap_helpers.c
 TRUNNER_EXTRA_FILES := $(OUTPUT)/urandom_read $(OUTPUT)/bpf_testmod.ko	\
+		       $(OUTPUT)/liburandom_read.so			\
 		       ima_setup.sh					\
 		       $(wildcard progs/btf_dump_test_case_*.c)
 TRUNNER_BPF_BUILD_RULE := CLANG_BPF_BUILD_RULE
diff --git a/tools/testing/selftests/bpf/prog_tests/attach_probe.c b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
index d48f6e533e1e..c0c6d410751d 100644
--- a/tools/testing/selftests/bpf/prog_tests/attach_probe.c
+++ b/tools/testing/selftests/bpf/prog_tests/attach_probe.c
@@ -11,15 +11,22 @@ static void trigger_func(void)
 	asm volatile ("");
 }
 
+/* attach point for byname uprobe */
+static void trigger_func2(void)
+{
+	asm volatile ("");
+}
+
 void test_attach_probe(void)
 {
 	DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, uprobe_opts);
-	int duration = 0;
 	struct bpf_link *kprobe_link, *kretprobe_link;
 	struct bpf_link *uprobe_link, *uretprobe_link;
 	struct test_attach_probe* skel;
 	ssize_t uprobe_offset, ref_ctr_offset;
+	struct bpf_link *uprobe_err_link;
 	bool legacy;
+	char *mem;
 
 	/* Check if new-style kprobe/uprobe API is supported.
 	 * Kernels that support new FD-based kprobe and uprobe BPF attachment
@@ -43,9 +50,9 @@ void test_attach_probe(void)
 		return;
 
 	skel = test_attach_probe__open_and_load();
-	if (CHECK(!skel, "skel_open", "failed to open skeleton\n"))
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
 		return;
-	if (CHECK(!skel->bss, "check_bss", ".bss wasn't mmap()-ed\n"))
+	if (!ASSERT_OK_PTR(skel->bss, "check_bss"))
 		goto cleanup;
 
 	kprobe_link = bpf_program__attach_kprobe(skel->progs.handle_kprobe,
@@ -90,25 +97,73 @@ void test_attach_probe(void)
 		goto cleanup;
 	skel->links.handle_uretprobe = uretprobe_link;
 
+	/* verify auto-attach fails for old-style uprobe definition */
+	uprobe_err_link = bpf_program__attach(skel->progs.handle_uprobe_byname);
+	if (!ASSERT_EQ(libbpf_get_error(uprobe_err_link), -EOPNOTSUPP,
+		       "auto-attach should fail for old-style name"))
+		goto cleanup;
+
+	uprobe_opts.func_name = "trigger_func2";
+	uprobe_opts.retprobe = false;
+	uprobe_opts.ref_ctr_offset = 0;
+	skel->links.handle_uprobe_byname =
+			bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe_byname,
+							0 /* this pid */,
+							"/proc/self/exe",
+							0, &uprobe_opts);
+	if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname, "attach_uprobe_byname"))
+		goto cleanup;
+
+	/* verify auto-attach works */
+	skel->links.handle_uretprobe_byname =
+			bpf_program__attach(skel->progs.handle_uretprobe_byname);
+	if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname, "attach_uretprobe_byname"))
+		goto cleanup;
+
+	/* test attach by name for a library function, using the library
+	 * as the binary argument. libc.so.6 will be resolved via dlopen()/dlinfo().
+	 */
+	uprobe_opts.func_name = "malloc";
+	uprobe_opts.retprobe = false;
+	skel->links.handle_uprobe_byname2 =
+			bpf_program__attach_uprobe_opts(skel->progs.handle_uprobe_byname2,
+							0 /* this pid */,
+							"libc.so.6",
+							0, &uprobe_opts);
+	if (!ASSERT_OK_PTR(skel->links.handle_uprobe_byname2, "attach_uprobe_byname2"))
+		goto cleanup;
+
+	uprobe_opts.func_name = "free";
+	uprobe_opts.retprobe = true;
+	skel->links.handle_uretprobe_byname2 =
+			bpf_program__attach_uprobe_opts(skel->progs.handle_uretprobe_byname2,
+							-1 /* any pid */,
+							"libc.so.6",
+							0, &uprobe_opts);
+	if (!ASSERT_OK_PTR(skel->links.handle_uretprobe_byname2, "attach_uretprobe_byname2"))
+		goto cleanup;
+
 	/* trigger & validate kprobe && kretprobe */
 	usleep(1);
 
-	if (CHECK(skel->bss->kprobe_res != 1, "check_kprobe_res",
-		  "wrong kprobe res: %d\n", skel->bss->kprobe_res))
-		goto cleanup;
-	if (CHECK(skel->bss->kretprobe_res != 2, "check_kretprobe_res",
-		  "wrong kretprobe res: %d\n", skel->bss->kretprobe_res))
-		goto cleanup;
+	/* trigger & validate shared library u[ret]probes attached by name */
+	mem = malloc(1);
+	free(mem);
 
 	/* trigger & validate uprobe & uretprobe */
 	trigger_func();
 
-	if (CHECK(skel->bss->uprobe_res != 3, "check_uprobe_res",
-		  "wrong uprobe res: %d\n", skel->bss->uprobe_res))
-		goto cleanup;
-	if (CHECK(skel->bss->uretprobe_res != 4, "check_uretprobe_res",
-		  "wrong uretprobe res: %d\n", skel->bss->uretprobe_res))
-		goto cleanup;
+	/* trigger & validate uprobe attached by name */
+	trigger_func2();
+
+	ASSERT_EQ(skel->bss->kprobe_res, 1, "check_kprobe_res");
+	ASSERT_EQ(skel->bss->kretprobe_res, 2, "check_kretprobe_res");
+	ASSERT_EQ(skel->bss->uprobe_res, 3, "check_uprobe_res");
+	ASSERT_EQ(skel->bss->uretprobe_res, 4, "check_uretprobe_res");
+	ASSERT_EQ(skel->bss->uprobe_byname_res, 5, "check_uprobe_byname_res");
+	ASSERT_EQ(skel->bss->uretprobe_byname_res, 6, "check_uretprobe_byname_res");
+	ASSERT_EQ(skel->bss->uprobe_byname2_res, 7, "check_uprobe_byname2_res");
+	ASSERT_EQ(skel->bss->uretprobe_byname2_res, 8, "check_uretprobe_byname2_res");
 
 cleanup:
 	test_attach_probe__destroy(skel);
diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c
index 044df13ee069..754e80937e5d 100644
--- a/tools/testing/selftests/bpf/prog_tests/for_each.c
+++ b/tools/testing/selftests/bpf/prog_tests/for_each.c
@@ -4,6 +4,7 @@
 #include <network_helpers.h>
 #include "for_each_hash_map_elem.skel.h"
 #include "for_each_array_map_elem.skel.h"
+#include "for_each_map_elem_write_key.skel.h"
 
 static unsigned int duration;
 
@@ -129,10 +130,21 @@ out:
 	for_each_array_map_elem__destroy(skel);
 }
 
+static void test_write_map_key(void)
+{
+	struct for_each_map_elem_write_key *skel;
+
+	skel = for_each_map_elem_write_key__open_and_load();
+	if (!ASSERT_ERR_PTR(skel, "for_each_map_elem_write_key__open_and_load"))
+		for_each_map_elem_write_key__destroy(skel);
+}
+
 void test_for_each(void)
 {
 	if (test__start_subtest("hash_map"))
 		test_hash_map();
 	if (test__start_subtest("array_map"))
 		test_array_map();
+	if (test__start_subtest("write_map_key"))
+		test_write_map_key();
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
index f6933b06daf8..1d7a2f1e0731 100644
--- a/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
+++ b/tools/testing/selftests/bpf/prog_tests/ksyms_btf.c
@@ -138,12 +138,16 @@ cleanup:
 	test_ksyms_weak_lskel__destroy(skel);
 }
 
-static void test_write_check(void)
+static void test_write_check(bool test_handler1)
 {
 	struct test_ksyms_btf_write_check *skel;
 
-	skel = test_ksyms_btf_write_check__open_and_load();
-	ASSERT_ERR_PTR(skel, "unexpected load of a prog writing to ksym memory\n");
+	skel = test_ksyms_btf_write_check__open();
+	if (!ASSERT_OK_PTR(skel, "test_ksyms_btf_write_check__open"))
+		return;
+	bpf_program__set_autoload(test_handler1 ? skel->progs.handler2 : skel->progs.handler1, false);
+	ASSERT_ERR(test_ksyms_btf_write_check__load(skel),
+		   "unexpected load of a prog writing to ksym memory\n");
 
 	test_ksyms_btf_write_check__destroy(skel);
 }
@@ -179,6 +183,9 @@ void test_ksyms_btf(void)
 	if (test__start_subtest("weak_ksyms_lskel"))
 		test_weak_syms_lskel();
 
-	if (test__start_subtest("write_check"))
-		test_write_check();
+	if (test__start_subtest("write_check1"))
+		test_write_check(true);
+
+	if (test__start_subtest("write_check2"))
+		test_write_check(false);
 }
diff --git a/tools/testing/selftests/bpf/prog_tests/netcnt.c b/tools/testing/selftests/bpf/prog_tests/netcnt.c
index 954964f0ac3d..d3915c58d0e1 100644
--- a/tools/testing/selftests/bpf/prog_tests/netcnt.c
+++ b/tools/testing/selftests/bpf/prog_tests/netcnt.c
@@ -25,7 +25,7 @@ void serial_test_netcnt(void)
 	if (!ASSERT_OK_PTR(skel, "netcnt_prog__open_and_load"))
 		return;
 
-	nproc = get_nprocs_conf();
+	nproc = bpf_num_possible_cpus();
 	percpu_netcnt = malloc(sizeof(*percpu_netcnt) * nproc);
 	if (!ASSERT_OK_PTR(percpu_netcnt, "malloc(percpu_netcnt)"))
 		goto err;
diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
index 509e21d5cb9d..b90ee47d3111 100644
--- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
+++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c
@@ -81,6 +81,7 @@ void test_test_global_funcs(void)
 		{ "test_global_func14.o", "reference type('FWD S') size cannot be determined" },
 		{ "test_global_func15.o", "At program exit the register R0 has value" },
 		{ "test_global_func16.o", "invalid indirect read from stack" },
+		{ "test_global_func17.o", "Caller passes invalid args into func#1" },
 	};
 	libbpf_print_fn_t old_print_fn = NULL;
 	int err, i, duration = 0;
diff --git a/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
new file mode 100644
index 000000000000..d6003dc8cc99
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/uprobe_autoattach.c
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022, Oracle and/or its affiliates. */
+
+#include <test_progs.h>
+#include "test_uprobe_autoattach.skel.h"
+
+/* uprobe attach point */
+static noinline int autoattach_trigger_func(int arg)
+{
+	asm volatile ("");
+	return arg + 1;
+}
+
+void test_uprobe_autoattach(void)
+{
+	struct test_uprobe_autoattach *skel;
+	int trigger_val = 100, trigger_ret;
+	size_t malloc_sz = 1;
+	char *mem;
+
+	skel = test_uprobe_autoattach__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	if (!ASSERT_OK(test_uprobe_autoattach__attach(skel), "skel_attach"))
+		goto cleanup;
+
+	skel->bss->test_pid = getpid();
+
+	/* trigger & validate uprobe & uretprobe */
+	trigger_ret = autoattach_trigger_func(trigger_val);
+
+	skel->bss->test_pid = getpid();
+
+	/* trigger & validate shared library u[ret]probes attached by name */
+	mem = malloc(malloc_sz);
+	free(mem);
+
+	ASSERT_EQ(skel->bss->uprobe_byname_parm1, trigger_val, "check_uprobe_byname_parm1");
+	ASSERT_EQ(skel->bss->uprobe_byname_ran, 1, "check_uprobe_byname_ran");
+	ASSERT_EQ(skel->bss->uretprobe_byname_rc, trigger_ret, "check_uretprobe_byname_rc");
+	ASSERT_EQ(skel->bss->uretprobe_byname_ran, 2, "check_uretprobe_byname_ran");
+	ASSERT_EQ(skel->bss->uprobe_byname2_parm1, malloc_sz, "check_uprobe_byname2_parm1");
+	ASSERT_EQ(skel->bss->uprobe_byname2_ran, 3, "check_uprobe_byname2_ran");
+	ASSERT_EQ(skel->bss->uretprobe_byname2_rc, mem, "check_uretprobe_byname2_rc");
+	ASSERT_EQ(skel->bss->uretprobe_byname2_ran, 4, "check_uretprobe_byname2_ran");
+cleanup:
+	test_uprobe_autoattach__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/prog_tests/usdt.c b/tools/testing/selftests/bpf/prog_tests/usdt.c
new file mode 100644
index 000000000000..a71f51bdc08d
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/usdt.c
@@ -0,0 +1,421 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include <test_progs.h>
+
+#define _SDT_HAS_SEMAPHORES 1
+#include "../sdt.h"
+
+#include "test_usdt.skel.h"
+#include "test_urandom_usdt.skel.h"
+
+int lets_test_this(int);
+
+static volatile int idx = 2;
+static volatile __u64 bla = 0xFEDCBA9876543210ULL;
+static volatile short nums[] = {-1, -2, -3, };
+
+static volatile struct {
+	int x;
+	signed char y;
+} t1 = { 1, -127 };
+
+#define SEC(name) __attribute__((section(name), used))
+
+unsigned short test_usdt0_semaphore SEC(".probes");
+unsigned short test_usdt3_semaphore SEC(".probes");
+unsigned short test_usdt12_semaphore SEC(".probes");
+
+static void __always_inline trigger_func(int x) {
+	long y = 42;
+
+	if (test_usdt0_semaphore)
+		STAP_PROBE(test, usdt0);
+	if (test_usdt3_semaphore)
+		STAP_PROBE3(test, usdt3, x, y, &bla);
+	if (test_usdt12_semaphore) {
+		STAP_PROBE12(test, usdt12,
+			     x, x + 1, y, x + y, 5,
+			     y / 7, bla, &bla, -9, nums[x],
+			     nums[idx], t1.y);
+	}
+}
+
+static void subtest_basic_usdt(void)
+{
+	LIBBPF_OPTS(bpf_usdt_opts, opts);
+	struct test_usdt *skel;
+	struct test_usdt__bss *bss;
+	int err;
+
+	skel = test_usdt__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	bss = skel->bss;
+	bss->my_pid = getpid();
+
+	err = test_usdt__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto cleanup;
+
+	/* usdt0 won't be auto-attached */
+	opts.usdt_cookie = 0xcafedeadbeeffeed;
+	skel->links.usdt0 = bpf_program__attach_usdt(skel->progs.usdt0,
+						     0 /*self*/, "/proc/self/exe",
+						     "test", "usdt0", &opts);
+	if (!ASSERT_OK_PTR(skel->links.usdt0, "usdt0_link"))
+		goto cleanup;
+
+	trigger_func(1);
+
+	ASSERT_EQ(bss->usdt0_called, 1, "usdt0_called");
+	ASSERT_EQ(bss->usdt3_called, 1, "usdt3_called");
+	ASSERT_EQ(bss->usdt12_called, 1, "usdt12_called");
+
+	ASSERT_EQ(bss->usdt0_cookie, 0xcafedeadbeeffeed, "usdt0_cookie");
+	ASSERT_EQ(bss->usdt0_arg_cnt, 0, "usdt0_arg_cnt");
+	ASSERT_EQ(bss->usdt0_arg_ret, -ENOENT, "usdt0_arg_ret");
+
+	/* auto-attached usdt3 gets default zero cookie value */
+	ASSERT_EQ(bss->usdt3_cookie, 0, "usdt3_cookie");
+	ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt");
+
+	ASSERT_EQ(bss->usdt3_arg_rets[0], 0, "usdt3_arg1_ret");
+	ASSERT_EQ(bss->usdt3_arg_rets[1], 0, "usdt3_arg2_ret");
+	ASSERT_EQ(bss->usdt3_arg_rets[2], 0, "usdt3_arg3_ret");
+	ASSERT_EQ(bss->usdt3_args[0], 1, "usdt3_arg1");
+	ASSERT_EQ(bss->usdt3_args[1], 42, "usdt3_arg2");
+	ASSERT_EQ(bss->usdt3_args[2], (uintptr_t)&bla, "usdt3_arg3");
+
+	/* auto-attached usdt12 gets default zero cookie value */
+	ASSERT_EQ(bss->usdt12_cookie, 0, "usdt12_cookie");
+	ASSERT_EQ(bss->usdt12_arg_cnt, 12, "usdt12_arg_cnt");
+
+	ASSERT_EQ(bss->usdt12_args[0], 1, "usdt12_arg1");
+	ASSERT_EQ(bss->usdt12_args[1], 1 + 1, "usdt12_arg2");
+	ASSERT_EQ(bss->usdt12_args[2], 42, "usdt12_arg3");
+	ASSERT_EQ(bss->usdt12_args[3], 42 + 1, "usdt12_arg4");
+	ASSERT_EQ(bss->usdt12_args[4], 5, "usdt12_arg5");
+	ASSERT_EQ(bss->usdt12_args[5], 42 / 7, "usdt12_arg6");
+	ASSERT_EQ(bss->usdt12_args[6], bla, "usdt12_arg7");
+	ASSERT_EQ(bss->usdt12_args[7], (uintptr_t)&bla, "usdt12_arg8");
+	ASSERT_EQ(bss->usdt12_args[8], -9, "usdt12_arg9");
+	ASSERT_EQ(bss->usdt12_args[9], nums[1], "usdt12_arg10");
+	ASSERT_EQ(bss->usdt12_args[10], nums[idx], "usdt12_arg11");
+	ASSERT_EQ(bss->usdt12_args[11], t1.y, "usdt12_arg12");
+
+	/* trigger_func() is marked __always_inline, so USDT invocations will be
+	 * inlined in two different places, meaning that each USDT will have
+	 * at least 2 different places to be attached to. This verifies that
+	 * bpf_program__attach_usdt() handles this properly and attaches to
+	 * all possible places of USDT invocation.
+	 */
+	trigger_func(2);
+
+	ASSERT_EQ(bss->usdt0_called, 2, "usdt0_called");
+	ASSERT_EQ(bss->usdt3_called, 2, "usdt3_called");
+	ASSERT_EQ(bss->usdt12_called, 2, "usdt12_called");
+
+	/* only check values that depend on trigger_func()'s input value */
+	ASSERT_EQ(bss->usdt3_args[0], 2, "usdt3_arg1");
+
+	ASSERT_EQ(bss->usdt12_args[0], 2, "usdt12_arg1");
+	ASSERT_EQ(bss->usdt12_args[1], 2 + 1, "usdt12_arg2");
+	ASSERT_EQ(bss->usdt12_args[3], 42 + 2, "usdt12_arg4");
+	ASSERT_EQ(bss->usdt12_args[9], nums[2], "usdt12_arg10");
+
+	/* detach and re-attach usdt3 */
+	bpf_link__destroy(skel->links.usdt3);
+
+	opts.usdt_cookie = 0xBADC00C51E;
+	skel->links.usdt3 = bpf_program__attach_usdt(skel->progs.usdt3, -1 /* any pid */,
+						     "/proc/self/exe", "test", "usdt3", &opts);
+	if (!ASSERT_OK_PTR(skel->links.usdt3, "usdt3_reattach"))
+		goto cleanup;
+
+	trigger_func(3);
+
+	ASSERT_EQ(bss->usdt3_called, 3, "usdt3_called");
+	/* this time usdt3 has custom cookie */
+	ASSERT_EQ(bss->usdt3_cookie, 0xBADC00C51E, "usdt3_cookie");
+	ASSERT_EQ(bss->usdt3_arg_cnt, 3, "usdt3_arg_cnt");
+
+	ASSERT_EQ(bss->usdt3_arg_rets[0], 0, "usdt3_arg1_ret");
+	ASSERT_EQ(bss->usdt3_arg_rets[1], 0, "usdt3_arg2_ret");
+	ASSERT_EQ(bss->usdt3_arg_rets[2], 0, "usdt3_arg3_ret");
+	ASSERT_EQ(bss->usdt3_args[0], 3, "usdt3_arg1");
+	ASSERT_EQ(bss->usdt3_args[1], 42, "usdt3_arg2");
+	ASSERT_EQ(bss->usdt3_args[2], (uintptr_t)&bla, "usdt3_arg3");
+
+cleanup:
+	test_usdt__destroy(skel);
+}
+
+unsigned short test_usdt_100_semaphore SEC(".probes");
+unsigned short test_usdt_300_semaphore SEC(".probes");
+unsigned short test_usdt_400_semaphore SEC(".probes");
+
+#define R10(F, X)  F(X+0); F(X+1);F(X+2); F(X+3); F(X+4); \
+		   F(X+5); F(X+6); F(X+7); F(X+8); F(X+9);
+#define R100(F, X) R10(F,X+ 0);R10(F,X+10);R10(F,X+20);R10(F,X+30);R10(F,X+40); \
+		   R10(F,X+50);R10(F,X+60);R10(F,X+70);R10(F,X+80);R10(F,X+90);
+
+/* carefully control that we get exactly 100 inlines by preventing inlining */
+static void __always_inline f100(int x)
+{
+	STAP_PROBE1(test, usdt_100, x);
+}
+
+__weak void trigger_100_usdts(void)
+{
+	R100(f100, 0);
+}
+
+/* we shouldn't be able to attach to test:usdt2_300 USDT as we don't have as
+ * many slots for specs. It's important that each STAP_PROBE2() invocation
+ * (after untolling) gets different arg spec due to compiler inlining i as
+ * a constant
+ */
+static void __always_inline f300(int x)
+{
+	STAP_PROBE1(test, usdt_300, x);
+}
+
+__weak void trigger_300_usdts(void)
+{
+	R100(f300, 0);
+	R100(f300, 100);
+	R100(f300, 200);
+}
+
+static void __always_inline f400(int x __attribute__((unused)))
+{
+	static int y;
+
+	STAP_PROBE1(test, usdt_400, y++);
+}
+
+/* this time we have 400 different USDT call sites, but they have uniform
+ * argument location, so libbpf's spec string deduplication logic should keep
+ * spec count use very small and so we should be able to attach to all 400
+ * call sites
+ */
+__weak void trigger_400_usdts(void)
+{
+	R100(f400, 0);
+	R100(f400, 100);
+	R100(f400, 200);
+	R100(f400, 300);
+}
+
+static void subtest_multispec_usdt(void)
+{
+	LIBBPF_OPTS(bpf_usdt_opts, opts);
+	struct test_usdt *skel;
+	struct test_usdt__bss *bss;
+	int err, i;
+
+	skel = test_usdt__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	bss = skel->bss;
+	bss->my_pid = getpid();
+
+	err = test_usdt__attach(skel);
+	if (!ASSERT_OK(err, "skel_attach"))
+		goto cleanup;
+
+	/* usdt_100 is auto-attached and there are 100 inlined call sites,
+	 * let's validate that all of them are properly attached to and
+	 * handled from BPF side
+	 */
+	trigger_100_usdts();
+
+	ASSERT_EQ(bss->usdt_100_called, 100, "usdt_100_called");
+	ASSERT_EQ(bss->usdt_100_sum, 99 * 100 / 2, "usdt_100_sum");
+
+	/* Stress test free spec ID tracking. By default libbpf allows up to
+	 * 256 specs to be used, so if we don't return free spec IDs back
+	 * after few detachments and re-attachments we should run out of
+	 * available spec IDs.
+	 */
+	for (i = 0; i < 2; i++) {
+		bpf_link__destroy(skel->links.usdt_100);
+
+		skel->links.usdt_100 = bpf_program__attach_usdt(skel->progs.usdt_100, -1,
+							        "/proc/self/exe",
+								"test", "usdt_100", NULL);
+		if (!ASSERT_OK_PTR(skel->links.usdt_100, "usdt_100_reattach"))
+			goto cleanup;
+
+		bss->usdt_100_sum = 0;
+		trigger_100_usdts();
+
+		ASSERT_EQ(bss->usdt_100_called, (i + 1) * 100 + 100, "usdt_100_called");
+		ASSERT_EQ(bss->usdt_100_sum, 99 * 100 / 2, "usdt_100_sum");
+	}
+
+	/* Now let's step it up and try to attach USDT that requires more than
+	 * 256 attach points with different specs for each.
+	 * Note that we need trigger_300_usdts() only to actually have 300
+	 * USDT call sites, we are not going to actually trace them.
+	 */
+	trigger_300_usdts();
+
+	/* we'll reuse usdt_100 BPF program for usdt_300 test */
+	bpf_link__destroy(skel->links.usdt_100);
+	skel->links.usdt_100 = bpf_program__attach_usdt(skel->progs.usdt_100, -1, "/proc/self/exe",
+							"test", "usdt_300", NULL);
+	err = -errno;
+	if (!ASSERT_ERR_PTR(skel->links.usdt_100, "usdt_300_bad_attach"))
+		goto cleanup;
+	ASSERT_EQ(err, -E2BIG, "usdt_300_attach_err");
+
+	/* let's check that there are no "dangling" BPF programs attached due
+	 * to partial success of the above test:usdt_300 attachment
+	 */
+	bss->usdt_100_called = 0;
+	bss->usdt_100_sum = 0;
+
+	f300(777); /* this is 301st instance of usdt_300 */
+
+	ASSERT_EQ(bss->usdt_100_called, 0, "usdt_301_called");
+	ASSERT_EQ(bss->usdt_100_sum, 0, "usdt_301_sum");
+
+	/* This time we have USDT with 400 inlined invocations, but arg specs
+	 * should be the same across all sites, so libbpf will only need to
+	 * use one spec and thus we'll be able to attach 400 uprobes
+	 * successfully.
+	 *
+	 * Again, we are reusing usdt_100 BPF program.
+	 */
+	skel->links.usdt_100 = bpf_program__attach_usdt(skel->progs.usdt_100, -1,
+							"/proc/self/exe",
+							"test", "usdt_400", NULL);
+	if (!ASSERT_OK_PTR(skel->links.usdt_100, "usdt_400_attach"))
+		goto cleanup;
+
+	trigger_400_usdts();
+
+	ASSERT_EQ(bss->usdt_100_called, 400, "usdt_400_called");
+	ASSERT_EQ(bss->usdt_100_sum, 399 * 400 / 2, "usdt_400_sum");
+
+cleanup:
+	test_usdt__destroy(skel);
+}
+
+static FILE *urand_spawn(int *pid)
+{
+	FILE *f;
+
+	/* urandom_read's stdout is wired into f */
+	f = popen("./urandom_read 1 report-pid", "r");
+	if (!f)
+		return NULL;
+
+	if (fscanf(f, "%d", pid) != 1) {
+		pclose(f);
+		return NULL;
+	}
+
+	return f;
+}
+
+static int urand_trigger(FILE **urand_pipe)
+{
+	int exit_code;
+
+	/* pclose() waits for child process to exit and returns their exit code */
+	exit_code = pclose(*urand_pipe);
+	*urand_pipe = NULL;
+
+	return exit_code;
+}
+
+static void subtest_urandom_usdt(bool auto_attach)
+{
+	struct test_urandom_usdt *skel;
+	struct test_urandom_usdt__bss *bss;
+	struct bpf_link *l;
+	FILE *urand_pipe = NULL;
+	int err, urand_pid = 0;
+
+	skel = test_urandom_usdt__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "skel_open"))
+		return;
+
+	urand_pipe = urand_spawn(&urand_pid);
+	if (!ASSERT_OK_PTR(urand_pipe, "urand_spawn"))
+		goto cleanup;
+
+	bss = skel->bss;
+	bss->urand_pid = urand_pid;
+
+	if (auto_attach) {
+		err = test_urandom_usdt__attach(skel);
+		if (!ASSERT_OK(err, "skel_auto_attach"))
+			goto cleanup;
+	} else {
+		l = bpf_program__attach_usdt(skel->progs.urand_read_without_sema,
+					     urand_pid, "./urandom_read",
+					     "urand", "read_without_sema", NULL);
+		if (!ASSERT_OK_PTR(l, "urand_without_sema_attach"))
+			goto cleanup;
+		skel->links.urand_read_without_sema = l;
+
+		l = bpf_program__attach_usdt(skel->progs.urand_read_with_sema,
+					     urand_pid, "./urandom_read",
+					     "urand", "read_with_sema", NULL);
+		if (!ASSERT_OK_PTR(l, "urand_with_sema_attach"))
+			goto cleanup;
+		skel->links.urand_read_with_sema = l;
+
+		l = bpf_program__attach_usdt(skel->progs.urandlib_read_without_sema,
+					     urand_pid, "./liburandom_read.so",
+					     "urandlib", "read_without_sema", NULL);
+		if (!ASSERT_OK_PTR(l, "urandlib_without_sema_attach"))
+			goto cleanup;
+		skel->links.urandlib_read_without_sema = l;
+
+		l = bpf_program__attach_usdt(skel->progs.urandlib_read_with_sema,
+					     urand_pid, "./liburandom_read.so",
+					     "urandlib", "read_with_sema", NULL);
+		if (!ASSERT_OK_PTR(l, "urandlib_with_sema_attach"))
+			goto cleanup;
+		skel->links.urandlib_read_with_sema = l;
+
+	}
+
+	/* trigger urandom_read USDTs */
+	ASSERT_OK(urand_trigger(&urand_pipe), "urand_exit_code");
+
+	ASSERT_EQ(bss->urand_read_without_sema_call_cnt, 1, "urand_wo_sema_cnt");
+	ASSERT_EQ(bss->urand_read_without_sema_buf_sz_sum, 256, "urand_wo_sema_sum");
+
+	ASSERT_EQ(bss->urand_read_with_sema_call_cnt, 1, "urand_w_sema_cnt");
+	ASSERT_EQ(bss->urand_read_with_sema_buf_sz_sum, 256, "urand_w_sema_sum");
+
+	ASSERT_EQ(bss->urandlib_read_without_sema_call_cnt, 1, "urandlib_wo_sema_cnt");
+	ASSERT_EQ(bss->urandlib_read_without_sema_buf_sz_sum, 256, "urandlib_wo_sema_sum");
+
+	ASSERT_EQ(bss->urandlib_read_with_sema_call_cnt, 1, "urandlib_w_sema_cnt");
+	ASSERT_EQ(bss->urandlib_read_with_sema_buf_sz_sum, 256, "urandlib_w_sema_sum");
+
+cleanup:
+	if (urand_pipe)
+		pclose(urand_pipe);
+	test_urandom_usdt__destroy(skel);
+}
+
+void test_usdt(void)
+{
+	if (test__start_subtest("basic"))
+		subtest_basic_usdt();
+	if (test__start_subtest("multispec"))
+		subtest_multispec_usdt();
+	if (test__start_subtest("urand_auto_attach"))
+		subtest_urandom_usdt(true /* auto_attach */);
+	if (test__start_subtest("urand_pid_attach"))
+		subtest_urandom_usdt(false /* auto_attach */);
+}
diff --git a/tools/testing/selftests/bpf/progs/for_each_map_elem_write_key.c b/tools/testing/selftests/bpf/progs/for_each_map_elem_write_key.c
new file mode 100644
index 000000000000..8e545865ea33
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/for_each_map_elem_write_key.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, __u64);
+} array_map SEC(".maps");
+
+static __u64
+check_array_elem(struct bpf_map *map, __u32 *key, __u64 *val,
+		 void *data)
+{
+	bpf_get_current_comm(key, sizeof(*key));
+	return 0;
+}
+
+SEC("raw_tp/sys_enter")
+int test_map_key_write(const void *ctx)
+{
+	bpf_for_each_map_elem(&array_map, check_array_elem, NULL, 0);
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/linked_funcs1.c b/tools/testing/selftests/bpf/progs/linked_funcs1.c
index b964ec1390c2..963b393c37e8 100644
--- a/tools/testing/selftests/bpf/progs/linked_funcs1.c
+++ b/tools/testing/selftests/bpf/progs/linked_funcs1.c
@@ -4,6 +4,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
 
 /* weak and shared between two files */
 const volatile int my_tid __weak;
@@ -44,6 +45,13 @@ void set_output_ctx1(__u64 *ctx)
 /* this weak instance should win because it's the first one */
 __weak int set_output_weak(int x)
 {
+	static volatile int whatever;
+
+	/* make sure we use CO-RE relocations in a weak function, this used to
+	 * cause problems for BPF static linker
+	 */
+	whatever = bpf_core_type_size(struct task_struct);
+
 	output_weak1 = x;
 	return x;
 }
diff --git a/tools/testing/selftests/bpf/progs/linked_funcs2.c b/tools/testing/selftests/bpf/progs/linked_funcs2.c
index 575e958e60b7..db195872f4eb 100644
--- a/tools/testing/selftests/bpf/progs/linked_funcs2.c
+++ b/tools/testing/selftests/bpf/progs/linked_funcs2.c
@@ -4,6 +4,7 @@
 #include "vmlinux.h"
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
+#include <bpf/bpf_core_read.h>
 
 /* weak and shared between both files */
 const volatile int my_tid __weak;
@@ -44,6 +45,13 @@ void set_output_ctx2(__u64 *ctx)
 /* this weak instance should lose, because it will be processed second */
 __weak int set_output_weak(int x)
 {
+	static volatile int whatever;
+
+	/* make sure we use CO-RE relocations in a weak function, this used to
+	 * cause problems for BPF static linker
+	 */
+	whatever = 2 * bpf_core_type_size(struct task_struct);
+
 	output_weak2 = x;
 	return 2 * x;
 }
diff --git a/tools/testing/selftests/bpf/progs/perf_event_stackmap.c b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c
index b3fcb5274ee0..f793280a3238 100644
--- a/tools/testing/selftests/bpf/progs/perf_event_stackmap.c
+++ b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c
@@ -35,10 +35,10 @@ int oncpu(void *ctx)
 	long val;
 
 	val = bpf_get_stackid(ctx, &stackmap, 0);
-	if (val > 0)
+	if (val >= 0)
 		stackid_kernel = 2;
 	val = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK);
-	if (val > 0)
+	if (val >= 0)
 		stackid_user = 2;
 
 	trace = bpf_map_lookup_elem(&stackdata_map, &key);
diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h
index 4896fdf816f7..92331053dba3 100644
--- a/tools/testing/selftests/bpf/progs/profiler.inc.h
+++ b/tools/testing/selftests/bpf/progs/profiler.inc.h
@@ -826,8 +826,9 @@ out:
 
 SEC("kprobe/vfs_link")
 int BPF_KPROBE(kprobe__vfs_link,
-	       struct dentry* old_dentry, struct inode* dir,
-	       struct dentry* new_dentry, struct inode** delegated_inode)
+	       struct dentry* old_dentry, struct user_namespace *mnt_userns,
+	       struct inode* dir, struct dentry* new_dentry,
+	       struct inode** delegated_inode)
 {
 	struct bpf_func_stats_ctx stats_ctx;
 	bpf_stats_enter(&stats_ctx, profiler_bpf_vfs_link);
diff --git a/tools/testing/selftests/bpf/progs/test_attach_probe.c b/tools/testing/selftests/bpf/progs/test_attach_probe.c
index 8056a4c6d918..af994d16bb10 100644
--- a/tools/testing/selftests/bpf/progs/test_attach_probe.c
+++ b/tools/testing/selftests/bpf/progs/test_attach_probe.c
@@ -10,6 +10,10 @@ int kprobe_res = 0;
 int kretprobe_res = 0;
 int uprobe_res = 0;
 int uretprobe_res = 0;
+int uprobe_byname_res = 0;
+int uretprobe_byname_res = 0;
+int uprobe_byname2_res = 0;
+int uretprobe_byname2_res = 0;
 
 SEC("kprobe/sys_nanosleep")
 int handle_kprobe(struct pt_regs *ctx)
@@ -25,18 +29,51 @@ int BPF_KRETPROBE(handle_kretprobe)
 	return 0;
 }
 
-SEC("uprobe/trigger_func")
+SEC("uprobe")
 int handle_uprobe(struct pt_regs *ctx)
 {
 	uprobe_res = 3;
 	return 0;
 }
 
-SEC("uretprobe/trigger_func")
+SEC("uretprobe")
 int handle_uretprobe(struct pt_regs *ctx)
 {
 	uretprobe_res = 4;
 	return 0;
 }
 
+SEC("uprobe")
+int handle_uprobe_byname(struct pt_regs *ctx)
+{
+	uprobe_byname_res = 5;
+	return 0;
+}
+
+/* use auto-attach format for section definition. */
+SEC("uretprobe//proc/self/exe:trigger_func2")
+int handle_uretprobe_byname(struct pt_regs *ctx)
+{
+	uretprobe_byname_res = 6;
+	return 0;
+}
+
+SEC("uprobe")
+int handle_uprobe_byname2(struct pt_regs *ctx)
+{
+	unsigned int size = PT_REGS_PARM1(ctx);
+
+	/* verify malloc size */
+	if (size == 1)
+		uprobe_byname2_res = 7;
+	return 0;
+}
+
+SEC("uretprobe")
+int handle_uretprobe_byname2(struct pt_regs *ctx)
+{
+	uretprobe_byname2_res = 8;
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c
index 2d3a7710e2ce..0e2222968918 100644
--- a/tools/testing/selftests/bpf/progs/test_bpf_cookie.c
+++ b/tools/testing/selftests/bpf/progs/test_bpf_cookie.c
@@ -37,14 +37,14 @@ int handle_kretprobe(struct pt_regs *ctx)
 	return 0;
 }
 
-SEC("uprobe/trigger_func")
+SEC("uprobe")
 int handle_uprobe(struct pt_regs *ctx)
 {
 	update(ctx, &uprobe_res);
 	return 0;
 }
 
-SEC("uretprobe/trigger_func")
+SEC("uretprobe")
 int handle_uretprobe(struct pt_regs *ctx)
 {
 	update(ctx, &uretprobe_res);
diff --git a/tools/testing/selftests/bpf/progs/test_global_func17.c b/tools/testing/selftests/bpf/progs/test_global_func17.c
new file mode 100644
index 000000000000..2b8b9b8ba018
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_global_func17.c
@@ -0,0 +1,16 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <vmlinux.h>
+#include <bpf/bpf_helpers.h>
+
+__noinline int foo(int *p)
+{
+	return p ? (*p = 42) : 0;
+}
+
+const volatile int i;
+
+SEC("tc")
+int test_cls(struct __sk_buff *skb)
+{
+	return foo((int *)&i);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
index 2180c41cd890..a72a5bf3812a 100644
--- a/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
+++ b/tools/testing/selftests/bpf/progs/test_ksyms_btf_write_check.c
@@ -8,7 +8,7 @@
 extern const int bpf_prog_active __ksym; /* int type global var. */
 
 SEC("raw_tp/sys_enter")
-int handler(const void *ctx)
+int handler1(const void *ctx)
 {
 	int *active;
 	__u32 cpu;
@@ -26,4 +26,20 @@ int handler(const void *ctx)
 	return 0;
 }
 
+__noinline int write_active(int *p)
+{
+	return p ? (*p = 42) : 0;
+}
+
+SEC("raw_tp/sys_enter")
+int handler2(const void *ctx)
+{
+	int *active;
+	__u32 cpu;
+
+	active = bpf_this_cpu_ptr(&bpf_prog_active);
+	write_active(active);
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
index 19e4d2071c60..c8bc0c6947aa 100644
--- a/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
+++ b/tools/testing/selftests/bpf/progs/test_l4lb_noinline.c
@@ -218,7 +218,7 @@ static __noinline bool get_packet_dst(struct real_definition **real,
 
 	if (hash != 0x358459b7 /* jhash of ipv4 packet */  &&
 	    hash != 0x2f4bc6bb /* jhash of ipv6 packet */)
-		return 0;
+		return false;
 
 	real_pos = bpf_map_lookup_elem(&ch_rings, &key);
 	if (!real_pos)
diff --git a/tools/testing/selftests/bpf/progs/test_sk_assign.c b/tools/testing/selftests/bpf/progs/test_sk_assign.c
index 02f79356d5eb..98c6493d9b91 100644
--- a/tools/testing/selftests/bpf/progs/test_sk_assign.c
+++ b/tools/testing/selftests/bpf/progs/test_sk_assign.c
@@ -89,7 +89,6 @@ get_tuple(struct __sk_buff *skb, bool *ipv4, bool *tcp)
 static inline int
 handle_udp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
 {
-	struct bpf_sock_tuple ln = {0};
 	struct bpf_sock *sk;
 	const int zero = 0;
 	size_t tuple_len;
@@ -121,7 +120,6 @@ assign:
 static inline int
 handle_tcp(struct __sk_buff *skb, struct bpf_sock_tuple *tuple, bool ipv4)
 {
-	struct bpf_sock_tuple ln = {0};
 	struct bpf_sock *sk;
 	const int zero = 0;
 	size_t tuple_len;
@@ -161,7 +159,7 @@ assign:
 SEC("tc")
 int bpf_sk_assign_test(struct __sk_buff *skb)
 {
-	struct bpf_sock_tuple *tuple, ln = {0};
+	struct bpf_sock_tuple *tuple;
 	bool ipv4 = false;
 	bool tcp = false;
 	int tuple_len;
diff --git a/tools/testing/selftests/bpf/progs/test_task_pt_regs.c b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c
index e6cb09259408..1926facba122 100644
--- a/tools/testing/selftests/bpf/progs/test_task_pt_regs.c
+++ b/tools/testing/selftests/bpf/progs/test_task_pt_regs.c
@@ -14,7 +14,7 @@ char current_regs[PT_REGS_SIZE] = {};
 char ctx_regs[PT_REGS_SIZE] = {};
 int uprobe_res = 0;
 
-SEC("uprobe/trigger_func")
+SEC("uprobe")
 int handle_uprobe(struct pt_regs *ctx)
 {
 	struct task_struct *current;
diff --git a/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
new file mode 100644
index 000000000000..ab75522e2eeb
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_uprobe_autoattach.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022, Oracle and/or its affiliates. */
+
+#include "vmlinux.h"
+
+#include <bpf/bpf_core_read.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+int uprobe_byname_parm1 = 0;
+int uprobe_byname_ran = 0;
+int uretprobe_byname_rc = 0;
+int uretprobe_byname_ran = 0;
+size_t uprobe_byname2_parm1 = 0;
+int uprobe_byname2_ran = 0;
+char *uretprobe_byname2_rc = NULL;
+int uretprobe_byname2_ran = 0;
+
+int test_pid;
+
+/* This program cannot auto-attach, but that should not stop other
+ * programs from attaching.
+ */
+SEC("uprobe")
+int handle_uprobe_noautoattach(struct pt_regs *ctx)
+{
+	return 0;
+}
+
+SEC("uprobe//proc/self/exe:autoattach_trigger_func")
+int handle_uprobe_byname(struct pt_regs *ctx)
+{
+	uprobe_byname_parm1 = PT_REGS_PARM1_CORE(ctx);
+	uprobe_byname_ran = 1;
+	return 0;
+}
+
+SEC("uretprobe//proc/self/exe:autoattach_trigger_func")
+int handle_uretprobe_byname(struct pt_regs *ctx)
+{
+	uretprobe_byname_rc = PT_REGS_RC_CORE(ctx);
+	uretprobe_byname_ran = 2;
+	return 0;
+}
+
+
+SEC("uprobe/libc.so.6:malloc")
+int handle_uprobe_byname2(struct pt_regs *ctx)
+{
+	int pid = bpf_get_current_pid_tgid() >> 32;
+
+	/* ignore irrelevant invocations */
+	if (test_pid != pid)
+		return 0;
+	uprobe_byname2_parm1 = PT_REGS_PARM1_CORE(ctx);
+	uprobe_byname2_ran = 3;
+	return 0;
+}
+
+SEC("uretprobe/libc.so.6:malloc")
+int handle_uretprobe_byname2(struct pt_regs *ctx)
+{
+	int pid = bpf_get_current_pid_tgid() >> 32;
+
+	/* ignore irrelevant invocations */
+	if (test_pid != pid)
+		return 0;
+	uretprobe_byname2_rc = (char *)PT_REGS_RC_CORE(ctx);
+	uretprobe_byname2_ran = 4;
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_urandom_usdt.c b/tools/testing/selftests/bpf/progs/test_urandom_usdt.c
new file mode 100644
index 000000000000..3539b02bd5f7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_urandom_usdt.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/usdt.bpf.h>
+
+int urand_pid;
+
+int urand_read_without_sema_call_cnt;
+int urand_read_without_sema_buf_sz_sum;
+
+SEC("usdt/./urandom_read:urand:read_without_sema")
+int BPF_USDT(urand_read_without_sema, int iter_num, int iter_cnt, int buf_sz)
+{
+	if (urand_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&urand_read_without_sema_call_cnt, 1);
+	__sync_fetch_and_add(&urand_read_without_sema_buf_sz_sum, buf_sz);
+
+	return 0;
+}
+
+int urand_read_with_sema_call_cnt;
+int urand_read_with_sema_buf_sz_sum;
+
+SEC("usdt/./urandom_read:urand:read_with_sema")
+int BPF_USDT(urand_read_with_sema, int iter_num, int iter_cnt, int buf_sz)
+{
+	if (urand_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&urand_read_with_sema_call_cnt, 1);
+	__sync_fetch_and_add(&urand_read_with_sema_buf_sz_sum, buf_sz);
+
+	return 0;
+}
+
+int urandlib_read_without_sema_call_cnt;
+int urandlib_read_without_sema_buf_sz_sum;
+
+SEC("usdt/./liburandom_read.so:urandlib:read_without_sema")
+int BPF_USDT(urandlib_read_without_sema, int iter_num, int iter_cnt, int buf_sz)
+{
+	if (urand_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&urandlib_read_without_sema_call_cnt, 1);
+	__sync_fetch_and_add(&urandlib_read_without_sema_buf_sz_sum, buf_sz);
+
+	return 0;
+}
+
+int urandlib_read_with_sema_call_cnt;
+int urandlib_read_with_sema_buf_sz_sum;
+
+SEC("usdt/./liburandom_read.so:urandlib:read_with_sema")
+int BPF_USDT(urandlib_read_with_sema, int iter_num, int iter_cnt, int buf_sz)
+{
+	if (urand_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&urandlib_read_with_sema_call_cnt, 1);
+	__sync_fetch_and_add(&urandlib_read_with_sema_buf_sz_sum, buf_sz);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_usdt.c b/tools/testing/selftests/bpf/progs/test_usdt.c
new file mode 100644
index 000000000000..505aab9a5234
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_usdt.c
@@ -0,0 +1,96 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/usdt.bpf.h>
+
+int my_pid;
+
+int usdt0_called;
+u64 usdt0_cookie;
+int usdt0_arg_cnt;
+int usdt0_arg_ret;
+
+SEC("usdt")
+int usdt0(struct pt_regs *ctx)
+{
+	long tmp;
+
+	if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&usdt0_called, 1);
+
+	usdt0_cookie = bpf_usdt_cookie(ctx);
+	usdt0_arg_cnt = bpf_usdt_arg_cnt(ctx);
+	/* should return -ENOENT for any arg_num */
+	usdt0_arg_ret = bpf_usdt_arg(ctx, bpf_get_prandom_u32(), &tmp);
+	return 0;
+}
+
+int usdt3_called;
+u64 usdt3_cookie;
+int usdt3_arg_cnt;
+int usdt3_arg_rets[3];
+u64 usdt3_args[3];
+
+SEC("usdt//proc/self/exe:test:usdt3")
+int usdt3(struct pt_regs *ctx)
+{
+	long tmp;
+
+	if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&usdt3_called, 1);
+
+	usdt3_cookie = bpf_usdt_cookie(ctx);
+	usdt3_arg_cnt = bpf_usdt_arg_cnt(ctx);
+
+	usdt3_arg_rets[0] = bpf_usdt_arg(ctx, 0, &tmp);
+	usdt3_args[0] = (int)tmp;
+
+	usdt3_arg_rets[1] = bpf_usdt_arg(ctx, 1, &tmp);
+	usdt3_args[1] = (long)tmp;
+
+	usdt3_arg_rets[2] = bpf_usdt_arg(ctx, 2, &tmp);
+	usdt3_args[2] = (uintptr_t)tmp;
+
+	return 0;
+}
+
+int usdt12_called;
+u64 usdt12_cookie;
+int usdt12_arg_cnt;
+u64 usdt12_args[12];
+
+SEC("usdt//proc/self/exe:test:usdt12")
+int BPF_USDT(usdt12, int a1, int a2, long a3, long a4, unsigned a5,
+		     long a6, __u64 a7, uintptr_t a8, int a9, short a10,
+		     short a11, signed char a12)
+{
+	if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&usdt12_called, 1);
+
+	usdt12_cookie = bpf_usdt_cookie(ctx);
+	usdt12_arg_cnt = bpf_usdt_arg_cnt(ctx);
+
+	usdt12_args[0] = a1;
+	usdt12_args[1] = a2;
+	usdt12_args[2] = a3;
+	usdt12_args[3] = a4;
+	usdt12_args[4] = a5;
+	usdt12_args[5] = a6;
+	usdt12_args[6] = a7;
+	usdt12_args[7] = a8;
+	usdt12_args[8] = a9;
+	usdt12_args[9] = a10;
+	usdt12_args[10] = a11;
+	usdt12_args[11] = a12;
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_usdt_multispec.c b/tools/testing/selftests/bpf/progs/test_usdt_multispec.c
new file mode 100644
index 000000000000..aa6de32b50d1
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_usdt_multispec.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/usdt.bpf.h>
+
+/* this file is linked together with test_usdt.c to validate that usdt.bpf.h
+ * can be included in multiple .bpf.c files forming single final BPF object
+ * file
+ */
+
+extern int my_pid;
+
+int usdt_100_called;
+int usdt_100_sum;
+
+SEC("usdt//proc/self/exe:test:usdt_100")
+int BPF_USDT(usdt_100, int x)
+{
+	long tmp;
+
+	if (my_pid != (bpf_get_current_pid_tgid() >> 32))
+		return 0;
+
+	__sync_fetch_and_add(&usdt_100_called, 1);
+	__sync_fetch_and_add(&usdt_100_sum, x);
+
+	return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
index 596c4e71bf3a..125d872d7981 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_noinline.c
@@ -564,22 +564,22 @@ static bool get_packet_dst(struct real_definition **real,
 	hash = get_packet_hash(pckt, hash_16bytes);
 	if (hash != 0x358459b7 /* jhash of ipv4 packet */  &&
 	    hash != 0x2f4bc6bb /* jhash of ipv6 packet */)
-		return 0;
+		return false;
 	key = 2 * vip_info->vip_num + hash % 2;
 	real_pos = bpf_map_lookup_elem(&ch_rings, &key);
 	if (!real_pos)
-		return 0;
+		return false;
 	key = *real_pos;
 	*real = bpf_map_lookup_elem(&reals, &key);
 	if (!(*real))
-		return 0;
+		return false;
 	if (!(vip_info->flags & (1 << 1))) {
 		__u32 conn_rate_key = 512 + 2;
 		struct lb_stats *conn_rate_stats =
 		    bpf_map_lookup_elem(&stats, &conn_rate_key);
 
 		if (!conn_rate_stats)
-			return 1;
+			return true;
 		cur_time = bpf_ktime_get_ns();
 		if ((cur_time - conn_rate_stats->v2) >> 32 > 0xffFFFF) {
 			conn_rate_stats->v1 = 1;
@@ -587,14 +587,14 @@ static bool get_packet_dst(struct real_definition **real,
 		} else {
 			conn_rate_stats->v1 += 1;
 			if (conn_rate_stats->v1 >= 1)
-				return 1;
+				return true;
 		}
 		if (pckt->flow.proto == IPPROTO_UDP)
 			new_dst_lru.atime = cur_time;
 		new_dst_lru.pos = key;
 		bpf_map_update_elem(lru_map, &pckt->flow, &new_dst_lru, 0);
 	}
-	return 1;
+	return true;
 }
 
 __attribute__ ((noinline))
diff --git a/tools/testing/selftests/bpf/progs/trigger_bench.c b/tools/testing/selftests/bpf/progs/trigger_bench.c
index 2ab049b54d6c..694e7cec1823 100644
--- a/tools/testing/selftests/bpf/progs/trigger_bench.c
+++ b/tools/testing/selftests/bpf/progs/trigger_bench.c
@@ -54,7 +54,7 @@ int bench_trigger_fmodret(void *ctx)
 	return -22;
 }
 
-SEC("uprobe/self/uprobe_target")
+SEC("uprobe")
 int bench_trigger_uprobe(void *ctx)
 {
 	__sync_add_and_fetch(&hits, 1);
diff --git a/tools/testing/selftests/bpf/sdt-config.h b/tools/testing/selftests/bpf/sdt-config.h
new file mode 100644
index 000000000000..733045a52771
--- /dev/null
+++ b/tools/testing/selftests/bpf/sdt-config.h
@@ -0,0 +1,6 @@
+/* includes/sys/sdt-config.h.  Generated from sdt-config.h.in by configure.
+
+   This file just defines _SDT_ASM_SECTION_AUTOGROUP_SUPPORT to 0 or 1 to
+   indicate whether the assembler supports "?" in .pushsection directives.  */
+
+#define _SDT_ASM_SECTION_AUTOGROUP_SUPPORT 1
diff --git a/tools/testing/selftests/bpf/sdt.h b/tools/testing/selftests/bpf/sdt.h
new file mode 100644
index 000000000000..ca0162b4dc57
--- /dev/null
+++ b/tools/testing/selftests/bpf/sdt.h
@@ -0,0 +1,513 @@
+/* <sys/sdt.h> - Systemtap static probe definition macros.
+
+   This file is dedicated to the public domain, pursuant to CC0
+   (https://creativecommons.org/publicdomain/zero/1.0/)
+*/
+
+#ifndef _SYS_SDT_H
+#define _SYS_SDT_H    1
+
+/*
+  This file defines a family of macros
+
+       STAP_PROBEn(op1, ..., opn)
+
+  that emit a nop into the instruction stream, and some data into an auxiliary
+  note section.  The data in the note section describes the operands, in terms
+  of size and location.  Each location is encoded as assembler operand string.
+  Consumer tools such as gdb or systemtap insert breakpoints on top of
+  the nop, and decode the location operand-strings, like an assembler,
+  to find the values being passed.
+
+  The operand strings are selected by the compiler for each operand.
+  They are constrained by gcc inline-assembler codes.  The default is:
+
+  #define STAP_SDT_ARG_CONSTRAINT nor
+
+  This is a good default if the operands tend to be integral and
+  moderate in number (smaller than number of registers).  In other
+  cases, the compiler may report "'asm' requires impossible reload" or
+  similar.  In this case, consider simplifying the macro call (fewer
+  and simpler operands), reduce optimization, or override the default
+  constraints string via:
+
+  #define STAP_SDT_ARG_CONSTRAINT g
+  #include <sys/sdt.h>
+
+  See also:
+  https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
+  https://gcc.gnu.org/onlinedocs/gcc/Constraints.html
+ */
+
+
+
+#ifdef __ASSEMBLER__
+# define _SDT_PROBE(provider, name, n, arglist)	\
+  _SDT_ASM_BODY(provider, name, _SDT_ASM_SUBSTR_1, (_SDT_DEPAREN_##n arglist)) \
+  _SDT_ASM_BASE
+# define _SDT_ASM_1(x)			x;
+# define _SDT_ASM_2(a, b)		a,b;
+# define _SDT_ASM_3(a, b, c)		a,b,c;
+# define _SDT_ASM_5(a, b, c, d, e)	a,b,c,d,e;
+# define _SDT_ASM_STRING_1(x)		.asciz #x;
+# define _SDT_ASM_SUBSTR_1(x)		.ascii #x;
+# define _SDT_DEPAREN_0()				/* empty */
+# define _SDT_DEPAREN_1(a)				a
+# define _SDT_DEPAREN_2(a,b)				a b
+# define _SDT_DEPAREN_3(a,b,c)				a b c
+# define _SDT_DEPAREN_4(a,b,c,d)			a b c d
+# define _SDT_DEPAREN_5(a,b,c,d,e)			a b c d e
+# define _SDT_DEPAREN_6(a,b,c,d,e,f)			a b c d e f
+# define _SDT_DEPAREN_7(a,b,c,d,e,f,g)			a b c d e f g
+# define _SDT_DEPAREN_8(a,b,c,d,e,f,g,h)		a b c d e f g h
+# define _SDT_DEPAREN_9(a,b,c,d,e,f,g,h,i)		a b c d e f g h i
+# define _SDT_DEPAREN_10(a,b,c,d,e,f,g,h,i,j)		a b c d e f g h i j
+# define _SDT_DEPAREN_11(a,b,c,d,e,f,g,h,i,j,k)		a b c d e f g h i j k
+# define _SDT_DEPAREN_12(a,b,c,d,e,f,g,h,i,j,k,l)	a b c d e f g h i j k l
+#else
+#if defined _SDT_HAS_SEMAPHORES
+#define _SDT_NOTE_SEMAPHORE_USE(provider, name) \
+  __asm__ __volatile__ ("" :: "m" (provider##_##name##_semaphore));
+#else
+#define _SDT_NOTE_SEMAPHORE_USE(provider, name)
+#endif
+
+# define _SDT_PROBE(provider, name, n, arglist) \
+  do {									    \
+    _SDT_NOTE_SEMAPHORE_USE(provider, name); \
+    __asm__ __volatile__ (_SDT_ASM_BODY(provider, name, _SDT_ASM_ARGS, (n)) \
+			  :: _SDT_ASM_OPERANDS_##n arglist);		    \
+    __asm__ __volatile__ (_SDT_ASM_BASE);				    \
+  } while (0)
+# define _SDT_S(x)			#x
+# define _SDT_ASM_1(x)			_SDT_S(x) "\n"
+# define _SDT_ASM_2(a, b)		_SDT_S(a) "," _SDT_S(b) "\n"
+# define _SDT_ASM_3(a, b, c)		_SDT_S(a) "," _SDT_S(b) "," \
+					_SDT_S(c) "\n"
+# define _SDT_ASM_5(a, b, c, d, e)	_SDT_S(a) "," _SDT_S(b) "," \
+					_SDT_S(c) "," _SDT_S(d) "," \
+					_SDT_S(e) "\n"
+# define _SDT_ASM_ARGS(n)		_SDT_ASM_TEMPLATE_##n
+# define _SDT_ASM_STRING_1(x)		_SDT_ASM_1(.asciz #x)
+# define _SDT_ASM_SUBSTR_1(x)		_SDT_ASM_1(.ascii #x)
+
+# define _SDT_ARGFMT(no)                _SDT_ASM_1(_SDT_SIGN %n[_SDT_S##no]) \
+                                        _SDT_ASM_1(_SDT_SIZE %n[_SDT_S##no]) \
+                                        _SDT_ASM_1(_SDT_TYPE %n[_SDT_S##no]) \
+                                        _SDT_ASM_SUBSTR(_SDT_ARGTMPL(_SDT_A##no))
+
+
+# ifndef STAP_SDT_ARG_CONSTRAINT
+# if defined __powerpc__
+# define STAP_SDT_ARG_CONSTRAINT        nZr
+# elif defined __arm__
+# define STAP_SDT_ARG_CONSTRAINT        g
+# else
+# define STAP_SDT_ARG_CONSTRAINT        nor
+# endif
+# endif
+
+# define _SDT_STRINGIFY(x)              #x
+# define _SDT_ARG_CONSTRAINT_STRING(x)  _SDT_STRINGIFY(x)
+/* _SDT_S encodes the size and type as 0xSSTT which is decoded by the assembler
+   macros _SDT_SIZE and _SDT_TYPE */
+# define _SDT_ARG(n, x)				    \
+  [_SDT_S##n] "n" ((_SDT_ARGSIGNED (x) ? (int)-1 : 1) * (-(((int) _SDT_ARGSIZE (x)) << 8) + (-(0x7f & __builtin_classify_type (x))))), \
+  [_SDT_A##n] _SDT_ARG_CONSTRAINT_STRING (STAP_SDT_ARG_CONSTRAINT) (_SDT_ARGVAL (x))
+#endif
+#define _SDT_ASM_STRING(x)		_SDT_ASM_STRING_1(x)
+#define _SDT_ASM_SUBSTR(x)		_SDT_ASM_SUBSTR_1(x)
+
+#define _SDT_ARGARRAY(x)	(__builtin_classify_type (x) == 14	\
+				 || __builtin_classify_type (x) == 5)
+
+#ifdef __cplusplus
+# define _SDT_ARGSIGNED(x)	(!_SDT_ARGARRAY (x) \
+				 && __sdt_type<__typeof (x)>::__sdt_signed)
+# define _SDT_ARGSIZE(x)	(_SDT_ARGARRAY (x) \
+				 ? sizeof (void *) : sizeof (x))
+# define _SDT_ARGVAL(x)		(x)
+
+# include <cstddef>
+
+template<typename __sdt_T>
+struct __sdt_type
+{
+  static const bool __sdt_signed = false;
+};
+  
+#define __SDT_ALWAYS_SIGNED(T) \
+template<> struct __sdt_type<T> { static const bool __sdt_signed = true; };
+#define __SDT_COND_SIGNED(T,CT)						\
+template<> struct __sdt_type<T> { static const bool __sdt_signed = ((CT)(-1) < 1); };
+__SDT_ALWAYS_SIGNED(signed char)
+__SDT_ALWAYS_SIGNED(short)
+__SDT_ALWAYS_SIGNED(int)
+__SDT_ALWAYS_SIGNED(long)
+__SDT_ALWAYS_SIGNED(long long)
+__SDT_ALWAYS_SIGNED(volatile signed char)
+__SDT_ALWAYS_SIGNED(volatile short)
+__SDT_ALWAYS_SIGNED(volatile int)
+__SDT_ALWAYS_SIGNED(volatile long)
+__SDT_ALWAYS_SIGNED(volatile long long)
+__SDT_ALWAYS_SIGNED(const signed char)
+__SDT_ALWAYS_SIGNED(const short)
+__SDT_ALWAYS_SIGNED(const int)
+__SDT_ALWAYS_SIGNED(const long)
+__SDT_ALWAYS_SIGNED(const long long)
+__SDT_ALWAYS_SIGNED(const volatile signed char)
+__SDT_ALWAYS_SIGNED(const volatile short)
+__SDT_ALWAYS_SIGNED(const volatile int)
+__SDT_ALWAYS_SIGNED(const volatile long)
+__SDT_ALWAYS_SIGNED(const volatile long long)
+__SDT_COND_SIGNED(char, char)
+__SDT_COND_SIGNED(wchar_t, wchar_t)
+__SDT_COND_SIGNED(volatile char, char)
+__SDT_COND_SIGNED(volatile wchar_t, wchar_t)
+__SDT_COND_SIGNED(const char, char)
+__SDT_COND_SIGNED(const wchar_t, wchar_t)
+__SDT_COND_SIGNED(const volatile char, char)
+__SDT_COND_SIGNED(const volatile wchar_t, wchar_t)
+#if defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+/* __SDT_COND_SIGNED(char16_t) */
+/* __SDT_COND_SIGNED(char32_t) */
+#endif
+
+template<typename __sdt_E>
+struct __sdt_type<__sdt_E[]> : public __sdt_type<__sdt_E *> {};
+
+template<typename __sdt_E, size_t __sdt_N>
+struct __sdt_type<__sdt_E[__sdt_N]> : public __sdt_type<__sdt_E *> {};
+
+#elif !defined(__ASSEMBLER__)
+__extension__ extern unsigned long long __sdt_unsp;
+# define _SDT_ARGINTTYPE(x)						\
+  __typeof (__builtin_choose_expr (((__builtin_classify_type (x)	\
+				     + 3) & -4) == 4, (x), 0U))
+# define _SDT_ARGSIGNED(x)						\
+  (!__extension__							\
+   (__builtin_constant_p ((((unsigned long long)			\
+			    (_SDT_ARGINTTYPE (x)) __sdt_unsp)		\
+			   & ((unsigned long long)1 << (sizeof (unsigned long long)	\
+				       * __CHAR_BIT__ - 1))) == 0)	\
+    || (_SDT_ARGINTTYPE (x)) -1 > (_SDT_ARGINTTYPE (x)) 0))
+# define _SDT_ARGSIZE(x)	\
+  (_SDT_ARGARRAY (x) ? sizeof (void *) : sizeof (x))
+# define _SDT_ARGVAL(x)		(x)
+#endif
+
+#if defined __powerpc__ || defined __powerpc64__
+# define _SDT_ARGTMPL(id)	%I[id]%[id]
+#elif defined __i386__
+# define _SDT_ARGTMPL(id)	%k[id]  /* gcc.gnu.org/PR80115 sourceware.org/PR24541 */
+#else
+# define _SDT_ARGTMPL(id)	%[id]
+#endif
+
+/* NB: gdb PR24541 highlighted an unspecified corner of the sdt.h
+   operand note format.
+
+   The named register may be a longer or shorter (!) alias for the
+   storage where the value in question is found.  For example, on
+   i386, 64-bit value may be put in register pairs, and the register
+   name stored would identify just one of them.  Previously, gcc was
+   asked to emit the %w[id] (16-bit alias of some registers holding
+   operands), even when a wider 32-bit value was used.
+
+   Bottom line: the byte-width given before the @ sign governs.  If
+   there is a mismatch between that width and that of the named
+   register, then a sys/sdt.h note consumer may need to employ
+   architecture-specific heuristics to figure out where the compiler
+   has actually put the complete value.
+*/
+
+#ifdef __LP64__
+# define _SDT_ASM_ADDR	.8byte
+#else
+# define _SDT_ASM_ADDR	.4byte
+#endif
+
+/* The ia64 and s390 nop instructions take an argument. */
+#if defined(__ia64__) || defined(__s390__) || defined(__s390x__)
+#define _SDT_NOP	nop 0
+#else
+#define _SDT_NOP	nop
+#endif
+
+#define _SDT_NOTE_NAME	"stapsdt"
+#define _SDT_NOTE_TYPE	3
+
+/* If the assembler supports the necessary feature, then we can play
+   nice with code in COMDAT sections, which comes up in C++ code.
+   Without that assembler support, some combinations of probe placements
+   in certain kinds of C++ code may produce link-time errors.  */
+#include "sdt-config.h"
+#if _SDT_ASM_SECTION_AUTOGROUP_SUPPORT
+# define _SDT_ASM_AUTOGROUP "?"
+#else
+# define _SDT_ASM_AUTOGROUP ""
+#endif
+
+#define _SDT_DEF_MACROS							     \
+	_SDT_ASM_1(.altmacro)						     \
+	_SDT_ASM_1(.macro _SDT_SIGN x)				     	     \
+	_SDT_ASM_3(.pushsection .note.stapsdt,"","note")		     \
+	_SDT_ASM_1(.iflt \\x)						     \
+	_SDT_ASM_1(.ascii "-")						     \
+	_SDT_ASM_1(.endif)						     \
+	_SDT_ASM_1(.popsection)						     \
+	_SDT_ASM_1(.endm)						     \
+	_SDT_ASM_1(.macro _SDT_SIZE_ x)					     \
+	_SDT_ASM_3(.pushsection .note.stapsdt,"","note")		     \
+	_SDT_ASM_1(.ascii "\x")						     \
+	_SDT_ASM_1(.popsection)						     \
+	_SDT_ASM_1(.endm)						     \
+	_SDT_ASM_1(.macro _SDT_SIZE x)					     \
+	_SDT_ASM_1(_SDT_SIZE_ %%((-(-\\x*((-\\x>0)-(-\\x<0))))>>8))	     \
+	_SDT_ASM_1(.endm)						     \
+	_SDT_ASM_1(.macro _SDT_TYPE_ x)				             \
+	_SDT_ASM_3(.pushsection .note.stapsdt,"","note")		     \
+	_SDT_ASM_2(.ifc 8,\\x)					     	     \
+	_SDT_ASM_1(.ascii "f")						     \
+	_SDT_ASM_1(.endif)						     \
+	_SDT_ASM_1(.ascii "@")						     \
+	_SDT_ASM_1(.popsection)						     \
+	_SDT_ASM_1(.endm)						     \
+	_SDT_ASM_1(.macro _SDT_TYPE x)				     	     \
+	_SDT_ASM_1(_SDT_TYPE_ %%((\\x)&(0xff)))			     \
+	_SDT_ASM_1(.endm)
+
+#define _SDT_UNDEF_MACROS						      \
+  _SDT_ASM_1(.purgem _SDT_SIGN)						      \
+  _SDT_ASM_1(.purgem _SDT_SIZE_)					      \
+  _SDT_ASM_1(.purgem _SDT_SIZE)						      \
+  _SDT_ASM_1(.purgem _SDT_TYPE_)					      \
+  _SDT_ASM_1(.purgem _SDT_TYPE)
+
+#define _SDT_ASM_BODY(provider, name, pack_args, args, ...)		      \
+  _SDT_DEF_MACROS							      \
+  _SDT_ASM_1(990:	_SDT_NOP)					      \
+  _SDT_ASM_3(		.pushsection .note.stapsdt,_SDT_ASM_AUTOGROUP,"note") \
+  _SDT_ASM_1(		.balign 4)					      \
+  _SDT_ASM_3(		.4byte 992f-991f, 994f-993f, _SDT_NOTE_TYPE)	      \
+  _SDT_ASM_1(991:	.asciz _SDT_NOTE_NAME)				      \
+  _SDT_ASM_1(992:	.balign 4)					      \
+  _SDT_ASM_1(993:	_SDT_ASM_ADDR 990b)				      \
+  _SDT_ASM_1(		_SDT_ASM_ADDR _.stapsdt.base)			      \
+  _SDT_SEMAPHORE(provider,name)						      \
+  _SDT_ASM_STRING(provider)						      \
+  _SDT_ASM_STRING(name)							      \
+  pack_args args							      \
+  _SDT_ASM_SUBSTR(\x00)							      \
+  _SDT_UNDEF_MACROS							      \
+  _SDT_ASM_1(994:	.balign 4)					      \
+  _SDT_ASM_1(		.popsection)
+
+#define _SDT_ASM_BASE							      \
+  _SDT_ASM_1(.ifndef _.stapsdt.base)					      \
+  _SDT_ASM_5(		.pushsection .stapsdt.base,"aG","progbits",	      \
+							.stapsdt.base,comdat) \
+  _SDT_ASM_1(		.weak _.stapsdt.base)				      \
+  _SDT_ASM_1(		.hidden _.stapsdt.base)				      \
+  _SDT_ASM_1(	_.stapsdt.base: .space 1)				      \
+  _SDT_ASM_2(		.size _.stapsdt.base, 1)			      \
+  _SDT_ASM_1(		.popsection)					      \
+  _SDT_ASM_1(.endif)
+
+#if defined _SDT_HAS_SEMAPHORES
+#define _SDT_SEMAPHORE(p,n) \
+	_SDT_ASM_1(		_SDT_ASM_ADDR p##_##n##_semaphore)
+#else
+#define _SDT_SEMAPHORE(p,n) _SDT_ASM_1(		_SDT_ASM_ADDR 0)
+#endif
+
+#define _SDT_ASM_BLANK _SDT_ASM_SUBSTR(\x20)
+#define _SDT_ASM_TEMPLATE_0		/* no arguments */
+#define _SDT_ASM_TEMPLATE_1		_SDT_ARGFMT(1)
+#define _SDT_ASM_TEMPLATE_2		_SDT_ASM_TEMPLATE_1 _SDT_ASM_BLANK _SDT_ARGFMT(2)
+#define _SDT_ASM_TEMPLATE_3		_SDT_ASM_TEMPLATE_2 _SDT_ASM_BLANK _SDT_ARGFMT(3)
+#define _SDT_ASM_TEMPLATE_4		_SDT_ASM_TEMPLATE_3 _SDT_ASM_BLANK _SDT_ARGFMT(4)
+#define _SDT_ASM_TEMPLATE_5		_SDT_ASM_TEMPLATE_4 _SDT_ASM_BLANK _SDT_ARGFMT(5)
+#define _SDT_ASM_TEMPLATE_6		_SDT_ASM_TEMPLATE_5 _SDT_ASM_BLANK _SDT_ARGFMT(6)
+#define _SDT_ASM_TEMPLATE_7		_SDT_ASM_TEMPLATE_6 _SDT_ASM_BLANK _SDT_ARGFMT(7)
+#define _SDT_ASM_TEMPLATE_8		_SDT_ASM_TEMPLATE_7 _SDT_ASM_BLANK _SDT_ARGFMT(8)
+#define _SDT_ASM_TEMPLATE_9		_SDT_ASM_TEMPLATE_8 _SDT_ASM_BLANK _SDT_ARGFMT(9)
+#define _SDT_ASM_TEMPLATE_10		_SDT_ASM_TEMPLATE_9 _SDT_ASM_BLANK _SDT_ARGFMT(10)
+#define _SDT_ASM_TEMPLATE_11		_SDT_ASM_TEMPLATE_10 _SDT_ASM_BLANK _SDT_ARGFMT(11)
+#define _SDT_ASM_TEMPLATE_12		_SDT_ASM_TEMPLATE_11 _SDT_ASM_BLANK _SDT_ARGFMT(12)
+#define _SDT_ASM_OPERANDS_0()		[__sdt_dummy] "g" (0)
+#define _SDT_ASM_OPERANDS_1(arg1)	_SDT_ARG(1, arg1)
+#define _SDT_ASM_OPERANDS_2(arg1, arg2) \
+  _SDT_ASM_OPERANDS_1(arg1), _SDT_ARG(2, arg2)
+#define _SDT_ASM_OPERANDS_3(arg1, arg2, arg3) \
+  _SDT_ASM_OPERANDS_2(arg1, arg2), _SDT_ARG(3, arg3)
+#define _SDT_ASM_OPERANDS_4(arg1, arg2, arg3, arg4) \
+  _SDT_ASM_OPERANDS_3(arg1, arg2, arg3), _SDT_ARG(4, arg4)
+#define _SDT_ASM_OPERANDS_5(arg1, arg2, arg3, arg4, arg5) \
+  _SDT_ASM_OPERANDS_4(arg1, arg2, arg3, arg4), _SDT_ARG(5, arg5)
+#define _SDT_ASM_OPERANDS_6(arg1, arg2, arg3, arg4, arg5, arg6) \
+  _SDT_ASM_OPERANDS_5(arg1, arg2, arg3, arg4, arg5), _SDT_ARG(6, arg6)
+#define _SDT_ASM_OPERANDS_7(arg1, arg2, arg3, arg4, arg5, arg6, arg7) \
+  _SDT_ASM_OPERANDS_6(arg1, arg2, arg3, arg4, arg5, arg6), _SDT_ARG(7, arg7)
+#define _SDT_ASM_OPERANDS_8(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8) \
+  _SDT_ASM_OPERANDS_7(arg1, arg2, arg3, arg4, arg5, arg6, arg7), \
+    _SDT_ARG(8, arg8)
+#define _SDT_ASM_OPERANDS_9(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9) \
+  _SDT_ASM_OPERANDS_8(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8), \
+    _SDT_ARG(9, arg9)
+#define _SDT_ASM_OPERANDS_10(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10) \
+  _SDT_ASM_OPERANDS_9(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9), \
+    _SDT_ARG(10, arg10)
+#define _SDT_ASM_OPERANDS_11(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11) \
+  _SDT_ASM_OPERANDS_10(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10), \
+    _SDT_ARG(11, arg11)
+#define _SDT_ASM_OPERANDS_12(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12) \
+  _SDT_ASM_OPERANDS_11(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11), \
+    _SDT_ARG(12, arg12)
+
+/* These macros can be used in C, C++, or assembly code.
+   In assembly code the arguments should use normal assembly operand syntax.  */
+
+#define STAP_PROBE(provider, name) \
+  _SDT_PROBE(provider, name, 0, ())
+#define STAP_PROBE1(provider, name, arg1) \
+  _SDT_PROBE(provider, name, 1, (arg1))
+#define STAP_PROBE2(provider, name, arg1, arg2) \
+  _SDT_PROBE(provider, name, 2, (arg1, arg2))
+#define STAP_PROBE3(provider, name, arg1, arg2, arg3) \
+  _SDT_PROBE(provider, name, 3, (arg1, arg2, arg3))
+#define STAP_PROBE4(provider, name, arg1, arg2, arg3, arg4) \
+  _SDT_PROBE(provider, name, 4, (arg1, arg2, arg3, arg4))
+#define STAP_PROBE5(provider, name, arg1, arg2, arg3, arg4, arg5) \
+  _SDT_PROBE(provider, name, 5, (arg1, arg2, arg3, arg4, arg5))
+#define STAP_PROBE6(provider, name, arg1, arg2, arg3, arg4, arg5, arg6)	\
+  _SDT_PROBE(provider, name, 6, (arg1, arg2, arg3, arg4, arg5, arg6))
+#define STAP_PROBE7(provider, name, arg1, arg2, arg3, arg4, arg5, arg6, arg7) \
+  _SDT_PROBE(provider, name, 7, (arg1, arg2, arg3, arg4, arg5, arg6, arg7))
+#define STAP_PROBE8(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8) \
+  _SDT_PROBE(provider, name, 8, (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8))
+#define STAP_PROBE9(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9)\
+  _SDT_PROBE(provider, name, 9, (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9))
+#define STAP_PROBE10(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10) \
+  _SDT_PROBE(provider, name, 10, \
+	     (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10))
+#define STAP_PROBE11(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11) \
+  _SDT_PROBE(provider, name, 11, \
+	     (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11))
+#define STAP_PROBE12(provider,name,arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12) \
+  _SDT_PROBE(provider, name, 12, \
+	     (arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8,arg9,arg10,arg11,arg12))
+
+/* This STAP_PROBEV macro can be used in variadic scenarios, where the
+   number of probe arguments is not known until compile time.  Since
+   variadic macro support may vary with compiler options, you must
+   pre-#define SDT_USE_VARIADIC to enable this type of probe.
+
+   The trick to count __VA_ARGS__ was inspired by this post by
+   Laurent Deniau <laurent.deniau@cern.ch>:
+       http://groups.google.com/group/comp.std.c/msg/346fc464319b1ee5
+
+   Note that our _SDT_NARG is called with an extra 0 arg that's not
+   counted, so we don't have to worry about the behavior of macros
+   called without any arguments.  */
+
+#define _SDT_NARG(...) __SDT_NARG(__VA_ARGS__, 12,11,10,9,8,7,6,5,4,3,2,1,0)
+#define __SDT_NARG(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,_10,_11,_12, N, ...) N
+#ifdef SDT_USE_VARIADIC
+#define _SDT_PROBE_N(provider, name, N, ...) \
+  _SDT_PROBE(provider, name, N, (__VA_ARGS__))
+#define STAP_PROBEV(provider, name, ...) \
+  _SDT_PROBE_N(provider, name, _SDT_NARG(0, ##__VA_ARGS__), ##__VA_ARGS__)
+#endif
+
+/* These macros are for use in asm statements.  You must compile
+   with -std=gnu99 or -std=c99 to use the STAP_PROBE_ASM macro.
+
+   The STAP_PROBE_ASM macro generates a quoted string to be used in the
+   template portion of the asm statement, concatenated with strings that
+   contain the actual assembly code around the probe site.
+
+   For example:
+
+	asm ("before\n"
+	     STAP_PROBE_ASM(provider, fooprobe, %eax 4(%esi))
+	     "after");
+
+   emits the assembly code for "before\nafter", with a probe in between.
+   The probe arguments are the %eax register, and the value of the memory
+   word located 4 bytes past the address in the %esi register.  Note that
+   because this is a simple asm, not a GNU C extended asm statement, these
+   % characters do not need to be doubled to generate literal %reg names.
+
+   In a GNU C extended asm statement, the probe arguments can be specified
+   using the macro STAP_PROBE_ASM_TEMPLATE(n) for n arguments.  The paired
+   macro STAP_PROBE_ASM_OPERANDS gives the C values of these probe arguments,
+   and appears in the input operand list of the asm statement.  For example:
+
+	asm ("someinsn %0,%1\n" // %0 is output operand, %1 is input operand
+	     STAP_PROBE_ASM(provider, fooprobe, STAP_PROBE_ASM_TEMPLATE(3))
+	     "otherinsn %[namedarg]"
+	     : "r" (outvar)
+	     : "g" (some_value), [namedarg] "i" (1234),
+	       STAP_PROBE_ASM_OPERANDS(3, some_value, some_ptr->field, 1234));
+
+    This is just like writing:
+
+	STAP_PROBE3(provider, fooprobe, some_value, some_ptr->field, 1234));
+
+    but the probe site is right between "someinsn" and "otherinsn".
+
+    The probe arguments in STAP_PROBE_ASM can be given as assembly
+    operands instead, even inside a GNU C extended asm statement.
+    Note that these can use operand templates like %0 or %[name],
+    and likewise they must write %%reg for a literal operand of %reg.  */
+
+#define _SDT_ASM_BODY_1(p,n,...) _SDT_ASM_BODY(p,n,_SDT_ASM_SUBSTR,(__VA_ARGS__))
+#define _SDT_ASM_BODY_2(p,n,...) _SDT_ASM_BODY(p,n,/*_SDT_ASM_STRING */,__VA_ARGS__)
+#define _SDT_ASM_BODY_N2(p,n,no,...) _SDT_ASM_BODY_ ## no(p,n,__VA_ARGS__)
+#define _SDT_ASM_BODY_N1(p,n,no,...) _SDT_ASM_BODY_N2(p,n,no,__VA_ARGS__)
+#define _SDT_ASM_BODY_N(p,n,...) _SDT_ASM_BODY_N1(p,n,_SDT_NARG(0, __VA_ARGS__),__VA_ARGS__)
+
+#if __STDC_VERSION__ >= 199901L
+# define STAP_PROBE_ASM(provider, name, ...)		\
+  _SDT_ASM_BODY_N(provider, name, __VA_ARGS__)					\
+  _SDT_ASM_BASE
+# define STAP_PROBE_ASM_OPERANDS(n, ...) _SDT_ASM_OPERANDS_##n(__VA_ARGS__)
+#else
+# define STAP_PROBE_ASM(provider, name, args)	\
+  _SDT_ASM_BODY(provider, name, /* _SDT_ASM_STRING */, (args))	\
+  _SDT_ASM_BASE
+#endif
+#define STAP_PROBE_ASM_TEMPLATE(n) _SDT_ASM_TEMPLATE_##n,"use _SDT_ASM_TEMPLATE_"
+
+
+/* DTrace compatible macro names.  */
+#define DTRACE_PROBE(provider,probe)		\
+  STAP_PROBE(provider,probe)
+#define DTRACE_PROBE1(provider,probe,parm1)	\
+  STAP_PROBE1(provider,probe,parm1)
+#define DTRACE_PROBE2(provider,probe,parm1,parm2)	\
+  STAP_PROBE2(provider,probe,parm1,parm2)
+#define DTRACE_PROBE3(provider,probe,parm1,parm2,parm3) \
+  STAP_PROBE3(provider,probe,parm1,parm2,parm3)
+#define DTRACE_PROBE4(provider,probe,parm1,parm2,parm3,parm4)	\
+  STAP_PROBE4(provider,probe,parm1,parm2,parm3,parm4)
+#define DTRACE_PROBE5(provider,probe,parm1,parm2,parm3,parm4,parm5)	\
+  STAP_PROBE5(provider,probe,parm1,parm2,parm3,parm4,parm5)
+#define DTRACE_PROBE6(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6) \
+  STAP_PROBE6(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6)
+#define DTRACE_PROBE7(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7) \
+  STAP_PROBE7(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7)
+#define DTRACE_PROBE8(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8) \
+  STAP_PROBE8(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8)
+#define DTRACE_PROBE9(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9) \
+  STAP_PROBE9(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9)
+#define DTRACE_PROBE10(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10) \
+  STAP_PROBE10(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10)
+#define DTRACE_PROBE11(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11) \
+  STAP_PROBE11(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11)
+#define DTRACE_PROBE12(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11,parm12) \
+  STAP_PROBE12(provider,probe,parm1,parm2,parm3,parm4,parm5,parm6,parm7,parm8,parm9,parm10,parm11,parm12)
+
+
+#endif /* sys/sdt.h */
diff --git a/tools/testing/selftests/bpf/test_bpftool_synctypes.py b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
index 6bf21e47882a..c0e7acd698ed 100755
--- a/tools/testing/selftests/bpf/test_bpftool_synctypes.py
+++ b/tools/testing/selftests/bpf/test_bpftool_synctypes.py
@@ -180,7 +180,7 @@ class FileExtractor(object):
         @enum_name: name of the enum to parse
         """
         start_marker = re.compile(f'enum {enum_name} {{\n')
-        pattern = re.compile('^\s*(BPF_\w+),?$')
+        pattern = re.compile('^\s*(BPF_\w+),?(\s+/\*.*\*/)?$')
         end_marker = re.compile('^};')
         parser = BlockParser(self.reader)
         parser.search_block(start_marker)
diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
index d6a1be4d8020..2ffa08198d1c 100644
--- a/tools/testing/selftests/bpf/test_cgroup_storage.c
+++ b/tools/testing/selftests/bpf/test_cgroup_storage.c
@@ -7,6 +7,7 @@
 #include <sys/sysinfo.h>
 
 #include "bpf_rlimit.h"
+#include "bpf_util.h"
 #include "cgroup_helpers.h"
 #include "testing_helpers.h"
 
@@ -44,7 +45,7 @@ int main(int argc, char **argv)
 	unsigned long long *percpu_value;
 	int cpu, nproc;
 
-	nproc = get_nprocs_conf();
+	nproc = bpf_num_possible_cpus();
 	percpu_value = malloc(sizeof(*percpu_value) * nproc);
 	if (!percpu_value) {
 		printf("Not enough memory for per-cpu area (%d cpus)\n", nproc);
diff --git a/tools/testing/selftests/bpf/test_offload.py b/tools/testing/selftests/bpf/test_offload.py
index edaffd43da83..6cd6ef9fc20b 100755
--- a/tools/testing/selftests/bpf/test_offload.py
+++ b/tools/testing/selftests/bpf/test_offload.py
@@ -184,7 +184,7 @@ def bpftool_prog_list(expected=None, ns=""):
 def bpftool_map_list(expected=None, ns=""):
     _, maps = bpftool("map show", JSON=True, ns=ns, fail=True)
     # Remove the base maps
-    maps = [m for m in maps if m not in base_maps and m.get('name') not in base_map_names]
+    maps = [m for m in maps if m not in base_maps and m.get('name') and m.get('name') not in base_map_names]
     if expected is not None:
         if len(maps) != expected:
             fail(True, "%d BPF maps loaded, expected %d" %
diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c
index 2ecb73a65206..0a4b45d7b515 100644
--- a/tools/testing/selftests/bpf/test_progs.c
+++ b/tools/testing/selftests/bpf/test_progs.c
@@ -761,8 +761,10 @@ int cd_flavor_subdir(const char *exec_name)
 	const char *flavor = strrchr(exec_name, '/');
 
 	if (!flavor)
-		return 0;
-	flavor++;
+		flavor = exec_name;
+	else
+		flavor++;
+
 	flavor = strrchr(flavor, '-');
 	if (!flavor)
 		return 0;
diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h
index 93c1ff705533..eec4c7385b14 100644
--- a/tools/testing/selftests/bpf/test_progs.h
+++ b/tools/testing/selftests/bpf/test_progs.h
@@ -332,6 +332,8 @@ int trigger_module_test_write(int write_sz);
 #define SYS_NANOSLEEP_KPROBE_NAME "__x64_sys_nanosleep"
 #elif defined(__s390x__)
 #define SYS_NANOSLEEP_KPROBE_NAME "__s390x_sys_nanosleep"
+#elif defined(__aarch64__)
+#define SYS_NANOSLEEP_KPROBE_NAME "__arm64_sys_nanosleep"
 #else
 #define SYS_NANOSLEEP_KPROBE_NAME "sys_nanosleep"
 #endif
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 795b6798ccee..87867f7a78c3 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -60,7 +60,7 @@ int parse_num_list(const char *s, bool **num_set, int *num_set_len)
 			set[i] = true;
 	}
 
-	if (!set)
+	if (!set || parsing_end)
 		return -EINVAL;
 
 	*num_set = set;
diff --git a/tools/testing/selftests/bpf/trace_helpers.c b/tools/testing/selftests/bpf/trace_helpers.c
index 3d6217e3aff7..9c4be2cdb21a 100644
--- a/tools/testing/selftests/bpf/trace_helpers.c
+++ b/tools/testing/selftests/bpf/trace_helpers.c
@@ -25,15 +25,12 @@ static int ksym_cmp(const void *p1, const void *p2)
 
 int load_kallsyms(void)
 {
-	FILE *f = fopen("/proc/kallsyms", "r");
+	FILE *f;
 	char func[256], buf[256];
 	char symbol;
 	void *addr;
 	int i = 0;
 
-	if (!f)
-		return -ENOENT;
-
 	/*
 	 * This is called/used from multiplace places,
 	 * load symbols just once.
@@ -41,6 +38,10 @@ int load_kallsyms(void)
 	if (sym_cnt)
 		return 0;
 
+	f = fopen("/proc/kallsyms", "r");
+	if (!f)
+		return -ENOENT;
+
 	while (fgets(buf, sizeof(buf), f)) {
 		if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
 			break;
diff --git a/tools/testing/selftests/bpf/urandom_read.c b/tools/testing/selftests/bpf/urandom_read.c
index db781052758d..e92644d0fa75 100644
--- a/tools/testing/selftests/bpf/urandom_read.c
+++ b/tools/testing/selftests/bpf/urandom_read.c
@@ -1,32 +1,85 @@
+#include <stdbool.h>
 #include <stdio.h>
 #include <unistd.h>
+#include <errno.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <stdlib.h>
+#include <signal.h>
+
+#define _SDT_HAS_SEMAPHORES 1
+#include "sdt.h"
+
+#define SEC(name) __attribute__((section(name), used))
 
 #define BUF_SIZE 256
 
+/* defined in urandom_read_aux.c */
+void urand_read_without_sema(int iter_num, int iter_cnt, int read_sz);
+/* these are coming from urandom_read_lib{1,2}.c */
+void urandlib_read_with_sema(int iter_num, int iter_cnt, int read_sz);
+void urandlib_read_without_sema(int iter_num, int iter_cnt, int read_sz);
+
+unsigned short urand_read_with_sema_semaphore SEC(".probes");
+
 static __attribute__((noinline))
 void urandom_read(int fd, int count)
 {
-       char buf[BUF_SIZE];
-       int i;
+	char buf[BUF_SIZE];
+	int i;
 
-       for (i = 0; i < count; ++i)
-               read(fd, buf, BUF_SIZE);
+	for (i = 0; i < count; ++i) {
+		read(fd, buf, BUF_SIZE);
+
+		/* trigger USDTs defined in executable itself */
+		urand_read_without_sema(i, count, BUF_SIZE);
+		STAP_PROBE3(urand, read_with_sema, i, count, BUF_SIZE);
+
+		/* trigger USDTs defined in shared lib */
+		urandlib_read_without_sema(i, count, BUF_SIZE);
+		urandlib_read_with_sema(i, count, BUF_SIZE);
+	}
+}
+
+static volatile bool parent_ready;
+
+static void handle_sigpipe(int sig)
+{
+	parent_ready = true;
 }
 
 int main(int argc, char *argv[])
 {
 	int fd = open("/dev/urandom", O_RDONLY);
 	int count = 4;
+	bool report_pid = false;
 
 	if (fd < 0)
 		return 1;
 
-	if (argc == 2)
+	if (argc >= 2)
 		count = atoi(argv[1]);
+	if (argc >= 3) {
+		report_pid = true;
+		/* install SIGPIPE handler to catch when parent closes their
+		 * end of the pipe (on the other side of our stdout)
+		 */
+		signal(SIGPIPE, handle_sigpipe);
+	}
+
+	/* report PID and wait for parent process to send us "signal" by
+	 * closing stdout
+	 */
+	if (report_pid) {
+		while (!parent_ready) {
+			fprintf(stdout, "%d\n", getpid());
+			fflush(stdout);
+		}
+		/* at this point stdout is closed, parent process knows our
+		 * PID and is ready to trace us
+		 */
+	}
 
 	urandom_read(fd, count);
 
diff --git a/tools/testing/selftests/bpf/urandom_read_aux.c b/tools/testing/selftests/bpf/urandom_read_aux.c
new file mode 100644
index 000000000000..6132edcfea74
--- /dev/null
+++ b/tools/testing/selftests/bpf/urandom_read_aux.c
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include "sdt.h"
+
+void urand_read_without_sema(int iter_num, int iter_cnt, int read_sz)
+{
+	/* semaphore-less USDT */
+	STAP_PROBE3(urand, read_without_sema, iter_num, iter_cnt, read_sz);
+}
diff --git a/tools/testing/selftests/bpf/urandom_read_lib1.c b/tools/testing/selftests/bpf/urandom_read_lib1.c
new file mode 100644
index 000000000000..86186e24b740
--- /dev/null
+++ b/tools/testing/selftests/bpf/urandom_read_lib1.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#define _SDT_HAS_SEMAPHORES 1
+#include "sdt.h"
+
+#define SEC(name) __attribute__((section(name), used))
+
+unsigned short urandlib_read_with_sema_semaphore SEC(".probes");
+
+void urandlib_read_with_sema(int iter_num, int iter_cnt, int read_sz)
+{
+	STAP_PROBE3(urandlib, read_with_sema, iter_num, iter_cnt, read_sz);
+}
diff --git a/tools/testing/selftests/bpf/urandom_read_lib2.c b/tools/testing/selftests/bpf/urandom_read_lib2.c
new file mode 100644
index 000000000000..9d401ad9838f
--- /dev/null
+++ b/tools/testing/selftests/bpf/urandom_read_lib2.c
@@ -0,0 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2022 Meta Platforms, Inc. and affiliates. */
+#include "sdt.h"
+
+void urandlib_read_without_sema(int iter_num, int iter_cnt, int read_sz)
+{
+	STAP_PROBE3(urandlib, read_without_sema, iter_num, iter_cnt, read_sz);
+}