From: Leon Hwang <hffilwlqm@gmail.com>
To: bpf@vger.kernel.org
Cc: ast@kernel.org, daniel@iogearbox.net, andrii@kernel.org,
maciej.fijalkowski@intel.com, jakub@cloudflare.com,
pulehui@huawei.com, hffilwlqm@gmail.com,
kernel-patches-bot@fb.com
Subject: [PATCH bpf-next v4 3/5] bpf, x64: Fix tailcall hierarchy
Date: Thu, 9 May 2024 23:05:39 +0800 [thread overview]
Message-ID: <20240509150541.81799-4-hffilwlqm@gmail.com> (raw)
In-Reply-To: <20240509150541.81799-1-hffilwlqm@gmail.com>
This patch fixes a tailcall issue caused by abusing the tailcall in
bpf2bpf feature.
As we know, tail_call_cnt propagates by rax from caller to callee when
to call subprog in tailcall context. But, like the following example,
MAX_TAIL_CALL_CNT won't work because of missing tail_call_cnt
back-propagation from callee to caller.
\#include <linux/bpf.h>
\#include <bpf/bpf_helpers.h>
\#include "bpf_legacy.h"
struct {
__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
__uint(max_entries, 1);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u32));
} jmp_table SEC(".maps");
int count = 0;
static __noinline
int subprog_tail1(struct __sk_buff *skb)
{
bpf_tail_call_static(skb, &jmp_table, 0);
return 0;
}
static __noinline
int subprog_tail2(struct __sk_buff *skb)
{
bpf_tail_call_static(skb, &jmp_table, 0);
return 0;
}
SEC("tc")
int entry(struct __sk_buff *skb)
{
volatile int ret = 1;
count++;
subprog_tail1(skb);
subprog_tail2(skb);
return ret;
}
char __license[] SEC("license") = "GPL";
At run time, the tail_call_cnt in entry() will be propagated to
subprog_tail1() and subprog_tail2(). But, when the tail_call_cnt in
subprog_tail1() updates when bpf_tail_call_static(), the tail_call_cnt
in entry() won't be updated at the same time. As a result, in entry(),
when tail_call_cnt in entry() is less than MAX_TAIL_CALL_CNT and
subprog_tail1() returns because of MAX_TAIL_CALL_CNT limit,
bpf_tail_call_static() in suprog_tail2() is able to run because the
tail_call_cnt in subprog_tail2() propagated from entry() is less than
MAX_TAIL_CALL_CNT.
So, how many tailcalls are there for this case if no error happens?
From top-down view, does it look like hierarchy layer and layer?
With view, there will be 2+4+8+...+2^33 = 2^34 - 2 = 17,179,869,182
tailcalls for this case.
How about there are N subprog_tail() in entry()? There will be almost
N^34 tailcalls.
Then, in this patch, it resolves this case on x86_64.
In stead of propagating tail_call_cnt from caller to callee, it
propagate its pointer, tail_call_cnt_ptr, tcc_ptr for short.
However, where does it store tail_call_cnt?
It stores tail_call_cnt on the stack of bpf prog's caller by the way in
previous patch "bpf: Introduce bpf_jit_supports_tail_call_cnt_ptr()".
Then, in bpf prog's prologue, it loads tcc_ptr from bpf_tail_call_run_ctx,
and restores the original ctx from bpf_tail_call_run_ctx meanwhile.
Then, when a tailcall runs, it compares tail_call_cnt accessed by
tcc_ptr with MAX_TAIL_CALL_CNT and then increments tail_call_cnt at
tcc_ptr.
Furthermore, when trampoline is the caller of bpf prog, it is required
to prepare tail_call_cnt and tail call run ctx on the stack of the
trampoline.
Finally, enable bpf_jit_supports_tail_call_cnt_ptr() to use
bpf_tail_call_run_ctx in __bpf_prog_run().
Fixes: ebf7d1f508a7 ("bpf, x64: rework pro/epilogue and tailcall handling in JIT")
Fixes: e411901c0b77 ("bpf: allow for tailcalls in BPF subprograms for x64 JIT")
Signed-off-by: Leon Hwang <hffilwlqm@gmail.com>
---
arch/x86/net/bpf_jit_comp.c | 101 ++++++++++++++++++++++++------------
include/linux/bpf.h | 2 +
2 files changed, 70 insertions(+), 33 deletions(-)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index ff217cc35ce92..43dc628e66222 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -273,7 +273,7 @@ struct jit_context {
/* Number of bytes emit_patch() needs to generate instructions */
#define X86_PATCH_SIZE 5
/* Number of bytes that will be skipped on tailcall */
-#define X86_TAIL_CALL_OFFSET (11 + ENDBR_INSN_SIZE)
+#define X86_TAIL_CALL_OFFSET (16 + ENDBR_INSN_SIZE)
static void push_r12(u8 **pprog)
{
@@ -420,14 +420,17 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
*/
emit_nops(&prog, X86_PATCH_SIZE);
if (!ebpf_from_cbpf) {
- if (tail_call_reachable && !is_subprog)
- /* When it's the entry of the whole tailcall context,
- * zeroing rax means initialising tail_call_cnt.
- */
- EMIT2(0x31, 0xC0); /* xor eax, eax */
- else
+ if (tail_call_reachable && !is_subprog) {
+ /* Store tcc_ptr to rax. */
+ /* mov rax, qword ptr [rdi + 8] */
+ EMIT4(0x48, 0x8B, 0x47, 0x08);
+ /* Restore the original ctx. */
+ /* mov rdi, qword ptr [rdi] */
+ EMIT3(0x48, 0x8B, 0x3F);
+ } else {
/* Keep the same instruction layout. */
- EMIT2(0x66, 0x90); /* nop2 */
+ emit_nops(&prog, 7);
+ }
}
/* Exception callback receives FP as third parameter */
if (is_exception_cb) {
@@ -453,6 +456,7 @@ static void emit_prologue(u8 **pprog, u32 stack_depth, bool ebpf_from_cbpf,
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xEC, round_up(stack_depth, 8));
if (tail_call_reachable)
+ /* Here, rax is tail_call_cnt_ptr. */
EMIT1(0x50); /* push rax */
*pprog = prog;
}
@@ -589,13 +593,15 @@ static void emit_return(u8 **pprog, u8 *ip)
*pprog = prog;
}
+#define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack) (-8 - round_up(stack, 8))
+
/*
* Generate the following code:
*
* ... bpf_tail_call(void *ctx, struct bpf_array *array, u64 index) ...
* if (index >= array->map.max_entries)
* goto out;
- * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+ * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
* goto out;
* prog = array->ptrs[index];
* if (prog == NULL)
@@ -608,7 +614,7 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
u32 stack_depth, u8 *ip,
struct jit_context *ctx)
{
- int tcc_off = -4 - round_up(stack_depth, 8);
+ int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
u8 *prog = *pprog, *start = *pprog;
int offset;
@@ -630,16 +636,15 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
EMIT2(X86_JBE, offset); /* jbe out */
/*
- * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+ * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
- EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
- EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
+ EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off); /* mov rax, qword ptr [rbp - tcc_ptr_off] */
+ EMIT3(0x83, 0x38, MAX_TAIL_CALL_CNT); /* cmp dword ptr [rax], MAX_TAIL_CALL_CNT */
offset = ctx->tail_call_indirect_label - (prog + 2 - start);
EMIT2(X86_JAE, offset); /* jae out */
- EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
- EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
+ EMIT3(0x83, 0x00, 0x01); /* add dword ptr [rax], 1 */
/* prog = array->ptrs[index]; */
EMIT4_off32(0x48, 0x8B, 0x8C, 0xD6, /* mov rcx, [rsi + rdx * 8 + offsetof(...)] */
@@ -663,6 +668,7 @@ static void emit_bpf_tail_call_indirect(struct bpf_prog *bpf_prog,
pop_r12(&prog);
}
+ /* pop tail_call_cnt_ptr */
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, /* add rsp, sd */
@@ -691,21 +697,20 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
bool *callee_regs_used, u32 stack_depth,
struct jit_context *ctx)
{
- int tcc_off = -4 - round_up(stack_depth, 8);
+ int tcc_ptr_off = BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack_depth);
u8 *prog = *pprog, *start = *pprog;
int offset;
/*
- * if (tail_call_cnt++ >= MAX_TAIL_CALL_CNT)
+ * if ((*tcc_ptr)++ >= MAX_TAIL_CALL_CNT)
* goto out;
*/
- EMIT2_off32(0x8B, 0x85, tcc_off); /* mov eax, dword ptr [rbp - tcc_off] */
- EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT); /* cmp eax, MAX_TAIL_CALL_CNT */
+ EMIT3_off32(0x48, 0x8B, 0x85, tcc_ptr_off); /* mov rax, qword ptr [rbp - tcc_ptr_off] */
+ EMIT3(0x83, 0x38, MAX_TAIL_CALL_CNT); /* cmp dword ptr [rax], MAX_TAIL_CALL_CNT */
offset = ctx->tail_call_direct_label - (prog + 2 - start);
EMIT2(X86_JAE, offset); /* jae out */
- EMIT3(0x83, 0xC0, 0x01); /* add eax, 1 */
- EMIT2_off32(0x89, 0x85, tcc_off); /* mov dword ptr [rbp - tcc_off], eax */
+ EMIT3(0x83, 0x00, 0x01); /* add dword ptr [rax], 1 */
poke->tailcall_bypass = ip + (prog - start);
poke->adj_off = X86_TAIL_CALL_OFFSET;
@@ -724,6 +729,7 @@ static void emit_bpf_tail_call_direct(struct bpf_prog *bpf_prog,
pop_r12(&prog);
}
+ /* pop tail_call_cnt_ptr */
EMIT1(0x58); /* pop rax */
if (stack_depth)
EMIT3_off32(0x48, 0x81, 0xC4, round_up(stack_depth, 8));
@@ -1314,8 +1320,8 @@ static void emit_shiftx(u8 **pprog, u32 dst_reg, u8 src_reg, bool is64, u8 op)
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
/* mov rax, qword ptr [rbp - rounded_stack_depth - 8] */
-#define RESTORE_TAIL_CALL_CNT(stack) \
- EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8)
+#define LOAD_TAIL_CALL_CNT_PTR(stack) \
+ EMIT3_off32(0x48, 0x8B, 0x85, BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack))
static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image,
int oldproglen, struct jit_context *ctx, bool jmp_padding)
@@ -2045,7 +2051,7 @@ st: if (is_imm8(insn->off))
func = (u8 *) __bpf_call_base + imm32;
if (tail_call_reachable) {
- RESTORE_TAIL_CALL_CNT(bpf_prog->aux->stack_depth);
+ LOAD_TAIL_CALL_CNT_PTR(bpf_prog->aux->stack_depth);
ip += 7;
}
if (!imm32)
@@ -2555,11 +2561,17 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
int run_ctx_off, bool save_ret,
void *image, void *rw_image)
{
- u8 *prog = *pprog;
- u8 *jmp_insn;
+ int ctx_tail_call_run_ctx_off = -run_ctx_off + offsetof(struct bpf_tramp_run_ctx,
+ tail_call_run_ctx);
+ int ctx_tcc_ptr_off = ctx_tail_call_run_ctx_off + offsetof(struct bpf_tail_call_run_ctx,
+ tail_call_cnt_ptr);
+ int ctx_tail_call_cnt_off = -run_ctx_off + offsetof(struct bpf_tramp_run_ctx,
+ tail_call_cnt);
int ctx_cookie_off = offsetof(struct bpf_tramp_run_ctx, bpf_cookie);
struct bpf_prog *p = l->link.prog;
u64 cookie = l->cookie;
+ u8 *prog = *pprog;
+ u8 *jmp_insn;
/* mov rdi, cookie */
emit_mov_imm64(&prog, BPF_REG_1, (long) cookie >> 32, (u32) (long) cookie);
@@ -2604,6 +2616,23 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
emit_mov_imm64(&prog, BPF_REG_2,
(long) p->insnsi >> 32,
(u32) (long) p->insnsi);
+ if (p->aux->use_tail_call_run_ctx) {
+ /* Cache the original ctx */
+ /* mov qword ptr [rbp - ctx_tail_call_run_ctx_off], rdi */
+ EMIT3_off32(0x48, 0x89, 0xBD, ctx_tail_call_run_ctx_off);
+ /* Make rdi as tcc_ptr */
+ /* lea rdi, [rbp - ctx_tail_call_cnt_off] */
+ EMIT3_off32(0x48, 0x8D, 0xBD, ctx_tail_call_cnt_off);
+ /* Clear tail_call_cnt */
+ /* mov dword ptr [rdi], 0 */
+ EMIT2_off32(0xC7, 0x07, 0x00);
+ /* Cache tcc_ptr */
+ /* mov qword ptr [rbp - ctx_tcc_ptr_off], rdi */
+ EMIT3_off32(0x48, 0x89, 0xBD, ctx_tcc_ptr_off);
+ /* Update rdi as tail call run ctx */
+ /* lea rdi, [rbp - ctx_tail_call_run_ctx_off] */
+ EMIT3_off32(0x48, 0x8D, 0xBD, ctx_tail_call_run_ctx_off);
+ }
/* call JITed bpf program or interpreter */
if (emit_rsb_call(&prog, p->bpf_func, image + (prog - (u8 *)rw_image)))
return -EINVAL;
@@ -2840,7 +2869,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
* [ ... ]
* [ stack_arg2 ]
* RBP - arg_stack_off [ stack_arg1 ]
- * RSP [ tail_call_cnt ] BPF_TRAMP_F_TAIL_CALL_CTX
+ * RSP [ tail_call_cnt_ptr ] BPF_TRAMP_F_TAIL_CALL_CTX
*/
/* room for return value of orig_call or fentry prog */
@@ -2969,10 +2998,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
save_args(m, &prog, arg_stack_off, true);
if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
- /* Before calling the original function, restore the
- * tail_call_cnt from stack to rax.
+ /* Before calling the original function, load the
+ * tail_call_cnt_ptr from stack to rax.
*/
- RESTORE_TAIL_CALL_CNT(stack_size);
+ LOAD_TAIL_CALL_CNT_PTR(stack_size);
}
if (flags & BPF_TRAMP_F_ORIG_STACK) {
@@ -3031,10 +3060,10 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *rw_im
goto cleanup;
}
} else if (flags & BPF_TRAMP_F_TAIL_CALL_CTX) {
- /* Before running the original function, restore the
- * tail_call_cnt from stack to rax.
+ /* Before running the original function, load the
+ * tail_call_cnt_ptr from stack to rax.
*/
- RESTORE_TAIL_CALL_CNT(stack_size);
+ LOAD_TAIL_CALL_CNT_PTR(stack_size);
}
/* restore return value of orig_call or fentry prog back into RAX */
@@ -3432,6 +3461,12 @@ bool bpf_jit_supports_subprog_tailcalls(void)
return true;
}
+/* Indicate the JIT backend supports tail call count pointer in tailcall context. */
+bool bpf_jit_supports_tail_call_cnt_ptr(void)
+{
+ return true;
+}
+
bool bpf_jit_supports_percpu_insn(void)
{
return true;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 95888700966f7..94f994204acea 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2057,6 +2057,8 @@ struct bpf_tramp_run_ctx {
struct bpf_run_ctx run_ctx;
u64 bpf_cookie;
struct bpf_run_ctx *saved_run_ctx;
+ struct bpf_tail_call_run_ctx tail_call_run_ctx;
+ u32 tail_call_cnt;
};
static inline struct bpf_run_ctx *bpf_set_run_ctx(struct bpf_run_ctx *new_ctx)
--
2.44.0
next prev parent reply other threads:[~2024-05-09 15:06 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-05-09 15:05 [PATCH bpf-next v4 0/5] bpf: Fix tailcall hierarchy Leon Hwang
2024-05-09 15:05 ` [PATCH bpf-next v4 1/5] bpf, verifier: Correct tail_call_reachable when no tailcall in subprog Leon Hwang
2024-05-09 15:05 ` [PATCH bpf-next v4 2/5] bpf: Introduce bpf_jit_supports_tail_call_cnt_ptr() Leon Hwang
2024-05-09 15:05 ` Leon Hwang [this message]
2024-05-16 15:28 ` [PATCH bpf-next v4 3/5] bpf, x64: Fix tailcall hierarchy Leon Hwang
2024-05-16 18:56 ` Zvi Effron
2024-05-17 15:05 ` Leon Hwang
2024-05-09 15:05 ` [PATCH bpf-next v4 4/5] bpf, arm64: " Leon Hwang
2024-05-09 15:05 ` [PATCH bpf-next v4 5/5] selftests/bpf: Add testcases for tailcall hierarchy fixing Leon Hwang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20240509150541.81799-4-hffilwlqm@gmail.com \
--to=hffilwlqm@gmail.com \
--cc=andrii@kernel.org \
--cc=ast@kernel.org \
--cc=bpf@vger.kernel.org \
--cc=daniel@iogearbox.net \
--cc=jakub@cloudflare.com \
--cc=kernel-patches-bot@fb.com \
--cc=maciej.fijalkowski@intel.com \
--cc=pulehui@huawei.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).