Hi, Sergey,

thanks for the patch! LGTM

Sergey

On 6/12/25 12:12, Sergey Kaplun wrote:
> From: Mike Pall <mike>
>
> Reported by Sergey Kaplun.
>
> (cherry picked from commit cd4af8ad80bb6430ad2e547f7af236268c9be7d9)
>
> When restoring a snapshot because of exiting by the check of the stack,
> the snapshot from the parent trace is used. For the correct error
> message, the snapshot map uses the next bytecode after the stored one.
> In case, when the PC in the snapshot is BC_RET* (the last bytecode in
> the prototype), there is no next value to be referenced, so this
> approach leads to the assertion failure in `lj_debug_framepc()`.
>
> This patch prevents this behaviour by checking the original bytecode and
> avoiding PC adjusting in the case of the last bytecode in the prototype.
>
> Sergey Kaplun:
> * added the description and the test for the problem
>
> Part of tarantool/tarantool#11278
> ---
>
> Related issues:
> *https://github.com/LuaJIT/LuaJIT/issues/1359
> *https://github.com/tarantool/tarantool/issues/11278
> Branch:https://github.com/tarantool/luajit/tree/skaplun/lj-1359-bad-pc-on-snap-restore-stackov
>
>   src/lj_bc.h                                   |  5 ++
>   src/lj_parse.c                                | 14 +----
>   src/lj_snap.c                                 |  6 ++-
>   ...59-bad-pc-on-snap-restore-stackov.test.lua | 51 +++++++++++++++++++
>   4 files changed, 61 insertions(+), 15 deletions(-)
>   create mode 100644 test/tarantool-tests/lj-1359-bad-pc-on-snap-restore-stackov.test.lua
>
> diff --git a/src/lj_bc.h b/src/lj_bc.h
> index 69a45f28..2196dbed 100644
> --- a/src/lj_bc.h
> +++ b/src/lj_bc.h
> @@ -259,6 +259,11 @@ static LJ_AINLINE int bc_isret(BCOp op)
>     return (op == BC_RETM || op == BC_RET || op == BC_RET0 || op == BC_RET1);
>   }
>   
> +static LJ_AINLINE int bc_isret_or_tail(BCOp op)
> +{
> +  return (op == BC_CALLMT || op == BC_CALLT || bc_isret(op));
> +}
> +
>   LJ_DATA const uint16_t lj_bc_mode[];
>   LJ_DATA const uint16_t lj_bc_ofs[];
>   
> diff --git a/src/lj_parse.c b/src/lj_parse.c
> index ec85ac9b..88108c01 100644
> --- a/src/lj_parse.c
> +++ b/src/lj_parse.c
> @@ -1519,23 +1519,11 @@ static void fs_fixup_var(LexState *ls, GCproto *pt, uint8_t *p, size_t ofsvar)
>   
>   #endif
>   
> -/* Check if bytecode op returns. */
> -static int bcopisret(BCOp op)
> -{
> -  switch (op) {
> -  case BC_CALLMT: case BC_CALLT:
> -  case BC_RETM: case BC_RET: case BC_RET0: case BC_RET1:
> -    return 1;
> -  default:
> -    return 0;
> -  }
> -}
> -
>   /* Fixup return instruction for prototype. */
>   static void fs_fixup_ret(FuncState *fs)
>   {
>     BCPos lastpc = fs->pc;
> -  if (lastpc <= fs->lasttarget || !bcopisret(bc_op(fs->bcbase[lastpc-1].ins))) {
> +  if (lastpc <= fs->lasttarget || !bc_isret_or_tail(bc_op(fs->bcbase[lastpc-1].ins))) {
>       if ((fs->bl->flags & FSCOPE_UPVAL))
>         bcemit_AJ(fs, BC_UCLO, 0, 0);
>       bcemit_AD(fs, BC_RET0, 0, 1);  /* Need final return. */
> diff --git a/src/lj_snap.c b/src/lj_snap.c
> index 4cfae579..3542a46c 100644
> --- a/src/lj_snap.c
> +++ b/src/lj_snap.c
> @@ -954,8 +954,10 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
>     const BCIns *pc = snap_pc(&map[nent]);
>     lua_State *L = J->L;
>   
> -  /* Set interpreter PC to the next PC to get correct error messages. */
> -  setcframe_pc(L->cframe, pc+1);
> +  /* Set interpreter PC to the next PC to get correct error messages.
> +  ** But not for returns or tail calls, since pc+1 may be out-of-range.
> +  */
> +  setcframe_pc(L->cframe, bc_isret_or_tail(bc_op(*pc)) ? pc : pc+1);
>     setcframe_pc(cframe_raw(cframe_prev(L->cframe)), pc);
>   
>     /* Make sure the stack is big enough for the slots from the snapshot. */
> diff --git a/test/tarantool-tests/lj-1359-bad-pc-on-snap-restore-stackov.test.lua b/test/tarantool-tests/lj-1359-bad-pc-on-snap-restore-stackov.test.lua
> new file mode 100644
> index 00000000..d09b5c42
> --- /dev/null
> +++ b/test/tarantool-tests/lj-1359-bad-pc-on-snap-restore-stackov.test.lua
> @@ -0,0 +1,51 @@
> +local tap = require('tap')
> +
> +-- The test file to demonstrate the LuaJIT misbehaviour during
> +-- stack overflow on snapshot restoration for the last bytecode in
> +-- the prototype.
> +-- See also:https://github.com/LuaJIT/LuaJIT/issues/1359.
> +
> +local test = tap.test('lj-1359-bad-pc-on-snap-restore-stackov'):skipcond({
> +  ['Test requires JIT enabled'] = not jit.status(),
> +})
> +
> +test:plan(2)
> +
> +jit.opt.start('hotloop=1')
> +
> +-- When restoring a snapshot because of exiting by the check of
> +-- the stack, the snapshot from the parent trace is used. For the
> +-- correct error message, the snapshot map uses the next bytecode
> +-- after the stored one. In case, when the PC in the snapshot is
> +-- BC_RET* (the last bytecode in the prototype), there is no next
> +-- value to be referenced, so this approach leads to the assertion
> +-- failure in lj_debug_framepc().
> +
> +local counter = 0
> +-- The second trace started from the side exit by the counter.
> +-- It ends by entering the first trace.
> +local function func_side_trace()
> +  if counter > 5 then
> +    -- This RET0 BC is the first recorded for the second trace.
> +    -- The stack check for the child trace uses the first snapshot
> +    -- from the parent trace.
> +    return
> +  end
> +  counter = counter + 1;
> +end
> +
> +-- The trace with up-recursion for the function that causes stack
> +-- overflow. It is recorded first and inlines func_side_trace.
> +local function stackov_f()
> +  local r = stackov_f(func_side_trace())
> +  return r
> +end
> +
> +local result, errmsg = pcall(stackov_f)
> +
> +-- There is no assertion failure if we are here.
> +-- Just sanity checks.
> +test:ok(not result, 'correct status')
> +test:like(errmsg, 'stack overflow', 'correct error message')
> +
> +test:done(true)