Hello, Sergey!

Thanks for the patch! See a comment below.

Sergey

On 6/10/25 13:28, Sergey Kaplun wrote:

From: Mike Pall <mike>

Reported by Junlong Li. Fixed by Peter Cawley.

(cherry picked from commit 86e7123bb1782a5f200ba5e83b8c4f3fbad4f7bc)

This patch is a follow-up to the previous commit, which leads to a dirty
read of the pseudo-valid PC set for the cframe on snapshot restoration.
To avoid these dirty reads, this patch sets the PC to the outer frame
as well before possible error throwing.

Sergey Kaplun:
* added the description and the test for the problem

Part of tarantool/tarantool#11278
---
 src/lj_snap.c                                 |  3 +-
 src/lj_trace.c                                |  4 +-
 ...-1196-stack-overflow-snap-restore.test.lua | 65 +++++++++++++++++++
 3 files changed, 68 insertions(+), 4 deletions(-)
 create mode 100644 test/tarantool-tests/lj-1196-stack-overflow-snap-restore.test.lua

diff --git a/src/lj_snap.c b/src/lj_snap.c
index 8d7bd868..4cfae579 100644
--- a/src/lj_snap.c
+++ b/src/lj_snap.c
@@ -955,7 +955,8 @@ const BCIns *lj_snap_restore(jit_State *J, void *exptr)
   lua_State *L = J->L;
 
   /* Set interpreter PC to the next PC to get correct error messages. */
-  setcframe_pc(cframe_raw(L->cframe), pc+1);
+  setcframe_pc(L->cframe, pc+1);
+  setcframe_pc(cframe_raw(cframe_prev(L->cframe)), pc);
 
   /* Make sure the stack is big enough for the slots from the snapshot. */
   if (LJ_UNLIKELY(L->base + snap->topslot >= tvref(L->maxstack))) {
diff --git a/src/lj_trace.c b/src/lj_trace.c
index 8a18d3cf..0d1d233a 100644
--- a/src/lj_trace.c
+++ b/src/lj_trace.c
@@ -909,10 +909,8 @@ int LJ_FASTCALL lj_trace_exit(jit_State *J, void *exptr)
   exd.J = J;
   exd.exptr = exptr;
   errcode = lj_vm_cpcall(L, NULL, &exd, trace_exit_cp);
-  if (errcode) {
-    setcframe_pc(cframe_raw(L->cframe), L);  /* Point to any valid memory. */
+  if (errcode)
     return -errcode;  /* Return negated error code. */
-  }
 
   if (exitcode) copyTV(L, L->top++, &exiterr);  /* Anchor the error object. */
 
diff --git a/test/tarantool-tests/lj-1196-stack-overflow-snap-restore.test.lua b/test/tarantool-tests/lj-1196-stack-overflow-snap-restore.test.lua
new file mode 100644
index 00000000..942d1f82
--- /dev/null
+++ b/test/tarantool-tests/lj-1196-stack-overflow-snap-restore.test.lua
@@ -0,0 +1,65 @@
+local tap = require('tap')
+
+-- Test file to demonstrate LuaJIT dirty reads after stack
+-- overflow during restoration from the snapshot.
+-- The test fails before the patch under Valgrind.

Please specify valgrind option that is required for reproducing the bug.

Cannot reproduce with command line below with reverted patch:

VALGRIND_OPTS="--leak-check=no --malloc-fill=0x00 --free-fill=0x00" ctest -V -R test/tarantool-tests/lj-1196-partial-snap-restore.test.lua -V

+--
+-- luacheck: push no max_comment_line_length
+--
+-- See also: https://github.com/LuaJIT/LuaJIT/issues/1196,
+-- https://www.freelists.org/post/luajit/Invalid-read-found-by-valgrind.
+--
+-- luacheck: pop
+
+local test = tap.test('lj-1196-stack-overflow-snap-restore')
+
+test:plan(4)
+
+-- XXX: This file has the same tests as the
+-- <test/LuaJIT-tests/lang/stackov.lua>, but without disabling the
+-- compilation for the given functions. Hence, the check here is
+-- less strict -- we just check that there are no dirty reads,
+-- uninitialized memory access, etc.
+
+local function recursive_f_noarg()
+  recursive_f_noarg()
+end
+
+local function recursive_one_arg(argument)
+  recursive_one_arg(argument)
+end
+
+local function recursive_f_vararg(...)
+  recursive_f_vararg(1, ...)
+end
+
+local function recursive_f_vararg_tail(...)
+  return recursive_f_vararg_tail(1, ...)
+end
+
+-- Use `coroutine.wrap()`, for independent stack sizes.
+-- The invalid read is done by the error handler
+-- `debug.traceback()`, since it observes the pseudo PC (`L`) and
+-- reads the memory by `L - 4` address before the patch.
+
+coroutine.wrap(function()
+  local status = xpcall(recursive_f_noarg, debug.traceback)
+  test:ok(not status, 'correct status, recursive no arguments')
+end)()
+
+coroutine.wrap(function()
+  local status = xpcall(recursive_one_arg, debug.traceback, 1)
+  test:ok(not status, 'correct status, recursive one argument')
+end)()
+
+coroutine.wrap(function()
+  local status = xpcall(recursive_f_vararg, debug.traceback, 1)
+  test:ok(not status, 'correct status, recursive vararg')
+end)()
+
+coroutine.wrap(function()
+  local status = xpcall(recursive_f_vararg_tail, debug.traceback, 1)
+  test:ok(not status, 'correct status, recursive vararg tail')
+end)()
+
+test:done(true)