Hi, Sergey, thanks for the patch! Please see my comments. On 22.04.2024 11:49, Sergey Kaplun wrote: > From: Mike Pall > > Thanks to Sergey Kaplun and Peter Cawley. > > (cherry picked from commit d06beb0480c5d1eb53b3343e78063950275aa281) > > This commit is a follow-up for the commit > 1b8216023d5a79814389f1c1affef27c15d9de27 ("Throw any errors before stack > changes in trace stitching."). The patch prepends failures for the > specific error to be thrown. Nevertheless, the error may be thrown due > to retrying trace recording in the case when table bump optimization > is enabled or when OOM is observed during reallocation of the snapshot > or IR buffers. > > This patch adds the corresponding protected frame and rethrows the error > after a fixup of the stack. > > This patch also tests the correctness of copying the error message to > the top of the stack to get a valid "abort" reason in the `jit.dump` > utility. > > Also, this patch fixes a non-ASCII space character in the comment for > . > > Sergey Kaplun: > * added the description and the test for the problem > > Part of tarantool/tarantool#9924 > --- > src/lj_ffrecord.c | 21 ++++++-- > test/tarantool-tests/CMakeLists.txt | 1 + > .../lj-1166-error-stitch-oom-ir-buff.test.lua | 46 ++++++++++++++++ > ...j-1166-error-stitch-oom-snap-buff.test.lua | 54 +++++++++++++++++++ > .../lj-1166-error-stitch-table-bump.test.lua | 38 +++++++++++++ > .../lj-1166-error-stitch/CMakeLists.txt | 1 + > .../lj-1166-error-stitch/mockalloc.c | 51 ++++++++++++++++++ > .../lj-720-errors-before-stitch.test.lua | 40 +++++++++++++- > 8 files changed, 245 insertions(+), 7 deletions(-) > create mode 100644 test/tarantool-tests/lj-1166-error-stitch-oom-ir-buff.test.lua > create mode 100644 test/tarantool-tests/lj-1166-error-stitch-oom-snap-buff.test.lua > create mode 100644 test/tarantool-tests/lj-1166-error-stitch-table-bump.test.lua > create mode 100644 test/tarantool-tests/lj-1166-error-stitch/CMakeLists.txt > create mode 100644 test/tarantool-tests/lj-1166-error-stitch/mockalloc.c > > diff --git a/src/lj_ffrecord.c b/src/lj_ffrecord.c > index e3ed80fb..ff14e9e4 100644 > --- a/src/lj_ffrecord.c > +++ b/src/lj_ffrecord.c > @@ -96,6 +96,14 @@ static ptrdiff_t results_wanted(jit_State *J) > return -1; > } > > +static TValue *rec_stop_stitch_cp(lua_State *L, lua_CFunction dummy, void *ud) > +{ > + jit_State *J = (jit_State *)ud; > + lj_record_stop(J, LJ_TRLINK_STITCH, 0); > + UNUSED(L); UNUSED(dummy); > + return NULL; > +} > + > /* Trace stitching: add continuation below frame to start a new trace. */ > static void recff_stitch(jit_State *J) > { > @@ -106,10 +114,7 @@ static void recff_stitch(jit_State *J) > TValue *nframe = base + 1 + LJ_FR2; > const BCIns *pc = frame_pc(base-1); > TValue *pframe = frame_prevl(base-1); > - > - /* Check for this now. Throwing in lj_record_stop messes up the stack. */ > - if (J->cur.nsnap >= (MSize)J->param[JIT_P_maxsnap]) > - lj_trace_err(J, LJ_TRERR_SNAPOV); > + int errcode; > > /* Move func + args up in Lua stack and insert continuation. */ > memmove(&base[1], &base[-1-LJ_FR2], sizeof(TValue)*nslot); > @@ -134,13 +139,19 @@ static void recff_stitch(jit_State *J) > J->baseslot += 2 + LJ_FR2; > J->framedepth++; > > - lj_record_stop(J, LJ_TRLINK_STITCH, 0); > + errcode = lj_vm_cpcall(L, NULL, J, rec_stop_stitch_cp); > > /* Undo Lua stack changes. */ > memmove(&base[-1-LJ_FR2], &base[1], sizeof(TValue)*nslot); > setframe_pc(base-1, pc); > L->base -= 2 + LJ_FR2; > L->top -= 2 + LJ_FR2; > + > + if (errcode) { > + if (errcode == LUA_ERRRUN) > + copyTV(L, L->top-1, L->top + (1 + LJ_FR2)); > + lj_err_throw(L, errcode); /* Propagate errors. */ > + } > } > > /* Fallback handler for fast functions that are not recorded (yet). */ > diff --git a/test/tarantool-tests/CMakeLists.txt b/test/tarantool-tests/CMakeLists.txt > index 56660932..d7c96078 100644 > --- a/test/tarantool-tests/CMakeLists.txt > +++ b/test/tarantool-tests/CMakeLists.txt > @@ -39,6 +39,7 @@ add_subdirectory(lj-802-panic-at-mcode-protfail) > add_subdirectory(lj-flush-on-trace) > add_subdirectory(lj-1004-oom-error-frame) > add_subdirectory(lj-1066-fix-cur_L-after-coroutine-resume) > +add_subdirectory(lj-1166-error-stitch) > > # The part of the memory profiler toolchain is located in tools > # directory, jit, profiler, and bytecode toolchains are located > diff --git a/test/tarantool-tests/lj-1166-error-stitch-oom-ir-buff.test.lua b/test/tarantool-tests/lj-1166-error-stitch-oom-ir-buff.test.lua > new file mode 100644 > index 00000000..e3a5397d > --- /dev/null > +++ b/test/tarantool-tests/lj-1166-error-stitch-oom-ir-buff.test.lua > @@ -0,0 +1,46 @@ > +local tap = require('tap') > + > +-- Test file to demonstrate unbalanced Lua stack after instruction > +-- recording due to throwing an error at recording of a stitched > +-- function. > +-- See also:https://github.com/LuaJIT/LuaJIT/issues/1166. > + > +local test = tap.test('lj-1166-error-stitch-oom-snap-buff'):skipcond({ should a name in tap.test match to test file name? now it is not. > + ['Test requires JIT enabled'] = not jit.status(), > + ['Disabled on *BSD due to #4819'] = jit.os == 'BSD', > +}) > + > +test:plan(1) > + > +local mockalloc = require('mockalloc') > + > +local function create_chunk(n_slots) I would add a comment like this: --- a/test/tarantool-tests/lj-1166-error-stitch-oom-ir-buff.test.lua +++ b/test/tarantool-tests/lj-1166-error-stitch-oom-ir-buff.test.lua @@ -14,6 +14,18 @@ test:plan(1)  local mockalloc = require('mockalloc') +-- Generate a Lua chunk like below: +-- local s1 +-- local s2 +-- ... +-- local sN +-- for i = 1, 2 do +--   s1 = i + 1 +--   s2 = i + 2 +--   ... +--   sN = i + N +--   math.modf(1) +-- end  local function create_chunk(n_slots)    local chunk = ''    for i = 1, n_slots do > + local chunk = '' > + for i = 1, n_slots do > + chunk = chunk .. ('local s%d\n'):format(i) > + end > + chunk = chunk .. 'for i = 1, 2 do\n' > + -- Generate additional IR instructions. > + for i = 1, n_slots do > + chunk = chunk .. (' s%d = i + %d\n'):format(i, i) > + end > + -- `math.modf()` recording is NYI. > + chunk = chunk .. ' math.modf(1)\n' > + chunk = chunk .. 'end\n' > + return chunk > +end > + > +-- XXX: amount of slots is empirical. > +local tracef = assert(loadstring(create_chunk(175))) > + > +jit.opt.start('hotloop=1', '-loop', '-fold') > + > +mockalloc.mock() > + > +tracef() > + > +mockalloc.unmock() > + > +test:ok(true, 'stack is balanced') > + > +test:done(true) > diff --git a/test/tarantool-tests/lj-1166-error-stitch-oom-snap-buff.test.lua b/test/tarantool-tests/lj-1166-error-stitch-oom-snap-buff.test.lua > new file mode 100644 > index 00000000..8d671f8d > --- /dev/null > +++ b/test/tarantool-tests/lj-1166-error-stitch-oom-snap-buff.test.lua > @@ -0,0 +1,54 @@ > +local tap = require('tap') > + > +-- Test file to demonstrate unbalanced Lua stack after instruction > +-- recording due to throwing an error at recording of a stitched > +-- function. > +-- See also:https://github.com/LuaJIT/LuaJIT/issues/1166. > + > +local test = tap.test('lj-1166-error-stitch-oom-snap-buff'):skipcond({ > + ['Test requires JIT enabled'] = not jit.status(), > + ['Disabled on *BSD due to #4819'] = jit.os == 'BSD', > +}) > + > +test:plan(1) > + > +local mockalloc = require('mockalloc') > + > +local function create_chunk(n_conds) the same as above: please add a comment with an example of generated Lua chunk > + local chunk = '' > + chunk = chunk .. 'for i = 1, 2 do\n' > + -- Each condition adds additional snapshot. > + for i = 1, n_conds do > + chunk = chunk .. (' if i < %d then end\n'):format(i + n_conds) > + end > + -- `math.modf()` recording is NYI. > + chunk = chunk .. ' math.modf(1)\n' > + chunk = chunk .. 'end\n' > + return chunk > +end > + > +-- XXX: Need to compile the cycle in the `create_chunk()` to > +-- preallocate the snapshot buffer. > +jit.opt.start('hotloop=1', '-loop', '-fold') > + > +-- XXX: Amount of slots is empirical. > +local tracef = assert(loadstring(create_chunk(6))) > + > +-- XXX: Remove previous trace. > +jit.off() > +jit.flush() > + > +-- XXX: Update hotcounts to avoid hash collisions. > +jit.opt.start('hotloop=1') > + > +jit.on() > + > +mockalloc.mock() > + > +tracef() > + > +mockalloc.unmock() > + > +test:ok(true, 'stack is balanced') > + > +test:done(true) > diff --git a/test/tarantool-tests/lj-1166-error-stitch-table-bump.test.lua b/test/tarantool-tests/lj-1166-error-stitch-table-bump.test.lua > new file mode 100644 > index 00000000..f2453bbe > --- /dev/null > +++ b/test/tarantool-tests/lj-1166-error-stitch-table-bump.test.lua this test is not failed after reverting patch > @@ -0,0 +1,38 @@ > +local tap = require('tap') > + > +-- Test file to demonstrate unbalanced Lua stack after instruction > +-- recording due to throwing an error at recording of a stitched > +-- function. The test fails with LUAJIT_ENABLE_TABLE_BUMP enabled. > +-- See also: > +-- *https://github.com/LuaJIT/LuaJIT/issues/606, > +-- *https://github.com/LuaJIT/LuaJIT/issues/1166. > + > +local test = tap.test('lj-1166-error-stitch-table-bump'):skipcond({ > + ['Test requires JIT enabled'] = not jit.status(), > +}) > + > +test:plan(1) > + > +-- `math.modf` recording is NYI. > +-- Local `modf` simplifies `jit.dump()` output. > +local modf = math.modf > + > +jit.opt.start('hotloop=1') > + > +-- luacheck: no unused > +local t > +-- There is no need to run the trace itself. Just check the > +-- correctness of a recording. > +for i = 1, 2 do > + t = {} > + -- Cause table rehashing to trigger table bump optimization. > + t[i] = i > + -- Forcify stitch. This will throw an error at the end of > + -- recording, since trace recording should be retried after > + -- bytecode updating. > + modf(1) > +end > + > +test:ok(true, 'stack is balanced') > + > +test:done(true) > diff --git a/test/tarantool-tests/lj-1166-error-stitch/CMakeLists.txt b/test/tarantool-tests/lj-1166-error-stitch/CMakeLists.txt > new file mode 100644 > index 00000000..1ebf253b > --- /dev/null > +++ b/test/tarantool-tests/lj-1166-error-stitch/CMakeLists.txt > @@ -0,0 +1 @@ > +BuildTestCLib(mockalloc mockalloc.c) > diff --git a/test/tarantool-tests/lj-1166-error-stitch/mockalloc.c b/test/tarantool-tests/lj-1166-error-stitch/mockalloc.c > new file mode 100644 > index 00000000..d6d3492e > --- /dev/null > +++ b/test/tarantool-tests/lj-1166-error-stitch/mockalloc.c > @@ -0,0 +1,51 @@ > +#include "lua.h" > +#include "lauxlib.h" > + > +#undef NDEBUG > +#include > + > +static lua_Alloc old_allocf = NULL; > +static void *old_alloc_state = NULL; > + > +/* Function to be used instead of the default allocator. */ > +static void *mock_allocf(void *ud, void *ptr, size_t osize, size_t nsize) > +{ > + assert(old_allocf != NULL); > + /* > + * Check the specific reallocation related to the IR > + * buffer or the snapshot buffer. > + */ > + if (osize * 2 == nsize) > + return NULL; > + return old_allocf(ud, ptr, osize, nsize); > +} > + > +static int mock(lua_State *L) It is actually not a test mock. According to definition [1] test mock imitate a behavior of a real object. Your memory allocator behaves as a real allocator, but in some cases it will return a NULL instead of memory address. What if we rename "mock" to "allocator with fault injection"? 1. https://www.martinfowler.com/articles/mocksArentStubs.html > +{ > + assert(old_allocf == NULL); > + old_allocf = lua_getallocf(L, &old_alloc_state); > + lua_setallocf(L, mock_allocf, old_alloc_state); > + return 0; > +} > + > +static int unmock(lua_State *L) > +{ > + assert(old_allocf != NULL); > + assert(old_allocf != mock_allocf); > + lua_setallocf(L, old_allocf, old_alloc_state); > + old_allocf = NULL; > + old_alloc_state = NULL; > + return 0; > +} > + > +static const struct luaL_Reg mockalloc[] = { > + {"mock", mock}, > + {"unmock", unmock}, > + {NULL, NULL} > +}; > + > +LUA_API int luaopen_mockalloc(lua_State *L) > +{ > + luaL_register(L, "mockalloc", mockalloc); > + return 1; > +} > diff --git a/test/tarantool-tests/lj-720-errors-before-stitch.test.lua b/test/tarantool-tests/lj-720-errors-before-stitch.test.lua > index d750b721..6e8f70c2 100644 > --- a/test/tarantool-tests/lj-720-errors-before-stitch.test.lua > +++ b/test/tarantool-tests/lj-720-errors-before-stitch.test.lua > @@ -1,13 +1,27 @@ > local tap = require('tap') > local test = tap.test('lj-720-errors-before-stitch'):skipcond({ > ['Test requires JIT enabled'] = not jit.status(), > + ['Disabled on *BSD due to #4819'] = jit.os == 'BSD', > }) > -test:plan(1) > > --- `math.modf` recording is NYI. > +local jparse = require('utils').jit.parse > + > +-- `math.modf` recording is NYI. > -- Local `modf` simplifies `jit.dump()` output. > local modf = math.modf > + > +-- XXX: Avoid other traces compilation due to hotcount collisions > +-- for predictable results. > +jit.off() > +jit.flush() > + > +test:plan(2) > + > +-- We only need the abort reason in the test. > +jparse.start('t') > + > jit.opt.start('hotloop=1', 'maxsnap=1') > +jit.on() > > -- The loop has only two iterations: the first to detect its > -- hotness and the second to record it. The snapshot limit is > @@ -17,5 +31,27 @@ for _ = 1, 2 do > modf(1.2) > end > > +local _, aborted_traces = jparse.finish() > + > +jit.off() > + > test:ok(true, 'stack is balanced') > + > +-- Tarantool may compile traces on the startup. These traces > +-- already exceed the maximum snapshot amount we set after they > +-- are compiled. Hence, there is no need to reallocate the > +-- snapshot buffer, so the check for the snap size is not > +-- triggered. > +test:skipcond({ > + -- luacheck: no global > + ['Impossible to predict the number of snapshots for Tarantool'] = _TARANTOOL, > +}) > + > +assert(aborted_traces and aborted_traces[1], 'aborted trace is persisted') > + > +-- We tried to compile only one trace. > +local reason = aborted_traces[1][1].abort_reason > + > +test:like(reason, 'too many snapshots', 'abort reason is correct') > + > test:done(true)