Hi, Sergey!

thanks for the patch! LGTM

On 10.01.2025 16:07, Sergey Kaplun wrote:
> From: Mike Pall <mike>
>
> Reported by Peter Cawley.
>
> (cherry picked from commit 644723649ea04cb23b72c814b88b72a29e4afed4)
>
> Load fusing optimization doesn't take into account the presence of the
> `IR_NEWREF` which may cause rehashing and deallocation of the array part
> of the table. This may lead to the incorrect stores if the fusing
> optimization occurs across this IR, leading to inconsistent behaviour
> between the JIT and the VM.
>
> This patch adds the corresponding check by the refactoring of the
> `noconflict()` function -- it now accepts the mask of the `check` as the
> last argument. The first bit stands for the `IR_NEWREF` check, the
> second for the multiple reference of the given instruction.
> Unfortunately, this commit misses the check for the `table.clear()`
> introduced for the preprevious patch. Thus, the corresponding test fails
> again. This will be fixed in the next commit.
>
> Sergey Kaplun:
> * added the description and the test for the problem
>
> Part of tarantool/tarantool#10709
> ---
>   src/lj_asm_x86.h                              | 17 +++---
>   .../lj-1117-fuse-across-newref.test.lua       | 52 +++++++++++++++++++
>   2 files changed, 61 insertions(+), 8 deletions(-)
>   create mode 100644 test/tarantool-tests/lj-1117-fuse-across-newref.test.lua
>
> diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
> index cba7ba80..d77087d6 100644
> --- a/src/lj_asm_x86.h
> +++ b/src/lj_asm_x86.h
> @@ -109,7 +109,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
>   /* Check if there's no conflicting instruction between curins and ref.
>   ** Also avoid fusing loads if there are multiple references.
>   */
> -static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
> +static int noconflict(ASMState *as, IRRef ref, IROp conflict, int check)
>   {
>     IRIns *ir = as->ir;
>     IRRef i = as->curins;
> @@ -118,7 +118,9 @@ static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
>     while (--i > ref) {
>       if (ir[i].o == conflict)
>         return 0;  /* Conflict found. */
> -    else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref))
> +    else if ((check & 1) && ir[i].o == IR_NEWREF)
> +      return 0;
> +    else if ((check & 2) && (ir[i].op1 == ref || ir[i].op2 == ref))
>         return 0;
>     }
>     return 1;  /* Ok, no conflict. */
> @@ -134,7 +136,7 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref)
>       lj_assertA(irb->op2 == IRFL_TAB_ARRAY, "expected FLOAD TAB_ARRAY");
>       /* We can avoid the FLOAD of t->array for colocated arrays. */
>       if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
> -	!neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) {
> +	!neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 0)) {
>         as->mrm.ofs = (int32_t)sizeof(GCtab);  /* Ofs to colocated array. */
>         return irb->op1;  /* Table obj. */
>       }
> @@ -448,7 +450,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
>       RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
>       if (ir->o == IR_SLOAD) {
>         if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
> -	  noconflict(as, ref, IR_RETF, 0) &&
> +	  noconflict(as, ref, IR_RETF, 2) &&
>   	  !(LJ_GC64 && irt_isaddr(ir->t))) {
>   	as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
>   	as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
> @@ -459,13 +461,12 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
>       } else if (ir->o == IR_FLOAD) {
>         /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
>         if ((irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)) &&
> -	  noconflict(as, ref, IR_FSTORE, 0)) {
> +	  noconflict(as, ref, IR_FSTORE, 2)) {
>   	asm_fusefref(as, ir, xallow);
>   	return RID_MRM;
>         }
>       } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
> -      if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) &&
> -	  noconflict(as, ref, IR_CALLS, 1) &&  /* Don't cross table.clear. */
> +      if (noconflict(as, ref, ir->o + IRDELTA_L2S, 2+(ir->o != IR_ULOAD)) &&
>   	  !(LJ_GC64 && irt_isaddr(ir->t))) {
>   	asm_fuseahuref(as, ir->op1, xallow);
>   	return RID_MRM;
> @@ -475,7 +476,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
>         ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
>         */
>         if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) &&
> -	  noconflict(as, ref, IR_XSTORE, 0)) {
> +	  noconflict(as, ref, IR_XSTORE, 2)) {
>   	asm_fusexref(as, ir->op1, xallow);
>   	return RID_MRM;
>         }
> diff --git a/test/tarantool-tests/lj-1117-fuse-across-newref.test.lua b/test/tarantool-tests/lj-1117-fuse-across-newref.test.lua
> new file mode 100644
> index 00000000..4b8772bf
> --- /dev/null
> +++ b/test/tarantool-tests/lj-1117-fuse-across-newref.test.lua
> @@ -0,0 +1,52 @@
> +local tap = require('tap')
> +-- Test file to demonstrate LuaJIT's incorrect fusion across
> +-- `IR_NEWREF`.
> +-- See also:https://github.com/LuaJIT/LuaJIT/issues/1117.
> +local test = tap.test('lj-1117-fuse-across-newref'):skipcond({
> +  ['Test requires JIT enabled'] = not jit.status(),
> +})
> +
> +local ffi = require('ffi')
> +
> +test:plan(1)
> +
> +-- Table with content.
> +local tab = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 42}
> +-- Use the alias to trick the code flow analysis.
> +local tab_alias = tab
> +local result_tab = {}
> +
> +-- Need to start recording trace at the 16th iteration to avoid
> +-- rehashing of the `t` and `result_tab` before the `if`
> +-- condition below on the 32nd iteration. Also, the inner loop
> +-- isn't recorded this way. After rehashing in the NEWREF, the
> +-- fusion will use the wrong address, which leads to the dirty
> +-- reads visible (always, not flaky) under Valgrind with the
> +-- `--free-fill` option set.
> +jit.opt.start('hotloop=16')
> +
> +-- The amount of iterations required for the rehashing of the
> +-- table.
> +for i = 1, 33 do
> +  -- ALOAD to be fused.
> +  local value = tab[16]
> +  -- NEWREF instruction.
> +  tab_alias[{}] = 100
> +  -- Need this CONV cast to trigger load fusion. See `asm_comp()`
> +  -- for the details. Before the patch, this fusion takes the
> +  -- incorrect address of the already deallocated array part of
> +  -- the table, which leads to the incorrect result.
> +  result_tab[i] = ffi.cast('int64_t', value)
> +  if i == 32 then
> +    -- Clear the array part.
> +    for j = 1, 15 do
> +      tab[j] = nil
> +    end
> +    -- Next rehash of the `tab`/`tab_alias` will dealloc the array
> +    -- part.
> +  end
> +end
> +
> +test:samevalues(result_tab, 'no fusion across NEWREF')
> +
> +test:done(true)