Hi, Sergey!

thanks for the patch! LGTM

On 10.01.2025 16:07, Sergey Kaplun wrote:
From: Mike Pall <mike>

Reported by Peter Cawley.

(cherry picked from commit 644723649ea04cb23b72c814b88b72a29e4afed4)

Load fusing optimization doesn't take into account the presence of the
`IR_NEWREF` which may cause rehashing and deallocation of the array part
of the table. This may lead to the incorrect stores if the fusing
optimization occurs across this IR, leading to inconsistent behaviour
between the JIT and the VM.

This patch adds the corresponding check by the refactoring of the
`noconflict()` function -- it now accepts the mask of the `check` as the
last argument. The first bit stands for the `IR_NEWREF` check, the
second for the multiple reference of the given instruction.
Unfortunately, this commit misses the check for the `table.clear()`
introduced for the preprevious patch. Thus, the corresponding test fails
again. This will be fixed in the next commit.

Sergey Kaplun:
* added the description and the test for the problem

Part of tarantool/tarantool#10709
---
 src/lj_asm_x86.h                              | 17 +++---
 .../lj-1117-fuse-across-newref.test.lua       | 52 +++++++++++++++++++
 2 files changed, 61 insertions(+), 8 deletions(-)
 create mode 100644 test/tarantool-tests/lj-1117-fuse-across-newref.test.lua

diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index cba7ba80..d77087d6 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -109,7 +109,7 @@ static int asm_isk32(ASMState *as, IRRef ref, int32_t *k)
 /* Check if there's no conflicting instruction between curins and ref.
 ** Also avoid fusing loads if there are multiple references.
 */
-static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
+static int noconflict(ASMState *as, IRRef ref, IROp conflict, int check)
 {
   IRIns *ir = as->ir;
   IRRef i = as->curins;
@@ -118,7 +118,9 @@ static int noconflict(ASMState *as, IRRef ref, IROp conflict, int noload)
   while (--i > ref) {
     if (ir[i].o == conflict)
       return 0;  /* Conflict found. */
-    else if (!noload && (ir[i].op1 == ref || ir[i].op2 == ref))
+    else if ((check & 1) && ir[i].o == IR_NEWREF)
+      return 0;
+    else if ((check & 2) && (ir[i].op1 == ref || ir[i].op2 == ref))
       return 0;
   }
   return 1;  /* Ok, no conflict. */
@@ -134,7 +136,7 @@ static IRRef asm_fuseabase(ASMState *as, IRRef ref)
     lj_assertA(irb->op2 == IRFL_TAB_ARRAY, "expected FLOAD TAB_ARRAY");
     /* We can avoid the FLOAD of t->array for colocated arrays. */
     if (ira->o == IR_TNEW && ira->op1 <= LJ_MAX_COLOSIZE &&
-	!neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 1)) {
+	!neverfuse(as) && noconflict(as, irb->op1, IR_NEWREF, 0)) {
       as->mrm.ofs = (int32_t)sizeof(GCtab);  /* Ofs to colocated array. */
       return irb->op1;  /* Table obj. */
     }
@@ -448,7 +450,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
     RegSet xallow = (allow & RSET_GPR) ? allow : RSET_GPR;
     if (ir->o == IR_SLOAD) {
       if (!(ir->op2 & (IRSLOAD_PARENT|IRSLOAD_CONVERT)) &&
-	  noconflict(as, ref, IR_RETF, 0) &&
+	  noconflict(as, ref, IR_RETF, 2) &&
 	  !(LJ_GC64 && irt_isaddr(ir->t))) {
 	as->mrm.base = (uint8_t)ra_alloc1(as, REF_BASE, xallow);
 	as->mrm.ofs = 8*((int32_t)ir->op1-1-LJ_FR2) +
@@ -459,13 +461,12 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
     } else if (ir->o == IR_FLOAD) {
       /* Generic fusion is only ok for 32 bit operand (but see asm_comp). */
       if ((irt_isint(ir->t) || irt_isu32(ir->t) || irt_isaddr(ir->t)) &&
-	  noconflict(as, ref, IR_FSTORE, 0)) {
+	  noconflict(as, ref, IR_FSTORE, 2)) {
 	asm_fusefref(as, ir, xallow);
 	return RID_MRM;
       }
     } else if (ir->o == IR_ALOAD || ir->o == IR_HLOAD || ir->o == IR_ULOAD) {
-      if (noconflict(as, ref, ir->o + IRDELTA_L2S, 0) &&
-	  noconflict(as, ref, IR_CALLS, 1) &&  /* Don't cross table.clear. */
+      if (noconflict(as, ref, ir->o + IRDELTA_L2S, 2+(ir->o != IR_ULOAD)) &&
 	  !(LJ_GC64 && irt_isaddr(ir->t))) {
 	asm_fuseahuref(as, ir->op1, xallow);
 	return RID_MRM;
@@ -475,7 +476,7 @@ static Reg asm_fuseload(ASMState *as, IRRef ref, RegSet allow)
       ** Fusing unaligned memory operands is ok on x86 (except for SIMD types).
       */
       if ((!irt_typerange(ir->t, IRT_I8, IRT_U16)) &&
-	  noconflict(as, ref, IR_XSTORE, 0)) {
+	  noconflict(as, ref, IR_XSTORE, 2)) {
 	asm_fusexref(as, ir->op1, xallow);
 	return RID_MRM;
       }
diff --git a/test/tarantool-tests/lj-1117-fuse-across-newref.test.lua b/test/tarantool-tests/lj-1117-fuse-across-newref.test.lua
new file mode 100644
index 00000000..4b8772bf
--- /dev/null
+++ b/test/tarantool-tests/lj-1117-fuse-across-newref.test.lua
@@ -0,0 +1,52 @@
+local tap = require('tap')
+-- Test file to demonstrate LuaJIT's incorrect fusion across
+-- `IR_NEWREF`.
+-- See also: https://github.com/LuaJIT/LuaJIT/issues/1117.
+local test = tap.test('lj-1117-fuse-across-newref'):skipcond({
+  ['Test requires JIT enabled'] = not jit.status(),
+})
+
+local ffi = require('ffi')
+
+test:plan(1)
+
+-- Table with content.
+local tab = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 42}
+-- Use the alias to trick the code flow analysis.
+local tab_alias = tab
+local result_tab = {}
+
+-- Need to start recording trace at the 16th iteration to avoid
+-- rehashing of the `t` and `result_tab` before the `if`
+-- condition below on the 32nd iteration. Also, the inner loop
+-- isn't recorded this way. After rehashing in the NEWREF, the
+-- fusion will use the wrong address, which leads to the dirty
+-- reads visible (always, not flaky) under Valgrind with the
+-- `--free-fill` option set.
+jit.opt.start('hotloop=16')
+
+-- The amount of iterations required for the rehashing of the
+-- table.
+for i = 1, 33 do
+  -- ALOAD to be fused.
+  local value = tab[16]
+  -- NEWREF instruction.
+  tab_alias[{}] = 100
+  -- Need this CONV cast to trigger load fusion. See `asm_comp()`
+  -- for the details. Before the patch, this fusion takes the
+  -- incorrect address of the already deallocated array part of
+  -- the table, which leads to the incorrect result.
+  result_tab[i] = ffi.cast('int64_t', value)
+  if i == 32 then
+    -- Clear the array part.
+    for j = 1, 15 do
+      tab[j] = nil
+    end
+    -- Next rehash of the `tab`/`tab_alias` will dealloc the array
+    -- part.
+  end
+end
+
+test:samevalues(result_tab, 'no fusion across NEWREF')
+
+test:done(true)