* [Tarantool-patches] [PATCH luajit] ARM64: Fix LDP/STP fusion (again).
@ 2025-08-27 9:17 Sergey Kaplun via Tarantool-patches
0 siblings, 0 replies; only message in thread
From: Sergey Kaplun via Tarantool-patches @ 2025-08-27 9:17 UTC (permalink / raw)
To: Sergey Bronnikov; +Cc: tarantool-patches
From: Mike Pall <mike>
Reported and analyzed by Zhongwei Yao. Fix by Peter Cawley.
(cherry picked from commit b8c6ccd50c61b7a2df5123ddc5a85ac7d089542b)
Assume we have stores/loads from the pointer with offset +488 and -16.
The lower bits of the offset are the same as for the offset (488 + 8).
This leads to the incorrect fusion of these instructions:
| str x20, [x21, 488]
| stur x20, [x21, -16]
to the following instruction:
| stp x20, x20, [x21, 488]
This patch prevents this fusion by more accurate offset comparison.
Sergey Kaplun:
* added the description and the test for the problem
Part of tarantool/tarantool#11691
---
Branch: https://github.com/tarantool/luajit/tree/skaplun/lj-1075-arm64-incorrect-ldp-stp-fusion
Related issues:
* https://github.com/tarantool/tarantool/issues/11691
* https://github.com/LuaJIT/LuaJIT/issues/1075
src/lj_emit_arm64.h | 17 ++-
...75-arm64-incorrect-ldp-stp-fusion.test.lua | 129 ++++++++++++++++++
2 files changed, 142 insertions(+), 4 deletions(-)
create mode 100644 test/tarantool-tests/lj-1075-arm64-incorrect-ldp-stp-fusion.test.lua
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
index 5c1bc372..9dd92c40 100644
--- a/src/lj_emit_arm64.h
+++ b/src/lj_emit_arm64.h
@@ -121,6 +121,17 @@ static int emit_checkofs(A64Ins ai, int64_t ofs)
}
}
+static LJ_AINLINE uint32_t emit_lso_pair_candidate(A64Ins ai, int ofs, int sc)
+{
+ if (ofs >= 0) {
+ return ai | A64F_U12(ofs>>sc); /* Subsequent lj_ror checks ofs. */
+ } else if (ofs >= -256) {
+ return (ai^A64I_LS_U) | A64F_S9(ofs & 0x1ff);
+ } else {
+ return A64F_D(31); /* Will mismatch prev. */
+ }
+}
+
static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
{
int ot = emit_checkofs(ai, ofs), sc = (ai >> 30) & 3;
@@ -132,11 +143,9 @@ static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
uint32_t prev = *as->mcp & ~A64F_D(31);
int ofsm = ofs - (1<<sc), ofsp = ofs + (1<<sc);
A64Ins aip;
- if (prev == (ai | A64F_N(rn) | A64F_U12(ofsm>>sc)) ||
- prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsm&0x1ff))) {
+ if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsm, sc)) {
aip = (A64F_A(rd) | A64F_D(*as->mcp & 31));
- } else if (prev == (ai | A64F_N(rn) | A64F_U12(ofsp>>sc)) ||
- prev == ((ai^A64I_LS_U) | A64F_N(rn) | A64F_S9(ofsp&0x1ff))) {
+ } else if (prev == emit_lso_pair_candidate(ai | A64F_N(rn), ofsp, sc)) {
aip = (A64F_D(rd) | A64F_A(*as->mcp & 31));
ofsm = ofs;
} else {
diff --git a/test/tarantool-tests/lj-1075-arm64-incorrect-ldp-stp-fusion.test.lua b/test/tarantool-tests/lj-1075-arm64-incorrect-ldp-stp-fusion.test.lua
new file mode 100644
index 00000000..c84c3b23
--- /dev/null
+++ b/test/tarantool-tests/lj-1075-arm64-incorrect-ldp-stp-fusion.test.lua
@@ -0,0 +1,129 @@
+local tap = require('tap')
+local ffi = require('ffi')
+
+-- This test demonstrates LuaJIT's incorrect emitting of LDP/STP
+-- instruction fused from LDR/STR with negative offset and
+-- positive offset with the same lower bits on arm64.
+-- See also https://github.com/LuaJIT/LuaJIT/pull/1075.
+local test = tap.test('lj-1075-arm64-incorrect-ldp-stp-fusion'):skipcond({
+ ['Test requires JIT enabled'] = not jit.status(),
+})
+
+test:plan(6)
+
+-- Amount of iterations to compile and run the invariant part of
+-- the trace.
+local N_ITERATIONS = 4
+
+local EXPECTED = 42
+
+-- 4 slots of redzone for int64_t load/store.
+local REDZONE = 4
+local MASK_IMM7 = 0x7f
+local BUFLEN = (MASK_IMM7 + REDZONE) * 4
+local buf = ffi.new('unsigned char [' .. BUFLEN .. ']', 0)
+
+local function clear_buf()
+ ffi.fill(buf, ffi.sizeof(buf), 0)
+end
+
+-- Initialize the buffer with simple values.
+local function init_buf()
+ -- Limit to fill the buffer. 0 in the top part helps
+ -- to detect the issue.
+ local LIMIT = BUFLEN - 12
+ for i = 0, LIMIT - 1 do
+ buf[i] = i
+ end
+ for i = LIMIT, BUFLEN - 1 do
+ buf[i] = 0
+ end
+end
+
+jit.opt.start('hotloop=2')
+
+-- Assume we have stores/loads from the pointer with offset
+-- +488 and -16. The lower 7 bits of the offset (-16) >> 2 are
+-- 1111100. These bits are the same as for the offset (488 + 8).
+-- Thus, before the patch, these two instructions:
+-- | str x20, [x21, #488]
+-- | stur x20, [x21, #-16]
+-- are incorrectly fused to the:
+-- | stp x20, x20, [x21, #488]
+
+-- Test stores.
+
+local start = ffi.cast('unsigned char *', buf)
+-- Use constants to allow optimization to take place.
+local base_ptr = start + 16
+for _ = 1, N_ITERATIONS do
+ -- Save the result only for the last iteration.
+ clear_buf()
+ -- These 2 accesses become `base_ptr + 488` and `base_ptr + 496`
+ -- on the trace before the patch.
+ ffi.cast('uint64_t *', base_ptr + 488)[0] = EXPECTED
+ ffi.cast('uint64_t *', base_ptr - 16)[0] = EXPECTED
+end
+
+test:is(buf[488 + 16], EXPECTED, 'correct store top value')
+test:is(buf[0], EXPECTED, 'correct store bottom value')
+
+-- Test loads.
+
+init_buf()
+
+local top, bottom
+for _ = 1, N_ITERATIONS do
+ -- These 2 accesses become `base_ptr + 488` and `base_ptr + 496`
+ -- on the trace before the patch.
+ top = ffi.cast('uint64_t *', base_ptr + 488)[0]
+ bottom = ffi.cast('uint64_t *', base_ptr - 16)[0]
+end
+
+test:is(top, 0xfffefdfcfbfaf9f8ULL, 'correct load top value')
+test:is(bottom, 0x706050403020100ULL, 'correct load bottom value')
+
+-- Another reproducer that is based on the snapshot restoring.
+-- Its advantage is avoiding FFI usage.
+
+-- Snapshot slots are restored in the reversed order.
+-- The recording order is the following (from the bottom of the
+-- trace to the top):
+-- - 0th (ofs == -16) -- `f64()` replaced the `tail64()` on the
+-- stack,
+-- - 63rd (ofs == 488) -- 1,
+-- - 64th (ofs == 496) -- 2.
+-- At recording, the instructions for the 0th and 63rd slots are
+-- merged like the following:
+-- | str x3, [x19, #496]
+-- | stp x2, x1, [x19, #488]
+-- The first store is dominated by the stp, so the restored value
+-- is incorrect.
+
+-- Function with 63 slots on the stack.
+local function f63()
+ -- 61 unused slots to avoid extra stores in between.
+ -- luacheck: no unused
+ local _, _, _, _, _, _, _, _, _, _
+ local _, _, _, _, _, _, _, _, _, _
+ local _, _, _, _, _, _, _, _, _, _
+ local _, _, _, _, _, _, _, _, _, _
+ local _, _, _, _, _, _, _, _, _, _
+ local _, _, _, _, _, _, _, _, _, _
+ local _
+ return 1, 2
+end
+
+local function tail63()
+ return f63()
+end
+
+-- Record the trace.
+tail63()
+tail63()
+-- Run the trace.
+local one, two = tail63()
+test:is(one, 1, 'correct 1st value on stack')
+test:is(two, 2, 'correct 2nd value on stack')
+
+test:done(true)
--
2.51.0
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2025-08-27 9:16 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-08-27 9:17 [Tarantool-patches] [PATCH luajit] ARM64: Fix LDP/STP fusion (again) Sergey Kaplun via Tarantool-patches
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox