[Tarantool-patches] [PATCH luajit] ARM64: Fix LDP/STP fusing for unaligned accesses.
Sergey Kaplun
skaplun at tarantool.org
Thu Jun 26 18:12:24 MSK 2025
From: Mike Pall <mike>
Thanks to Peter Cawley.
(cherry picked from commit 0fa2f1cbcf023ad0549f1428809e506fa2c78552)
The arm64 emitting of load/store operation works incorrectly in the
case when at least one offset of load/store to be fused into ldp/stp is
misaligned. In this case this misaligning is ignored, and instructions
are fused, which leads to loading/storing from/to at least one incorrect
address.
For example, the following instructions:
| stur w0, [x1, #17]
| stur w0, [x1, #21]
May be fused to the following:
| stp w0, w0, [x1, #16]
This patch prevents fusion in this case by testing the alignment with
the help of bitwise ROR by the alignment value. In case of misaligned
offset, the value overflows the 7-bit length mask in the check.
The negative immediate (7-bit width including sign bit) is limited by
the corresponding addition of `64 << sc` (it is harmless in the case of
positive values).
Sergey Kaplun:
* added the description and the test for the problem
Part of tarantool/tarantool#11278
---
Related issues:
* https://github.com/LuaJIT/LuaJIT/issues/1056
* https://github.com/tarantool/tarantool/issues/11278
Branch: https://github.com/tarantool/luajit/tree/skaplun/lj-1056-arm64-ldp-sdp-misaligned-fusing
src/lj_emit_arm64.h | 2 +-
...6-arm64-ldp-sdp-misaligned-fusing.test.lua | 98 +++++++++++++++++++
2 files changed, 99 insertions(+), 1 deletion(-)
create mode 100644 test/tarantool-tests/lj-1056-arm64-ldp-sdp-misaligned-fusing.test.lua
diff --git a/src/lj_emit_arm64.h b/src/lj_emit_arm64.h
index 30cd3505..5c1bc372 100644
--- a/src/lj_emit_arm64.h
+++ b/src/lj_emit_arm64.h
@@ -142,7 +142,7 @@ static void emit_lso(ASMState *as, A64Ins ai, Reg rd, Reg rn, int64_t ofs)
} else {
goto nopair;
}
- if (ofsm >= (int)((unsigned int)-64<<sc) && ofsm <= (63<<sc)) {
+ if (lj_ror((unsigned int)ofsm + (64u<<sc), sc) <= 127u) {
*as->mcp = aip | A64F_N(rn) | (((ofsm >> sc) & 0x7f) << 15) |
(ai ^ ((ai == A64I_LDRx || ai == A64I_STRx) ? 0x50000000 : 0x90000000));
return;
diff --git a/test/tarantool-tests/lj-1056-arm64-ldp-sdp-misaligned-fusing.test.lua b/test/tarantool-tests/lj-1056-arm64-ldp-sdp-misaligned-fusing.test.lua
new file mode 100644
index 00000000..5d03097e
--- /dev/null
+++ b/test/tarantool-tests/lj-1056-arm64-ldp-sdp-misaligned-fusing.test.lua
@@ -0,0 +1,98 @@
+local tap = require('tap')
+local ffi = require('ffi')
+
+-- This test demonstrates LuaJIT's incorrect emitting of LDP/STP
+-- instructions from LDUR/STUR instructions with misaligned offset
+-- on arm64.
+-- See also https://github.com/LuaJIT/LuaJIT/issue/1056.
+local test = tap.test('lj-1056-arm64-ldp-sdp-misaligned-fusing'):skipcond({
+ ['Test requires JIT enabled'] = not jit.status(),
+})
+
+test:plan(6)
+
+-- Amount of iterations to compile and run the invariant part of
+-- the trace.
+local N_ITERATIONS = 4
+
+local EXPECTED = 42
+
+-- 9 bytes to make unaligned 4-byte access like buf + 5.
+local BUFLEN = 9
+local buf = ffi.new('unsigned char [' .. BUFLEN .. ']', 0)
+
+local function clear_buf()
+ ffi.fill(buf, ffi.sizeof(buf), 0)
+end
+
+-- Initialize the buffer with simple values.
+local function init_buf()
+ for i = 0, BUFLEN - 1 do
+ buf[i] = i
+ end
+end
+
+local function test_buf_content(expected_bytes, msg)
+ local got_bytes = {}
+ assert(#expected_bytes == BUFLEN, 'mismatched size of buffer and table')
+ for i = 0, BUFLEN - 1 do
+ got_bytes[i + 1] = buf[i]
+ end
+ test:is_deeply(got_bytes, expected_bytes, msg)
+end
+
+jit.opt.start('hotloop=1')
+
+-- Test stores.
+
+for _ = 1, N_ITERATIONS do
+ local ptr = ffi.cast('unsigned char *', buf)
+ -- These 2 accesses become ptr + 0 and ptr + 4 on the trace
+ -- before the patch.
+ ffi.cast('int32_t *', ptr + 1)[0] = EXPECTED
+ ffi.cast('int32_t *', ptr + 5)[0] = EXPECTED
+end
+test_buf_content({0, EXPECTED, 0, 0, 0, EXPECTED, 0, 0, 0},
+ 'pair of misaligned stores')
+
+clear_buf()
+
+for _ = 1, N_ITERATIONS do
+ local ptr = ffi.cast('unsigned char *', buf)
+ -- The next access becomes ptr + 4 on the trace before the
+ -- patch.
+ ffi.cast('int32_t *', ptr + 5)[0] = EXPECTED
+ ffi.cast('int32_t *', ptr)[0] = EXPECTED
+end
+test_buf_content({EXPECTED, 0, 0, 0, 0, EXPECTED, 0, 0, 0},
+ 'aligned / misaligned stores')
+
+-- Test loads.
+
+local resl, resr = 0, 0
+
+init_buf()
+
+for _ = 1, N_ITERATIONS do
+ local ptr = ffi.cast('unsigned char *', buf)
+ -- These 2 accesses become ptr + 0 and ptr + 4 on the trace
+ -- before the patch.
+ resl = ffi.cast('int32_t *', ptr + 1)[0]
+ resr = ffi.cast('int32_t *', ptr + 5)[0]
+end
+
+test:is(resl, 0x4030201, 'pair of misaligned loads, left')
+test:is(resr, 0x8070605, 'pair of misaligned loads, right')
+
+for _ = 1, N_ITERATIONS do
+ local ptr = ffi.cast('unsigned char *', buf)
+ -- The next access becomes ptr + 4 on the trace before the
+ -- patch.
+ resr = ffi.cast('int32_t *', ptr + 5)[0]
+ resl = ffi.cast('int32_t *', ptr)[0]
+end
+
+test:is(resl, 0x3020100, 'aligned / misaligned load, aligned')
+test:is(resr, 0x8070605, 'aligned / misaligned load, misaligned')
+
+test:done(true)
--
2.49.0
More information about the Tarantool-patches
mailing list