Hi, Sergey, thanks for the patch! LGTM with two minor comments. Sergey On 5/30/26 19:04, Sergey Kaplun wrote: > From: Mike Pall > > Contributed by Peter Cawley. > > (cherry picked from commit 83954100dba9fc0cf5eeaf122f007df35ec9a604) > > This patch adds FFI support for passing small (< 8 bytes) parameters on > the stack on the OSX arm64 architecture. Also, it fixes the compilation > of FFI vararg functions, since before the patch arguments were passed in > registers instead of the stack [1] for them. JIT machinery now uses > `TREF_NIL` as a marker for the slot from which the variadic arguments > begin. > > [1]:https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms#Update-code-that-passes-arguments-to-variadic-functions > > Sergey Kaplun: > * added the description and the test for the problem > > Resolves tarantool/tarantool#6097 > Part of tarantool/tarantool#12480 > --- > src/lj_asm_arm64.h | 75 +++++++++++++++---- > src/lj_ccall.c | 11 ++- > src/lj_ccall.h | 6 ++ > src/lj_crecord.c | 27 +++++++ > test/tarantool-tests/ffi-ccall/CMakeLists.txt | 8 +- > test/tarantool-tests/ffi-ccall/libfficcall.c | 51 +++++++++++++ > .../gh-6097-arm64-osx-ffi-vararg.test.lua | 43 +++++++++++ > .../lj-205-arm64-osx-ffi-enum-arg.test.lua | 63 ++++++++++++++++ > .../lj-205-arm64-osx-ffi-small-arg.test.lua | 29 +++++++ > 9 files changed, 291 insertions(+), 22 deletions(-) > create mode 100644 test/tarantool-tests/gh-6097-arm64-osx-ffi-vararg.test.lua > create mode 100644 test/tarantool-tests/lj-205-arm64-osx-ffi-enum-arg.test.lua > create mode 100644 test/tarantool-tests/lj-205-arm64-osx-ffi-small-arg.test.lua > > diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h > index 313b4a96..f731ab05 100644 > --- a/src/lj_asm_arm64.h > +++ b/src/lj_asm_arm64.h > @@ -416,7 +416,7 @@ static int asm_fuseorshift(ASMState *as, IRIns *ir) > static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) > { > uint32_t n, nargs = CCI_XNARGS(ci); > - int32_t ofs = 0; > + int32_t spofs = 0, spalign = LJ_HASFFI && LJ_TARGET_OSX ? 0 : 7; > Reg gpr, fpr = REGARG_FIRSTFPR; > if ((void *)ci->func) > emit_call(as, (void *)ci->func); > @@ -435,8 +435,14 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) > fpr++; > } else { > Reg r = ra_alloc1(as, ref, RSET_FPR); > - emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_isnum(ir->t)) ? 4 : 0)); > - ofs += 8; > + int32_t al = spalign; > +#if LJ_HASFFI && LJ_TARGET_OSX > + al |= irt_isnum(ir->t) ? 7 : 3; > +#endif > + spofs = (spofs + al) & ~al; > + if (LJ_BE && al >= 7 && !irt_isnum(ir->t)) spofs += 4, al -= 4; > + emit_spstore(as, ir, r, spofs); > + spofs += al + 1; > } > } else { > if (gpr <= REGARG_LASTGPR) { > @@ -446,10 +452,27 @@ static void asm_gencall(ASMState *as, const CCallInfo *ci, IRRef *args) > gpr++; > } else { > Reg r = ra_alloc1(as, ref, RSET_GPR); > - emit_spstore(as, ir, r, ofs + ((LJ_BE && !irt_is64(ir->t)) ? 4 : 0)); > - ofs += 8; > + int32_t al = spalign; > +#if LJ_HASFFI && LJ_TARGET_OSX > + al |= irt_size(ir->t) - 1; > +#endif > + spofs = (spofs + al) & ~al; > + if (al >= 3) { > + if (LJ_BE && al >= 7 && !irt_is64(ir->t)) spofs += 4, al -= 4; > + emit_spstore(as, ir, r, spofs); > + } else { > + lj_assertA(al == 0 || al == 1, "size %d unexpected", al + 1); > + emit_lso(as, al ? A64I_STRH : A64I_STRB, r, RID_SP, spofs); > + } > + spofs += al + 1; > } > } > +#if LJ_HASFFI && LJ_TARGET_OSX > + } else { /* Marker for start of varargs. */ > + gpr = REGARG_LASTGPR+1; > + fpr = REGARG_LASTFPR+1; > + spalign = 7; > +#endif > } > } > } > @@ -1928,19 +1951,41 @@ static void asm_tail_prep(ASMState *as) > /* Ensure there are enough stack slots for call arguments. */ > static Reg asm_setup_call_slots(ASMState *as, IRIns *ir, const CCallInfo *ci) > { > - IRRef args[CCI_NARGS_MAX*2]; > +#if LJ_HASFFI > uint32_t i, nargs = CCI_XNARGS(ci); > - int nslots = 0, ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR; > - asm_collectargs(as, ir, ci, args); > - for (i = 0; i < nargs; i++) { > - if (args[i] && irt_isfp(IR(args[i])->t)) { > - if (nfpr > 0) nfpr--; else nslots += 2; > - } else { > - if (ngpr > 0) ngpr--; else nslots += 2; > + if (nargs > (REGARG_NUMGPR < REGARG_NUMFPR ? REGARG_NUMGPR : REGARG_NUMFPR) || > + (LJ_TARGET_OSX && (ci->flags & CCI_VARARG))) { > + IRRef args[CCI_NARGS_MAX*2]; > + int ngpr = REGARG_NUMGPR, nfpr = REGARG_NUMFPR; > + int spofs = 0, spalign = LJ_TARGET_OSX ? 0 : 7, nslots; > + asm_collectargs(as, ir, ci, args); > + for (i = 0; i < nargs; i++) { > + int al = spalign; > + if (!args[i]) { > +#if LJ_TARGET_OSX > + /* Marker for start of varaargs. */ > + nfpr = 0; > + ngpr = 0; > + spalign = 7; > +#endif > + } else if (irt_isfp(IR(args[i])->t)) { > + if (nfpr > 0) { nfpr--; continue; } > +#if LJ_TARGET_OSX > + al |= irt_isnum(IR(args[i])->t) ? 7 : 3; > +#endif > + } else { > + if (ngpr > 0) { ngpr--; continue; } > +#if LJ_TARGET_OSX > + al |= irt_size(IR(args[i])->t) - 1; > +#endif > + } > + spofs = (spofs + 2*al+1) & ~al; /* Align and bump stack pointer. */ > } > + nslots = (spofs + 3) >> 2; > + if (nslots > as->evenspill) /* Leave room for args in stack slots. */ > + as->evenspill = nslots; > } > - if (nslots > as->evenspill) /* Leave room for args in stack slots. */ > - as->evenspill = nslots; > +#endif > return REGSP_HINT(RID_RET); > } > > diff --git a/src/lj_ccall.c b/src/lj_ccall.c > index 394255eb..b2705de5 100644 > --- a/src/lj_ccall.c > +++ b/src/lj_ccall.c > @@ -348,7 +348,6 @@ > goto done; \ > } else { \ > nfpr = CCALL_NARG_FPR; /* Prevent reordering. */ \ > - if (LJ_TARGET_OSX && d->size < 8) goto err_nyi; \ > } \ > } else { /* Try to pass argument in GPRs. */ \ > if (!LJ_TARGET_OSX && (d->info & CTF_ALIGN) > CTALIGN_PTR) \ > @@ -359,7 +358,6 @@ > goto done; \ > } else { \ > ngpr = maxgpr; /* Prevent reordering. */ \ > - if (LJ_TARGET_OSX && d->size < 8) goto err_nyi; \ > } \ > } > > @@ -1027,7 +1025,7 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, > CCALL_HANDLE_STRUCTARG > } else if (ctype_iscomplex(d->info)) { > CCALL_HANDLE_COMPLEXARG > - } else { > + } else if (!(CCALL_PACK_STACKARG && ctype_isenum(d->info))) { > sz = CTSIZE_PTR; > } > n = (sz + CTSIZE_PTR-1) / CTSIZE_PTR; /* Number of GPRs or stack slots needed. */ > @@ -1037,12 +1035,12 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, > /* Otherwise pass argument on stack. */ > if (CCALL_ALIGN_STACKARG) { /* Align argument on stack. */ > MSize align = (1u << ctype_align(d->info)) - 1; > - if (rp) > + if (rp || (CCALL_PACK_STACKARG && isva && align < CTSIZE_PTR-1)) > align = CTSIZE_PTR-1; > nsp = (nsp + align) & ~align; > } > dp = ((uint8_t *)cc->stack) + nsp; > - nsp += n * CTSIZE_PTR; > + nsp += CCALL_PACK_STACKARG ? sz : n * CTSIZE_PTR; > if (nsp > CCALL_SIZE_STACK) { /* Too many arguments. */ > err_nyi: > lj_err_caller(L, LJ_ERR_FFI_NYICALL); > @@ -1057,7 +1055,8 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, > } > lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg)); > /* Extend passed integers to 32 bits at least. */ > - if (ctype_isinteger_or_bool(d->info) && d->size < 4) { > + if (ctype_isinteger_or_bool(d->info) && d->size < 4 && > + (!CCALL_PACK_STACKARG || !((uintptr_t)dp & 3))) { /* Assumes LJ_LE. */ > if (d->info & CTF_UNSIGNED) > *(uint32_t *)dp = d->size == 1 ? (uint32_t)*(uint8_t *)dp : > (uint32_t)*(uint16_t *)dp; > diff --git a/src/lj_ccall.h b/src/lj_ccall.h > index af7a8e84..10d93b65 100644 > --- a/src/lj_ccall.h > +++ b/src/lj_ccall.h > @@ -75,6 +75,9 @@ typedef union FPRArg { > #define CCALL_NARG_FPR 8 > #define CCALL_NRET_FPR 4 > #define CCALL_SPS_FREE 0 > +#if LJ_TARGET_OSX > +#define CCALL_PACK_STACKARG 1 > +#endif > > typedef intptr_t GPRArg; > typedef union FPRArg { > @@ -139,6 +142,9 @@ typedef union FPRArg { > #ifndef CCALL_ALIGN_STACKARG > #define CCALL_ALIGN_STACKARG 1 > #endif > +#ifndef CCALL_PACK_STACKARG > +#define CCALL_PACK_STACKARG 0 > +#endif > #ifndef CCALL_ALIGN_CALLSTATE > #define CCALL_ALIGN_CALLSTATE 8 > #endif > diff --git a/src/lj_crecord.c b/src/lj_crecord.c > index d486ee85..7d9421a6 100644 > --- a/src/lj_crecord.c > +++ b/src/lj_crecord.c > @@ -1122,6 +1122,12 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd, > ngpr = 1; > else if (ctype_cconv(info) == CTCC_FASTCALL) > ngpr = 2; > +#elif LJ_TARGET_ARM64 > +#if LJ_ABI_WIN > +#error "NYI: ARM64 Windows ABI calling conventions" > +#elif LJ_TARGET_OSX > + int ngpr = CCALL_NARG_GPR; > +#endif > #endif > > /* Skip initial attributes. */ > @@ -1147,6 +1153,14 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd, > } else { > if (!(info & CTF_VARARG)) > lj_trace_err(J, LJ_TRERR_NYICALL); /* Too many arguments. */ > +#if LJ_TARGET_ARM64 && LJ_TARGET_OSX > + if (ngpr >= 0) { > + ngpr = -1; > + args[n++] = TREF_NIL; /* Marker for start of varargs. */ > + if (n >= CCI_NARGS_MAX) > + lj_trace_err(J, LJ_TRERR_NYICALL); > + } > +#endif > did = lj_ccall_ctid_vararg(cts, o); /* Infer vararg type. */ > } > d = ctype_raw(cts, did); > @@ -1155,6 +1169,15 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd, > lj_trace_err(J, LJ_TRERR_NYICALL); > tr = crec_ct_tv(J, d, 0, *base, o); > if (ctype_isinteger_or_bool(d->info)) { > +#if LJ_TARGET_ARM64 && LJ_TARGET_OSX > + if (!ngpr) { > + /* Fixed args passed on the stack use their unpromoted size. */ > + if (d->size != lj_ir_type_size[tref_type(tr)]) { > + lj_assertJ(d->size == 1 || d->size==2, "unexpected size %d", d->size); > + tr = emitconv(tr, d->size==1 ? IRT_U8 : IRT_U16, tref_type(tr), 0); > + } > + } else > +#endif > if (d->size < 4) { > if ((d->info & CTF_UNSIGNED)) > tr = emitconv(tr, IRT_INT, d->size==1 ? IRT_U8 : IRT_U16, 0); > @@ -1192,6 +1215,10 @@ static TRef crec_call_args(jit_State *J, RecordFFData *rd, > } > } > #endif > +#elif LJ_TARGET_ARM64 && LJ_TARGET_OSX > + if (!ctype_isfp(d->info) && ngpr) { > + ngpr--; > + } > #endif > args[n] = tr; > } > diff --git a/test/tarantool-tests/ffi-ccall/CMakeLists.txt b/test/tarantool-tests/ffi-ccall/CMakeLists.txt > index 8acd8fe4..27de07ac 100644 > --- a/test/tarantool-tests/ffi-ccall/CMakeLists.txt > +++ b/test/tarantool-tests/ffi-ccall/CMakeLists.txt > @@ -1 +1,7 @@ > -BuildTestCLib(libfficcall libfficcall.c ffi-ccall-arm64-fp-convention.test.lua) > +list(APPEND tests > + ffi-ccall-arm64-fp-convention.test.lua > + lj-205-arm64-osx-ffi-enum-arg.test.lua > + lj-205-arm64-osx-ffi-small-arg.test.lua > +) > + > +BuildTestCLib(libfficcall libfficcall.c "${tests}") > diff --git a/test/tarantool-tests/ffi-ccall/libfficcall.c b/test/tarantool-tests/ffi-ccall/libfficcall.c > index 6c23f7d1..fd2d4711 100644 > --- a/test/tarantool-tests/ffi-ccall/libfficcall.c > +++ b/test/tarantool-tests/ffi-ccall/libfficcall.c > @@ -1,3 +1,5 @@ > +#include > + > struct sz12_t { > float f1; > float f2; > @@ -26,3 +28,52 @@ struct sz12_t sum3sz12(struct sz12_t a, struct sz12_t b, struct sz12_t c) > res.f3 = a.f3 + b.f3 + c.f3; > return res; > } > + > +/****************************************************************/ > +/* Enums. */ > +/****************************************************************/ > + > +typedef enum { > + E1 = 1, > + E2 = 2, > + E3 = 3, > + E4 = 4, > + E5 = 5, > + E6 = 6, > + E7 = 7, > + E8 = 8, > + E9 = 9, > + E10 = 10, > + E11 = 11 > +} enum_t; > + > +int test_enum_reg(enum_t e1, enum_t e2, enum_t e3) > +{ > + return e1 + e2 + e3; > +} > + > +int test_enum_stack(enum_t e1, enum_t e2, enum_t e3, enum_t e4, enum_t e5, > + enum_t e6, enum_t e7, enum_t e8, enum_t e9, enum_t e10, > + enum_t e11) > +{ > + return e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11; > +} > + > +/****************************************************************/ > +/* Basic types (< 8 bytes). */ > +/****************************************************************/ > + > +uint8_t test_u8_stack(uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, > + uint8_t u5, uint8_t u6, uint8_t u7, uint8_t u8, > + uint8_t u9, uint8_t u10, uint8_t u11) > +{ > + return u1 + u2 + u3 + u4 + u5 + u6 + u7 + u8 + u9 + u10 + u11; > +} > + > +float test_float_stack(float f1, float f2, float f3, float f4, float f5, > + float f6, float f7, float f8, float f9, float f10, > + float f11) > +{ > + return f1 + f2 + f3 + f4 + f5 + f6 + f7 + f8 + f9 + f10 + f11; > +} > + newline is not needed > diff --git a/test/tarantool-tests/gh-6097-arm64-osx-ffi-vararg.test.lua b/test/tarantool-tests/gh-6097-arm64-osx-ffi-vararg.test.lua > new file mode 100644 > index 00000000..fc44d253 > --- /dev/null > +++ b/test/tarantool-tests/gh-6097-arm64-osx-ffi-vararg.test.lua > @@ -0,0 +1,43 @@ > +local tap = require('tap') > + > +-- The test file to demonstrate LuaJIT incorrect FFI vararg call > +-- on macOS M1. > +-- See also:https://github.com/tarantool/tarantool/issues/6097. > +local test = tap.test('gh-6097-arm64-osx-ffi-vararg'):skipcond({ > + ['Test requires JIT enabled'] = not jit.status(), > +}) > + > +test:plan(4) > + > +local ffi = require('ffi') > + > +ffi.cdef('int sprintf(char *str, const char *format, ...)') > + > +local EXPECTED = '1' > +local EXPECTED_LEN = #EXPECTED > + > +local str = ffi.new(string.format('char[256]')) > + > +jit.opt.start('hotloop=1') > + > +local results = {} > +for i = 1, 4 do > + local strlen = ffi.C.sprintf(str, '%d', 1LL) honestly, I didn't get why the resulted buffer is named "strlen". The same is below. > + assert(strlen == EXPECTED_LEN, 'correct string length for result') > + results[i] = ffi.string(str) > +end > + > +test:is(results[1], EXPECTED, 'correct result of FFI vararg call for int') > +test:samevalues(results, 'consistent behaviour JIT and VM for vararg int arg') > + > +results = {} > +for i = 1, 4 do > + local strlen = ffi.C.sprintf(str, '%c', ffi.new('char', string.byte('1'))) > + assert(strlen == EXPECTED_LEN, 'correct string length for result') > + results[i] = ffi.string(str) > +end > + > +test:is(results[1], EXPECTED, 'correct result of FFI vararg call for char') > +test:samevalues(results, 'consistent behaviour JIT and VM for vararg char arg') > + > +test:done(true) > diff --git a/test/tarantool-tests/lj-205-arm64-osx-ffi-enum-arg.test.lua b/test/tarantool-tests/lj-205-arm64-osx-ffi-enum-arg.test.lua > new file mode 100644 > index 00000000..4ba4f69d > --- /dev/null > +++ b/test/tarantool-tests/lj-205-arm64-osx-ffi-enum-arg.test.lua > @@ -0,0 +1,63 @@ > +local ffi = require('ffi') > +local tap = require('tap') > + > +local ffi_ccall = ffi.load('libfficcall') > + > +-- The test file to check the FFI call for enum arguments. > +-- See also:https://github.com/LuaJIT/LuaJIT/issues/205. > +local test = tap.test('lj-205-arm64-osx-ffi-enum-arg'):skipcond({ > + ['Test requires JIT enabled'] = not jit.status(), > +}) > + > +test:plan(4) > + > +ffi.cdef[[ > + int sprintf(char *str, const char *format, ...); > + > + typedef enum { > + E1 = 1, > + E2 = 2, > + E3 = 3, > + E4 = 4, > + E5 = 5, > + E6 = 6, > + E7 = 7, > + E8 = 8, > + E9 = 9, > + E10 = 10, > + E11 = 11 > + } enum_t; > + > + int test_enum_reg(enum_t e1, enum_t e2, enum_t e3); > + > + int test_enum_stack(enum_t e1, enum_t e2, enum_t e3, enum_t e4, enum_t e5, > + enum_t e6, enum_t e7, enum_t e8, enum_t e9, enum_t e10, > + enum_t e11); > +]] > + > + > +local str = ffi.new(string.format('char[256]')) > + > +jit.opt.start('hotloop=1') > + > +local enum_t = ffi.typeof('enum_t') > + > +local results = {} > +for i = 1, 4 do > + local strlen = ffi.C.sprintf(str, '%d', enum_t(1)) > + assert(strlen == 1, 'correct string length for result') > + results[i] = ffi.string(str) > +end > + > +test:is(results[1], '1', 'correct result of FFI vararg call for enum') > +test:samevalues(results, 'consistent behaviour JIT and VM for vararg enum arg') > + > +test:is(ffi_ccall.test_enum_reg(enum_t(1), enum_t(2), enum_t(3)), 6, > + 'correct enum reg pass') > + > +test:is(ffi_ccall.test_enum_stack(enum_t(1), enum_t(2), enum_t(3), enum_t(4), > + enum_t(5), enum_t(6), enum_t(7), enum_t(8), > + enum_t(9), enum_t(10), enum_t(11)), > + 66, 'correct enum stack pass') > + > +test:done(true) > diff --git a/test/tarantool-tests/lj-205-arm64-osx-ffi-small-arg.test.lua b/test/tarantool-tests/lj-205-arm64-osx-ffi-small-arg.test.lua > new file mode 100644 > index 00000000..be60de93 > --- /dev/null > +++ b/test/tarantool-tests/lj-205-arm64-osx-ffi-small-arg.test.lua > @@ -0,0 +1,29 @@ > +local ffi = require('ffi') > +local tap = require('tap') > + > +local ffi_ccall = ffi.load('libfficcall') > + > +-- The test file to check the FFI call for small (<8 bytes) > +-- arguments give on stack. > +-- See also:https://github.com/LuaJIT/LuaJIT/issues/205. > +local test = tap.test('lj-205-arm64-osx-ffi-small-arg') > +test:plan(2) > + > +ffi.cdef[[ > + uint8_t test_u8_stack(uint8_t u1, uint8_t u2, uint8_t u3, uint8_t u4, > + uint8_t u5, uint8_t u6, uint8_t u7, uint8_t u8, > + uint8_t u9, uint8_t u10, uint8_t u11); > + > + float test_float_stack(float f1, float f2, float f3, float f4, float f5, > + float f6, float f7, float f8, float f9, float f10, > + float f11); > +]] > + > +test:is(ffi_ccall.test_u8_stack(1ULL, 2ULL, 3ULL, 4ULL, 5ULL, 6ULL, 7ULL, > + 8ULL, 9ULL, 10ULL, 11ULL), > + 66, 'correct uint8_t stack pass') > + > +test:is(ffi_ccall.test_float_stack(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), 66, > + 'correct float stack pass') > + > +test:done(true)