Thanks for the patch! LGTM On 5/30/26 19:04, Sergey Kaplun wrote: > From: Mike Pall > > (cherry picked from commit cf903edb30e0cbd620ebd4bac02d4e2b4410fd02) > > This patch refactors the FFI CCallState structure. The `nsp` field now > contains the number of bytes of the stack slot occupied. Also, it > effectively decreased the number of arguments for the callee to 31. > This patch is required for the next commit. > > Sergey Kaplun: > * added the description for the patch > > Part of tarantool/tarantool#12480 > --- > src/lj_ccall.c | 57 +++++++++++++++++++++++++--------------------- > src/lj_ccall.h | 7 +++--- > src/vm_arm.dasc | 8 +++---- > src/vm_arm64.dasc | 8 +++---- > src/vm_mips.dasc | 1 - > src/vm_mips64.dasc | 1 - > src/vm_ppc.dasc | 3 +-- > src/vm_x64.dasc | 8 +++---- > src/vm_x86.dasc | 22 +++++++++++------- > 9 files changed, 62 insertions(+), 53 deletions(-) > > diff --git a/src/lj_ccall.c b/src/lj_ccall.c > index c3b27572..394255eb 100644 > --- a/src/lj_ccall.c > +++ b/src/lj_ccall.c > @@ -20,12 +20,15 @@ > #if LJ_TARGET_X86 > /* -- x86 calling conventions --------------------------------------------- */ > > +#define CCALL_PUSH(arg) \ > + *(GPRArg *)((uint8_t *)cc->stack + nsp) = (GPRArg)(arg), nsp += CTSIZE_PTR > + > #if LJ_ABI_WIN > > #define CCALL_HANDLE_STRUCTRET \ > /* Return structs bigger than 8 by reference (on stack only). */ \ > cc->retref = (sz > 8); \ > - if (cc->retref) cc->stack[nsp++] = (GPRArg)dp; > + if (cc->retref) CCALL_PUSH(dp); > > #define CCALL_HANDLE_COMPLEXRET CCALL_HANDLE_STRUCTRET > > @@ -40,7 +43,7 @@ > if (ngpr < maxgpr) \ > cc->gpr[ngpr++] = (GPRArg)dp; \ > else \ > - cc->stack[nsp++] = (GPRArg)dp; \ > + CCALL_PUSH(dp); \ > } else { /* Struct with single FP field ends up in FPR. */ \ > cc->resx87 = ccall_classify_struct(cts, ctr); \ > } > @@ -56,7 +59,7 @@ > if (ngpr < maxgpr) \ > cc->gpr[ngpr++] = (GPRArg)dp; \ > else \ > - cc->stack[nsp++] = (GPRArg)dp; > + CCALL_PUSH(dp); > > #endif > > @@ -67,7 +70,7 @@ > if (ngpr < maxgpr) \ > cc->gpr[ngpr++] = (GPRArg)dp; \ > else \ > - cc->stack[nsp++] = (GPRArg)dp; \ > + CCALL_PUSH(dp); \ > } > > #endif > @@ -278,8 +281,8 @@ > if (ngpr < maxgpr) { \ > dp = &cc->gpr[ngpr]; \ > if (ngpr + n > maxgpr) { \ > - nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ > - if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ > + nsp += (ngpr + n - maxgpr) * CTSIZE_PTR; /* Assumes contiguous gpr/stack fields. */ \ > + if (nsp > CCALL_SIZE_STACK) goto err_nyi; /* Too many arguments. */ \ > ngpr = maxgpr; \ > } else { \ > ngpr += n; \ > @@ -471,8 +474,8 @@ > if (ngpr < maxgpr) { \ > dp = &cc->gpr[ngpr]; \ > if (ngpr + n > maxgpr) { \ > - nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ > - if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ > + nsp += (ngpr + n - maxgpr) * CTSIZE_PTR; /* Assumes contiguous gpr/stack fields. */ \ > + if (nsp > CCALL_SIZE_STACK) goto err_nyi; /* Too many arguments. */ \ > ngpr = maxgpr; \ > } else { \ > ngpr += n; \ > @@ -565,8 +568,8 @@ > if (ngpr < maxgpr) { \ > dp = &cc->gpr[ngpr]; \ > if (ngpr + n > maxgpr) { \ > - nsp += ngpr + n - maxgpr; /* Assumes contiguous gpr/stack fields. */ \ > - if (nsp > CCALL_MAXSTACK) goto err_nyi; /* Too many arguments. */ \ > + nsp += (ngpr + n - maxgpr) * CTSIZE_PTR; /* Assumes contiguous gpr/stack fields. */ \ > + if (nsp > CCALL_SIZE_STACK) goto err_nyi; /* Too many arguments. */ \ > ngpr = maxgpr; \ > } else { \ > ngpr += n; \ > @@ -698,10 +701,11 @@ static int ccall_struct_arg(CCallState *cc, CTState *cts, CType *d, int *rcl, > lj_cconv_ct_tv(cts, d, (uint8_t *)dp, o, CCF_ARG(narg)); > if (ccall_struct_reg(cc, cts, dp, rcl)) { > /* Register overflow? Pass on stack. */ > - MSize nsp = cc->nsp, n = rcl[1] ? 2 : 1; > - if (nsp + n > CCALL_MAXSTACK) return 1; /* Too many arguments. */ > - cc->nsp = nsp + n; > - memcpy(&cc->stack[nsp], dp, n*CTSIZE_PTR); > + MSize nsp = cc->nsp, sz = rcl[1] ? 2*CTSIZE_PTR : CTSIZE_PTR; > + if (nsp + sz > CCALL_SIZE_STACK) > + return 1; /* Too many arguments. */ > + cc->nsp = nsp + sz; > + memcpy((uint8_t *)cc->stack + nsp, dp, sz); > } > return 0; /* Ok. */ > } > @@ -1026,22 +1030,23 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, > } else { > sz = CTSIZE_PTR; > } > - sz = (sz + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1); > - n = sz / CTSIZE_PTR; /* Number of GPRs or stack slots needed. */ > + n = (sz + CTSIZE_PTR-1) / CTSIZE_PTR; /* Number of GPRs or stack slots needed. */ > > CCALL_HANDLE_REGARG /* Handle register arguments. */ > > /* Otherwise pass argument on stack. */ > - if (CCALL_ALIGN_STACKARG && !rp && (d->info & CTF_ALIGN) > CTALIGN_PTR) { > - MSize align = (1u << ctype_align(d->info-CTALIGN_PTR)) -1; > - nsp = (nsp + align) & ~align; /* Align argument on stack. */ > + if (CCALL_ALIGN_STACKARG) { /* Align argument on stack. */ > + MSize align = (1u << ctype_align(d->info)) - 1; > + if (rp) > + align = CTSIZE_PTR-1; > + nsp = (nsp + align) & ~align; > } > - if (nsp + n > CCALL_MAXSTACK) { /* Too many arguments. */ > + dp = ((uint8_t *)cc->stack) + nsp; > + nsp += n * CTSIZE_PTR; > + if (nsp > CCALL_SIZE_STACK) { /* Too many arguments. */ > err_nyi: > lj_err_caller(L, LJ_ERR_FFI_NYICALL); > } > - dp = &cc->stack[nsp]; > - nsp += n; > isva = 0; > > done: > @@ -1103,10 +1108,10 @@ static int ccall_set_args(lua_State *L, CTState *cts, CType *ct, > #if LJ_TARGET_X64 || (LJ_TARGET_PPC && !LJ_ABI_SOFTFP) > cc->nfpr = nfpr; /* Required for vararg functions. */ > #endif > - cc->nsp = nsp; > - cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA)*CTSIZE_PTR; > - if (nsp > CCALL_SPS_FREE) > - cc->spadj += (((nsp-CCALL_SPS_FREE)*CTSIZE_PTR + 15u) & ~15u); > + cc->nsp = (nsp + CTSIZE_PTR-1) & ~(CTSIZE_PTR-1); > + cc->spadj = (CCALL_SPS_FREE + CCALL_SPS_EXTRA) * CTSIZE_PTR; > + if (cc->nsp > CCALL_SPS_FREE * CTSIZE_PTR) > + cc->spadj += (((cc->nsp - CCALL_SPS_FREE * CTSIZE_PTR) + 15u) & ~15u); > return gcsteps; > } > > diff --git a/src/lj_ccall.h b/src/lj_ccall.h > index 6efa48c7..af7a8e84 100644 > --- a/src/lj_ccall.h > +++ b/src/lj_ccall.h > @@ -152,14 +152,15 @@ typedef union FPRArg { > LJ_STATIC_ASSERT(CCALL_NUM_GPR <= CCALL_MAX_GPR); > LJ_STATIC_ASSERT(CCALL_NUM_FPR <= CCALL_MAX_FPR); > > -#define CCALL_MAXSTACK 32 > +#define CCALL_NUM_STACK 31 > +#define CCALL_SIZE_STACK (CCALL_NUM_STACK * CTSIZE_PTR) > > /* -- C call state -------------------------------------------------------- */ > > typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState { > void (*func)(void); /* Pointer to called function. */ > uint32_t spadj; /* Stack pointer adjustment. */ > - uint8_t nsp; /* Number of stack slots. */ > + uint8_t nsp; /* Number of bytes on stack. */ > uint8_t retref; /* Return value by reference. */ > #if LJ_TARGET_X64 > uint8_t ngpr; /* Number of arguments in GPRs. */ > @@ -178,7 +179,7 @@ typedef LJ_ALIGN(CCALL_ALIGN_CALLSTATE) struct CCallState { > FPRArg fpr[CCALL_NUM_FPR]; /* Arguments/results in FPRs. */ > #endif > GPRArg gpr[CCALL_NUM_GPR]; /* Arguments/results in GPRs. */ > - GPRArg stack[CCALL_MAXSTACK]; /* Stack slots. */ > + GPRArg stack[CCALL_NUM_STACK]; /* Stack slots. */ > } CCallState; > > /* -- C call handling ----------------------------------------------------- */ > diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc > index 628c1c24..7ed555f8 100644 > --- a/src/vm_arm.dasc > +++ b/src/vm_arm.dasc > @@ -2513,16 +2513,16 @@ static void build_subroutines(BuildCtx *ctx) > |.endif > | mov r11, sp > | sub sp, sp, CARG1 // Readjust stack. > - | subs CARG2, CARG2, #1 > + | subs CARG2, CARG2, #4 > |.if HFABI > | vldm RB, {d0-d7} > |.endif > | ldr RB, CCSTATE->func > | bmi >2 > |1: // Copy stack slots. > - | ldr CARG4, [CARG3, CARG2, lsl #2] > - | str CARG4, [sp, CARG2, lsl #2] > - | subs CARG2, CARG2, #1 > + | ldr CARG4, [CARG3, CARG2] > + | str CARG4, [sp, CARG2] > + | subs CARG2, CARG2, #4 > | bpl <1 > |2: > | ldrd CARG12, CCSTATE->gpr[0] > diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc > index c35eaf12..57131140 100644 > --- a/src/vm_arm64.dasc > +++ b/src/vm_arm64.dasc > @@ -2142,14 +2142,14 @@ static void build_subroutines(BuildCtx *ctx) > | ldr TMP0w,CCSTATE:x0->spadj > | ldrb TMP1w, CCSTATE->nsp > | add TMP2, CCSTATE, #offsetof(CCallState, stack) > - | subs TMP1, TMP1, #1 > + | subs TMP1, TMP1, #8 > | ldr TMP3, CCSTATE->func > | sub sp, sp, TMP0 > | bmi >2 > |1: // Copy stack slots > - | ldr TMP0, [TMP2, TMP1, lsl #3] > - | str TMP0, [sp, TMP1, lsl #3] > - | subs TMP1, TMP1, #1 > + | ldr TMP0, [TMP2, TMP1] > + | str TMP0, [sp, TMP1] > + | subs TMP1, TMP1, #8 > | bpl <1 > |2: > | ldp x0, x1, CCSTATE->gpr[0] > diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc > index 52366b88..4db9308f 100644 > --- a/src/vm_mips.dasc > +++ b/src/vm_mips.dasc > @@ -2836,7 +2836,6 @@ static void build_subroutines(BuildCtx *ctx) > | move TMP2, sp > | subu sp, sp, TMP1 > | sw ra, -4(TMP2) > - | sll CARG2, CARG2, 2 > | sw r16, -8(TMP2) > | sw CCSTATE, -12(TMP2) > | move r16, TMP2 > diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc > index c41b27f4..87e240f7 100644 > --- a/src/vm_mips64.dasc > +++ b/src/vm_mips64.dasc > @@ -2963,7 +2963,6 @@ static void build_subroutines(BuildCtx *ctx) > | move TMP2, sp > | dsubu sp, sp, TMP1 > | sd ra, -8(TMP2) > - | sll CARG2, CARG2, 3 > | sd r16, -16(TMP2) > | sd CCSTATE, -24(TMP2) > | move r16, TMP2 > diff --git a/src/vm_ppc.dasc b/src/vm_ppc.dasc > index edae7b98..23d6e316 100644 > --- a/src/vm_ppc.dasc > +++ b/src/vm_ppc.dasc > @@ -3275,14 +3275,13 @@ static void build_subroutines(BuildCtx *ctx) > | stw TMP0, 4(sp) > | cmpwi cr1, CARG3, 0 > | mr TMP2, sp > - | addic. CARG2, CARG2, -1 > + | addic. CARG2, CARG2, -4 > | stwux sp, sp, TMP1 > | crnot 4*cr1+eq, 4*cr1+eq // For vararg calls. > | stw r14, -4(TMP2) > | stw CCSTATE, -8(TMP2) > | mr r14, TMP2 > | la TMP1, CCSTATE->stack > - | slwi CARG2, CARG2, 2 > | blty >2 > | la TMP2, 8(sp) > |1: > diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc > index 6ac88d70..7f0da677 100644 > --- a/src/vm_x64.dasc > +++ b/src/vm_x64.dasc > @@ -2773,12 +2773,12 @@ static void build_subroutines(BuildCtx *ctx) > | > | // Copy stack slots. > | movzx ecx, byte CCSTATE->nsp > - | sub ecx, 1 > + | sub ecx, 8 > | js >2 > |1: > - | mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)] > - | mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax > - | sub ecx, 1 > + | mov rax, [CCSTATE+rcx+offsetof(CCallState, stack)] > + | mov [rsp+rcx+CCALL_SPS_EXTRA*8], rax > + | sub ecx, 8 > | jns <1 > |2: > | > diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc > index d9234f3b..98546550 100644 > --- a/src/vm_x86.dasc > +++ b/src/vm_x86.dasc > @@ -3348,19 +3348,25 @@ static void build_subroutines(BuildCtx *ctx) > | > | // Copy stack slots. > | movzx ecx, byte CCSTATE->nsp > - | sub ecx, 1 > + |.if X64 > + | sub ecx, 8 > | js >2 > |1: > - |.if X64 > - | mov rax, [CCSTATE+rcx*8+offsetof(CCallState, stack)] > - | mov [rsp+rcx*8+CCALL_SPS_EXTRA*8], rax > + | mov rax, [CCSTATE+rcx+offsetof(CCallState, stack)] > + | mov [rsp+rcx+CCALL_SPS_EXTRA*8], rax > + | sub ecx, 8 > + | jns <1 > + |2: > |.else > - | mov eax, [CCSTATE+ecx*4+offsetof(CCallState, stack)] > - | mov [esp+ecx*4], eax > - |.endif > - | sub ecx, 1 > + | sub ecx, 4 > + | js >2 > + |1: > + | mov eax, [CCSTATE+ecx+offsetof(CCallState, stack)] > + | mov [esp+ecx], eax > + | sub ecx, 4 > | jns <1 > |2: > + |.endif > | > |.if X64 > | movzx eax, byte CCSTATE->nfpr