Hi! I see, the `or x86_64` gives green light. LGTM, Sergos > On 6 Mar 2023, at 17:52, Maxim Kokryashkin wrote: > > Hi! > The XXX ones are just some useful notes, the FIXME ones > are some issues with LuaJIT that remain unfixed in the mainline. > > -- > Best regards, > Maxim Kokryashkin > > > > Hi! > > Thanks for the patch! > > Number of FIXME and a XXX in the test - does it mean we’d have problems in CI? > > Regards, > Sergos > > > On 8 Feb 2023, at 23:05, Maksim Kokryashkin > wrote: > > > > From: Mike Pall > > > > (cherry-picked from commit 03208c8162af9cc01ca76ee1676ca79e5abe9b60) > > > > `math.min()`/`math.max()` could produce different results. > > Previously, dirty values on the Lua stack could be > > treated as arguments to `math.min()`/`math.max()`. > > This patch adds check for the number of arguments provided to > > math.min/max, which fixes the issue. > > > > Also, several fold optimizations were modified or > > deleted due to inconsistencies between compiler > > and interpreter on NaN-values. > > > > Changes in `fold_kfold_numarith()` also required > > to replace the `CC_LO/CC_HI` comparison modes with the `CC_LE/CC_PL` > > on aarch64 platforms. The issue is thoroughly described just before > > the corresponding test. > > > > Changes for MIPS and PPC architectures are ommited, due to the > > absence of other patches the are dependent on. > > > > Maxim Kokryashkin & Sergey Kaplun: > > * added the description and tests for the problem > > > > Resolves tarantool/tarantool#6163 > > --- > > Changes in v8: > > - Fixed comments as per review by Sergey > > > > Branch: https://github.com/tarantool/luajit/tree/fckxorg/gh-6163-min-max > > PR: https://github.com/tarantool/tarantool/pull/7455 > > Issue for x86 misbehavior: https://github.com/LuaJIT/LuaJIT/issues/957 > > src/lj_asm_arm.h | 6 +- > > src/lj_asm_arm64.h | 6 +- > > src/lj_opt_fold.c | 53 ++-- > > src/lj_vmmath.c | 4 +- > > src/vm_arm.dasc | 4 +- > > src/vm_arm64.dasc | 4 +- > > src/vm_x64.dasc | 2 +- > > src/vm_x86.dasc | 2 +- > > test/tarantool-tests/gh-6163-min-max.test.lua | 263 ++++++++++++++++++ > > 9 files changed, 296 insertions(+), 48 deletions(-) > > create mode 100644 test/tarantool-tests/gh-6163-min-max.test.lua > > > > diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h > > index 8af19eb9..6ae6e2f2 100644 > > --- a/src/lj_asm_arm.h > > +++ b/src/lj_asm_arm.h > > @@ -1663,8 +1663,8 @@ static void asm_min_max(ASMState *as, IRIns *ir, int cc, int fcc) > > asm_intmin_max(as, ir, cc); > > } > > > > -#define asm_min(as, ir) asm_min_max(as, ir, CC_GT, CC_HI) > > -#define asm_max(as, ir) asm_min_max(as, ir, CC_LT, CC_LO) > > +#define asm_min(as, ir) asm_min_max(as, ir, CC_GT, CC_PL) > > +#define asm_max(as, ir) asm_min_max(as, ir, CC_LT, CC_LE) > > > > /* -- Comparisons --------------------------------------------------------- */ > > > > @@ -1856,7 +1856,7 @@ static void asm_hiop(ASMState *as, IRIns *ir) > > } else if ((ir-1)->o == IR_MIN || (ir-1)->o == IR_MAX) { > > as->curins--; /* Always skip the loword min/max. */ > > if (uselo || usehi) > > - asm_sfpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_HI : CC_LO); > > + asm_sfpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_PL : CC_LE); > > return; > > #elif LJ_HASFFI > > } else if ((ir-1)->o == IR_CONV) { > > diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h > > index 4aeb51f3..fe197700 100644 > > --- a/src/lj_asm_arm64.h > > +++ b/src/lj_asm_arm64.h > > @@ -1592,7 +1592,7 @@ static void asm_fpmin_max(ASMState *as, IRIns *ir, A64CC fcc) > > Reg dest = (ra_dest(as, ir, RSET_FPR) & 31); > > Reg right, left = ra_alloc2(as, ir, RSET_FPR); > > right = ((left >> 8) & 31); left &= 31; > > - emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, left, right); > > + emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, right, left); > > emit_nm(as, A64I_FCMPd, left, right); > > } > > > > @@ -1604,8 +1604,8 @@ static void asm_min_max(ASMState *as, IRIns *ir, A64CC cc, A64CC fcc) > > asm_intmin_max(as, ir, cc); > > } > > > > -#define asm_max(as, ir) asm_min_max(as, ir, CC_GT, CC_HI) > > -#define asm_min(as, ir) asm_min_max(as, ir, CC_LT, CC_LO) > > +#define asm_min(as, ir) asm_min_max(as, ir, CC_LT, CC_PL) > > +#define asm_max(as, ir) asm_min_max(as, ir, CC_GT, CC_LE) > > > > /* -- Comparisons --------------------------------------------------------- */ > > > > diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c > > index 49f74996..27e489af 100644 > > --- a/src/lj_opt_fold.c > > +++ b/src/lj_opt_fold.c > > @@ -1797,8 +1797,6 @@ LJFOLDF(reassoc_intarith_k64) > > #endif > > } > > > > -LJFOLD(MIN MIN any) > > -LJFOLD(MAX MAX any) > > LJFOLD(BAND BAND any) > > LJFOLD(BOR BOR any) > > LJFOLDF(reassoc_dup) > > @@ -1808,6 +1806,15 @@ LJFOLDF(reassoc_dup) > > return NEXTFOLD; > > } > > > > +LJFOLD(MIN MIN any) > > +LJFOLD(MAX MAX any) > > +LJFOLDF(reassoc_dup_minmax) > > +{ > > + if (fins->op2 == fleft->op2) > > + return LEFTFOLD; /* (a o b) o b ==> a o b */ > > + return NEXTFOLD; > > +} > > + > > LJFOLD(BXOR BXOR any) > > LJFOLDF(reassoc_bxor) > > { > > @@ -1846,23 +1853,12 @@ LJFOLDF(reassoc_shift) > > return NEXTFOLD; > > } > > > > -LJFOLD(MIN MIN KNUM) > > -LJFOLD(MAX MAX KNUM) > > LJFOLD(MIN MIN KINT) > > LJFOLD(MAX MAX KINT) > > LJFOLDF(reassoc_minmax_k) > > { > > IRIns *irk = IR(fleft->op2); > > - if (irk->o == IR_KNUM) { > > - lua_Number a = ir_knum(irk)->n; > > - lua_Number y = lj_vm_foldarith(a, knumright, fins->o - IR_ADD); > > - if (a == y) /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */ > > - return LEFTFOLD; > > - PHIBARRIER(fleft); > > - fins->op1 = fleft->op1; > > - fins->op2 = (IRRef1)lj_ir_knum(J, y); > > - return RETRYFOLD; /* (x o k1) o k2 ==> x o (k1 o k2) */ > > - } else if (irk->o == IR_KINT) { > > + if (irk->o == IR_KINT) { > > int32_t a = irk->i; > > int32_t y = kfold_intop(a, fright->i, fins->o); > > if (a == y) /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */ > > @@ -1875,24 +1871,6 @@ LJFOLDF(reassoc_minmax_k) > > return NEXTFOLD; > > } > > > > -LJFOLD(MIN MAX any) > > -LJFOLD(MAX MIN any) > > -LJFOLDF(reassoc_minmax_left) > > -{ > > - if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2) > > - return RIGHTFOLD; /* (b o1 a) o2 b ==> b; (a o1 b) o2 b ==> b */ > > - return NEXTFOLD; > > -} > > - > > -LJFOLD(MIN any MAX) > > -LJFOLD(MAX any MIN) > > -LJFOLDF(reassoc_minmax_right) > > -{ > > - if (fins->op1 == fright->op1 || fins->op1 == fright->op2) > > - return LEFTFOLD; /* a o2 (a o1 b) ==> a; a o2 (b o1 a) ==> a */ > > - return NEXTFOLD; > > -} > > - > > /* -- Array bounds check elimination -------------------------------------- */ > > > > /* Eliminate ABC across PHIs to handle t[i-1] forwarding case. > > @@ -2018,8 +1996,6 @@ LJFOLDF(comm_comp) > > > > LJFOLD(BAND any any) > > LJFOLD(BOR any any) > > -LJFOLD(MIN any any) > > -LJFOLD(MAX any any) > > LJFOLDF(comm_dup) > > { > > if (fins->op1 == fins->op2) /* x o x ==> x */ > > @@ -2027,6 +2003,15 @@ LJFOLDF(comm_dup) > > return fold_comm_swap(J); > > } > > > > +LJFOLD(MIN any any) > > +LJFOLD(MAX any any) > > +LJFOLDF(comm_dup_minmax) > > +{ > > + if (fins->op1 == fins->op2) /* x o x ==> x */ > > + return LEFTFOLD; > > + return NEXTFOLD; > > +} > > + > > LJFOLD(BXOR any any) > > LJFOLDF(comm_bxor) > > { > > diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c > > index c04459bd..ae4e0f15 100644 > > --- a/src/lj_vmmath.c > > +++ b/src/lj_vmmath.c > > @@ -49,8 +49,8 @@ double lj_vm_foldarith(double x, double y, int op) > > case IR_ABS - IR_ADD: return fabs(x); break; > > #if LJ_HASJIT > > case IR_LDEXP - IR_ADD: return ldexp(x, (int)y); break; > > - case IR_MIN - IR_ADD: return x > y ? y : x; break; > > - case IR_MAX - IR_ADD: return x < y ? y : x; break; > > + case IR_MIN - IR_ADD: return x < y ? x : y; break; > > + case IR_MAX - IR_ADD: return x > y ? x : y; break; > > #endif > > default: return x; > > } > > diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc > > index a29292f1..89faa03e 100644 > > --- a/src/vm_arm.dasc > > +++ b/src/vm_arm.dasc > > @@ -1718,8 +1718,8 @@ static void build_subroutines(BuildCtx *ctx) > > |.endif > > |.endmacro > > | > > - | math_minmax math_min, gt, hi > > - | math_minmax math_max, lt, lo > > + | math_minmax math_min, gt, pl > > + | math_minmax math_max, lt, le > > | > > |//-- String library ----------------------------------------------------- > > | > > diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc > > index f517a808..2c1bb4f8 100644 > > --- a/src/vm_arm64.dasc > > +++ b/src/vm_arm64.dasc > > @@ -1494,8 +1494,8 @@ static void build_subroutines(BuildCtx *ctx) > > | b <6 > > |.endmacro > > | > > - | math_minmax math_min, gt, hi > > - | math_minmax math_max, lt, lo > > + | math_minmax math_min, gt, pl > > + | math_minmax math_max, lt, le > > | > > |//-- String library ----------------------------------------------------- > > | > > diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc > > index 59f117ba..faeb5181 100644 > > --- a/src/vm_x64.dasc > > +++ b/src/vm_x64.dasc > > @@ -1896,7 +1896,7 @@ static void build_subroutines(BuildCtx *ctx) > > | jmp ->fff_res > > | > > |.macro math_minmax, name, cmovop, sseop > > - | .ffunc name > > + | .ffunc_1 name > > | mov RAd, 2 > > |.if DUALNUM > > | mov RB, [BASE] > > diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc > > index f7ffe5d2..1c995d16 100644 > > --- a/src/vm_x86.dasc > > +++ b/src/vm_x86.dasc > > @@ -2321,7 +2321,7 @@ static void build_subroutines(BuildCtx *ctx) > > | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0. > > | > > |.macro math_minmax, name, cmovop, sseop > > - | .ffunc name > > + | .ffunc_1 name > > | mov RA, 2 > > | cmp dword [BASE+4], LJ_TISNUM > > |.if DUALNUM > > diff --git a/test/tarantool-tests/gh-6163-min-max.test.lua b/test/tarantool-tests/gh-6163-min-max.test.lua > > new file mode 100644 > > index 00000000..fb7a4058 > > --- /dev/null > > +++ b/test/tarantool-tests/gh-6163-min-max.test.lua > > @@ -0,0 +1,263 @@ > > +local tap = require('tap') > > +local test = tap.test('gh-6163-jit-min-max') > > +local x86_64 = jit.arch == 'x86' or jit.arch == 'x64' > > +test:plan(18) > > +-- > > +-- gh-6163: math.min/math.max inconsistencies. > > +-- > > + > > +local function isnan(x) > > + return x ~= x > > +end > > + > > +local function array_is_consistent(res) > > + for i = 1, #res - 1 do > > + if res[i] ~= res[i + 1] and not (isnan(res[i]) and isnan(res[i + 1])) then > > + return false > > + end > > + end > > + return true > > +end > > + > > +-- This function creates dirty values on the Lua stack. > > +-- The latter of them is going to be treated as an > > +-- argument by the `math.min/math.max`. > > +-- The first two of them are going to be overwritten > > +-- by the math function itself. > > +local function filler() > > + return 1, 1, 1 > > +end > > + > > +local min = math.min > > +local max = math.max > > + > > +-- It would be enough to test all cases for the > > +-- `math.min()` or for the `math.max()` only, because the > > +-- problem was in the common code. However, we shouldn't > > +-- make such assumptions in the testing code. > > + > > +-- `math.min()/math.max()` should raise an error when > > +-- called with no arguments. > > +filler() > > +local r, _ = pcall(function() min() end) > > +test:ok(not r, 'math.min fails with no args') > > + > > +filler() > > +r, _ = pcall(function() max() end) > > +test:ok(false == r, 'math.max fails with no args') > > + > > +local nan = 0/0 > > +local x = 1 > > + > > +jit.opt.start('hotloop=1') > > + > > +-- XXX: Looping over the operations and their arguments breakes the > > +-- semantics of some optimization tests below. The cases are > > +-- copy-pasted to preserve optimization semantics. > > + > > +-- Without the `(a o b) o a ==> a o b` fold optimization for > > +-- `math.min()/math.max()` the following mcode is emitted on aarch64 > > +-- for the `math.min(math.min(x, nan), x)` expression: > > +-- > > +-- | fcmp d2, d3 ; fcmp 1.0, nan > > +-- | fcsel d1, d2, d3, cc ; d1 == nan after this instruction > > +-- | ... > > +-- | fcmp d1, d2 ; fcmp nan, 1.0 > > +-- | fcsel d0, d1, d2, cc ; d0 == 1.0 after this instruction > > +-- > > +-- According to the `fcmp` docs[1], if either of the operands is NaN, > > +-- then the operands are unordered. It results in the following state > > +-- of the flags register: N=0, Z=0, C=1, V=1 > > +-- > > +-- According to the `fcsel` docs[2], if the condition is met, then > > +-- the first register value is taken, otherwise -- the second. > > +-- In our case, the condition is cc, which means that the `C` flag > > +-- should be clear[3], which is false. Then, the second value is taken, > > +-- which is `NaN` for the first `fcmp`-`fcsel` pair, and `1.0` for > > +-- the second. > > +-- > > +-- If that fold optimization is applied, then only the first `fcmp`-`fcsel` > > +-- pair is emitted, and the result is `NaN`, which is inconsistent with > > +-- the result of the non-optimized mcode. > > +-- > > +-- [1]: https://developer.arm.com/documentation/dui0801/g/A64-Floating-point-Instructions/FCMP > > +-- [2]: https://developer.arm.com/documentation/100069/0608/A64-Floating-point-Instructions/FCSEL > > +-- [3]: https://developer.arm.com/documentation/dui0068/b/ARM-Instruction-Reference/Conditional-execution > > + > > +local result = {} > > +for k = 1, 4 do > > + result[k] = min(min(x, nan), x) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'math.min: reassoc_dup') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(max(x, nan), x) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'math.max: reassoc_dup') > > + > > +-- If one gets the expression like `math.min(x, math.min(x, nan))`, > > +-- and the `comm_dup` optimization is applied, it results in the > > +-- same situation as explained above. With the `comm_dup_minmax` > > +-- there is no swap, hence, everything is consistent again: > > +-- > > +-- | fcmp d2, d3 ; fcmp 1.0, nan > > +-- | fcsel d1, d3, d2, pl ; d1 == nan after this instruction > > +-- | ... > > +-- | fcmp d2, d1 ; fcmp 1.0, nan > > +-- | fcsel d0, d1, d2, pl ; d0 == nan after this instruction > > +-- `pl` (aka `CC_PL`) condition means that N flag is 0 [2], that > > +-- is true when we are comparing something with NaN. So, the value of the > > +-- first source register is taken > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = min(x, min(x, nan)) > > +end > > +-- FIXME: results are still inconsistent for the x86/64 architecture. > > +-- expected: nan nan nan nan > > +test:ok(array_is_consistent(result) or x86_64, 'math.min: comm_dup_minmax') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(x, max(x, nan)) > > +end > > +-- FIXME: results are still inconsistent for the x86/64 architecture. > > +-- expected: nan nan nan nan > > +test:ok(array_is_consistent(result) or x86_64, 'math.max: comm_dup_minmax') > > + > > +-- The following optimization should be disabled: > > +-- (x o k1) o k2 ==> x o (k1 o k2) > > + > > +x = 1.2 > > +result = {} > > +for k = 1, 4 do > > + result[k] = min(min(x, 0/0), 1.3) > > +end > > +-- expected: 1.3 1.3 1.3 1.3 > > +test:ok(array_is_consistent(result), 'math.min: reassoc_minmax_k') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(max(x, 0/0), 1.1) > > +end > > +-- expected: 1.1 1.1 1.1 1.1 > > +test:ok(array_is_consistent(result), 'math.max: reassoc_minmax_k') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = min(max(nan, 1), 1) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'min-max-case1: reassoc_minmax_left') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = min(max(1, nan), 1) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'min-max-case2: reassoc_minmax_left') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(min(nan, 1), 1) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'max-min-case1: reassoc_minmax_left') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(min(1, nan), 1) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'max-min-case2: reassoc_minmax_left') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = min(1, max(nan, 1)) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'min-max-case1: reassoc_minmax_right') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = min(1, max(1, nan)) > > +end > > +-- FIXME: results are still inconsistent for the x86/64 architecture. > > +-- expected: nan nan nan nan > > +test:ok(array_is_consistent(result) or x86_64, 'min-max-case2: reassoc_minmax_right') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(1, min(nan, 1)) > > +end > > +-- expected: 1 1 1 1 > > +test:ok(array_is_consistent(result), 'max-min-case1: reassoc_minmax_right') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(1, min(1, nan)) > > +end > > +-- FIXME: results are still inconsistent for the x86/64 architecture. > > +-- expected: nan nan nan nan > > +test:ok(array_is_consistent(result) or x86_64, 'max-min-case2: reassoc_minmax_right') > > + > > +-- XXX: If we look into the disassembled code of `lj_vm_foldarith()` > > +-- we can see the following: > > +-- > > +-- | /* In our test x == 7.1, y == nan */ > > +-- | case IR_MIN - IR_ADD: return x > y ? y : x; break; > > +-- > > +-- | ; case IR_MIN > > +-- | : movsd xmm0,QWORD PTR [rsp+0x18] ; xmm0 <- 7.1 > > +-- | : comisd xmm0,QWORD PTR [rsp+0x10] ; comisd 7.1, nan > > +-- | : jbe ; >= ? > > +-- | : mov rax,QWORD PTR [rsp+0x10] ; return nan > > +-- | : jmp ; > > +-- | : mov rax,QWORD PTR [rsp+0x18] ; else return 7.1 > > +-- | : jmp ; > > +-- > > +-- According to `comisd` documentation [4] in case when one of the operands > > +-- is NaN, the result is unordered and ZF,PF,CF := 111. This means that `jbe` > > +-- condition is true (CF=1 or ZF=1)[5], so we return 7.1 (the first > > +-- operand) for case `IR_MIN`. > > +-- > > +-- However, in `lj_ff_math_min()` in the VM we see the following: > > +-- |7: > > +-- | sseop xmm0, xmm1 > > +-- Where `sseop` is either `minsd` or `maxsd` instruction. > > +-- If only one of their args is a NaN, the second source operand, > > +-- either a NaN or a valid floating-point value, is > > +-- written to the result. > > +-- > > +-- So the patch changes the `lj_vm_foldairth()` assembly in the following way: > > +-- | ; case IR_MIN > > +-- | : movsd xmm0,QWORD PTR [rsp+0x10] ; xmm0 <- nan > > +-- | : comisd xmm0,QWORD PTR [rsp+0x18] ; comisd nan, 7.1 > > +-- | : jbe ; >= ? > > +-- | : mov rax,QWORD PTR [rsp+0x18] ; return 7.1 > > +-- | : jmp ; > > +-- | : mov rax,QWORD PTR [rsp+0x10] ; else return nan > > +-- | : jmp ; > > +-- > > +-- So now we always return the second operand. > > +-- > > +-- XXX: The two tests below use the `0/0` constant instead of `nan` > > +-- variable is dictated by the `fold_kfold_numarith` semantics. > > +result = {} > > +for k = 1, 4 do > > + result[k] = min(min(7.1, 0/0), 1.1) > > +end > > +-- expected: 1.1 1.1 1.1 1.1 > > +test:ok(array_is_consistent(result), 'min: fold_kfold_numarith') > > + > > +result = {} > > +for k = 1, 4 do > > + result[k] = max(max(7.1, 0/0), 1.1) > > +end > > +-- expected: 1.1 1.1 1.1 1.1 > > +test:ok(array_is_consistent(result), 'max: fold_kfold_numarith') > > + > > +os.exit(test:check() and 0 or 1) > > -- > > 2.37.1 (Apple Git-137.1) > > >