[Tarantool-patches] [PATCH luajit v8] Fix math.min()/math.max() inconsistencies.

Wed Feb 8 23:05:47 MSK 2023

From: Mike Pall <mike>

(cherry-picked from commit 03208c8162af9cc01ca76ee1676ca79e5abe9b60)

`math.min()`/`math.max()` could produce different results.
Previously, dirty values on the Lua stack could be
treated as arguments to `math.min()`/`math.max()`.
This patch adds check for the number of arguments provided to
math.min/max, which fixes the issue.

Also, several fold optimizations were modified or
deleted due to inconsistencies between compiler
and interpreter on NaN-values.

Changes in `fold_kfold_numarith()` also required
to replace the `CC_LO/CC_HI` comparison modes with the `CC_LE/CC_PL`
on aarch64 platforms. The issue is thoroughly described just before
the corresponding test.

Changes for MIPS and PPC architectures are ommited, due to the
absence of other patches the are dependent on.

Maxim Kokryashkin & Sergey Kaplun:
* added the description and tests for the problem

Resolves tarantool/tarantool#6163
---
Changes in v8:
- Fixed comments as per review by Sergey

Branch: https://github.com/tarantool/luajit/tree/fckxorg/gh-6163-min-max
PR: https://github.com/tarantool/tarantool/pull/7455
Issue for x86 misbehavior: https://github.com/LuaJIT/LuaJIT/issues/957
 src/lj_asm_arm.h                              |   6 +-
 src/lj_asm_arm64.h                            |   6 +-
 src/lj_opt_fold.c                             |  53 ++--
 src/lj_vmmath.c                               |   4 +-
 src/vm_arm.dasc                               |   4 +-
 src/vm_arm64.dasc                             |   4 +-
 src/vm_x64.dasc                               |   2 +-
 src/vm_x86.dasc                               |   2 +-
 test/tarantool-tests/gh-6163-min-max.test.lua | 263 ++++++++++++++++++
 9 files changed, 296 insertions(+), 48 deletions(-)
 create mode 100644 test/tarantool-tests/gh-6163-min-max.test.lua

diff --git a/src/lj_asm_arm.h b/src/lj_asm_arm.h
index 8af19eb9..6ae6e2f2 100644
--- a/src/lj_asm_arm.h
+++ b/src/lj_asm_arm.h
@@ -1663,8 +1663,8 @@ static void asm_min_max(ASMState *as, IRIns *ir, int cc, int fcc)
     asm_intmin_max(as, ir, cc);
 }

-#define asm_min(as, ir)		asm_min_max(as, ir, CC_GT, CC_HI)
-#define asm_max(as, ir)		asm_min_max(as, ir, CC_LT, CC_LO)
+#define asm_min(as, ir)		asm_min_max(as, ir, CC_GT, CC_PL)
+#define asm_max(as, ir)		asm_min_max(as, ir, CC_LT, CC_LE)

 /* -- Comparisons --------------------------------------------------------- */

@@ -1856,7 +1856,7 @@ static void asm_hiop(ASMState *as, IRIns *ir)
   } else if ((ir-1)->o == IR_MIN || (ir-1)->o == IR_MAX) {
     as->curins--;  /* Always skip the loword min/max. */
     if (uselo || usehi)
-      asm_sfpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_HI : CC_LO);
+      asm_sfpmin_max(as, ir-1, (ir-1)->o == IR_MIN ? CC_PL : CC_LE);
     return;
 #elif LJ_HASFFI
   } else if ((ir-1)->o == IR_CONV) {
diff --git a/src/lj_asm_arm64.h b/src/lj_asm_arm64.h
index 4aeb51f3..fe197700 100644
--- a/src/lj_asm_arm64.h
+++ b/src/lj_asm_arm64.h
@@ -1592,7 +1592,7 @@ static void asm_fpmin_max(ASMState *as, IRIns *ir, A64CC fcc)
   Reg dest = (ra_dest(as, ir, RSET_FPR) & 31);
   Reg right, left = ra_alloc2(as, ir, RSET_FPR);
   right = ((left >> 8) & 31); left &= 31;
-  emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, left, right);
+  emit_dnm(as, A64I_FCSELd | A64F_CC(fcc), dest, right, left);
   emit_nm(as, A64I_FCMPd, left, right);
 }

@@ -1604,8 +1604,8 @@ static void asm_min_max(ASMState *as, IRIns *ir, A64CC cc, A64CC fcc)
     asm_intmin_max(as, ir, cc);
 }

-#define asm_max(as, ir)		asm_min_max(as, ir, CC_GT, CC_HI)
-#define asm_min(as, ir)		asm_min_max(as, ir, CC_LT, CC_LO)
+#define asm_min(as, ir)		asm_min_max(as, ir, CC_LT, CC_PL)
+#define asm_max(as, ir)		asm_min_max(as, ir, CC_GT, CC_LE)

 /* -- Comparisons --------------------------------------------------------- */

diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c
index 49f74996..27e489af 100644
--- a/src/lj_opt_fold.c
+++ b/src/lj_opt_fold.c
@@ -1797,8 +1797,6 @@ LJFOLDF(reassoc_intarith_k64)
 #endif
 }

-LJFOLD(MIN MIN any)
-LJFOLD(MAX MAX any)
 LJFOLD(BAND BAND any)
 LJFOLD(BOR BOR any)
 LJFOLDF(reassoc_dup)
@@ -1808,6 +1806,15 @@ LJFOLDF(reassoc_dup)
   return NEXTFOLD;
 }

+LJFOLD(MIN MIN any)
+LJFOLD(MAX MAX any)
+LJFOLDF(reassoc_dup_minmax)
+{
+  if (fins->op2 == fleft->op2)
+    return LEFTFOLD;  /* (a o b) o b ==> a o b */
+  return NEXTFOLD;
+}
+
 LJFOLD(BXOR BXOR any)
 LJFOLDF(reassoc_bxor)
 {
@@ -1846,23 +1853,12 @@ LJFOLDF(reassoc_shift)
   return NEXTFOLD;
 }

-LJFOLD(MIN MIN KNUM)
-LJFOLD(MAX MAX KNUM)
 LJFOLD(MIN MIN KINT)
 LJFOLD(MAX MAX KINT)
 LJFOLDF(reassoc_minmax_k)
 {
   IRIns *irk = IR(fleft->op2);
-  if (irk->o == IR_KNUM) {
-    lua_Number a = ir_knum(irk)->n;
-    lua_Number y = lj_vm_foldarith(a, knumright, fins->o - IR_ADD);
-    if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
-      return LEFTFOLD;
-    PHIBARRIER(fleft);
-    fins->op1 = fleft->op1;
-    fins->op2 = (IRRef1)lj_ir_knum(J, y);
-    return RETRYFOLD;  /* (x o k1) o k2 ==> x o (k1 o k2) */
-  } else if (irk->o == IR_KINT) {
+  if (irk->o == IR_KINT) {
     int32_t a = irk->i;
     int32_t y = kfold_intop(a, fright->i, fins->o);
     if (a == y)  /* (x o k1) o k2 ==> x o k1, if (k1 o k2) == k1. */
@@ -1875,24 +1871,6 @@ LJFOLDF(reassoc_minmax_k)
   return NEXTFOLD;
 }

-LJFOLD(MIN MAX any)
-LJFOLD(MAX MIN any)
-LJFOLDF(reassoc_minmax_left)
-{
-  if (fins->op2 == fleft->op1 || fins->op2 == fleft->op2)
-    return RIGHTFOLD;  /* (b o1 a) o2 b ==> b; (a o1 b) o2 b ==> b */
-  return NEXTFOLD;
-}
-
-LJFOLD(MIN any MAX)
-LJFOLD(MAX any MIN)
-LJFOLDF(reassoc_minmax_right)
-{
-  if (fins->op1 == fright->op1 || fins->op1 == fright->op2)
-    return LEFTFOLD;  /* a o2 (a o1 b) ==> a; a o2 (b o1 a) ==> a */
-  return NEXTFOLD;
-}
-
 /* -- Array bounds check elimination -------------------------------------- */

 /* Eliminate ABC across PHIs to handle t[i-1] forwarding case.
@@ -2018,8 +1996,6 @@ LJFOLDF(comm_comp)

 LJFOLD(BAND any any)
 LJFOLD(BOR any any)
-LJFOLD(MIN any any)
-LJFOLD(MAX any any)
 LJFOLDF(comm_dup)
 {
   if (fins->op1 == fins->op2)  /* x o x ==> x */
@@ -2027,6 +2003,15 @@ LJFOLDF(comm_dup)
   return fold_comm_swap(J);
 }

+LJFOLD(MIN any any)
+LJFOLD(MAX any any)
+LJFOLDF(comm_dup_minmax)
+{
+  if (fins->op1 == fins->op2)  /* x o x ==> x */
+    return LEFTFOLD;
+  return NEXTFOLD;
+}
+
 LJFOLD(BXOR any any)
 LJFOLDF(comm_bxor)
 {
diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c
index c04459bd..ae4e0f15 100644
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@@ -49,8 +49,8 @@ double lj_vm_foldarith(double x, double y, int op)
   case IR_ABS - IR_ADD: return fabs(x); break;
 #if LJ_HASJIT
   case IR_LDEXP - IR_ADD: return ldexp(x, (int)y); break;
-  case IR_MIN - IR_ADD: return x > y ? y : x; break;
-  case IR_MAX - IR_ADD: return x < y ? y : x; break;
+  case IR_MIN - IR_ADD: return x < y ? x : y; break;
+  case IR_MAX - IR_ADD: return x > y ? x : y; break;
 #endif
   default: return x;
   }
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc
index a29292f1..89faa03e 100644
--- a/src/vm_arm.dasc
+++ b/src/vm_arm.dasc
@@ -1718,8 +1718,8 @@ static void build_subroutines(BuildCtx *ctx)
   |.endif
   |.endmacro
   |
-  |  math_minmax math_min, gt, hi
-  |  math_minmax math_max, lt, lo
+  |  math_minmax math_min, gt, pl
+  |  math_minmax math_max, lt, le
   |
   |//-- String library -----------------------------------------------------
   |
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index f517a808..2c1bb4f8 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -1494,8 +1494,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  b <6
   |.endmacro
   |
-  |  math_minmax math_min, gt, hi
-  |  math_minmax math_max, lt, lo
+  |  math_minmax math_min, gt, pl
+  |  math_minmax math_max, lt, le
   |
   |//-- String library -----------------------------------------------------
   |
diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc
index 59f117ba..faeb5181 100644
--- a/src/vm_x64.dasc
+++ b/src/vm_x64.dasc
@@ -1896,7 +1896,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  jmp ->fff_res
   |
   |.macro math_minmax, name, cmovop, sseop
-  |  .ffunc name
+  |  .ffunc_1 name
   |  mov RAd, 2
   |.if DUALNUM
   |  mov RB, [BASE]
diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
index f7ffe5d2..1c995d16 100644
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@@ -2321,7 +2321,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  xorps xmm4, xmm4; jmp <1			// Return +-Inf and +-0.
   |
   |.macro math_minmax, name, cmovop, sseop
-  |  .ffunc name
+  |  .ffunc_1 name
   |  mov RA, 2
   |  cmp dword [BASE+4], LJ_TISNUM
   |.if DUALNUM
diff --git a/test/tarantool-tests/gh-6163-min-max.test.lua b/test/tarantool-tests/gh-6163-min-max.test.lua
new file mode 100644
index 00000000..fb7a4058
--- /dev/null
+++ b/test/tarantool-tests/gh-6163-min-max.test.lua
@@ -0,0 +1,263 @@
+local tap = require('tap')
+local test = tap.test('gh-6163-jit-min-max')
+local x86_64 = jit.arch == 'x86' or jit.arch == 'x64'
+test:plan(18)
+--
+-- gh-6163: math.min/math.max inconsistencies.
+--
+
+local function isnan(x)
+    return x ~= x
+end
+
+local function array_is_consistent(res)
+  for i = 1, #res - 1 do
+    if res[i] ~= res[i + 1] and not (isnan(res[i]) and isnan(res[i + 1])) then
+      return false
+    end
+  end
+  return true
+end
+
+-- This function creates dirty values on the Lua stack.
+-- The latter of them is going to be treated as an
+-- argument by the `math.min/math.max`.
+-- The first two of them are going to be overwritten
+-- by the math function itself.
+local function filler()
+  return 1, 1, 1
+end
+
+local min = math.min
+local max = math.max
+
+-- It would be enough to test all cases for the
+-- `math.min()` or for the `math.max()` only, because the
+-- problem was in the common code. However, we shouldn't
+-- make such assumptions in the testing code.
+
+-- `math.min()/math.max()` should raise an error when
+-- called with no arguments.
+filler()
+local r, _ = pcall(function() min() end)
+test:ok(not r, 'math.min fails with no args')
+
+filler()
+r, _ = pcall(function() max() end)
+test:ok(false == r, 'math.max fails with no args')
+
+local nan = 0/0
+local x = 1
+
+jit.opt.start('hotloop=1')
+
+-- XXX: Looping over the operations and their arguments breakes the
+-- semantics of some optimization tests below. The cases are
+-- copy-pasted to preserve optimization semantics.
+
+-- Without the `(a o b) o a ==> a o b` fold optimization for
+-- `math.min()/math.max()` the following mcode is emitted on aarch64
+-- for the `math.min(math.min(x, nan), x)` expression:
+--
+-- | fcmp d2, d3 ; fcmp 1.0, nan
+-- | fcsel d1, d2, d3, cc ; d1 == nan after this instruction
+-- | ...
+-- | fcmp d1, d2 ; fcmp nan, 1.0
+-- | fcsel d0, d1, d2, cc ; d0 == 1.0 after this instruction
+--
+-- According to the `fcmp` docs[1], if either of the operands is NaN,
+-- then the operands are unordered. It results in the following state
+-- of the flags register: N=0, Z=0, C=1, V=1
+--
+-- According to the `fcsel` docs[2], if the condition is met, then
+-- the first register value is taken, otherwise -- the second.
+-- In our case, the condition is cc, which means that the `C` flag
+-- should be clear[3], which is false. Then, the second value is taken,
+-- which is `NaN` for the first `fcmp`-`fcsel` pair, and `1.0` for
+-- the second.
+--
+-- If that fold optimization is applied, then only the first `fcmp`-`fcsel`
+-- pair is emitted, and the result is `NaN`, which is inconsistent with
+-- the result of the non-optimized mcode.
+--
+-- [1]: https://developer.arm.com/documentation/dui0801/g/A64-Floating-point-Instructions/FCMP
+-- [2]: https://developer.arm.com/documentation/100069/0608/A64-Floating-point-Instructions/FCSEL
+-- [3]: https://developer.arm.com/documentation/dui0068/b/ARM-Instruction-Reference/Conditional-execution
+
+local result = {}
+for k = 1, 4 do
+    result[k] = min(min(x, nan), x)
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'math.min: reassoc_dup')
+
+result = {}
+for k = 1, 4 do
+    result[k] = max(max(x, nan), x)
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'math.max: reassoc_dup')
+
+-- If one gets the expression like `math.min(x, math.min(x, nan))`,
+-- and the `comm_dup` optimization is applied, it results in the
+-- same situation as explained above. With the `comm_dup_minmax`
+-- there is no swap, hence, everything is consistent again:
+--
+-- | fcmp d2, d3 ; fcmp 1.0, nan
+-- | fcsel d1, d3, d2, pl ; d1 == nan after this instruction
+-- | ...
+-- | fcmp d2, d1 ; fcmp 1.0, nan
+-- | fcsel d0, d1, d2, pl ; d0 == nan after this instruction
+-- `pl` (aka `CC_PL`) condition means that N flag is 0 [2], that
+-- is true when we are comparing something with NaN. So, the value of the
+-- first source register is taken
+
+result = {}
+for k = 1, 4 do
+    result[k] = min(x, min(x, nan))
+end
+-- FIXME: results are still inconsistent for the x86/64 architecture.
+-- expected: nan nan nan nan
+test:ok(array_is_consistent(result) or x86_64, 'math.min: comm_dup_minmax')
+
+result = {}
+for k = 1, 4 do
+    result[k] = max(x, max(x, nan))
+end
+-- FIXME: results are still inconsistent for the x86/64 architecture.
+-- expected: nan nan nan nan
+test:ok(array_is_consistent(result) or x86_64, 'math.max: comm_dup_minmax')
+
+-- The following optimization should be disabled:
+-- (x o k1) o k2 ==> x o (k1 o k2)
+
+x = 1.2
+result = {}
+for k = 1, 4 do
+    result[k] = min(min(x, 0/0), 1.3)
+end
+-- expected: 1.3 1.3 1.3 1.3
+test:ok(array_is_consistent(result), 'math.min: reassoc_minmax_k')
+
+result = {}
+for k = 1, 4 do
+    result[k] = max(max(x, 0/0), 1.1)
+end
+-- expected: 1.1 1.1 1.1 1.1
+test:ok(array_is_consistent(result), 'math.max: reassoc_minmax_k')
+
+result = {}
+for k = 1, 4 do
+  result[k] = min(max(nan, 1), 1)
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'min-max-case1: reassoc_minmax_left')
+
+result = {}
+for k = 1, 4 do
+  result[k] = min(max(1, nan), 1)
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'min-max-case2: reassoc_minmax_left')
+
+result = {}
+for k = 1, 4 do
+  result[k] = max(min(nan, 1), 1)
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'max-min-case1: reassoc_minmax_left')
+
+result = {}
+for k = 1, 4 do
+  result[k] = max(min(1, nan), 1)
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'max-min-case2: reassoc_minmax_left')
+
+result = {}
+for k = 1, 4 do
+  result[k] = min(1, max(nan, 1))
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'min-max-case1: reassoc_minmax_right')
+
+result = {}
+for k = 1, 4 do
+  result[k] = min(1, max(1, nan))
+end
+-- FIXME: results are still inconsistent for the x86/64 architecture.
+-- expected: nan nan nan nan
+test:ok(array_is_consistent(result) or x86_64, 'min-max-case2: reassoc_minmax_right')
+
+result = {}
+for k = 1, 4 do
+  result[k] = max(1, min(nan, 1))
+end
+-- expected: 1 1 1 1
+test:ok(array_is_consistent(result), 'max-min-case1: reassoc_minmax_right')
+
+result = {}
+for k = 1, 4 do
+  result[k] = max(1, min(1, nan))
+end
+-- FIXME: results are still inconsistent for the x86/64 architecture.
+-- expected: nan nan nan nan
+test:ok(array_is_consistent(result) or x86_64, 'max-min-case2: reassoc_minmax_right')
+
+-- XXX: If we look into the disassembled code of `lj_vm_foldarith()`
+-- we can see the following:
+--
+-- | /* In our test x == 7.1, y == nan */
+-- | case IR_MIN - IR_ADD: return x > y ? y : x; break;
+--
+-- | ; case IR_MIN
+-- | <lj_vm_foldarith+337>: movsd xmm0,QWORD PTR [rsp+0x18] ; xmm0 <- 7.1
+-- | <lj_vm_foldarith+343>: comisd xmm0,QWORD PTR [rsp+0x10] ; comisd 7.1, nan
+-- | <lj_vm_foldarith+349>: jbe <lj_vm_foldarith+358> ; >= ?
+-- | <lj_vm_foldarith+351>: mov rax,QWORD PTR [rsp+0x10] ; return nan
+-- | <lj_vm_foldarith+356>: jmp <lj_vm_foldarith+398> ;
+-- | <lj_vm_foldarith+358>: mov rax,QWORD PTR [rsp+0x18] ; else return 7.1
+-- | <lj_vm_foldarith+363>: jmp <lj_vm_foldarith+398> ;
+--
+-- According to `comisd` documentation [4] in case when one of the operands
+-- is NaN, the result is unordered and ZF,PF,CF := 111. This means that `jbe`
+-- condition is true (CF=1 or ZF=1)[5], so we return 7.1 (the first
+-- operand) for case `IR_MIN`.
+--
+-- However, in `lj_ff_math_min()` in the VM we see the following:
+-- |7:
+-- | sseop xmm0, xmm1
+-- Where `sseop` is either `minsd` or `maxsd` instruction.
+-- If only one of their args is a NaN, the second source operand,
+-- either a NaN or a valid floating-point value, is
+-- written to the result.
+--
+-- So the patch changes the `lj_vm_foldairth()` assembly in the following way:
+-- | ; case IR_MIN
+-- | <lj_vm_foldarith+337>: movsd xmm0,QWORD PTR [rsp+0x10] ; xmm0 <- nan
+-- | <lj_vm_foldarith+343>: comisd xmm0,QWORD PTR [rsp+0x18] ; comisd nan, 7.1
+-- | <lj_vm_foldarith+349>: jbe <lj_vm_foldarith+358> ; >= ?
+-- | <lj_vm_foldarith+351>: mov rax,QWORD PTR [rsp+0x18] ; return 7.1
+-- | <lj_vm_foldarith+356>: jmp <lj_vm_foldarith+398> ;
+-- | <lj_vm_foldarith+358>: mov rax,QWORD PTR [rsp+0x10] ; else return nan
+-- | <lj_vm_foldarith+363>: jmp <lj_vm_foldarith+398> ;
+--
+-- So now we always return the second operand.
+--
+-- XXX: The two tests below use the `0/0` constant instead of `nan`
+-- variable is dictated by the `fold_kfold_numarith` semantics.
+result = {}
+for k = 1, 4 do
+  result[k] = min(min(7.1, 0/0), 1.1)
+end
+-- expected: 1.1 1.1 1.1 1.1
+test:ok(array_is_consistent(result), 'min: fold_kfold_numarith')
+
+result = {}
+for k = 1, 4 do
+  result[k] = max(max(7.1, 0/0), 1.1)
+end
+-- expected: 1.1 1.1 1.1 1.1
+test:ok(array_is_consistent(result), 'max: fold_kfold_numarith')
+
+os.exit(test:check() and 0 or 1)
--
2.37.1 (Apple Git-137.1)