[Tarantool-patches] [PATCH luajit 4/5] Fix pow() optimization inconsistencies.

Tue Aug 15 12:36:30 MSK 2023

From: Mike Pall <mike>

(cherry-picked from commit 9512d5c1aced61e13e7be2d3208ec7ae3516b458)

This patch fixes different misbehaviour between JIT-compiled code and
the interpreter for power operator with the following ways:
* Drop folding optimizations for base ^ 0.5 => sqrt(base), as far as
  pow(base, 0.5) isn't interchangeable and depends on the <math.h>
  implementation.
* Drop folding optimizations for 2 ^ int_pow => ldexp(1.0, int_pow), to
  avoid dependcy on the <math.h> implementation.
* Now `asm_pow()` always assemble a call to the `lj_vm_powi()` function,
  that is general now for all CPU architectures. Using this internal
  function instead of toolchain-provided `pow()` guarantees consistency
  between interpreter and JIT results. Also, it drops custom
  implementation for the `vm_powi_sse()` on x86_64.
* `math_extern2` macro in the VM may take the second argument, that is
  used as the target function to call. The first argument is still the
  name for `func_nnsse` macro.
* Narrowing for power operation avoids range guard for non-constant base
  IR. This leads to invalid result if value on trace is out of range.
  Now it is done unconditionally.

Be aware, that [220/502] lib/string/format/num.lua test [1] from
LuaJIT-test suite fails after this commit.

[1]: https://www.exploringbinary.com/incorrect-floating-point-to-decimal-conversions/

Sergey Kaplun:
* added the description and the test for the problem

Part of tarantool/tarantool#8825
---
 src/lj_asm.c                                  |  7 +-
 src/lj_asm_x86.h                              | 13 ---
 src/lj_dispatch.h                             |  2 +-
 src/lj_ircall.h                               |  2 +-
 src/lj_opt_fold.c                             | 27 ------
 src/lj_opt_narrow.c                           | 12 +--
 src/lj_vm.h                                   |  7 +-
 src/lj_vmmath.c                               | 82 +++++++++--------
 src/vm_arm.dasc                               | 13 +--
 src/vm_arm64.dasc                             | 11 ++-
 src/vm_mips.dasc                              | 11 ++-
 src/vm_mips64.dasc                            | 11 ++-
 src/vm_ppc.dasc                               | 11 ++-
 src/vm_x64.dasc                               | 44 ++-------
 src/vm_x86.dasc                               | 46 ++--------
 .../lj-684-pow-inconsistencies.test.lua       | 89 +++++++++++++++++++
 .../lj-9-pow-inconsistencies.test.lua         |  2 +
 17 files changed, 195 insertions(+), 195 deletions(-)
 create mode 100644 test/tarantool-tests/lj-684-pow-inconsistencies.test.lua

diff --git a/src/lj_asm.c b/src/lj_asm.c
index d71fa8c8..65261d50 100644
--- a/src/lj_asm.c
+++ b/src/lj_asm.c
@@ -1650,7 +1650,6 @@ static void asm_loop(ASMState *as)
 #if !LJ_SOFTFP32
 #if !LJ_TARGET_X86ORX64
 #define asm_ldexp(as, ir)	asm_callid(as, ir, IRCALL_ldexp)
-#define asm_fppowi(as, ir)	asm_callid(as, ir, IRCALL_lj_vm_powi)
 #endif
 
 static void asm_pow(ASMState *as, IRIns *ir)
@@ -1661,10 +1660,8 @@ static void asm_pow(ASMState *as, IRIns *ir)
 					  IRCALL_lj_carith_powu64);
   else
 #endif
-  if (irt_isnum(IR(ir->op2)->t))
-    asm_callid(as, ir, IRCALL_pow);
-  else
-    asm_fppowi(as, ir);
+  asm_callid(as, ir, irt_isnum(IR(ir->op2)->t) ? IRCALL_lj_vm_pow :
+						 IRCALL_lj_vm_powi);
 }
 
 static void asm_div(ASMState *as, IRIns *ir)
diff --git a/src/lj_asm_x86.h b/src/lj_asm_x86.h
index 74f2d853..2b810c8d 100644
--- a/src/lj_asm_x86.h
+++ b/src/lj_asm_x86.h
@@ -2005,19 +2005,6 @@ static void asm_ldexp(ASMState *as, IRIns *ir)
   asm_x87load(as, ir->op2);
 }
 
-static void asm_fppowi(ASMState *as, IRIns *ir)
-{
-  /* The modified regs must match with the *.dasc implementation. */
-  RegSet drop = RSET_RANGE(RID_XMM0, RID_XMM1+1)|RID2RSET(RID_EAX);
-  if (ra_hasreg(ir->r))
-    rset_clear(drop, ir->r);  /* Dest reg handled below. */
-  ra_evictset(as, drop);
-  ra_destreg(as, ir, RID_XMM0);
-  emit_call(as, lj_vm_powi_sse);
-  ra_left(as, RID_XMM0, ir->op1);
-  ra_left(as, RID_EAX, ir->op2);
-}
-
 static int asm_swapops(ASMState *as, IRIns *ir)
 {
   IRIns *irl = IR(ir->op1);
diff --git a/src/lj_dispatch.h b/src/lj_dispatch.h
index b8bc2594..af870a75 100644
--- a/src/lj_dispatch.h
+++ b/src/lj_dispatch.h
@@ -44,7 +44,7 @@ extern double __divdf3(double a, double b);
 #define GOTDEF(_) \
   _(floor) _(ceil) _(trunc) _(log) _(log10) _(exp) _(sin) _(cos) _(tan) \
   _(asin) _(acos) _(atan) _(sinh) _(cosh) _(tanh) _(frexp) _(modf) _(atan2) \
-  _(pow) _(fmod) _(ldexp) _(lj_vm_modi) \
+  _(lj_vm_pow) _(fmod) _(ldexp) _(lj_vm_modi) \
   _(lj_dispatch_call) _(lj_dispatch_ins) _(lj_dispatch_stitch) \
   _(lj_dispatch_profile) _(lj_err_throw) \
   _(lj_ffh_coroutine_wrap_err) _(lj_func_closeuv) _(lj_func_newL_gc) \
diff --git a/src/lj_ircall.h b/src/lj_ircall.h
index af064a6f..ac0888a0 100644
--- a/src/lj_ircall.h
+++ b/src/lj_ircall.h
@@ -195,7 +195,7 @@ typedef struct CCallInfo {
   _(ANY,	log,			1,   N, NUM, XA_FP) \
   _(ANY,	lj_vm_log2,		1,   N, NUM, XA_FP) \
   _(ANY,	lj_vm_powi,		2,   N, NUM, XA_FP) \
-  _(ANY,	pow,			2,   N, NUM, XA2_FP) \
+  _(ANY,	lj_vm_pow,		2,   N, NUM, XA2_FP) \
   _(ANY,	atan2,			2,   N, NUM, XA2_FP) \
   _(ANY,	ldexp,			2,   N, NUM, XA_FP) \
   _(SOFTFP,	lj_vm_tobit,		1,   N, INT, XA_FP32) \
diff --git a/src/lj_opt_fold.c b/src/lj_opt_fold.c
index 0007107b..7d7cc9d1 100644
--- a/src/lj_opt_fold.c
+++ b/src/lj_opt_fold.c
@@ -1114,33 +1114,6 @@ LJFOLDF(simplify_numpow_xkint)
   return ref;
 }
 
-LJFOLD(POW any KNUM)
-LJFOLDF(simplify_numpow_xknum)
-{
-  if (knumright == 0.5)  /* x ^ 0.5 ==> sqrt(x) */
-    return emitir(IRTN(IR_FPMATH), fins->op1, IRFPM_SQRT);
-  return NEXTFOLD;
-}
-
-LJFOLD(POW KNUM any)
-LJFOLDF(simplify_numpow_kx)
-{
-  lua_Number n = knumleft;
-  if (n == 2.0 && irt_isint(fright->t)) {  /* 2.0 ^ i ==> ldexp(1.0, i) */
-#if LJ_TARGET_X86ORX64
-    /* Different IR_LDEXP calling convention on x86/x64 requires conversion. */
-    fins->o = IR_CONV;
-    fins->op1 = fins->op2;
-    fins->op2 = IRCONV_NUM_INT;
-    fins->op2 = (IRRef1)lj_opt_fold(J);
-#endif
-    fins->op1 = (IRRef1)lj_ir_knum_one(J);
-    fins->o = IR_LDEXP;
-    return RETRYFOLD;
-  }
-  return NEXTFOLD;
-}
-
 /* -- Simplify conversions ------------------------------------------------ */
 
 LJFOLD(CONV CONV IRCONV_NUM_INT)  /* _NUM */
diff --git a/src/lj_opt_narrow.c b/src/lj_opt_narrow.c
index 2cfb775b..d6601f4c 100644
--- a/src/lj_opt_narrow.c
+++ b/src/lj_opt_narrow.c
@@ -590,20 +590,14 @@ TRef lj_opt_narrow_pow(jit_State *J, TRef rb, TRef rc, TValue *vb, TValue *vc)
   rb = conv_str_tonum(J, rb, vb);
   rb = lj_ir_tonum(J, rb);  /* Left arg is always treated as an FP number. */
   rc = conv_str_tonum(J, rc, vc);
-  /* Narrowing must be unconditional to preserve (-x)^i semantics. */
   if (tvisint(vc) || numisint(numV(vc))) {
-    int checkrange = 0;
-    /* pow() is faster for bigger exponents. But do this only for (+k)^i. */
-    if (tref_isk(rb) && (int32_t)ir_knum(IR(tref_ref(rb)))->u32.hi >= 0) {
-      int32_t k = numberVint(vc);
-      if (!(k >= -65536 && k <= 65536)) goto force_pow_num;
-      checkrange = 1;
-    }
+    int32_t k = numberVint(vc);
+    if (!(k >= -65536 && k <= 65536)) goto force_pow_num;
     if (!tref_isinteger(rc)) {
       /* Guarded conversion to integer! */
       rc = emitir(IRTGI(IR_CONV), rc, IRCONV_INT_NUM|IRCONV_CHECK);
     }
-    if (checkrange && !tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
+    if (!tref_isk(rc)) {  /* Range guard: -65536 <= i <= 65536 */
       TRef tmp = emitir(IRTI(IR_ADD), rc, lj_ir_kint(J, 65536));
       emitir(IRTGI(IR_ULE), tmp, lj_ir_kint(J, 2*65536));
     }
diff --git a/src/lj_vm.h b/src/lj_vm.h
index abaa7c52..f6f28a08 100644
--- a/src/lj_vm.h
+++ b/src/lj_vm.h
@@ -82,10 +82,6 @@ LJ_ASMF int32_t LJ_FASTCALL lj_vm_modi(int32_t, int32_t);
 LJ_ASMF void lj_vm_floor_sse(void);
 LJ_ASMF void lj_vm_ceil_sse(void);
 LJ_ASMF void lj_vm_trunc_sse(void);
-LJ_ASMF void lj_vm_powi_sse(void);
-#define lj_vm_powi	NULL
-#else
-LJ_ASMF double lj_vm_powi(double, int32_t);
 #endif
 #if LJ_TARGET_PPC || LJ_TARGET_ARM64
 #define lj_vm_trunc	trunc
@@ -100,6 +96,9 @@ LJ_ASMF int lj_vm_errno(void);
 #endif
 #endif
 
+LJ_ASMF double lj_vm_powi(double, int32_t);
+LJ_ASMF double lj_vm_pow(double, double);
+
 /* Continuations for metamethods. */
 LJ_ASMF void lj_cont_cat(void);  /* Continue with concatenation. */
 LJ_ASMF void lj_cont_ra(void);  /* Store result in RA from instruction. */
diff --git a/src/lj_vmmath.c b/src/lj_vmmath.c
index 14e66687..539f955b 100644
--- a/src/lj_vmmath.c
+++ b/src/lj_vmmath.c
@@ -30,11 +30,51 @@ LJ_FUNCA double lj_wrap_sinh(double x) { return sinh(x); }
 LJ_FUNCA double lj_wrap_cosh(double x) { return cosh(x); }
 LJ_FUNCA double lj_wrap_tanh(double x) { return tanh(x); }
 LJ_FUNCA double lj_wrap_atan2(double x, double y) { return atan2(x, y); }
-LJ_FUNCA double lj_wrap_pow(double x, double y) { return pow(x, y); }
 LJ_FUNCA double lj_wrap_fmod(double x, double y) { return fmod(x, y); }
 #endif
 
-/* -- Helper functions for generated machine code ------------------------- */
+/* -- Helper functions ---------------------------------------------------- */
+
+/* Unsigned x^k. */
+static double lj_vm_powui(double x, uint32_t k)
+{
+  double y;
+  lj_assertX(k != 0, "pow with zero exponent");
+  for (; (k & 1) == 0; k >>= 1) x *= x;
+  y = x;
+  if ((k >>= 1) != 0) {
+    for (;;) {
+      x *= x;
+      if (k == 1) break;
+      if (k & 1) y *= x;
+      k >>= 1;
+    }
+    y *= x;
+  }
+  return y;
+}
+
+/* Signed x^k. */
+double lj_vm_powi(double x, int32_t k)
+{
+  if (k > 1)
+    return lj_vm_powui(x, (uint32_t)k);
+  else if (k == 1)
+    return x;
+  else if (k == 0)
+    return 1.0;
+  else
+    return 1.0 / lj_vm_powui(x, (uint32_t)-k);
+}
+
+double lj_vm_pow(double x, double y)
+{
+  int32_t k = lj_num2int(y);
+  if ((k >= -65536 && k <= 65536) && y == (double)k)
+    return lj_vm_powi(x, k);
+  else
+    return pow(x, y);
+}
 
 double lj_vm_foldarith(double x, double y, int op)
 {
@@ -44,7 +84,7 @@ double lj_vm_foldarith(double x, double y, int op)
   case IR_MUL - IR_ADD: return x*y; break;
   case IR_DIV - IR_ADD: return x/y; break;
   case IR_MOD - IR_ADD: return x-lj_vm_floor(x/y)*y; break;
-  case IR_POW - IR_ADD: return pow(x, y); break;
+  case IR_POW - IR_ADD: return lj_vm_pow(x, y); break;
   case IR_NEG - IR_ADD: return -x; break;
   case IR_ABS - IR_ADD: return fabs(x); break;
 #if LJ_HASJIT
@@ -56,6 +96,8 @@ double lj_vm_foldarith(double x, double y, int op)
   }
 }
 
+/* -- Helper functions for generated machine code ------------------------- */
+
 #if (LJ_HASJIT && !(LJ_TARGET_ARM || LJ_TARGET_ARM64 || LJ_TARGET_PPC)) || LJ_TARGET_MIPS
 int32_t LJ_FASTCALL lj_vm_modi(int32_t a, int32_t b)
 {
@@ -80,40 +122,6 @@ double lj_vm_log2(double a)
 }
 #endif
 
-#if !LJ_TARGET_X86ORX64
-/* Unsigned x^k. */
-static double lj_vm_powui(double x, uint32_t k)
-{
-  double y;
-  lj_assertX(k != 0, "pow with zero exponent");
-  for (; (k & 1) == 0; k >>= 1) x *= x;
-  y = x;
-  if ((k >>= 1) != 0) {
-    for (;;) {
-      x *= x;
-      if (k == 1) break;
-      if (k & 1) y *= x;
-      k >>= 1;
-    }
-    y *= x;
-  }
-  return y;
-}
-
-/* Signed x^k. */
-double lj_vm_powi(double x, int32_t k)
-{
-  if (k > 1)
-    return lj_vm_powui(x, (uint32_t)k);
-  else if (k == 1)
-    return x;
-  else if (k == 0)
-    return 1.0;
-  else
-    return 1.0 / lj_vm_powui(x, (uint32_t)-k);
-}
-#endif
-
 /* Computes fpm(x) for extended math functions. */
 double lj_vm_foldfpm(double x, int fpm)
 {
diff --git a/src/vm_arm.dasc b/src/vm_arm.dasc
index 767d31f9..792f0363 100644
--- a/src/vm_arm.dasc
+++ b/src/vm_arm.dasc
@@ -1485,11 +1485,11 @@ static void build_subroutines(BuildCtx *ctx)
   |.endif
   |.endmacro
   |
-  |.macro math_extern2, func
+  |.macro math_extern2, name, func
   |.if HFABI
-  |  .ffunc_dd math_ .. func
+  |  .ffunc_dd math_ .. name
   |.else
-  |  .ffunc_nn math_ .. func
+  |  .ffunc_nn math_ .. name
   |.endif
   |  .IOS mov RA, BASE
   |  bl extern func
@@ -1500,6 +1500,9 @@ static void build_subroutines(BuildCtx *ctx)
   |  b ->fff_restv
   |.endif
   |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
   |
   |.if FPU
   |  .ffunc_d math_sqrt
@@ -1545,7 +1548,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern sinh
   |  math_extern cosh
   |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
@@ -3153,7 +3156,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     break;
   case BC_POW:
     |  // NYI: (partial) integer arithmetic.
-    |  ins_arithfp extern, extern pow
+    |  ins_arithfp extern, extern lj_vm_pow
     break;
 
   case BC_CAT:
diff --git a/src/vm_arm64.dasc b/src/vm_arm64.dasc
index de33bde4..fb267a76 100644
--- a/src/vm_arm64.dasc
+++ b/src/vm_arm64.dasc
@@ -1391,11 +1391,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  b ->fff_resn
   |.endmacro
   |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
   |  bl extern func
   |  b ->fff_resn
   |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
   |
   |.ffunc_n math_sqrt
   |  fsqrt d0, d0
@@ -1424,7 +1427,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern sinh
   |  math_extern cosh
   |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
@@ -2621,7 +2624,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  ins_arithload FARG1, FARG2
     |  ins_arithfallback ins_arithcheck_num
     |.if "fpins" == "fpow"
-    |  bl extern pow
+    |  bl extern lj_vm_pow
     |.else
     |  fpins FARG1, FARG1, FARG2
     |.endif
diff --git a/src/vm_mips.dasc b/src/vm_mips.dasc
index 32caabf7..5664f503 100644
--- a/src/vm_mips.dasc
+++ b/src/vm_mips.dasc
@@ -1631,14 +1631,17 @@ static void build_subroutines(BuildCtx *ctx)
   |.  nop
   |.endmacro
   |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
   |.  load_got func
   |  call_extern
   |.  nop
   |  b ->fff_resn
   |.  nop
   |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
   |
   |// TODO: Return integer type if result is integer (own sf implementation).
   |.macro math_round, func
@@ -1692,7 +1695,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern sinh
   |  math_extern cosh
   |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
@@ -3585,7 +3588,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  sltiu AT, SFARG1HI, LJ_TISNUM
     |  sltiu TMP0, SFARG2HI, LJ_TISNUM
     |  and AT, AT, TMP0
-    |  load_got pow
+    |  load_got lj_vm_pow
     |  beqz AT, ->vmeta_arith
     |.  addu RA, BASE, RA
     |.if FPU
diff --git a/src/vm_mips64.dasc b/src/vm_mips64.dasc
index 44fba36c..249605d4 100644
--- a/src/vm_mips64.dasc
+++ b/src/vm_mips64.dasc
@@ -1669,14 +1669,17 @@ static void build_subroutines(BuildCtx *ctx)
   |.  nop
   |.endmacro
   |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
   |.  load_got func
   |  call_extern
   |.  nop
   |  b ->fff_resn
   |.  nop
   |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
   |
   |// TODO: Return integer type if result is integer (own sf implementation).
   |.macro math_round, func
@@ -1730,7 +1733,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern sinh
   |  math_extern cosh
   |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
@@ -3823,7 +3826,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  sltiu TMP0, TMP0, LJ_TISNUM
     |   sltiu TMP1, TMP1, LJ_TISNUM
     |  and AT, TMP0, TMP1
-    |  load_got pow
+    |  load_got lj_vm_pow
     |  beqz AT, ->vmeta_arith
     |.  daddu RA, BASE, RA
     |.if FPU
diff --git a/src/vm_ppc.dasc b/src/vm_ppc.dasc
index 980ad897..94af63e6 100644
--- a/src/vm_ppc.dasc
+++ b/src/vm_ppc.dasc
@@ -2032,11 +2032,14 @@ static void build_subroutines(BuildCtx *ctx)
   |  b ->fff_resn
   |.endmacro
   |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
   |  blex func
   |  b ->fff_resn
   |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
   |
   |.macro math_round, func
   |  .ffunc_1 math_ .. func
@@ -2161,7 +2164,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern sinh
   |  math_extern cosh
   |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
@@ -4154,7 +4157,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  checknum cr1, CARG3
     |  crand 4*cr0+lt, 4*cr0+lt, 4*cr1+lt
     |  bge ->vmeta_arith_vv
-    |  blex pow
+    |  blex lj_vm_pow
     |  ins_next1
     |.if FPU
     |  stfdx FARG1, BASE, RA
diff --git a/src/vm_x64.dasc b/src/vm_x64.dasc
index 7b04b928..acbe8dc2 100644
--- a/src/vm_x64.dasc
+++ b/src/vm_x64.dasc
@@ -1825,13 +1825,16 @@ static void build_subroutines(BuildCtx *ctx)
   |  jmp ->fff_resxmm0
   |.endmacro
   |
-  |.macro math_extern2, func
-  |  .ffunc_nn math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nn math_ .. name
   |  mov RB, BASE
   |  call extern func
   |  mov BASE, RB
   |  jmp ->fff_resxmm0
   |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
   |
   |  math_extern log10
   |  math_extern exp
@@ -1844,7 +1847,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern sinh
   |  math_extern cosh
   |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
@@ -2649,41 +2652,6 @@ static void build_subroutines(BuildCtx *ctx)
   |  subsd xmm0, xmm1
   |  ret
   |
-  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
-  |->vm_powi_sse:
-  |  cmp eax, 1; jle >6			// i<=1?
-  |  // Now 1 < (unsigned)i <= 0x80000000.
-  |1:  // Handle leading zeros.
-  |  test eax, 1; jnz >2
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1
-  |  jmp <1
-  |2:
-  |  shr eax, 1; jz >5
-  |  movaps xmm1, xmm0
-  |3:  // Handle trailing bits.
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1; jz >4
-  |  jnc <3
-  |  mulsd xmm1, xmm0
-  |  jmp <3
-  |4:
-  |  mulsd xmm0, xmm1
-  |5:
-  |  ret
-  |6:
-  |  je <5				// x^1 ==> x
-  |  jb >7				// x^0 ==> 1
-  |  neg eax
-  |  call <1
-  |  sseconst_1 xmm1, RD
-  |  divsd xmm1, xmm0
-  |  movaps xmm0, xmm1
-  |  ret
-  |7:
-  |  sseconst_1 xmm0, RD
-  |  ret
-  |
   |//-----------------------------------------------------------------------
   |//-- Miscellaneous functions --------------------------------------------
   |//-----------------------------------------------------------------------
diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
index bd1e940e..bf30cce6 100644
--- a/src/vm_x86.dasc
+++ b/src/vm_x86.dasc
@@ -2240,8 +2240,8 @@ static void build_subroutines(BuildCtx *ctx)
   |  jmp ->fff_resfp
   |.endmacro
   |
-  |.macro math_extern2, func
-  |  .ffunc_nnsse math_ .. func
+  |.macro math_extern2, name, func
+  |  .ffunc_nnsse math_ .. name
   |.if not X64
   |  movsd FPARG1, xmm0
   |  movsd FPARG3, xmm1
@@ -2251,6 +2251,9 @@ static void build_subroutines(BuildCtx *ctx)
   |  mov BASE, RB
   |  jmp ->fff_resfp
   |.endmacro
+  |.macro math_extern2, func
+  |  math_extern2 func, func
+  |.endmacro
   |
   |  math_extern log10
   |  math_extern exp
@@ -2263,7 +2266,7 @@ static void build_subroutines(BuildCtx *ctx)
   |  math_extern sinh
   |  math_extern cosh
   |  math_extern tanh
-  |  math_extern2 pow
+  |  math_extern2 pow, lj_vm_pow
   |  math_extern2 atan2
   |  math_extern2 fmod
   |
@@ -3140,41 +3143,6 @@ static void build_subroutines(BuildCtx *ctx)
   |  subsd xmm0, xmm1
   |  ret
   |
-  |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
-  |->vm_powi_sse:
-  |  cmp eax, 1; jle >6			// i<=1?
-  |  // Now 1 < (unsigned)i <= 0x80000000.
-  |1:  // Handle leading zeros.
-  |  test eax, 1; jnz >2
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1
-  |  jmp <1
-  |2:
-  |  shr eax, 1; jz >5
-  |  movaps xmm1, xmm0
-  |3:  // Handle trailing bits.
-  |  mulsd xmm0, xmm0
-  |  shr eax, 1; jz >4
-  |  jnc <3
-  |  mulsd xmm1, xmm0
-  |  jmp <3
-  |4:
-  |  mulsd xmm0, xmm1
-  |5:
-  |  ret
-  |6:
-  |  je <5				// x^1 ==> x
-  |  jb >7				// x^0 ==> 1
-  |  neg eax
-  |  call <1
-  |  sseconst_1 xmm1, RDa
-  |  divsd xmm1, xmm0
-  |  movaps xmm0, xmm1
-  |  ret
-  |7:
-  |  sseconst_1 xmm0, RDa
-  |  ret
-  |
   |//-----------------------------------------------------------------------
   |//-- Miscellaneous functions --------------------------------------------
   |//-----------------------------------------------------------------------
@@ -3976,7 +3944,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
     |  movsd FPARG1, xmm0
     |  movsd FPARG3, xmm1
     |.endif
-    |  call extern pow
+    |  call extern lj_vm_pow
     |  movzx RA, PC_RA
     |  mov BASE, RB
     |.if X64
diff --git a/test/tarantool-tests/lj-684-pow-inconsistencies.test.lua b/test/tarantool-tests/lj-684-pow-inconsistencies.test.lua
new file mode 100644
index 00000000..5129fc45
--- /dev/null
+++ b/test/tarantool-tests/lj-684-pow-inconsistencies.test.lua
@@ -0,0 +1,89 @@
+local tap = require('tap')
+-- Test to demonstrate the incorrect JIT behaviour for different
+-- power operation optimizations.
+-- See also:
+-- https://github.com/LuaJIT/LuaJIT/issues/684.
+local test = tap.test('lj-684-pow-inconsistencies'):skipcond({
+  ['Test requires JIT enabled'] = not jit.status(),
+})
+
+local tostring = tostring
+
+test:plan(4)
+
+jit.opt.start('hotloop=1')
+
+-- XXX: Prevent hotcount side effects.
+jit.off()
+jit.flush()
+
+local res = {}
+-- -0 ^ 0.5 = 0. Test sign with `tostring()`.
+-- XXX: use local variable to prevent folding via parser.
+-- XXX: use stack slot out of trace to prevent constant folding.
+local minus_zero = -0
+jit.on()
+for i = 1, 4 do
+  res[i] = tostring(minus_zero ^ 0.5)
+end
+
+-- XXX: Prevent hotcount side effects.
+jit.off()
+jit.flush()
+
+test:samevalues(res, ('consistent results for folding (-0) ^ 0.5'))
+
+jit.on()
+-- -inf ^ 0.5 = inf.
+res = {}
+local minus_inf = -math.huge
+jit.on()
+for i = 1, 4 do
+  res[i] = minus_inf ^ 0.5
+end
+
+-- XXX: Prevent hotcount side effects.
+jit.off()
+jit.flush()
+
+test:samevalues(res, ('consistent results for folding (-inf) ^ 0.5'))
+
+-- 2921 ^ 0.5 = 0x1.b05ec632536fap+5.
+res = {}
+-- XXX: use local variable to prevent folding via parser.
+-- XXX: use stack slot out of trace to prevent constant folding.
+local corner_case_05 = 2921
+jit.on()
+for i = 1, 4 do
+  res[i] = corner_case_05 ^ 0.5
+end
+
+-- XXX: Prevent hotcount side effects.
+jit.off()
+jit.flush()
+
+test:samevalues(res, ('consistent results for folding 2921 ^ 0.5'))
+
+-- Narrowing for non-constant base of power operation.
+local function pow(base, power)
+  return base ^ power
+end
+
+jit.on()
+
+-- Compile function first.
+pow(1, 2)
+pow(1, 2)
+
+-- Need some value near 1, to avoid infinite result.
+local base = 1.0000000001
+local power = 65536 * 3
+local resulting_value = pow(base, power)
+
+-- XXX: Prevent hotcount side effects.
+jit.off()
+jit.flush()
+
+test:is(resulting_value, base ^ power, 'guard for narrowing of power operation')
+
+test:done(true)
diff --git a/test/tarantool-tests/lj-9-pow-inconsistencies.test.lua b/test/tarantool-tests/lj-9-pow-inconsistencies.test.lua
index 21b3a0d9..1f7f65c5 100644
--- a/test/tarantool-tests/lj-9-pow-inconsistencies.test.lua
+++ b/test/tarantool-tests/lj-9-pow-inconsistencies.test.lua
@@ -16,6 +16,8 @@ local INTERESTING_VALUES = {
   -- x ^  inf = 0 (inf), if |x| < 1 (|x| > 1).
   -- x ^ -inf = inf (0), if |x| < 1 (|x| > 1).
   0.999999, 1.000001, -0.999999, -1.000001,
+  -- Test power of even numbers optimizations.
+  2, -2, 0.5, -0.5,
 }
 test:plan(1 + (#INTERESTING_VALUES) ^ 2)
 
-- 
2.41.0