[Tarantool-patches] [PATCH] fiber: abort trace recording on fiber yield

Igor Munkin imun at tarantool.org
Tue Mar 31 01:44:31 MSK 2020


Since Tarantool fibers doesn't respect Lua coroutine switch mechanism,
JIT machinery stays unnotified when one lua_State substitues another
one. As a result if trace recording hasn't been aborted prior to fiber
switch, the recording proceeds using the new lua_State and leads to
failure either on any compiler phase or on execution the compiled trace.

This changeset adds a mandatory on_yield trigger aborting trace
recording when fiber switches to another one.

Fixes #4491
Closes #1700

Signed-off-by: Igor Munkin <imun at tarantool.org>
---

This patch is based on the changes introduced within [1].

Branch: https://github.com/tarantool/tarantool/tree/imun/gh-1700-abort-recording-on-fiber-switch
Issue:
* https://github.com/tarantool/tarantool/issues/1700
* https://github.com/tarantool/tarantool/issues/4491

[1]: https://lists.tarantool.org/pipermail/tarantool-patches/2020-March/015212.html

 src/lua/fiber.c                               | 39 +++++++++++++-
 ...-4491-coio-wait-leads-to-segfault.test.lua | 53 +++++++++++++++++++
 2 files changed, 90 insertions(+), 2 deletions(-)
 create mode 100755 test/app-tap/gh-4491-coio-wait-leads-to-segfault.test.lua

diff --git a/src/lua/fiber.c b/src/lua/fiber.c
index 45bc03787..c79aa7c2b 100644
--- a/src/lua/fiber.c
+++ b/src/lua/fiber.c
@@ -38,6 +38,7 @@
 #include <lua.h>
 #include <lauxlib.h>
 #include <lualib.h>
+#include <luajit.h>
 
 void
 luaL_testcancel(struct lua_State *L)
@@ -454,6 +455,25 @@ lua_fiber_run_f(MAYBE_UNUSED va_list ap)
 	return result;
 }
 
+static int
+fiber_on_yield(struct trigger *trigger, void *event)
+{
+	(void) trigger;
+	(void) event;
+
+	/*
+	 * XXX: According to LuaJIT API reference luaJIT_setmode
+	 * function returns 0 on failure and 1 on success. Since
+	 * this call is aimed to abort the trace recording on
+	 * fiber_yield call and prevent already compiled traces
+	 * execution, the mode parameter is an invalid value (-1)
+	 * so JIT mode change is expected to fail and 0 is
+	 * returned. Otherwise non-zero return value signals
+	 * trigger_run routine about this trigger failure.
+	 */
+	return luaJIT_setmode(tarantool_L, 0, -1);
+}
+
 /**
  * Utility function for fiber.create and fiber.new
  */
@@ -467,10 +487,18 @@ fiber_create(struct lua_State *L)
 
 	struct fiber *f = fiber_new("lua", lua_fiber_run_f);
 	if (f == NULL) {
-		luaL_unref(L, LUA_REGISTRYINDEX, coro_ref);
-		luaT_error(L);
+		/* diagnostics is set in fiber_new. */
+		goto error;
 	}
 
+	struct trigger *t = malloc(sizeof(*t));
+	if (t == NULL) {
+		diag_set(OutOfMemory, sizeof(*t), "malloc", "t");
+		goto error;
+	}
+	trigger_create(t, fiber_on_yield, NULL, (trigger_f0) free);
+	trigger_add(&f->on_yield, t);
+
 	/* Move the arguments to the new coro */
 	lua_xmove(L, child_L, lua_gettop(L));
 	/* XXX: 'fiber' is leaked if this throws a Lua error. */
@@ -483,6 +511,13 @@ fiber_create(struct lua_State *L)
 	lua_pushinteger(child_L, coro_ref);
 	f->storage.lua.stack = child_L;
 	return f;
+
+error:
+	/* Release the anchored coroutine. */
+	luaL_unref(L, LUA_REGISTRYINDEX, coro_ref);
+	luaT_error(L);
+	unreachable();
+	return NULL;
 }
 
 /**
diff --git a/test/app-tap/gh-4491-coio-wait-leads-to-segfault.test.lua b/test/app-tap/gh-4491-coio-wait-leads-to-segfault.test.lua
new file mode 100755
index 000000000..0dd8dfbee
--- /dev/null
+++ b/test/app-tap/gh-4491-coio-wait-leads-to-segfault.test.lua
@@ -0,0 +1,53 @@
+#!/usr/bin/env tarantool
+
+local tap = require('tap')
+
+local test = tap.test('gh-4491-coio-wait-leads-to-segfault')
+
+-- Test file to demonstrate platform failure due to fiber switch
+-- while trace recording, details:
+--     https://github.com/tarantool/tarantool/issues/4491
+
+local fiber = require('fiber')
+local ffi = require('ffi')
+ffi.cdef('int coio_wait(int fd, int event, double timeout);')
+
+local cfg = {
+  hotloop = arg[1] or 1,
+  fibers = arg[1] or 2,
+  timeout = { put = 1, get = 1 },
+}
+
+test:plan(cfg.fibers + 1)
+
+local args = {
+  fd      = 1   , -- STDIN file descriptor
+  event   = 0x1 , -- COIO_READ event
+  timeout = 0.05, -- Timeout value
+}
+
+local function run(iterations, channel)
+  for _ = 1, iterations do
+    ffi.C.coio_wait(args.fd, args.event, args.timeout)
+  end
+  channel:put(true, cfg.timeout.put)
+end
+
+local channels = { }
+
+jit.opt.start('3', string.format('hotloop=%d', cfg.hotloop))
+
+for _ = 1, cfg.fibers do
+  channels[_] = fiber.channel(1)
+  fiber.new(run, cfg.hotloop + 1, channels[_])
+end
+
+-- Finalize the existing fibers
+for _ = 1, cfg.fibers do
+  test:ok(channels[_]:get(cfg.timeout.get),
+          string.format('fiber #%d successfully finished', _))
+end
+
+test:ok(true, 'trace is not recorded due to fiber switch underneath coio_wait')
+
+os.exit(test:check() and 0 or 1)
-- 
2.25.0



More information about the Tarantool-patches mailing list