Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH 1/1] app: exit gracefully when a main script throws an error
@ 2019-09-10 22:00 Vladislav Shpilevoy
  2019-10-24  0:21 ` [Tarantool-patches] " Alexander Turenko
  0 siblings, 1 reply; 4+ messages in thread
From: Vladislav Shpilevoy @ 2019-09-10 22:00 UTC (permalink / raw)
  To: tarantool-patches; +Cc: alexander.turenko

Code to run main script (passed via command line args, or
interactive console) has a footer where it notifies systemd,
logs a happened error, and panics.

Before the patch that code was unreachable in case of any
exception in a main script, because panic happened earlier. Now a
happened exception is correctly carried to the footer with proper
error processing.

A first and obvious solution was replace all panics with diag_set
and use fiber_join on the script runner fiber. But appeared, that
the fiber running a main script can't be joined. This is because
normally it exits via os.exit() which never returns and therefore
its caller never dies = can't be joined.

The patch solves this problem by passing main fiber diag to the
script runner by pointer, eliminating fiber_join necessity.

Closes #4382
---
Branch: https://github.com/tarantool/tarantool/tree/gerold103/gh-4382-init-script-fail-and-sysd
Issue: https://github.com/tarantool/tarantool/issues/4382

 src/lua/init.c                  | 64 ++++++++++++++++++++-------------
 src/lua/init.h                  |  6 +++-
 src/main.cc                     |  5 +--
 test/app-tap/fail_main.result   |  3 ++
 test/app-tap/fail_main.test.lua | 39 ++++++++++++++++++++
 5 files changed, 89 insertions(+), 28 deletions(-)
 create mode 100644 test/app-tap/fail_main.result
 create mode 100755 test/app-tap/fail_main.test.lua

diff --git a/src/lua/init.c b/src/lua/init.c
index 1641f3d82..097dd8495 100644
--- a/src/lua/init.c
+++ b/src/lua/init.c
@@ -534,19 +534,17 @@ tarantool_lua_slab_cache()
 /**
  * Push argument and call a function on the top of Lua stack
  */
-static void
+static int
 lua_main(lua_State *L, int argc, char **argv)
 {
 	assert(lua_isfunction(L, -1));
 	lua_checkstack(L, argc - 1);
 	for (int i = 1; i < argc; i++)
 		lua_pushstring(L, argv[i]);
-	if (luaT_call(L, lua_gettop(L) - 1, 0) != 0) {
-		struct error *e = diag_last_error(&fiber()->diag);
-		panic("%s", e->errmsg);
-	}
+	int rc = luaT_call(L, lua_gettop(L) - 1, 0);
 	/* clear the stack from return values. */
 	lua_settop(L, 0);
+	return rc;
 }
 
 /**
@@ -562,7 +560,13 @@ run_script_f(va_list ap)
 	char **optv = va_arg(ap, char **);
 	int argc = va_arg(ap, int);
 	char **argv = va_arg(ap, char **);
-	struct diag *diag = &fiber()->diag;
+	/*
+	 * An error is returned via an external diag. A caller
+	 * can't use fiber_join(), because the script can call
+	 * os.exit(). That call makes the script runner fiber
+	 * never really dead. It never returns from its function.
+	 */
+	struct diag *diag = va_arg(ap, struct diag *);
 
 	/*
 	 * Load libraries and execute chunks passed by -l and -e
@@ -577,10 +581,8 @@ run_script_f(va_list ap)
 			 */
 			lua_getglobal(L, "require");
 			lua_pushstring(L, optv[i + 1]);
-			if (luaT_call(L, 1, 1) != 0) {
-				struct error *e = diag_last_error(diag);
-				panic("%s", e->errmsg);
-			}
+			if (luaT_call(L, 1, 1) != 0)
+				goto error;
 			/* Non-standard: set name = require('name') */
 			lua_setglobal(L, optv[i + 1]);
 			lua_settop(L, 0);
@@ -590,13 +592,10 @@ run_script_f(va_list ap)
 			 * Execute chunk
 			 */
 			if (luaL_loadbuffer(L, optv[i + 1], strlen(optv[i + 1]),
-					    "=(command line)") != 0) {
-				panic("%s", lua_tostring(L, -1));
-			}
-			if (luaT_call(L, 0, 0) != 0) {
-				struct error *e = diag_last_error(diag);
-				panic("%s", e->errmsg);
-			}
+					    "=(command line)") != 0)
+				goto luajit_error;
+			if (luaT_call(L, 0, 0) != 0)
+				goto error;
 			lua_settop(L, 0);
 			break;
 		default:
@@ -614,13 +613,15 @@ run_script_f(va_list ap)
 	if (path && strcmp(path, "-") != 0 && access(path, F_OK) == 0) {
 		/* Execute script. */
 		if (luaL_loadfile(L, path) != 0)
-			panic("%s", lua_tostring(L, -1));
-		lua_main(L, argc, argv);
+			goto luajit_error;
+		if (lua_main(L, argc, argv) != 0)
+			goto error;
 	} else if (!isatty(STDIN_FILENO) || (path && strcmp(path, "-") == 0)) {
 		/* Execute stdin */
 		if (luaL_loadfile(L, NULL) != 0)
-			panic("%s", lua_tostring(L, -1));
-		lua_main(L, argc, argv);
+			goto luajit_error;
+		if (lua_main(L, argc, argv) != 0)
+			goto error;
 	} else {
 		interactive = true;
 	}
@@ -639,18 +640,25 @@ run_script_f(va_list ap)
 		lua_remove(L, -2); /* remove package.loaded.console */
 		lua_remove(L, -2); /* remove package.loaded */
 		start_loop = false;
-		lua_main(L, argc, argv);
+		if (lua_main(L, argc, argv) != 0)
+			goto error;
 	}
-
 	/*
 	 * Lua script finished. Stop the auxiliary event loop and
 	 * return control back to tarantool_lua_run_script.
 	 */
+end:
 	ev_break(loop(), EVBREAK_ALL);
 	return 0;
+
+luajit_error:
+	diag_set(LuajitError, lua_tostring(L, -1));
+error:
+	diag_move(diag_get(), diag);
+	goto end;
 }
 
-void
+int
 tarantool_lua_run_script(char *path, bool interactive,
 			 int optc, char **optv, int argc, char **argv)
 {
@@ -668,7 +676,7 @@ tarantool_lua_run_script(char *path, bool interactive,
 		panic("%s", diag_last_error(diag_get())->errmsg);
 	script_fiber->storage.lua.stack = tarantool_L;
 	fiber_start(script_fiber, tarantool_L, path, interactive,
-		    optc, optv, argc, argv);
+		    optc, optv, argc, argv, diag_get());
 
 	/*
 	 * Run an auxiliary event loop to re-schedule run_script fiber.
@@ -678,6 +686,12 @@ tarantool_lua_run_script(char *path, bool interactive,
 		ev_run(loop(), 0);
 	/* The fiber running the startup script has ended. */
 	script_fiber = NULL;
+	/*
+	 * Result can't be obtained via fiber_join - script fiber
+	 * never dies if os.exit() was called. This is why diag
+	 * is checked explicitly.
+	 */
+	return diag_is_empty(diag_get()) ? 0 : -1;
 }
 
 void
diff --git a/src/lua/init.h b/src/lua/init.h
index 1257ef510..507360738 100644
--- a/src/lua/init.h
+++ b/src/lua/init.h
@@ -65,8 +65,12 @@ tarantool_lua_free();
  * @param interactive force interactive mode
  * @param argc argc the number of command line arguments
  * @param argv argv command line arguments
+ *
+ * @retval 0 The script is successfully finished.
+ * @retval -1 Error during the script execution. Diagnostics area
+ *        error is set.
  */
-void
+int
 tarantool_lua_run_script(char *path, bool force_interactive,
 			 int optc, char **optv,
 			 int argc, char **argv);
diff --git a/src/main.cc b/src/main.cc
index b16cfa5fe..6ccb2a47f 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -845,8 +845,9 @@ main(int argc, char **argv)
 		 * is why script must run only after the server was fully
 		 * initialized.
 		 */
-		tarantool_lua_run_script(script, interactive, optc, optv,
-					 main_argc, main_argv);
+		if (tarantool_lua_run_script(script, interactive, optc, optv,
+					     main_argc, main_argv) != 0)
+			diag_raise();
 		/*
 		 * Start event loop after executing Lua script if signal_cb()
 		 * wasn't triggered and there is some new events. Initial value
diff --git a/test/app-tap/fail_main.result b/test/app-tap/fail_main.result
new file mode 100644
index 000000000..09d9f2b20
--- /dev/null
+++ b/test/app-tap/fail_main.result
@@ -0,0 +1,3 @@
+TAP version 13
+1..1
+ok - main script error is handled gracefully
diff --git a/test/app-tap/fail_main.test.lua b/test/app-tap/fail_main.test.lua
new file mode 100755
index 000000000..f8c45bf6f
--- /dev/null
+++ b/test/app-tap/fail_main.test.lua
@@ -0,0 +1,39 @@
+#!/usr/bin/env tarantool
+
+local tap = require('tap')
+local test = tap.test("fail_main")
+local fio = require('fio')
+local tarantool_bin = arg[-1]
+
+test:plan(1)
+
+function run_script(code)
+    local dir = fio.tempdir()
+    local script_path = fio.pathjoin(dir, 'script.lua')
+    local script = fio.open(script_path, {'O_CREAT', 'O_WRONLY', 'O_APPEND'},
+        tonumber('0777', 8))
+    script:write(code)
+    script:close()
+    local output_file = fio.pathjoin(fio.cwd(), 'out.txt')
+    local cmd = [[/bin/sh -c 'cd "%s" && "%s" ./script.lua 0> %s 2> %s']]
+    local code = os.execute(
+        string.format(cmd, dir, tarantool_bin, output_file, output_file)
+    )
+    fio.rmtree(dir)
+    local out_fd = fio.open(output_file, {'O_RDONLY'})
+    local output = out_fd:read(100000)
+    out_fd:close()
+    return code, output
+end
+
+--
+-- gh-4382: an error in main script should be handled gracefully,
+-- with proper logging.
+--
+local code, output = run_script("error('Error in the main script')")
+
+test:ok(output:match("fatal error, exiting the event loop"),
+        "main script error is handled gracefully")
+
+test:check()
+os.exit(0)
-- 
2.20.1 (Apple Git-117)

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/1] app: exit gracefully when a main script throws an error
  2019-09-10 22:00 [tarantool-patches] [PATCH 1/1] app: exit gracefully when a main script throws an error Vladislav Shpilevoy
@ 2019-10-24  0:21 ` Alexander Turenko
  2019-10-24 19:53   ` [Tarantool-patches] [tarantool-patches] " Vladislav Shpilevoy
  0 siblings, 1 reply; 4+ messages in thread
From: Alexander Turenko @ 2019-10-24  0:21 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches, tarantool-patches

I know, the patch was already pushed, but I looked on it briefly anyway.

It is LGTM, however I have one question: see below.

WBR, Alexander Turenko.

> @@ -678,6 +686,12 @@ tarantool_lua_run_script(char *path, bool interactive,
>  		ev_run(loop(), 0);
>  	/* The fiber running the startup script has ended. */
>  	script_fiber = NULL;
> +	/*
> +	 * Result can't be obtained via fiber_join - script fiber
> +	 * never dies if os.exit() was called. This is why diag
> +	 * is checked explicitly.
> +	 */
> +	return diag_is_empty(diag_get()) ? 0 : -1;

This is the only part I'm a bit tentative: can the diagnostic area be
populated by a user somehow? I tried to do--

 | fio.open('non_existent', {'O_RDONLY'})
 | os.exit()

--in the main script and it seems that it uses some other diagnostic
area, because the fio error was not reported as the main script fail.

So, things seems to work good, but I didn't got how exactly.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Tarantool-patches] [tarantool-patches] Re: [PATCH 1/1] app: exit gracefully when a main script throws an error
  2019-10-24  0:21 ` [Tarantool-patches] " Alexander Turenko
@ 2019-10-24 19:53   ` Vladislav Shpilevoy
  2019-10-24 23:04     ` Alexander Turenko
  0 siblings, 1 reply; 4+ messages in thread
From: Vladislav Shpilevoy @ 2019-10-24 19:53 UTC (permalink / raw)
  To: Alexander Turenko; +Cc: tarantool-patches, tarantool-patches

Hi!

On 24/10/2019 02:21, Alexander Turenko wrote:
> I know, the patch was already pushed, but I looked on it briefly anyway.
> 

If something is wrong, it may be fixed as a follow up.

> It is LGTM, however I have one question: see below.
> 
> WBR, Alexander Turenko.
> 
>> @@ -678,6 +686,12 @@ tarantool_lua_run_script(char *path, bool interactive,
>>  		ev_run(loop(), 0);
>>  	/* The fiber running the startup script has ended. */
>>  	script_fiber = NULL;
>> +	/*
>> +	 * Result can't be obtained via fiber_join - script fiber
>> +	 * never dies if os.exit() was called. This is why diag
>> +	 * is checked explicitly.
>> +	 */
>> +	return diag_is_empty(diag_get()) ? 0 : -1;
> 
> This is the only part I'm a bit tentative: can the diagnostic area be
> populated by a user somehow? I tried to do--
> 
>  | fio.open('non_existent', {'O_RDONLY'})
>  | os.exit()
> 
> --in the main script and it seems that it uses some other diagnostic
> area, because the fio error was not reported as the main script fail.

Should it have been reported?

Talking of why it is not now - well, yes, it uses another diagnostics
area. tarantool_lua_run_script() starts a special fiber to run a
script or a console. And the error is forwarded to the main fiber
only when the script fiber does it voluntary, in the end of
run_script_f().

In case of os.exit() the script fiber never finishes, and therefore
never moves its diag to the main fiber. Os.exit() breaks event loop
and freezes the current fiber. It makes ev_run() return in
tarantool_lua_run_script(). After that we see that the diag is empty,
because the script fiber never got to the point of filling it.

> 
> So, things seems to work good, but I didn't got how exactly.
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [Tarantool-patches] [tarantool-patches] Re: [PATCH 1/1] app: exit gracefully when a main script throws an error
  2019-10-24 19:53   ` [Tarantool-patches] [tarantool-patches] " Vladislav Shpilevoy
@ 2019-10-24 23:04     ` Alexander Turenko
  0 siblings, 0 replies; 4+ messages in thread
From: Alexander Turenko @ 2019-10-24 23:04 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches, tarantool-patches

> >> @@ -678,6 +686,12 @@ tarantool_lua_run_script(char *path, bool interactive,
> >>  		ev_run(loop(), 0);
> >>  	/* The fiber running the startup script has ended. */
> >>  	script_fiber = NULL;
> >> +	/*
> >> +	 * Result can't be obtained via fiber_join - script fiber
> >> +	 * never dies if os.exit() was called. This is why diag
> >> +	 * is checked explicitly.
> >> +	 */
> >> +	return diag_is_empty(diag_get()) ? 0 : -1;
> > 
> > This is the only part I'm a bit tentative: can the diagnostic area be
> > populated by a user somehow? I tried to do--
> > 
> >  | fio.open('non_existent', {'O_RDONLY'})
> >  | os.exit()
> > 
> > --in the main script and it seems that it uses some other diagnostic
> > area, because the fio error was not reported as the main script fail.
> 
> Should it have been reported?

No, current behaviour looks right for me.

> Talking of why it is not now - well, yes, it uses another diagnostics
> area. tarantool_lua_run_script() starts a special fiber to run a
> script or a console. And the error is forwarded to the main fiber
> only when the script fiber does it voluntary, in the end of
> run_script_f().
> 
> In case of os.exit() the script fiber never finishes, and therefore
> never moves its diag to the main fiber. Os.exit() breaks event loop
> and freezes the current fiber. It makes ev_run() return in
> tarantool_lua_run_script(). After that we see that the diag is empty,
> because the script fiber never got to the point of filling it.

Now I understood the logic, thanks!

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2019-10-24 23:04 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-09-10 22:00 [tarantool-patches] [PATCH 1/1] app: exit gracefully when a main script throws an error Vladislav Shpilevoy
2019-10-24  0:21 ` [Tarantool-patches] " Alexander Turenko
2019-10-24 19:53   ` [Tarantool-patches] [tarantool-patches] " Vladislav Shpilevoy
2019-10-24 23:04     ` Alexander Turenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox