[Tarantool-patches] [PATCH v5 4/4] crash: report crash data to the feedback server

Cyrill Gorcunov gorcunov at gmail.com
Wed Dec 23 18:41:55 MSK 2020


We have a feedback server which gathers information about a running instance.
While general info is enough for now we may loose a precious information about
crashes (such as call backtrace which caused the issue, type of build and etc).

In the commit we add support of sending this kind of information to the feedback
server. Internally we gather the reason of failure, pack it into base64 form
and then run another Tarantool instance which sends it out.

A typical report might look like

 | {
 |   "crashdump": {
 |     "version": "1",
 |     "data": {
 |       "uname": {
 |         "sysname": "Linux",
 |         "release": "5.9.14-100.fc32.x86_64",
 |         "version": "#1 SMP Fri Dec 11 14:30:38 UTC 2020",
 |         "machine": "x86_64"
 |       },
 |       "build": {
 |         "version": "2.7.0-115-g360565efb",
 |         "cmake_type": "Linux-x86_64-Debug"
 |       },
 |       "signal": {
 |         "signo": 11,
 |         "si_code": 0,
 |         "si_addr": "0x3e800004838",
 |         "backtrace": "IzAgIDB4NjMwNzM0IGluIGNyYXNoX2NvbGxlY3...",
 |         "timestamp": "2020-12-23 14:42:10 MSK"
 |       }
 |     }
 |   }
 | }

The `backtrace` itself is encoded as base64 because of newline symbols
(and may comprise some nonascii symbols as well).

There is no simple way to test this so I did it manually:
1) Run instance with

	box.cfg{log_level = 8, feedback_host="127.0.0.1:1500"}

2) Run listener shell as

	while true ; do nc -l -p 1500 -c 'echo -e "HTTP/1.1 200 OK\n\n $(date)"'; done

3) Send SIGSEGV

	kill -11 `pidof tarantool`

Once SIGSEGV is delivered the crashinfo data is generated and sent out. For
debug purpose this data is also printed to the terminal on debug log level.

Closes #5261

Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>

@TarantoolBot document
Title: Configuration update, allow to disable sending crash information

For better analysis of program crashes the information associated with
the crash such as

 - utsname (similar to `uname -a` output except the network name)
 - build information
 - reason for a crash
 - call backtrace

is sent to the feedback server. To disable it set `feedback_crashinfo`
to `false`.
---
 src/box/box.cc                  |  18 ++
 src/box/box.h                   |   1 +
 src/box/lua/cfg.cc              |   9 +
 src/box/lua/load_cfg.lua        |   6 +-
 src/lib/core/CMakeLists.txt     |   2 +-
 src/lib/core/crash.c            | 308 ++++++++++++++++++++++++++++++++
 src/lib/core/crash.h            |  18 ++
 src/main.cc                     |   1 +
 test/app-tap/init_script.result |   1 +
 test/box/admin.result           |   2 +
 test/box/cfg.result             |   4 +
 11 files changed, 368 insertions(+), 2 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index a8bc3471d..440bfa305 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -74,6 +74,7 @@
 #include "sql.h"
 #include "systemd.h"
 #include "call.h"
+#include "crash.h"
 #include "func.h"
 #include "sequence.h"
 #include "sql_stmt_cache.h"
@@ -1213,6 +1214,23 @@ box_set_prepared_stmt_cache_size(void)
 	return 0;
 }
 
+int
+box_set_crash(void)
+{
+	const char *host = cfg_gets("feedback_host");
+	bool is_enabled_1 = cfg_getb("feedback_enabled");
+	bool is_enabled_2 = cfg_getb("feedback_crashinfo");
+
+	if (host != NULL && strlen(host) >= CRASH_FEEDBACK_HOST_MAX) {
+		diag_set(ClientError, ER_CFG, "feedback_host",
+			  "the address is too long");
+		return -1;
+	}
+
+	crash_cfg(host, is_enabled_1 && is_enabled_2);
+	return 0;
+}
+
 /* }}} configuration bindings */
 
 /**
diff --git a/src/box/box.h b/src/box/box.h
index b47a220b7..69fa096a1 100644
--- a/src/box/box.h
+++ b/src/box/box.h
@@ -257,6 +257,7 @@ void box_set_replication_sync_timeout(void);
 void box_set_replication_skip_conflict(void);
 void box_set_replication_anon(void);
 void box_set_net_msg_max(void);
+int box_set_crash(void);
 
 int
 box_set_prepared_stmt_cache_size(void);
diff --git a/src/box/lua/cfg.cc b/src/box/lua/cfg.cc
index 42805e602..2d3ccbf0e 100644
--- a/src/box/lua/cfg.cc
+++ b/src/box/lua/cfg.cc
@@ -375,6 +375,14 @@ lbox_cfg_set_replication_skip_conflict(struct lua_State *L)
 	return 0;
 }
 
+static int
+lbox_cfg_set_crash(struct lua_State *L)
+{
+	if (box_set_crash() != 0)
+		luaT_error(L);
+	return 0;
+}
+
 void
 box_lua_cfg_init(struct lua_State *L)
 {
@@ -411,6 +419,7 @@ box_lua_cfg_init(struct lua_State *L)
 		{"cfg_set_replication_anon", lbox_cfg_set_replication_anon},
 		{"cfg_set_net_msg_max", lbox_cfg_set_net_msg_max},
 		{"cfg_set_sql_cache_size", lbox_set_prepared_stmt_cache_size},
+		{"cfg_set_crash", lbox_cfg_set_crash},
 		{NULL, NULL}
 	};
 
diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua
index 770442052..7e41e0999 100644
--- a/src/box/lua/load_cfg.lua
+++ b/src/box/lua/load_cfg.lua
@@ -33,7 +33,8 @@ end
 
 local ifdef_feedback_set_params =
     private.feedback_daemon ~= nil and
-    private.feedback_daemon.set_feedback_params or nil
+    private.feedback_daemon.set_feedback_params and
+    private.cfg_set_crash or nil
 
 -- all available options
 local default_cfg = {
@@ -99,6 +100,7 @@ local default_cfg = {
     replication_skip_conflict = false,
     replication_anon      = false,
     feedback_enabled      = true,
+    feedback_crashinfo    = true,
     feedback_host         = "https://feedback.tarantool.io",
     feedback_interval     = 3600,
     net_msg_max           = 768,
@@ -179,6 +181,7 @@ local template_cfg = {
     replication_skip_conflict = 'boolean',
     replication_anon      = 'boolean',
     feedback_enabled      = ifdef_feedback('boolean'),
+    feedback_crashinfo    = ifdef_feedback('boolean'),
     feedback_host         = ifdef_feedback('string'),
     feedback_interval     = ifdef_feedback('number'),
     net_msg_max           = 'number',
@@ -277,6 +280,7 @@ local dynamic_cfg = {
     checkpoint_wal_threshold = private.cfg_set_checkpoint_wal_threshold,
     worker_pool_threads     = private.cfg_set_worker_pool_threads,
     feedback_enabled        = ifdef_feedback_set_params,
+    feedback_crashinfo      = ifdef_feedback_set_params,
     feedback_host           = ifdef_feedback_set_params,
     feedback_interval       = ifdef_feedback_set_params,
     -- do nothing, affects new replicas, which query this value on start
diff --git a/src/lib/core/CMakeLists.txt b/src/lib/core/CMakeLists.txt
index 30cf0dd15..358e98ea7 100644
--- a/src/lib/core/CMakeLists.txt
+++ b/src/lib/core/CMakeLists.txt
@@ -40,7 +40,7 @@ endif()
 
 add_library(core STATIC ${core_sources})
 
-target_link_libraries(core salad small uri decNumber bit ${LIBEV_LIBRARIES}
+target_link_libraries(core salad small uri decNumber bit misc ${LIBEV_LIBRARIES}
                       ${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES}
                       ${MSGPUCK_LIBRARIES} ${ICU_LIBRARIES})
 
diff --git a/src/lib/core/crash.c b/src/lib/core/crash.c
index 3929463f3..19d3d49ea 100644
--- a/src/lib/core/crash.c
+++ b/src/lib/core/crash.c
@@ -7,8 +7,13 @@
 #include <string.h>
 #include <signal.h>
 #include <stdint.h>
+#include <limits.h>
 #include <time.h>
+#include <sys/types.h>
+#include <sys/utsname.h>
 
+#include "third_party/base64.h"
+#include "small/static.h"
 #include "trivia/util.h"
 
 #include "backtrace.h"
@@ -16,9 +21,16 @@
 #include "say.h"
 
 #define pr_fmt(fmt)		"crash: " fmt
+#define pr_debug(fmt, ...)	say_debug(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_info(fmt, ...)	say_info(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_err(fmt, ...)	say_error(pr_fmt(fmt), ##__VA_ARGS__)
 #define pr_syserr(fmt, ...)	say_syserror(pr_fmt(fmt), ##__VA_ARGS__)
+#define pr_crit(fmt, ...)	fprintf(stderr, pr_fmt(fmt) "\n", ##__VA_ARGS__)
 #define pr_panic(fmt, ...)	panic(pr_fmt(fmt), ##__VA_ARGS__)
 
+/** Use strlcpy with destination as an array */
+#define strlcpy_a(dst, src) strlcpy(dst, src, sizeof(dst))
+
 #ifdef TARGET_OS_LINUX
 #ifndef __x86_64__
 # error "Non x86-64 architectures are not supported"
@@ -71,6 +83,10 @@ static struct crash_info {
 	 */
 	struct crash_greg greg;
 #endif
+	/**
+	 * Timestamp in nanoseconds (realtime).
+	 */
+	uint64_t timestamp_rt;
 	/**
 	 * Faulting address.
 	 */
@@ -92,6 +108,72 @@ static struct crash_info {
 #endif
 } crash_info;
 
+static char tarantool_path[PATH_MAX];
+static char feedback_host[CRASH_FEEDBACK_HOST_MAX];
+static bool send_crashinfo = false;
+
+static inline uint64_t
+timespec_to_ns(struct timespec *ts)
+{
+	return (uint64_t)ts->tv_sec * 1000000000 + (uint64_t)ts->tv_nsec;
+}
+
+static char *
+ns_to_localtime(uint64_t timestamp, char *buf, ssize_t len)
+{
+	time_t sec = timestamp / 1000000000;
+	char *start = buf;
+	struct tm tm;
+
+	/*
+	 * Use similar format as say_x logger. Except plain
+	 * seconds should be enough.
+	 */
+	localtime_r(&sec, &tm);
+	ssize_t total = strftime(start, len, "%F %T %Z", &tm);
+	start += total;
+	if (total < len)
+		return buf;
+	buf[len - 1] = '\0';
+	return buf;
+}
+
+void
+crash_init(const char *tarantool_bin)
+{
+	strlcpy_a(tarantool_path, tarantool_bin);
+	if (strlen(tarantool_path) < strlen(tarantool_bin))
+		pr_panic("executable path is trimmed");
+}
+
+void
+crash_cfg(const char *host, bool is_enabled)
+{
+	if (host == NULL || !is_enabled) {
+		if (send_crashinfo) {
+			pr_debug("disable sending crashinfo feedback");
+			send_crashinfo = false;
+			feedback_host[0] = '\0';
+		}
+		return;
+	}
+
+	if (strcmp(feedback_host, host) != 0) {
+		strlcpy_a(feedback_host, host);
+		/*
+		 * The caller should have tested already
+		 * that there is enough space to keep
+		 * the host address.
+		 */
+		assert(strlen(feedback_host) == strlen(host));
+	}
+
+	if (!send_crashinfo) {
+		pr_debug("enable sending crashinfo feedback");
+		send_crashinfo = true;
+	}
+}
+
 /**
  * The routine is called inside crash signal handler so
  * be careful to not cause additional signals inside.
@@ -100,6 +182,12 @@ static struct crash_info *
 crash_collect(int signo, siginfo_t *siginfo, void *ucontext)
 {
 	struct crash_info *cinfo = &crash_info;
+	struct timespec ts;
+
+	if (clock_gettime(CLOCK_REALTIME, &ts) == 0)
+		cinfo->timestamp_rt = timespec_to_ns(&ts);
+	else
+		cinfo->timestamp_rt = 0;
 
 	cinfo->signo = signo;
 	cinfo->sicode = siginfo->si_code;
@@ -130,6 +218,224 @@ crash_collect(int signo, siginfo_t *siginfo, void *ucontext)
 	return cinfo;
 }
 
+/**
+ * Mark an environment that we're in crashinfo handling, this
+ * allows us to escape recursive attempts to send report,
+ * if the action of sending report is failing itself.
+ */
+static int
+crash_mark_env_mode(void)
+{
+	const char *env_name = "TT_CRASHINFO_MODE";
+	if (getenv(env_name) != NULL) {
+		pr_crit("recursive failure detected");
+		return -1;
+	}
+
+	if (setenv(env_name, "y", 0) != 0) {
+		pr_crit("unable to setup %s", env_name);
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * Zap reserved symbols.
+ */
+static char *
+json_zap(char *s)
+{
+	if (s == NULL)
+		return NULL;
+
+	/*
+	 * Actually we should escape them but for
+	 * now lets just zap without changing source
+	 * size.
+	 */
+	for (size_t i = 0; i < strlen(s); i++) {
+		if (s[i] == '\"' || s[i] == '\b' ||
+		    s[i] == '\f' || s[i] == '\n' ||
+		    s[i] == '\r' || s[i] == '\t' ||
+		    s[i] == '\\') {
+			s[i] = ' ';
+		}
+	}
+	return s;
+}
+
+/**
+ * Report crash information to the feedback daemon
+ * (ie send it to feedback daemon).
+ */
+static void
+crash_report_feedback_daemon(struct crash_info *cinfo)
+{
+	if (crash_mark_env_mode() != 0)
+		return;
+
+	/*
+	 * Update to a new number if the format get changed.
+	 */
+	const int crashinfo_version = 1;
+
+	char *p = static_alloc(SMALL_STATIC_SIZE);
+	char *tail = &p[SMALL_STATIC_SIZE];
+	char *e = &p[SMALL_STATIC_SIZE];
+	char *head = p;
+
+	/*
+	 * Note that while we encode the report we
+	 * intensively use a tail of the allocated
+	 * buffer as a temporary store.
+	 */
+
+#define snprintf_safe(__end, __fmt, ...)			\
+	do {							\
+		size_t size = (char *)__end - p;		\
+		p += snprintf(p, size, __fmt, ##__VA_ARGS__);	\
+		if (p >= (char *)__end)				\
+			goto out;				\
+	} while (0)
+
+	/*
+	 * Lets reuse tail of the buffer as a temp space.
+	 */
+	struct utsname *uname_ptr = (void *)&tail[-sizeof(struct utsname)];
+	if (p >= (char *)uname_ptr)
+		goto out;
+
+	if (uname(uname_ptr) != 0) {
+		pr_syserr("uname call failed, ignore");
+		memset(uname_ptr, 0, sizeof(struct utsname));
+	}
+
+	/*
+	 * Start filling the script. The "data" key value is
+	 * filled as a separate code block for easier
+	 * modifications in future.
+	 */
+	snprintf_safe(uname_ptr,
+		      "require(\'http.client\').post(\'%s\',"
+		      "'{\"crashdump\":{\"version\":\"%d\","
+		      "\"data\":", feedback_host,
+		      crashinfo_version);
+
+	/* The "data" key value */
+	snprintf_safe(uname_ptr, "{");
+	snprintf_safe(uname_ptr, "\"uname\":{");
+	snprintf_safe(uname_ptr, "\"sysname\":\"%s\",",
+		      json_zap(uname_ptr->sysname));
+	/*
+	 * nodename might contain a sensitive information, skip.
+	 */
+	snprintf_safe(uname_ptr, "\"release\":\"%s\",",
+		      json_zap(uname_ptr->release));
+	snprintf_safe(uname_ptr, "\"version\":\"%s\",",
+		      json_zap(uname_ptr->version));
+	snprintf_safe(uname_ptr, "\"machine\":\"%s\"",
+		      json_zap(uname_ptr->machine));
+	snprintf_safe(uname_ptr, "},");
+
+	snprintf_safe(e, "\"build\":{");
+	snprintf_safe(e, "\"version\":\"%s\",", PACKAGE_VERSION);
+	snprintf_safe(e, "\"cmake_type\":\"%s\"", BUILD_INFO);
+	snprintf_safe(e, "},");
+
+	snprintf_safe(e, "\"signal\":{");
+	snprintf_safe(e, "\"signo\":%d,", cinfo->signo);
+	snprintf_safe(e, "\"si_code\":%d,", cinfo->sicode);
+	if (cinfo->signo == SIGSEGV) {
+		if (cinfo->sicode == SEGV_MAPERR) {
+			snprintf_safe(e, "\"si_code_str\":\"%s\",",
+				      "SEGV_MAPERR");
+		} else if (cinfo->sicode == SEGV_ACCERR) {
+			snprintf_safe(e, "\"si_code_str\":\"%s\",",
+				      "SEGV_ACCERR");
+		}
+		snprintf_safe(e, "\"si_addr\":\"0x%llx\",",
+			      (long long)cinfo->siaddr);
+	}
+
+#ifdef ENABLE_BACKTRACE
+	/*
+	 * The backtrace itself is encoded into base64 form
+	 * since it might have arbitrary symbols not suitable
+	 * for json encoding (newlines and etc).
+	 */
+	size_t bt_len = strlen(cinfo->backtrace_buf);
+	size_t bt_elen = base64_bufsize(bt_len, BASE64_NOWRAP);
+	char *bt_base64 = &tail[-bt_elen];
+	if (p >= bt_base64)
+		goto out;
+	base64_encode(cinfo->backtrace_buf, bt_len,
+		      bt_base64, bt_elen, BASE64_NOWRAP);
+	bt_base64[bt_elen] = '\0';
+	snprintf_safe(bt_base64, "\"backtrace\":\"%s\",", bt_base64);
+#endif
+
+	/* 64 bytes should be enough for longest localtime */
+	const int ts_size = 64;
+	char *timestamp_rt_str = &tail[-ts_size];
+	if (p >= timestamp_rt_str)
+		goto out;
+	ns_to_localtime(cinfo->timestamp_rt, timestamp_rt_str, ts_size);
+	snprintf_safe(timestamp_rt_str, "\"timestamp\":\"%s\"",
+		      json_zap(timestamp_rt_str));
+	snprintf_safe(timestamp_rt_str, "}");
+	snprintf_safe(timestamp_rt_str, "}");
+
+	/*
+	 * Finalize the "data" key and the script.
+	 *
+	 * The timeout is choosen to be 1 second as
+	 * main feedback daemon uses.
+	 */
+	snprintf_safe(e, "}}',{timeout=1});os.exit(1);");
+
+	pr_debug("crashinfo script: %s", head);
+
+	char *exec_argv[4] = {
+		[0] = tarantool_path,
+		[1] = "-e",
+		[2] = head,
+		[3] = NULL,
+	};
+
+	/*
+	 * Can't use fork here because libev has own
+	 * at_fork helpers with mutex where we might
+	 * stuck (see popen code).
+	 */
+	pid_t pid = vfork();
+	if (pid == 0) {
+		/*
+		 * Environment is needed for recursive
+		 * crash protection. See crash_mark_env_mode
+		 * above.
+		 */
+		extern char **environ;
+		/*
+		 * The script must exit at the end but there
+		 * is no simple way to make sure from inside
+		 * of a signal crash handler. So just hope it
+		 * is running fine.
+		 */
+		execve(exec_argv[0], exec_argv, environ);
+		pr_crit("exec(%s,[%s,%s,%s]) failed",
+			exec_argv[0], exec_argv[0],
+			exec_argv[1], exec_argv[2]);
+		_exit(1);
+	} else if (pid < 0) {
+		pr_crit("unable to vfork (errno %d)", errno);
+	}
+
+	return;
+out:
+	pr_crit("unable to prepare a crash report");
+}
+
 /**
  * Report crash information to the stderr
  * (usually a current console).
@@ -236,6 +542,8 @@ crash_signal_cb(int signo, siginfo_t *siginfo, void *context)
 		in_cb = 1;
 		cinfo = crash_collect(signo, siginfo, context);
 		crash_report_stderr(cinfo);
+		if (send_crashinfo)
+			crash_report_feedback_daemon(cinfo);
 	} else {
 		/* Got a signal while running the handler. */
 		fprintf(stderr, "Fatal %d while backtracing", signo);
diff --git a/src/lib/core/crash.h b/src/lib/core/crash.h
index cd1db585e..195aef10b 100644
--- a/src/lib/core/crash.h
+++ b/src/lib/core/crash.h
@@ -9,6 +9,24 @@
 extern "C" {
 #endif /* defined(__cplusplus) */
 
+/**
+ * PATH_MAX is too big and 2K is recommended
+ * limit for web address.
+ */
+#define CRASH_FEEDBACK_HOST_MAX 2048
+
+/**
+ * Initialize crash subsystem.
+ */
+void
+crash_init(const char *tarantool_bin);
+
+/**
+ * Configure crash parameters.
+ */
+void
+crash_cfg(const char *host, bool is_enabled);
+
 /**
  * Initialize crash signal handlers.
  */
diff --git a/src/main.cc b/src/main.cc
index 391e0f878..2fce81bb3 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -687,6 +687,7 @@ main(int argc, char **argv)
 		title_set_script_name(argv[0]);
 	}
 
+	crash_init(tarantool_bin);
 	export_syms();
 
 	random_init();
diff --git a/test/app-tap/init_script.result b/test/app-tap/init_script.result
index 72aa67db2..16c5b01d2 100644
--- a/test/app-tap/init_script.result
+++ b/test/app-tap/init_script.result
@@ -10,6 +10,7 @@ checkpoint_wal_threshold:1e+18
 coredump:false
 election_mode:off
 election_timeout:5
+feedback_crashinfo:true
 feedback_enabled:true
 feedback_host:https://feedback.tarantool.io
 feedback_interval:3600
diff --git a/test/box/admin.result b/test/box/admin.result
index e05440f66..05debe673 100644
--- a/test/box/admin.result
+++ b/test/box/admin.result
@@ -41,6 +41,8 @@ cfg_filter(box.cfg)
     - off
   - - election_timeout
     - 5
+  - - feedback_crashinfo
+    - true
   - - feedback_enabled
     - true
   - - feedback_host
diff --git a/test/box/cfg.result b/test/box/cfg.result
index 10fef006c..22a720c2c 100644
--- a/test/box/cfg.result
+++ b/test/box/cfg.result
@@ -29,6 +29,8 @@ cfg_filter(box.cfg)
  |     - off
  |   - - election_timeout
  |     - 5
+ |   - - feedback_crashinfo
+ |     - true
  |   - - feedback_enabled
  |     - true
  |   - - feedback_host
@@ -142,6 +144,8 @@ cfg_filter(box.cfg)
  |     - off
  |   - - election_timeout
  |     - 5
+ |   - - feedback_crashinfo
+ |     - true
  |   - - feedback_enabled
  |     - true
  |   - - feedback_host
-- 
2.26.2



More information about the Tarantool-patches mailing list