[PATCH 2/2] Rate limit certain warning messages

Vladimir Davydov vdavydov.dev at gmail.com
Tue Dec 11 19:31:18 MSK 2018


There are a few warning messages that can easily flood the log, making
it more difficult to figure out what causes the problem. Those are
 - too long WAL write
 - waited for ... bytes of vinyl memory quota for too long
 - get/select(...) => ... took too long
 - readahead limit is reached
 - net_msg_max limit is reached

Actually, it's pointless to print each and every of them, because all
messages of the same kind are similar and don't convey any additional
information.

So this patch limits the rate at which those messages may be printed.
To achieve that, it introduces say_ratelimited() helper, which works
exactly like say() except it does nothing if too many messages of
the same kind have already been printed in the last few seconds.
The implementation is trivial - say_ratelimited() defines a static
ratelimit state variable at its call site (it's a macro) and checks it
before logging anything. If the ratelimit state says that an event may
be emitted, it will log the message, otherwise it will skip it and
eventually print the total number of skipped messages instead.

The rate limit is set to 10 messages per 5 seconds for each kind of
a warning message enumerated above.

Here's how it looks in the log:

2018-12-11 18:07:21.830 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:26.851 [30404] iproto iproto.cc:524 W> 9635 messages suppressed

Closes #2218
---
 src/box/iproto.cc          | 10 ++++++----
 src/box/txn.c              |  5 +++--
 src/box/vy_point_lookup.c  |  7 ++++---
 src/box/vy_quota.c         |  5 +++--
 src/box/vy_read_iterator.c |  9 +++++----
 src/say.h                  | 38 ++++++++++++++++++++++++++++++++++++++
 6 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/src/box/iproto.cc b/src/box/iproto.cc
index 03066dd5..00896628 100644
--- a/src/box/iproto.cc
+++ b/src/box/iproto.cc
@@ -539,8 +539,9 @@ iproto_connection_is_idle(struct iproto_connection *con)
 static inline void
 iproto_connection_stop_readahead_limit(struct iproto_connection *con)
 {
-	say_warn("stopping input on connection %s, readahead limit is reached",
-		 sio_socketname(con->input.fd));
+	say_warn_ratelimited("stopping input on connection %s, "
+			     "readahead limit is reached",
+			     sio_socketname(con->input.fd));
 	assert(rlist_empty(&con->in_stop_list));
 	ev_io_stop(con->loop, &con->input);
 }
@@ -550,8 +551,9 @@ iproto_connection_stop_msg_max_limit(struct iproto_connection *con)
 {
 	assert(rlist_empty(&con->in_stop_list));
 
-	say_warn("stopping input on connection %s, net_msg_max limit is reached",
-		 sio_socketname(con->input.fd));
+	say_warn_ratelimited("stopping input on connection %s, "
+			     "net_msg_max limit is reached",
+			     sio_socketname(con->input.fd));
 	ev_io_stop(con->loop, &con->input);
 	/*
 	 * Important to add to tail and fetch from head to ensure
diff --git a/src/box/txn.c b/src/box/txn.c
index e47853a8..7f4e85b4 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -296,8 +296,9 @@ txn_write_to_wal(struct txn *txn)
 		diag_set(ClientError, ER_WAL_IO);
 		diag_log();
 	} else if (stop - start > too_long_threshold) {
-		say_warn("too long WAL write: %d rows at LSN %lld: %.3f sec",
-			 txn->n_rows, res - txn->n_rows + 1, stop - start);
+		say_warn_ratelimited("too long WAL write: %d rows at "
+				     "LSN %lld: %.3f sec", txn->n_rows,
+				     res - txn->n_rows + 1, stop - start);
 	}
 	/*
 	 * Use vclock_sum() from WAL writer as transaction signature.
diff --git a/src/box/vy_point_lookup.c b/src/box/vy_point_lookup.c
index 7b704b84..ddbc2d46 100644
--- a/src/box/vy_point_lookup.c
+++ b/src/box/vy_point_lookup.c
@@ -287,9 +287,10 @@ done:
 	latency_collect(&lsm->stat.latency, latency);
 
 	if (latency > lsm->env->too_long_threshold) {
-		say_warn("%s: get(%s) => %s took too long: %.3f sec",
-			 vy_lsm_name(lsm), tuple_str(key),
-			 vy_stmt_str(*ret), latency);
+		say_warn_ratelimited("%s: get(%s) => %s "
+				     "took too long: %.3f sec",
+				     vy_lsm_name(lsm), tuple_str(key),
+				     vy_stmt_str(*ret), latency);
 	}
 	return 0;
 }
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 161ab4c7..07cd5856 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -241,8 +241,9 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
 
 	double wait_time = ev_monotonic_now(loop()) - wait_start;
 	if (wait_time > q->too_long_threshold) {
-		say_warn("waited for %zu bytes of vinyl memory quota "
-			 "for too long: %.3f sec", size, wait_time);
+		say_warn_ratelimited("waited for %zu bytes of vinyl memory "
+				     "quota for too long: %.3f sec", size,
+				     wait_time);
 	}
 
 	vy_quota_do_use(q, size);
diff --git a/src/box/vy_read_iterator.c b/src/box/vy_read_iterator.c
index 954fc0df..06860cbc 100644
--- a/src/box/vy_read_iterator.c
+++ b/src/box/vy_read_iterator.c
@@ -887,10 +887,11 @@ next_key:
 	latency_collect(&lsm->stat.latency, latency);
 
 	if (latency > lsm->env->too_long_threshold) {
-		say_warn("%s: select(%s, %s) => %s took too long: %.3f sec",
-			 vy_lsm_name(lsm), tuple_str(itr->key),
-			 iterator_type_strs[itr->iterator_type],
-			 vy_stmt_str(stmt), latency);
+		say_warn_ratelimited("%s: select(%s, %s) => %s "
+				     "took too long: %.3f sec",
+				     vy_lsm_name(lsm), tuple_str(itr->key),
+				     iterator_type_strs[itr->iterator_type],
+				     vy_stmt_str(stmt), latency);
 	}
 
 	*result = stmt;
diff --git a/src/say.h b/src/say.h
index 3cff02e9..26db9513 100644
--- a/src/say.h
+++ b/src/say.h
@@ -39,6 +39,7 @@
 #include <tarantool_ev.h>
 #include "small/rlist.h"
 #include "fiber_cond.h"
+#include "ratelimit.h"
 
 #if defined(__cplusplus)
 extern "C" {
@@ -327,6 +328,43 @@ CFORMAT(printf, 5, 0) extern sayfunc_t _say;
 #define panic(...)			panic_status(EXIT_FAILURE, __VA_ARGS__)
 #define panic_syserror(...)		({ say(S_FATAL, strerror(errno), __VA_ARGS__); exit(EXIT_FAILURE); })
 
+enum {
+	/* 10 messages per 5 seconds. */
+	SAY_RATELIMIT_INTERVAL = 5,
+	SAY_RATELIMIT_BURST = 10,
+};
+
+/**
+ * Wrapper around ratelimit_check() that prints a warning if some
+ * messages are suppressed. It uses ev_monotonic_now() as a time
+ * source.
+ */
+#define say_ratelimit_check(rl) ({					\
+	int suppressed = 0;						\
+	bool ret = ratelimit_check((rl), ev_monotonic_now(loop()),	\
+				   &suppressed);			\
+	if (suppressed > 0)						\
+		say_warn("%d messages suppressed", suppressed);		\
+	ret;								\
+})
+
+/**
+ * Same as say(), but rate limited. If this function is called more
+ * often than SAY_RATELIMIT_BURST times per SAY_RATELIMIT_INTERVAL
+ * seconds, extra messages are suppressed and a warning is printed
+ * to the log.
+ */
+#define say_ratelimited(level, error, format, ...) ({			\
+	static struct ratelimit rl =					\
+		RATELIMIT_INITIALIZER(SAY_RATELIMIT_INTERVAL,		\
+				      SAY_RATELIMIT_BURST);		\
+	if (say_ratelimit_check(&rl))					\
+		say(level, error, format, ##__VA_ARGS__);		\
+})
+
+#define say_warn_ratelimited(format, ...) \
+	say_ratelimited(S_WARN, NULL, format, ##__VA_ARGS__)
+
 /**
  * Format and print a message to Tarantool log file.
  *
-- 
2.11.0




More information about the Tarantool-patches mailing list