* [PATCH 1/2] Introduce basic rate limiter
2018-12-11 16:31 [PATCH 0/2] Rate limit certain warining messages Vladimir Davydov
@ 2018-12-11 16:31 ` Vladimir Davydov
2018-12-11 16:31 ` [PATCH 2/2] Rate limit certain warning messages Vladimir Davydov
2018-12-11 21:48 ` [PATCH 0/2] Rate limit certain warining messages Konstantin Osipov
2 siblings, 0 replies; 5+ messages in thread
From: Vladimir Davydov @ 2018-12-11 16:31 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
We will use it to limit the rate of log messages.
Needed for #2218
---
src/ratelimit.h | 104 +++++++++++++++++++++++++++++++++++++++++++++
test/unit/CMakeLists.txt | 2 +
test/unit/ratelimit.c | 78 ++++++++++++++++++++++++++++++++++
test/unit/ratelimit.result | 13 ++++++
4 files changed, 197 insertions(+)
create mode 100644 src/ratelimit.h
create mode 100644 test/unit/ratelimit.c
create mode 100644 test/unit/ratelimit.result
diff --git a/src/ratelimit.h b/src/ratelimit.h
new file mode 100644
index 00000000..2bf0bac7
--- /dev/null
+++ b/src/ratelimit.h
@@ -0,0 +1,104 @@
+#ifndef TARANTOOL_RATELIMIT_H_INCLUDED
+#define TARANTOOL_RATELIMIT_H_INCLUDED
+/*
+ * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <stdbool.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/**
+ * Rate limit state.
+ */
+struct ratelimit {
+ /** Time interval used for rate limiting, in seconds. */
+ double interval;
+ /** Max number of events per interval. */
+ int burst;
+ /** Number of events emitted in the current interval. */
+ int emitted;
+ /** Number of events suppressed in the current interval. */
+ int suppressed;
+ /** Start time of the current interval. */
+ double start;
+};
+
+/**
+ * Rate limit state initializer.
+ */
+#define RATELIMIT_INITIALIZER(interval_init, burst_init) \
+ { (interval_init), (burst_init), 0, 0, 0 }
+
+/**
+ * Initialize a rate limit state.
+ */
+static inline void
+ratelimit_create(struct ratelimit *rl, double interval, int burst)
+{
+ rl->interval = interval;
+ rl->burst = burst;
+ rl->emitted = 0;
+ rl->suppressed = 0;
+ rl->start = 0;
+}
+
+/**
+ * Check if an event may be emitted. Returns true on success.
+ * @now is the current time.
+ *
+ * If the current interval is over, the total number of events
+ * suppressed in it is added to @suppressed.
+ */
+static inline bool
+ratelimit_check(struct ratelimit *rl, double now, int *suppressed)
+{
+ if (now > rl->start + rl->interval) {
+ /* Current interval is over, reset counters. */
+ *suppressed += rl->suppressed;
+ rl->emitted = 0;
+ rl->suppressed = 0;
+ rl->start = now;
+ }
+ if (rl->emitted < rl->burst) {
+ rl->emitted++;
+ return true;
+ } else {
+ rl->suppressed++;
+ return false;
+ }
+}
+
+#if defined(__cplusplus)
+} /* extern "C" */
+#endif
+
+#endif /* TARANTOOL_RATELIMIT_H_INCLUDED */
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 7cba5cbb..0025d361 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -136,6 +136,8 @@ add_executable(rmean.test rmean.cc)
target_link_libraries(rmean.test stat unit)
add_executable(histogram.test histogram.c)
target_link_libraries(histogram.test stat unit)
+add_executable(ratelimit.test ratelimit.c)
+target_link_libraries(ratelimit.test unit)
add_executable(say.test say.c)
target_link_libraries(say.test core unit)
diff --git a/test/unit/ratelimit.c b/test/unit/ratelimit.c
new file mode 100644
index 00000000..3dfbe485
--- /dev/null
+++ b/test/unit/ratelimit.c
@@ -0,0 +1,78 @@
+#include <stdlib.h>
+#include <time.h>
+
+#include "unit.h"
+#include "ratelimit.h"
+
+#define check(expected_emitted, expected_suppressed) do { \
+ is(emitted, expected_emitted, "emitted %d expected %d", \
+ emitted, expected_emitted); \
+ is(suppressed, expected_suppressed, "suppressed %d expected %d",\
+ suppressed, expected_suppressed); \
+} while (0)
+
+int
+main()
+{
+ header();
+ plan(10);
+
+ srand(time(NULL));
+ double now = rand();
+
+ int burst = 10;
+ double interval = 5;
+ int count, emitted, suppressed;
+ struct ratelimit rl = RATELIMIT_INITIALIZER(interval, burst);
+ now += interval;
+
+ count = burst;
+ emitted = suppressed = 0;
+ for (int i = 0; i < count; i++) {
+ if (ratelimit_check(&rl, now, &suppressed))
+ emitted++;
+ now += interval / count / 2;
+ }
+ check(count, 0);
+
+ emitted = suppressed = 0;
+ for (int i = 0; i < count; i++) {
+ if (ratelimit_check(&rl, now, &suppressed))
+ emitted++;
+ now += interval / count / 2;
+ }
+ check(0, 0);
+
+ now += 1;
+ emitted = suppressed = 0;
+ if (ratelimit_check(&rl, now, &suppressed))
+ emitted++;
+ check(1, count);
+
+ now += interval * 2;
+ emitted = suppressed = 0;
+ if (ratelimit_check(&rl, now, &suppressed))
+ emitted++;
+ check(1, 0);
+
+ interval = 5;
+ burst = 100;
+ ratelimit_create(&rl, interval, burst);
+
+ int interval_count = 10;
+ count = burst * interval_count * 4;
+ emitted = suppressed = 0;
+ for (int i = 0; i < count; i++) {
+ if (ratelimit_check(&rl, now, &suppressed))
+ emitted++;
+ now += interval_count * interval / count;
+ }
+ now += interval;
+ ratelimit_check(&rl, now, &suppressed);
+ check(interval_count * burst, count - interval_count * burst);
+
+ check_plan();
+ footer();
+
+ return 0;
+}
diff --git a/test/unit/ratelimit.result b/test/unit/ratelimit.result
new file mode 100644
index 00000000..8a755c64
--- /dev/null
+++ b/test/unit/ratelimit.result
@@ -0,0 +1,13 @@
+ *** main ***
+1..10
+ok 1 - emitted 10 expected 10
+ok 2 - suppressed 0 expected 0
+ok 3 - emitted 0 expected 0
+ok 4 - suppressed 0 expected 0
+ok 5 - emitted 1 expected 1
+ok 6 - suppressed 10 expected 10
+ok 7 - emitted 1 expected 1
+ok 8 - suppressed 0 expected 0
+ok 9 - emitted 1000 expected 1000
+ok 10 - suppressed 3000 expected 3000
+ *** main: done ***
--
2.11.0
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 2/2] Rate limit certain warning messages
2018-12-11 16:31 [PATCH 0/2] Rate limit certain warining messages Vladimir Davydov
2018-12-11 16:31 ` [PATCH 1/2] Introduce basic rate limiter Vladimir Davydov
@ 2018-12-11 16:31 ` Vladimir Davydov
2018-12-11 21:48 ` [PATCH 0/2] Rate limit certain warining messages Konstantin Osipov
2 siblings, 0 replies; 5+ messages in thread
From: Vladimir Davydov @ 2018-12-11 16:31 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
There are a few warning messages that can easily flood the log, making
it more difficult to figure out what causes the problem. Those are
- too long WAL write
- waited for ... bytes of vinyl memory quota for too long
- get/select(...) => ... took too long
- readahead limit is reached
- net_msg_max limit is reached
Actually, it's pointless to print each and every of them, because all
messages of the same kind are similar and don't convey any additional
information.
So this patch limits the rate at which those messages may be printed.
To achieve that, it introduces say_ratelimited() helper, which works
exactly like say() except it does nothing if too many messages of
the same kind have already been printed in the last few seconds.
The implementation is trivial - say_ratelimited() defines a static
ratelimit state variable at its call site (it's a macro) and checks it
before logging anything. If the ratelimit state says that an event may
be emitted, it will log the message, otherwise it will skip it and
eventually print the total number of skipped messages instead.
The rate limit is set to 10 messages per 5 seconds for each kind of
a warning message enumerated above.
Here's how it looks in the log:
2018-12-11 18:07:21.830 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.831 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:21.832 [30404] iproto iproto.cc:524 W> stopping input on connection fd 15, aka 127.0.0.1:12345, peer of 127.0.0.1:59212, readahead limit is reached
2018-12-11 18:07:26.851 [30404] iproto iproto.cc:524 W> 9635 messages suppressed
Closes #2218
---
src/box/iproto.cc | 10 ++++++----
src/box/txn.c | 5 +++--
src/box/vy_point_lookup.c | 7 ++++---
src/box/vy_quota.c | 5 +++--
src/box/vy_read_iterator.c | 9 +++++----
src/say.h | 38 ++++++++++++++++++++++++++++++++++++++
6 files changed, 59 insertions(+), 15 deletions(-)
diff --git a/src/box/iproto.cc b/src/box/iproto.cc
index 03066dd5..00896628 100644
--- a/src/box/iproto.cc
+++ b/src/box/iproto.cc
@@ -539,8 +539,9 @@ iproto_connection_is_idle(struct iproto_connection *con)
static inline void
iproto_connection_stop_readahead_limit(struct iproto_connection *con)
{
- say_warn("stopping input on connection %s, readahead limit is reached",
- sio_socketname(con->input.fd));
+ say_warn_ratelimited("stopping input on connection %s, "
+ "readahead limit is reached",
+ sio_socketname(con->input.fd));
assert(rlist_empty(&con->in_stop_list));
ev_io_stop(con->loop, &con->input);
}
@@ -550,8 +551,9 @@ iproto_connection_stop_msg_max_limit(struct iproto_connection *con)
{
assert(rlist_empty(&con->in_stop_list));
- say_warn("stopping input on connection %s, net_msg_max limit is reached",
- sio_socketname(con->input.fd));
+ say_warn_ratelimited("stopping input on connection %s, "
+ "net_msg_max limit is reached",
+ sio_socketname(con->input.fd));
ev_io_stop(con->loop, &con->input);
/*
* Important to add to tail and fetch from head to ensure
diff --git a/src/box/txn.c b/src/box/txn.c
index e47853a8..7f4e85b4 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -296,8 +296,9 @@ txn_write_to_wal(struct txn *txn)
diag_set(ClientError, ER_WAL_IO);
diag_log();
} else if (stop - start > too_long_threshold) {
- say_warn("too long WAL write: %d rows at LSN %lld: %.3f sec",
- txn->n_rows, res - txn->n_rows + 1, stop - start);
+ say_warn_ratelimited("too long WAL write: %d rows at "
+ "LSN %lld: %.3f sec", txn->n_rows,
+ res - txn->n_rows + 1, stop - start);
}
/*
* Use vclock_sum() from WAL writer as transaction signature.
diff --git a/src/box/vy_point_lookup.c b/src/box/vy_point_lookup.c
index 7b704b84..ddbc2d46 100644
--- a/src/box/vy_point_lookup.c
+++ b/src/box/vy_point_lookup.c
@@ -287,9 +287,10 @@ done:
latency_collect(&lsm->stat.latency, latency);
if (latency > lsm->env->too_long_threshold) {
- say_warn("%s: get(%s) => %s took too long: %.3f sec",
- vy_lsm_name(lsm), tuple_str(key),
- vy_stmt_str(*ret), latency);
+ say_warn_ratelimited("%s: get(%s) => %s "
+ "took too long: %.3f sec",
+ vy_lsm_name(lsm), tuple_str(key),
+ vy_stmt_str(*ret), latency);
}
return 0;
}
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 161ab4c7..07cd5856 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -241,8 +241,9 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
double wait_time = ev_monotonic_now(loop()) - wait_start;
if (wait_time > q->too_long_threshold) {
- say_warn("waited for %zu bytes of vinyl memory quota "
- "for too long: %.3f sec", size, wait_time);
+ say_warn_ratelimited("waited for %zu bytes of vinyl memory "
+ "quota for too long: %.3f sec", size,
+ wait_time);
}
vy_quota_do_use(q, size);
diff --git a/src/box/vy_read_iterator.c b/src/box/vy_read_iterator.c
index 954fc0df..06860cbc 100644
--- a/src/box/vy_read_iterator.c
+++ b/src/box/vy_read_iterator.c
@@ -887,10 +887,11 @@ next_key:
latency_collect(&lsm->stat.latency, latency);
if (latency > lsm->env->too_long_threshold) {
- say_warn("%s: select(%s, %s) => %s took too long: %.3f sec",
- vy_lsm_name(lsm), tuple_str(itr->key),
- iterator_type_strs[itr->iterator_type],
- vy_stmt_str(stmt), latency);
+ say_warn_ratelimited("%s: select(%s, %s) => %s "
+ "took too long: %.3f sec",
+ vy_lsm_name(lsm), tuple_str(itr->key),
+ iterator_type_strs[itr->iterator_type],
+ vy_stmt_str(stmt), latency);
}
*result = stmt;
diff --git a/src/say.h b/src/say.h
index 3cff02e9..26db9513 100644
--- a/src/say.h
+++ b/src/say.h
@@ -39,6 +39,7 @@
#include <tarantool_ev.h>
#include "small/rlist.h"
#include "fiber_cond.h"
+#include "ratelimit.h"
#if defined(__cplusplus)
extern "C" {
@@ -327,6 +328,43 @@ CFORMAT(printf, 5, 0) extern sayfunc_t _say;
#define panic(...) panic_status(EXIT_FAILURE, __VA_ARGS__)
#define panic_syserror(...) ({ say(S_FATAL, strerror(errno), __VA_ARGS__); exit(EXIT_FAILURE); })
+enum {
+ /* 10 messages per 5 seconds. */
+ SAY_RATELIMIT_INTERVAL = 5,
+ SAY_RATELIMIT_BURST = 10,
+};
+
+/**
+ * Wrapper around ratelimit_check() that prints a warning if some
+ * messages are suppressed. It uses ev_monotonic_now() as a time
+ * source.
+ */
+#define say_ratelimit_check(rl) ({ \
+ int suppressed = 0; \
+ bool ret = ratelimit_check((rl), ev_monotonic_now(loop()), \
+ &suppressed); \
+ if (suppressed > 0) \
+ say_warn("%d messages suppressed", suppressed); \
+ ret; \
+})
+
+/**
+ * Same as say(), but rate limited. If this function is called more
+ * often than SAY_RATELIMIT_BURST times per SAY_RATELIMIT_INTERVAL
+ * seconds, extra messages are suppressed and a warning is printed
+ * to the log.
+ */
+#define say_ratelimited(level, error, format, ...) ({ \
+ static struct ratelimit rl = \
+ RATELIMIT_INITIALIZER(SAY_RATELIMIT_INTERVAL, \
+ SAY_RATELIMIT_BURST); \
+ if (say_ratelimit_check(&rl)) \
+ say(level, error, format, ##__VA_ARGS__); \
+})
+
+#define say_warn_ratelimited(format, ...) \
+ say_ratelimited(S_WARN, NULL, format, ##__VA_ARGS__)
+
/**
* Format and print a message to Tarantool log file.
*
--
2.11.0
^ permalink raw reply [flat|nested] 5+ messages in thread