[PATCH v3 4/4] vinyl: throttle tx to ensure compaction keeps up with dumps

Tue Feb 12 20:26:56 MSK 2019

Every byte of data written to a vinyl database eventually gets compacted
with data written to the database earlier. The ratio of the size of data
actually written to disk to the size of data written to the database is
called write amplification. Write amplification depends on the LSM tree
configuration and the workload parameters and varies in a wide range,
from 2-3 to 10-20 or even higher in some extreme cases. If the database
engine doesn't manage to write those extra data, LSM tree shape will get
distorted, which will result in increased read and space amplification,
which, in turn, will lead to slowing down reads and wasting disk space.
That's why it's so important to ensure the database engine has enough
compaction power.

One way to ensure that is increase the number of compaction threads by
tuning box.cfg.vinyl_write_threads configuration knob, but one can't
increase it beyond the capacity of the server running the instance. So
the database engine must throttle writes if it detects that compaction
threads are struggling to keep up. This patch implements a very simple
algorithm to achieve that: it keeps track of recently observed write
amplification and data compaction speed, use them to calculate the max
transaction rate that the database engine can handle while steadily
maintaining the current level of write amplification, and sets the rate
limit to 0.75 of that so as to give the engine enough room to increase
write amplification if needed.

The algorithm is obviously pessimistic: it undervalues the transaction
rate the database can handle after write amplification has steadied. But
this is compensated by its simplicity and stability - there shouldn't be
any abrupt drops or peaks in RPS due to its decisions. Besides, it
adapts fairly quickly to increase in write amplification when a database
is filled up. If one finds that the algorithm is being too cautious by
undervaluing the limit, it's easy to fix by simply increasing the number
of compaction threads - the rate limit will scale proportionately if the
system is underloaded.

The current value of the rate limit set by the algorithm is reported by
box.stat.vinyl() under regulator.rate_limit section.

Thanks to @kostja for the great comment explaining the logic behind the
rate limiting algorithm.

Closes #3721
---
 src/box/vinyl.c        |   6 +++
 src/box/vy_quota.c     |  12 +++++
 src/box/vy_quota.h     |   6 +++
 src/box/vy_regulator.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++--
 src/box/vy_regulator.h |  27 ++++++++++
 5 files changed, 191 insertions(+), 3 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 22fe135b..947ca355 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -274,6 +274,8 @@ vy_info_append_regulator(struct vy_env *env, struct info_handler *h)
 	info_append_int(h, "write_rate", r->write_rate);
 	info_append_int(h, "dump_bandwidth", r->dump_bandwidth);
 	info_append_int(h, "dump_watermark", r->dump_watermark);
+	info_append_int(h, "rate_limit", vy_quota_get_rate_limit(r->quota,
+							VY_QUOTA_CONSUMER_TX));
 	info_table_end(h); /* regulator */
 }
 
@@ -532,6 +534,7 @@ vinyl_engine_reset_stat(struct engine *engine)
 	memset(&xm->stat, 0, sizeof(xm->stat));
 
 	vy_scheduler_reset_stat(&env->scheduler);
+	vy_regulator_reset_stat(&env->regulator);
 }
 
 /** }}} Introspection */
@@ -2480,6 +2483,9 @@ vy_env_dump_complete_cb(struct vy_scheduler *scheduler,
 	 */
 	vy_regulator_dump_complete(&env->regulator, mem_dumped, dump_duration);
 	vy_quota_release(quota, mem_dumped);
+
+	vy_regulator_update_rate_limit(&env->regulator, &scheduler->stat,
+				       scheduler->compaction_pool.size);
 }
 
 static struct vy_squash_queue *
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 075bfef1..035040a3 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -254,6 +254,18 @@ vy_quota_set_rate_limit(struct vy_quota *q, enum vy_quota_resource_type type,
 	vy_rate_limit_set(&q->rate_limit[type], rate);
 }
 
+size_t
+vy_quota_get_rate_limit(struct vy_quota *q, enum vy_quota_consumer_type type)
+{
+	size_t rate = SIZE_MAX;
+	for (int i = 0; i < vy_quota_resource_type_MAX; i++) {
+		struct vy_rate_limit *rl = &q->rate_limit[i];
+		if (vy_rate_limit_is_applicable(type, i))
+			rate = MIN(rate, rl->rate);
+	}
+	return rate;
+}
+
 void
 vy_quota_force_use(struct vy_quota *q, enum vy_quota_consumer_type type,
 		   size_t size)
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index d90922b2..7ff98cc1 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -255,6 +255,12 @@ vy_quota_set_rate_limit(struct vy_quota *q, enum vy_quota_resource_type type,
 			size_t rate);
 
 /**
+ * Return the rate limit applied to a consumer of the given type.
+ */
+size_t
+vy_quota_get_rate_limit(struct vy_quota *q, enum vy_quota_consumer_type type);
+
+/**
  * Consume @size bytes of memory. In contrast to vy_quota_use()
  * this function does not throttle the caller.
  */
diff --git a/src/box/vy_regulator.c b/src/box/vy_regulator.c
index e14b01aa..4288753f 100644
--- a/src/box/vy_regulator.c
+++ b/src/box/vy_regulator.c
@@ -34,6 +34,7 @@
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <string.h>
 #include <tarantool_ev.h>
 
 #include "fiber.h"
@@ -42,6 +43,7 @@
 #include "trivia/util.h"
 
 #include "vy_quota.h"
+#include "vy_stat.h"
 
 /**
  * Regulator timer period, in seconds.
@@ -73,6 +75,14 @@ static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;
  */
 static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;
 
+/**
+ * Number of dumps to take into account for rate limit calculation.
+ * Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
+ * either - otherwise the rate limit will adapt too slowly to workload
+ * changes. 100 feels like a good choice.
+ */
+static const int VY_RECENT_DUMP_COUNT = 100;
+
 static void
 vy_regulator_trigger_dump(struct vy_regulator *regulator)
 {
@@ -182,6 +192,7 @@ vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
 		100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
 		700 * MB, 800 * MB, 900 * MB,
 	};
+	memset(regulator, 0, sizeof(*regulator));
 	regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,
 					lengthof(dump_bandwidth_buckets));
 	if (regulator->dump_bandwidth_hist == NULL)
@@ -192,11 +203,8 @@ vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
 	ev_timer_init(&regulator->timer, vy_regulator_timer_cb, 0,
 		      VY_REGULATOR_TIMER_PERIOD);
 	regulator->timer.data = regulator;
-	regulator->write_rate = 0;
-	regulator->quota_used_last = 0;
 	regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
 	regulator->dump_watermark = SIZE_MAX;
-	regulator->dump_in_progress = false;
 }
 
 void
@@ -269,3 +277,132 @@ vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
 	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
 				regulator->dump_bandwidth);
 }
+
+void
+vy_regulator_reset_stat(struct vy_regulator *regulator)
+{
+	memset(&regulator->sched_stat_last, 0,
+	       sizeof(regulator->sched_stat_last));
+}
+
+/*
+ * The goal of rate limiting is to ensure LSM trees stay close to
+ * their perfect shape, as defined by run_size_ratio. When dump rate
+ * is too high, we have to throttle database writes to ensure
+ * compaction can keep up with dumps. We can't deduce optimal dump
+ * bandwidth from LSM configuration, such as run_size_ratio or
+ * run_count_per_level, since different spaces or different indexes
+ * within a space can have different configuration settings. The
+ * workload can also vary significantly from space to space. So,
+ * when setting the limit, we have to consider dump and compaction
+ * activities of the database as a whole.
+ *
+ * To this end, we keep track of compaction bandwidth and write
+ * amplification of the entire database, across all LSM trees.
+ * The idea is simple: observe the current write amplification
+ * and compaction bandwidth, and set maximal write rate to a value
+ * somewhat below the implied limit, so as to make room for
+ * compaction to do more work if necessary.
+ *
+ * We use the following metrics to calculate the limit:
+ *  - dump_output - number of bytes dumped to disk over the last
+ *    observation period. The period itself is measured in dumps,
+ *    not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
+ *  - compaction_output - number of bytes produced by compaction
+ *    over the same period.
+ *  - compaction_rate - total compaction output, in bytes, divided
+ *    by total time spent on doing compaction by compaction threads,
+ *    both measured over the same observation period. This gives an
+ *    estimate of the speed at which compaction can write output.
+ *    In the real world this speed is dependent on the disk write
+ *    throughput, number of dump threads, and actual dump rate, but
+ *    given the goal of rate limiting is providing compaction with
+ *    extra bandwidth, this metric is considered an accurate enough
+ *    approximation of the disk bandwidth available to compaction.
+ *
+ * We calculate the compaction rate with the following formula:
+ *
+ *                                            compaction_output
+ *     compaction_rate = compaction_threads * -----------------
+ *                                             compaction_time
+ *
+ * where compaction_threads represents the total number of available
+ * compaction threads and compaction_time is the total time, in
+ * seconds, spent by all threads doing compaction. You can look at
+ * the formula this way: compaction_ouptut / compaction_time gives
+ * the average write speed of a single compaction thread, and by
+ * multiplying it by the number of compaction threads we get the
+ * compaction rate of the entire database.
+ *
+ * In an optimal system dump rate must be proportional to compaction
+ * rate and inverse to write amplification:
+ *
+ *     dump_rate = compaction_rate / (write_amplification - 1)
+ *
+ * The latter can be obtained by dividing total output of compaction
+ * by total output of dumps over the observation period:
+ *
+ *                           dump_output + compaction_output
+ *     write_amplification = ------------------------------- =
+ *                                    dump_output
+ *
+ *                         = 1 + compaction_output / dump_output
+ *
+ * Putting this all together and taking into account data compaction
+ * during memory dump, we get for the max transaction rate:
+ *
+ *                           dump_input
+ *     tx_rate = dump_rate * ----------- =
+ *                           dump_output
+ *
+ *                                    compaction_output
+ *             = compaction_threads * ----------------- *
+ *                                     compaction_time
+ *
+ *                              dump_output      dump_input
+ *                         * ----------------- * ----------- =
+ *                           compaction_output   dump_output
+ *
+ *             = compaction_threads * dump_input / compaction_time
+ *
+ * We set the rate limit to 0.75 of the approximated optimal to
+ * leave the database engine enough room needed to use more disk
+ * bandwidth for compaction if necessary. As soon as compaction gets
+ * enough disk bandwidth to keep LSM trees in optimal shape
+ * compaction speed becomes stable, as does write amplification.
+ */
+void
+vy_regulator_update_rate_limit(struct vy_regulator *regulator,
+			       const struct vy_scheduler_stat *stat,
+			       int compaction_threads)
+{
+	struct vy_scheduler_stat *last = &regulator->sched_stat_last;
+	struct vy_scheduler_stat *recent = &regulator->sched_stat_recent;
+
+	int32_t dump_count = stat->dump_count - last->dump_count;
+	int64_t dump_input = stat->dump_input - last->dump_input;
+	double compaction_time = stat->compaction_time - last->compaction_time;
+	*last = *stat;
+
+	if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN)
+		return;
+
+	recent->dump_count += dump_count;
+	recent->dump_input += dump_input;
+	recent->compaction_time += compaction_time;
+
+	double rate = 0.75 * compaction_threads * recent->dump_input /
+						  recent->compaction_time;
+	vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,
+				MIN(rate, SIZE_MAX));
+
+	/*
+	 * Periodically rotate statistics for quicker adaptation
+	 * to workload changes.
+	 */
+	if (recent->dump_count > VY_RECENT_DUMP_COUNT) {
+		recent->dump_count /= 2;
+		recent->dump_input /= 2;
+		recent->compaction_time /= 2;
+	}
+}
diff --git a/src/box/vy_regulator.h b/src/box/vy_regulator.h
index 0188da26..341f41df 100644
--- a/src/box/vy_regulator.h
+++ b/src/box/vy_regulator.h
@@ -35,6 +35,8 @@
 #include <stddef.h>
 #include <tarantool_ev.h>
 
+#include "vy_stat.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif /* defined(__cplusplus) */
@@ -107,6 +109,16 @@ struct vy_regulator {
 	 * but vy_regulator_dump_complete() hasn't been called yet.
 	 */
 	bool dump_in_progress;
+	/**
+	 * Snapshot of scheduler statistics taken at the time of
+	 * the last rate limit update.
+	 */
+	struct vy_scheduler_stat sched_stat_last;
+	/**
+	 * Scheduler statistics for the most recent few dumps.
+	 * Used for calculating the rate limit.
+	 */
+	struct vy_scheduler_stat sched_stat_recent;
 };
 
 void
@@ -146,6 +158,21 @@ vy_regulator_dump_complete(struct vy_regulator *regulator,
 void
 vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max);
 
+/**
+ * Called when global statistics are reset by box.stat.reset().
+ */
+void
+vy_regulator_reset_stat(struct vy_regulator *regulator);
+
+/**
+ * Set transaction rate limit so as to ensure that compaction
+ * will keep up with dumps.
+ */
+void
+vy_regulator_update_rate_limit(struct vy_regulator *regulator,
+			       const struct vy_scheduler_stat *stat,
+			       int compaction_threads);
+
 #if defined(__cplusplus)
 } /* extern "C" */
 #endif /* defined(__cplusplus) */
-- 
2.11.0