[PATCH 18/18] vinyl: throttle tx rate if dump does not catch up

Vladimir Davydov vdavydov.dev at gmail.com
Thu Aug 16 19:12:12 MSK 2018


If the rate at which transactions are ready to write to the database is
greater than the dump bandwidth, memory will get depleted before the
last dump is complete and all newer transactions will have to wait until
the dump has been completed, which may take seconds or even minutes:

  2018-08-16 15:45:11.739 [30874] main/1100/main vy_quota.c:291 W> waited for 555 bytes of vinyl memory quota for too long: 15.750 sec

This patch set implements transaction throttling that are supposed to
help avoid unpredictably long stalls. Now once a dump is started,
transaction write rate will be limited so that the hard limit cannot get
exceeded before the dump is complete. This is how it looks in the log:

  2018-08-16 16:01:09.412 [489] main/445/main I> dumping 134217901 bytes, expected rate 6.0 MB/s, ETA 21.3 s, recent write rate 10.5 MB/s
  2018-08-16 16:01:09.447 [489] main/103/vinyl.scheduler I> 513/0: dump started
  2018-08-16 16:01:09.447 [489] vinyl.writer.0/103/task I> writing `./513/0/00000000000000000004.run'
  2018-08-16 16:01:09.468 [489] main I> throttling enabled, max write rate 6.0 MB/s
  2018-08-16 16:01:30.004 [489] vinyl.writer.0/103/task I> writing `./513/0/00000000000000000004.index'
  2018-08-16 16:01:30.094 [489] main/103/vinyl.scheduler I> 513/0: dump completed
  2018-08-16 16:01:30.095 [489] main/103/vinyl.scheduler I> dumped 134216236 bytes in 20.7 s, rate 6.2 MB/s
  2018-08-16 16:01:30.167 [489] main I> throttling disabled

Closes #1862
---
 src/box/vy_quota.c           | 41 ++++++++++++++++++-
 src/box/vy_quota.h           | 13 ++++++
 test/vinyl/suite.ini         |  2 +-
 test/vinyl/throttle.result   | 95 ++++++++++++++++++++++++++++++++++++++++++++
 test/vinyl/throttle.test.lua | 47 ++++++++++++++++++++++
 5 files changed, 195 insertions(+), 3 deletions(-)
 create mode 100644 test/vinyl/throttle.result
 create mode 100644 test/vinyl/throttle.test.lua

diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 471a8bd0..85e18d4b 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -119,7 +119,7 @@ enum {
 static inline void
 vy_quota_signal(struct vy_quota *q)
 {
-	if (q->used < q->limit)
+	if (q->used < q->limit && q->use_curr < q->use_max)
 		fiber_cond_signal(&q->cond);
 }
 
@@ -133,6 +133,8 @@ vy_quota_check_watermark(struct vy_quota *q)
 	if (!q->dump_in_progress &&
 	    q->used >= q->watermark && q->quota_exceeded_cb(q)) {
 		q->dump_in_progress = true;
+		q->dump_size = q->used;
+		q->dump_start = ev_monotonic_now(loop());
 		say_info("dumping %zu bytes, expected rate %.1f MB/s, "
 			 "ETA %.1f s, recent write rate %.1f MB/s", q->used,
 			 (double)q->dump_bw / 1024 / 1024,
@@ -148,6 +150,7 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
 	(void)events;
 
 	struct vy_quota *q = timer->data;
+	double now = ev_monotonic_now(loop());
 
 	/*
 	 * Update the quota use rate with the new measurement.
@@ -159,6 +162,34 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
 	q->use_curr = 0;
 
 	/*
+	 * To avoid unpredictably long stalls, we must limit
+	 * the write rate when a dump is in progress so that
+	 * we don't hit the hard limit before the dump has
+	 * completed, i.e.
+	 *
+	 *   left_to_use    left_to_dump
+	 *   ----------- <= ------------
+	 *     use_rate       dump_rate
+	 */
+	if (q->dump_in_progress) {
+		size_t dumped = q->dump_bw * (now - q->dump_start);
+		size_t left_to_dump = (dumped < q->dump_size ?
+				       q->dump_size - dumped : 0);
+		size_t left_to_use = (q->used < q->limit ?
+				      q->limit - q->used : 0);
+		double max_use_rate = (left_to_use * q->dump_bw /
+				       (left_to_dump + 1));
+		if (q->use_max == SIZE_MAX)
+			say_info("throttling enabled, max write rate "
+				 "%.1f MB/s", max_use_rate / 1024 / 1024);
+		q->use_max = VY_QUOTA_UPDATE_INTERVAL * max_use_rate;
+	} else {
+		if (q->use_max < SIZE_MAX)
+			say_info("throttling disabled");
+		q->use_max = SIZE_MAX;
+	}
+
+	/*
 	 * Update the quota watermark and trigger memory dump
 	 * if the watermark is exceeded.
 	 *
@@ -172,6 +203,9 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
 	q->watermark = MIN(q->limit * VY_QUOTA_WATERMARK_MAX / 100,
 			   q->watermark);
 	vy_quota_check_watermark(q);
+
+	/* Wake up the next throttled fiber in the line. */
+	vy_quota_signal(q);
 }
 
 int
@@ -201,11 +235,14 @@ vy_quota_create(struct vy_quota *q, vy_quota_exceeded_f quota_exceeded_cb)
 	q->watermark = SIZE_MAX;
 	q->used = 0;
 	q->use_curr = 0;
+	q->use_max = SIZE_MAX;
 	q->use_rate = 0;
 	q->too_long_threshold = TIMEOUT_INFINITY;
 	q->dump_bw = VY_DEFAULT_DUMP_BANDWIDTH;
 	q->quota_exceeded_cb = quota_exceeded_cb;
 	q->dump_in_progress = false;
+	q->dump_size = 0;
+	q->dump_start = 0;
 	fiber_cond_create(&q->cond);
 	ev_timer_init(&q->timer, vy_quota_timer_cb, 0,
 		      VY_QUOTA_UPDATE_INTERVAL);
@@ -277,7 +314,7 @@ vy_quota_try_use(struct vy_quota *q, size_t size, double timeout)
 	double deadline = start_time + timeout;
 
 	q->used += size;
-	while (q->used > q->limit) {
+	while (q->used > q->limit || q->use_curr >= q->use_max) {
 		vy_quota_check_watermark(q);
 		q->used -= size;
 		int rc = fiber_cond_wait_deadline(&q->cond, deadline);
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index cb681386..ef2fb6cb 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -86,6 +86,13 @@ struct vy_quota {
 	 * true, but vy_quota_dump() hasn't been called yet.
 	 */
 	bool dump_in_progress;
+	/**
+	 * Memory usage at the time when the last dump was started
+	 * (memory dump size).
+	 */
+	size_t dump_size;
+	/** Time when the last dump was started. */
+	double dump_start;
 	/** Timer for updating quota watermark. */
 	ev_timer timer;
 	/**
@@ -94,6 +101,12 @@ struct vy_quota {
 	 */
 	size_t use_curr;
 	/**
+	 * Maximal amount of quota that can be used between timer
+	 * callback invocations. It is set to such a value so that
+	 * the quota use rate never exceeds the dump bandwidth.
+	 */
+	size_t use_max;
+	/**
 	 * Quota use rate, in bytes per second.
 	 * Calculated as exponentially weighted
 	 * moving average of use_curr.
diff --git a/test/vinyl/suite.ini b/test/vinyl/suite.ini
index b9dae380..785bc63d 100644
--- a/test/vinyl/suite.ini
+++ b/test/vinyl/suite.ini
@@ -6,5 +6,5 @@ release_disabled = errinj.test.lua errinj_gc.test.lua errinj_vylog.test.lua part
 config = suite.cfg
 lua_libs = suite.lua stress.lua large.lua txn_proxy.lua ../box/lua/utils.lua
 use_unix_sockets = True
-long_run = stress.test.lua large.test.lua write_iterator_rand.test.lua dump_stress.test.lua select_consistency.test.lua
+long_run = stress.test.lua large.test.lua write_iterator_rand.test.lua dump_stress.test.lua select_consistency.test.lua throttle.test.lua
 is_parallel = False
diff --git a/test/vinyl/throttle.result b/test/vinyl/throttle.result
new file mode 100644
index 00000000..5634c40b
--- /dev/null
+++ b/test/vinyl/throttle.result
@@ -0,0 +1,95 @@
+test_run = require('test_run').new()
+---
+...
+test_run:cmd("create server test with script='vinyl/low_quota.lua'")
+---
+- true
+...
+test_run:cmd(string.format("start server test with args='%d'", 32 * 1024 * 1024))
+---
+- true
+...
+test_run:cmd('switch test')
+---
+- true
+...
+fiber = require('fiber')
+---
+...
+digest = require('digest')
+---
+...
+box.cfg{snap_io_rate_limit = 4}
+---
+...
+FIBER_COUNT = 5
+---
+...
+TUPLE_SIZE = 1000
+---
+...
+TX_TUPLE_COUNT = 10
+---
+...
+TX_SIZE = TUPLE_SIZE * TX_TUPLE_COUNT
+---
+...
+TX_COUNT = math.ceil(box.cfg.vinyl_memory / (TX_SIZE * FIBER_COUNT))
+---
+...
+s = box.schema.space.create('test', {engine = 'vinyl'})
+---
+...
+_ = s:create_index('primary', {parts = {1, 'unsigned', 2, 'unsigned', 3, 'unsigned'}})
+---
+...
+latency = 0
+---
+...
+c = fiber.channel(FIBER_COUNT)
+---
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+for i = 1, FIBER_COUNT do
+    fiber.create(function()
+        for j = 1, TX_COUNT do
+            local t1 = fiber.time()
+            box.begin()
+            for k = 1, TX_TUPLE_COUNT do
+                s:replace{i, j, k, digest.urandom(TUPLE_SIZE)}
+            end
+            box.commit()
+            local t2 = fiber.time()
+            latency = math.max(latency, t2 - t1)
+        end
+        c:put(true)
+    end)
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+for i = 1, FIBER_COUNT do c:get() end
+---
+...
+latency < 0.5 or latency
+---
+- true
+...
+test_run:cmd('switch default')
+---
+- true
+...
+test_run:cmd("stop server test")
+---
+- true
+...
+test_run:cmd("cleanup server test")
+---
+- true
+...
diff --git a/test/vinyl/throttle.test.lua b/test/vinyl/throttle.test.lua
new file mode 100644
index 00000000..4efbbec7
--- /dev/null
+++ b/test/vinyl/throttle.test.lua
@@ -0,0 +1,47 @@
+test_run = require('test_run').new()
+test_run:cmd("create server test with script='vinyl/low_quota.lua'")
+test_run:cmd(string.format("start server test with args='%d'", 32 * 1024 * 1024))
+test_run:cmd('switch test')
+
+fiber = require('fiber')
+digest = require('digest')
+
+box.cfg{snap_io_rate_limit = 4}
+
+FIBER_COUNT = 5
+TUPLE_SIZE = 1000
+TX_TUPLE_COUNT = 10
+TX_SIZE = TUPLE_SIZE * TX_TUPLE_COUNT
+TX_COUNT = math.ceil(box.cfg.vinyl_memory / (TX_SIZE * FIBER_COUNT))
+
+s = box.schema.space.create('test', {engine = 'vinyl'})
+_ = s:create_index('primary', {parts = {1, 'unsigned', 2, 'unsigned', 3, 'unsigned'}})
+
+latency = 0
+c = fiber.channel(FIBER_COUNT)
+
+test_run:cmd("setopt delimiter ';'")
+for i = 1, FIBER_COUNT do
+    fiber.create(function()
+        for j = 1, TX_COUNT do
+            local t1 = fiber.time()
+            box.begin()
+            for k = 1, TX_TUPLE_COUNT do
+                s:replace{i, j, k, digest.urandom(TUPLE_SIZE)}
+            end
+            box.commit()
+            local t2 = fiber.time()
+            latency = math.max(latency, t2 - t1)
+        end
+        c:put(true)
+    end)
+end;
+test_run:cmd("setopt delimiter ''");
+
+for i = 1, FIBER_COUNT do c:get() end
+
+latency < 0.5 or latency
+
+test_run:cmd('switch default')
+test_run:cmd("stop server test")
+test_run:cmd("cleanup server test")
-- 
2.11.0




More information about the Tarantool-patches mailing list