Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: tarantool-patches@freelists.org
Subject: [PATCH v2 6/8] vinyl: randomize range compaction to avoid IO load spikes
Date: Thu, 24 Jan 2019 20:12:42 +0300	[thread overview]
Message-ID: <1d11af88e61d4ddb86f3406202ad178c1c7672cf.1548349067.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1548349067.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1548349067.git.vdavydov.dev@gmail.com>

Since all ranges constituting an LSM tree have the same configuration,
they tend to get compacted at approximately the same time. This entails
IO load spikes, which, in turn, lead to deviation of the LSM tree from
the target shape and hence increased read amplification. To prevent this
from happening, this patch implements compaction randomization: with 10%
probability we defer compaction at each LSM tree level, i.e. if the
number of runs at a level exceeds the configured run_count_per_level,
the level will be compacted with 90%-probability, but with 10%
probability it won't - compaction will be deferred until another run
is added to the level.

Our simulations show that such a simple algorithm performs fairly well:
it randomizes compaction pace among ranges, spreading IO load evenly in
time, while the write amplification is increased by not more than 5-10%,
which seems to be a reasonable price for elimination of IO load spikes.

Closes #3944
---
 src/box/vy_range.c                 | 20 +++++++++++++++++++-
 src/box/vy_run.c                   |  1 +
 src/box/vy_run.h                   |  5 +++++
 test/vinyl/write_iterator.result   | 20 ++++++++++++++++++--
 test/vinyl/write_iterator.test.lua | 10 ++++++++--
 5 files changed, 51 insertions(+), 5 deletions(-)

diff --git a/src/box/vy_range.c b/src/box/vy_range.c
index 1a2a2c12..7b6ddb40 100644
--- a/src/box/vy_range.c
+++ b/src/box/vy_range.c
@@ -389,7 +389,25 @@ vy_range_update_compaction_priority(struct vy_range *range,
 			 * we find an appropriate level for it.
 			 */
 		}
-		if (level_run_count > opts->run_count_per_level) {
+		/*
+		 * Since all ranges constituting an LSM tree have
+		 * the same configuration, they tend to get compacted
+		 * simultaneously, leading to IO load spikes and, as
+		 * a result, distortion of the LSM tree shape and
+		 * increased read amplification. To prevent this from
+		 * happening, we constantly randomize compaction pace
+		 * among ranges by deferring compaction at each LSM
+		 * tree level with some fixed small probability.
+		 *
+		 * Note, we can't use rand() directly here, because
+		 * this function is called on every memory dump and
+		 * scans all LSM tree levels. Instead we use the
+		 * value of rand() from the slice creation time.
+		 */
+		uint32_t max_run_count = opts->run_count_per_level;
+		if (slice->seed < RAND_MAX / 10)
+			max_run_count++;
+		if (level_run_count > max_run_count) {
 			/*
 			 * The number of runs at the current level
 			 * exceeds the configured maximum. Arrange
diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index cee90458..4c7e637c 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -377,6 +377,7 @@ vy_slice_new(int64_t id, struct vy_run *run, struct tuple *begin,
 	memset(slice, 0, sizeof(*slice));
 	slice->id = id;
 	slice->run = run;
+	slice->seed = rand();
 	vy_run_ref(run);
 	run->slice_count++;
 	if (begin != NULL)
diff --git a/src/box/vy_run.h b/src/box/vy_run.h
index 28fd6a50..18ca1729 100644
--- a/src/box/vy_run.h
+++ b/src/box/vy_run.h
@@ -185,6 +185,11 @@ struct vy_slice {
 	struct tuple *begin;
 	struct tuple *end;
 	/**
+	 * Random seed used for compaction randomization.
+	 * Lays in range [0, RAND_MAX].
+	 */
+	int seed;
+	/**
 	 * Number of async users of this slice. Slice must not
 	 * be removed until it hits 0. Used by the iterator to
 	 * prevent use-after-free after waiting for IO.
diff --git a/test/vinyl/write_iterator.result b/test/vinyl/write_iterator.result
index 88a1c287..39212572 100644
--- a/test/vinyl/write_iterator.result
+++ b/test/vinyl/write_iterator.result
@@ -755,7 +755,7 @@ sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned
 PAD1 = 100
 ---
 ...
-PAD2 = 10
+PAD2 = 15
 ---
 ...
 -- Create a big run to prevent major compaction.
@@ -766,6 +766,14 @@ box.snapshot()
 ---
 - ok
 ...
+-- Some padding to trigger minor compaction.
+for i = 1001, 1000 + PAD2 do s:replace{i, i} end
+---
+...
+box.snapshot()
+---
+- ok
+...
 -- Generate some INSERT statements and dump them to disk.
 _ = s:insert{1, 1} -- insert
 ---
@@ -887,7 +895,7 @@ sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned
 PAD1 = 100
 ---
 ...
-PAD2 = 10
+PAD2 = 15
 ---
 ...
 -- Create a big run to prevent major compaction.
@@ -922,6 +930,14 @@ box.snapshot()
 ---
 - ok
 ...
+-- Some padding to trigger minor compaction.
+for i = 1001, 1000 + PAD2 do s:replace{i, i} end
+---
+...
+box.snapshot()
+---
+- ok
+...
 -- Generate DELETE+INSERT statements and write them to disk.
 s:delete{1} s:insert{1, 100}
 ---
diff --git a/test/vinyl/write_iterator.test.lua b/test/vinyl/write_iterator.test.lua
index 069c7f69..d7871ffc 100644
--- a/test/vinyl/write_iterator.test.lua
+++ b/test/vinyl/write_iterator.test.lua
@@ -323,10 +323,13 @@ _ = s:on_replace(function() end)
 pk = s:create_index('primary', {run_count_per_level = 1})
 sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned'}})
 PAD1 = 100
-PAD2 = 10
+PAD2 = 15
 -- Create a big run to prevent major compaction.
 for i = 1001, 1000 + PAD1 do s:replace{i, i} end
 box.snapshot()
+-- Some padding to trigger minor compaction.
+for i = 1001, 1000 + PAD2 do s:replace{i, i} end
+box.snapshot()
 -- Generate some INSERT statements and dump them to disk.
 _ = s:insert{1, 1} -- insert
 _ = s:replace{2, 2} -- replace, no old tuple
@@ -373,7 +376,7 @@ s = box.schema.space.create('test', {engine = 'vinyl'})
 pk = s:create_index('primary', {run_count_per_level = 1})
 sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned'}})
 PAD1 = 100
-PAD2 = 10
+PAD2 = 15
 -- Create a big run to prevent major compaction.
 for i = 1001, 1000 + PAD1 do s:insert{i, i} end
 _ = s:insert{1, 1}
@@ -385,6 +388,9 @@ _ = s:insert{6, 6}
 _ = s:insert{7, 7}
 _ = s:insert{8, 8}
 box.snapshot()
+-- Some padding to trigger minor compaction.
+for i = 1001, 1000 + PAD2 do s:replace{i, i} end
+box.snapshot()
 -- Generate DELETE+INSERT statements and write them to disk.
 s:delete{1} s:insert{1, 100}
 box.begin() s:delete{2} s:insert{2, 200} box.commit()
-- 
2.11.0

  parent reply	other threads:[~2019-01-24 17:12 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-01-24 17:12 [PATCH v2 0/8] vinyl: compaction randomization and throttling Vladimir Davydov
2019-01-24 17:12 ` [PATCH v2 1/8] vinyl: use uncompressed run size for range split/coalesce/compaction Vladimir Davydov
2019-01-24 17:12 ` [PATCH v2 2/8] vinyl: fix compaction priority calculation Vladimir Davydov
2019-02-08 17:18   ` [tarantool-patches] " Konstantin Osipov
2019-02-11 14:42     ` Vladimir Davydov
2019-01-24 17:12 ` [PATCH v2 3/8] vinyl: rename lsm->range_heap to max_compaction_priority Vladimir Davydov
2019-02-08 17:19   ` [tarantool-patches] " Konstantin Osipov
2019-02-08 17:26     ` Vladimir Davydov
2019-01-24 17:12 ` [PATCH v2 4/8] vinyl: keep track of dumps per compaction for each LSM tree Vladimir Davydov
2019-02-08 17:42   ` Vladimir Davydov
2019-02-11 18:17     ` [tarantool-patches] " Konstantin Osipov
2019-02-11 18:17   ` Konstantin Osipov
2019-02-12 10:15     ` Vladimir Davydov
2019-01-24 17:12 ` [PATCH v2 5/8] vinyl: set range size automatically Vladimir Davydov
2019-02-11 18:21   ` [tarantool-patches] " Konstantin Osipov
2019-02-12 10:16     ` Vladimir Davydov
2019-01-24 17:12 ` Vladimir Davydov [this message]
2019-02-11 18:21   ` [tarantool-patches] Re: [PATCH v2 6/8] vinyl: randomize range compaction to avoid IO load spikes Konstantin Osipov
2019-02-12 10:16     ` Vladimir Davydov
2019-01-24 17:12 ` [PATCH v2 7/8] vinyl: introduce quota consumer types Vladimir Davydov
2019-02-12 15:48   ` Vladimir Davydov
2019-01-24 17:12 ` [PATCH v2 8/8] vinyl: throttle tx to ensure compaction keeps up with dumps Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1d11af88e61d4ddb86f3406202ad178c1c7672cf.1548349067.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH v2 6/8] vinyl: randomize range compaction to avoid IO load spikes' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox