From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH v2 6/8] vinyl: randomize range compaction to avoid IO load spikes Date: Thu, 24 Jan 2019 20:12:42 +0300 Message-Id: <1d11af88e61d4ddb86f3406202ad178c1c7672cf.1548349067.git.vdavydov.dev@gmail.com> In-Reply-To: References: In-Reply-To: References: To: tarantool-patches@freelists.org List-ID: Since all ranges constituting an LSM tree have the same configuration, they tend to get compacted at approximately the same time. This entails IO load spikes, which, in turn, lead to deviation of the LSM tree from the target shape and hence increased read amplification. To prevent this from happening, this patch implements compaction randomization: with 10% probability we defer compaction at each LSM tree level, i.e. if the number of runs at a level exceeds the configured run_count_per_level, the level will be compacted with 90%-probability, but with 10% probability it won't - compaction will be deferred until another run is added to the level. Our simulations show that such a simple algorithm performs fairly well: it randomizes compaction pace among ranges, spreading IO load evenly in time, while the write amplification is increased by not more than 5-10%, which seems to be a reasonable price for elimination of IO load spikes. Closes #3944 --- src/box/vy_range.c | 20 +++++++++++++++++++- src/box/vy_run.c | 1 + src/box/vy_run.h | 5 +++++ test/vinyl/write_iterator.result | 20 ++++++++++++++++++-- test/vinyl/write_iterator.test.lua | 10 ++++++++-- 5 files changed, 51 insertions(+), 5 deletions(-) diff --git a/src/box/vy_range.c b/src/box/vy_range.c index 1a2a2c12..7b6ddb40 100644 --- a/src/box/vy_range.c +++ b/src/box/vy_range.c @@ -389,7 +389,25 @@ vy_range_update_compaction_priority(struct vy_range *range, * we find an appropriate level for it. */ } - if (level_run_count > opts->run_count_per_level) { + /* + * Since all ranges constituting an LSM tree have + * the same configuration, they tend to get compacted + * simultaneously, leading to IO load spikes and, as + * a result, distortion of the LSM tree shape and + * increased read amplification. To prevent this from + * happening, we constantly randomize compaction pace + * among ranges by deferring compaction at each LSM + * tree level with some fixed small probability. + * + * Note, we can't use rand() directly here, because + * this function is called on every memory dump and + * scans all LSM tree levels. Instead we use the + * value of rand() from the slice creation time. + */ + uint32_t max_run_count = opts->run_count_per_level; + if (slice->seed < RAND_MAX / 10) + max_run_count++; + if (level_run_count > max_run_count) { /* * The number of runs at the current level * exceeds the configured maximum. Arrange diff --git a/src/box/vy_run.c b/src/box/vy_run.c index cee90458..4c7e637c 100644 --- a/src/box/vy_run.c +++ b/src/box/vy_run.c @@ -377,6 +377,7 @@ vy_slice_new(int64_t id, struct vy_run *run, struct tuple *begin, memset(slice, 0, sizeof(*slice)); slice->id = id; slice->run = run; + slice->seed = rand(); vy_run_ref(run); run->slice_count++; if (begin != NULL) diff --git a/src/box/vy_run.h b/src/box/vy_run.h index 28fd6a50..18ca1729 100644 --- a/src/box/vy_run.h +++ b/src/box/vy_run.h @@ -185,6 +185,11 @@ struct vy_slice { struct tuple *begin; struct tuple *end; /** + * Random seed used for compaction randomization. + * Lays in range [0, RAND_MAX]. + */ + int seed; + /** * Number of async users of this slice. Slice must not * be removed until it hits 0. Used by the iterator to * prevent use-after-free after waiting for IO. diff --git a/test/vinyl/write_iterator.result b/test/vinyl/write_iterator.result index 88a1c287..39212572 100644 --- a/test/vinyl/write_iterator.result +++ b/test/vinyl/write_iterator.result @@ -755,7 +755,7 @@ sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned PAD1 = 100 --- ... -PAD2 = 10 +PAD2 = 15 --- ... -- Create a big run to prevent major compaction. @@ -766,6 +766,14 @@ box.snapshot() --- - ok ... +-- Some padding to trigger minor compaction. +for i = 1001, 1000 + PAD2 do s:replace{i, i} end +--- +... +box.snapshot() +--- +- ok +... -- Generate some INSERT statements and dump them to disk. _ = s:insert{1, 1} -- insert --- @@ -887,7 +895,7 @@ sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned PAD1 = 100 --- ... -PAD2 = 10 +PAD2 = 15 --- ... -- Create a big run to prevent major compaction. @@ -922,6 +930,14 @@ box.snapshot() --- - ok ... +-- Some padding to trigger minor compaction. +for i = 1001, 1000 + PAD2 do s:replace{i, i} end +--- +... +box.snapshot() +--- +- ok +... -- Generate DELETE+INSERT statements and write them to disk. s:delete{1} s:insert{1, 100} --- diff --git a/test/vinyl/write_iterator.test.lua b/test/vinyl/write_iterator.test.lua index 069c7f69..d7871ffc 100644 --- a/test/vinyl/write_iterator.test.lua +++ b/test/vinyl/write_iterator.test.lua @@ -323,10 +323,13 @@ _ = s:on_replace(function() end) pk = s:create_index('primary', {run_count_per_level = 1}) sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned'}}) PAD1 = 100 -PAD2 = 10 +PAD2 = 15 -- Create a big run to prevent major compaction. for i = 1001, 1000 + PAD1 do s:replace{i, i} end box.snapshot() +-- Some padding to trigger minor compaction. +for i = 1001, 1000 + PAD2 do s:replace{i, i} end +box.snapshot() -- Generate some INSERT statements and dump them to disk. _ = s:insert{1, 1} -- insert _ = s:replace{2, 2} -- replace, no old tuple @@ -373,7 +376,7 @@ s = box.schema.space.create('test', {engine = 'vinyl'}) pk = s:create_index('primary', {run_count_per_level = 1}) sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned'}}) PAD1 = 100 -PAD2 = 10 +PAD2 = 15 -- Create a big run to prevent major compaction. for i = 1001, 1000 + PAD1 do s:insert{i, i} end _ = s:insert{1, 1} @@ -385,6 +388,9 @@ _ = s:insert{6, 6} _ = s:insert{7, 7} _ = s:insert{8, 8} box.snapshot() +-- Some padding to trigger minor compaction. +for i = 1001, 1000 + PAD2 do s:replace{i, i} end +box.snapshot() -- Generate DELETE+INSERT statements and write them to disk. s:delete{1} s:insert{1, 100} box.begin() s:delete{2} s:insert{2, 200} box.commit() -- 2.11.0