From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH 3/3] vinyl: force major compaction if there are too many DELETEs Date: Sat, 20 Oct 2018 21:23:10 +0300 Message-Id: In-Reply-To: References: In-Reply-To: References: To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: Even a perfectly shaped LSM tree can accumulate a huge number of DELETE statements over time in case indexed fields are frequently updated. This can significantly increase read and space amplification, especially for secondary indexes. One way to deal with it is to propagate read amplification back to the scheduler so that it can raise compaction priority accordingly. Although this would probably make sense, it wouldn't be enough, because it wouldn't deal with space amplification growth in case the workload is write-mostly. So this patch simply forces major compaction for a range if the number of DELETEs plus the statements they are supposed to purge exceeds a half of all statements in the range. This should prevent uncontrollable growth of the number of DELETEs and confine space and read amplification to sane values. Part of #3225 --- src/box/vy_range.c | 22 ++++++++++++++++++++ src/box/vy_run.c | 14 +++++++++++++ src/box/vy_run.h | 8 ++++++++ src/box/vy_stat.h | 6 ++++++ test/vinyl/compact.result | 40 +++++++++++++++++++++++++++++++++++++ test/vinyl/compact.test.lua | 18 +++++++++++++++++ test/vinyl/deferred_delete.result | 26 +++++++++++++++++++----- test/vinyl/deferred_delete.test.lua | 15 +++++++++++--- test/vinyl/update_optimize.result | 9 ++++++--- test/vinyl/update_optimize.test.lua | 5 +++-- 10 files changed, 150 insertions(+), 13 deletions(-) diff --git a/src/box/vy_range.c b/src/box/vy_range.c index af1227ea..63efcfc6 100644 --- a/src/box/vy_range.c +++ b/src/box/vy_range.c @@ -310,6 +310,8 @@ vy_range_update_compact_priority(struct vy_range *range, /* Total number of statements in checked runs. */ struct vy_disk_stmt_counter total_stmt_count; vy_disk_stmt_counter_reset(&total_stmt_count); + struct vy_stmt_stat total_stmt_stat; + vy_stmt_stat_reset(&total_stmt_stat); /* Total number of checked runs. */ uint32_t total_run_count = 0; /* Estimated size of a compacted run, if compaction is scheduled. */ @@ -326,6 +328,10 @@ vy_range_update_compact_priority(struct vy_range *range, struct vy_slice *slice; rlist_foreach_entry(slice, &range->slices, in_range) { + struct vy_stmt_stat stmt_stat; + vy_slice_stmt_stat(slice, &stmt_stat); + vy_stmt_stat_add(&total_stmt_stat, &stmt_stat); + uint64_t size = slice->count.bytes_compressed; /* * The size of the first level is defined by @@ -377,6 +383,22 @@ vy_range_update_compact_priority(struct vy_range *range, est_new_run_size = total_stmt_count.bytes_compressed; } } + + /* + * Even a perfectly shaped LSM tree can accumulate a huge + * number of DELETE statements over time in case indexed + * fields get updated frequently. To prevent read and space + * amplification from growing uncontrollably, we force + * major compaction if the total number of DELETEs plus the + * statements that they are supposed to purge exceeds a + * half of all statements, i.e. + * + * 2 * deletes > total / 2 + */ + if (total_stmt_stat.deletes > total_stmt_count.rows / 4) { + range->compact_priority = range->slice_count; + range->compact_queue = range->count; + } } /** diff --git a/src/box/vy_run.c b/src/box/vy_run.c index 5f2b980b..5c8b3700 100644 --- a/src/box/vy_run.c +++ b/src/box/vy_run.c @@ -476,6 +476,20 @@ vy_slice_cut(struct vy_slice *slice, int64_t id, struct tuple *begin, return 0; } +void +vy_slice_stmt_stat(struct vy_slice *slice, struct vy_stmt_stat *stat) +{ + struct vy_run_info *run_info = &slice->run->info; + struct vy_stmt_stat *run_stmt_stat = &run_info->stmt_stat; + uint32_t run_pages = run_info->page_count; + uint32_t slice_pages = slice->last_page_no - slice->first_page_no + 1; + + stat->inserts = run_stmt_stat->inserts * slice_pages / run_pages; + stat->replaces = run_stmt_stat->replaces * slice_pages / run_pages; + stat->deletes = run_stmt_stat->deletes * slice_pages / run_pages; + stat->upserts = run_stmt_stat->upserts * slice_pages / run_pages; +} + /** * Decode page information from xrow. * diff --git a/src/box/vy_run.h b/src/box/vy_run.h index 990daffa..90e83c22 100644 --- a/src/box/vy_run.h +++ b/src/box/vy_run.h @@ -492,6 +492,14 @@ vy_slice_cut(struct vy_slice *slice, int64_t id, struct tuple *begin, struct vy_slice **result); /** + * Estimate statement count of each type spanned by a slice. + * Basically, this function returns run_info::stmt_stat scaled + * down proportionately to the slice size. + */ +void +vy_slice_stmt_stat(struct vy_slice *slice, struct vy_stmt_stat *stat); + +/** * Open an iterator over on-disk run. * * Note, it is the caller's responsibility to make sure the slice diff --git a/src/box/vy_stat.h b/src/box/vy_stat.h index 9bb09174..04fea3c3 100644 --- a/src/box/vy_stat.h +++ b/src/box/vy_stat.h @@ -309,6 +309,12 @@ vy_disk_stmt_counter_sub(struct vy_disk_stmt_counter *c1, c1->pages -= c2->pages; } +static inline void +vy_stmt_stat_reset(struct vy_stmt_stat *stat) +{ + memset(stat, 0, sizeof(*stat)); +} + /** * Account a single statement of the given type in @stat. */ diff --git a/test/vinyl/compact.result b/test/vinyl/compact.result index a79292ee..51dca963 100644 --- a/test/vinyl/compact.result +++ b/test/vinyl/compact.result @@ -210,3 +210,43 @@ info() -- 4 ranges, 4 runs s:drop() --- ... +-- +-- gh-3225: major compaction is triggered if key fields are +-- frequently updated. +-- +s = box.schema.space.create('test', {engine = 'vinyl'}) +--- +... +i1 = s:create_index('primary', {run_count_per_level = 10}) +--- +... +i2 = s:create_index('secondary', {run_count_per_level = 10, parts = {2, 'unsigned'}}) +--- +... +for i = 1, 10 do s:replace{i, i} end +--- +... +box.snapshot() +--- +- ok +... +for i = 1, 10 do s:update(i, {{'+', 2, 10}}) end +--- +... +box.snapshot() +--- +- ok +... +while i2:stat().disk.compact.queue.rows > 0 do fiber.sleep(0.01) end +--- +... +i2:stat().disk.statement +--- +- inserts: 10 + replaces: 0 + upserts: 0 + deletes: 0 +... +s:drop() +--- +... diff --git a/test/vinyl/compact.test.lua b/test/vinyl/compact.test.lua index b9a1c109..f0c71640 100644 --- a/test/vinyl/compact.test.lua +++ b/test/vinyl/compact.test.lua @@ -95,3 +95,21 @@ compact() info() -- 4 ranges, 4 runs s:drop() + +-- +-- gh-3225: major compaction is triggered if key fields are +-- frequently updated. +-- +s = box.schema.space.create('test', {engine = 'vinyl'}) +i1 = s:create_index('primary', {run_count_per_level = 10}) +i2 = s:create_index('secondary', {run_count_per_level = 10, parts = {2, 'unsigned'}}) + +for i = 1, 10 do s:replace{i, i} end +box.snapshot() +for i = 1, 10 do s:update(i, {{'+', 2, 10}}) end +box.snapshot() + +while i2:stat().disk.compact.queue.rows > 0 do fiber.sleep(0.01) end +i2:stat().disk.statement + +s:drop() diff --git a/test/vinyl/deferred_delete.result b/test/vinyl/deferred_delete.result index 4a68c6c2..14408fc2 100644 --- a/test/vinyl/deferred_delete.result +++ b/test/vinyl/deferred_delete.result @@ -662,6 +662,12 @@ test_run:cmd("switch test") fiber = require('fiber') --- ... +var = box.schema.space.create('var') +--- +... +_ = var:create_index('primary', {parts = {1, 'string'}}) +--- +... s = box.schema.space.create('test', {engine = 'vinyl'}) --- ... @@ -711,11 +717,18 @@ sk:stat().disk.dump.count -- 1 --- - 1 ... -sk:stat().rows -- 120 old REPLACEs + 120 new REPLACEs + 120 deferred DELETEs +-- Excess of DELETEs triggers compaction. Wait for it to complete. +while sk:stat().disk.compact.queue.rows > 0 do fiber.sleep(0.01) end +--- +... +-- Remember the total number of rows before restart. +_ = var:put{'rows', sk:stat().rows} --- -- 360 ... test_run:cmd("restart server test with args='1048576'") +var = box.space.var +--- +... s = box.space.test --- ... @@ -725,16 +738,19 @@ pk = s.index.pk sk = s.index.sk --- ... --- Should be 360, the same amount of statements as before restart. +-- Should be the same amount of statements as before restart. -- If we applied all deferred DELETEs, including the dumped ones, -- then there would be more. -sk:stat().rows +sk:stat().rows == var:get('rows')[2] --- -- 360 +- true ... s:drop() --- ... +var:drop() +--- +... test_run:cmd("switch default") --- - true diff --git a/test/vinyl/deferred_delete.test.lua b/test/vinyl/deferred_delete.test.lua index e9e0e293..dfffe22f 100644 --- a/test/vinyl/deferred_delete.test.lua +++ b/test/vinyl/deferred_delete.test.lua @@ -249,6 +249,9 @@ test_run:cmd("switch test") fiber = require('fiber') +var = box.schema.space.create('var') +_ = var:create_index('primary', {parts = {1, 'string'}}) + s = box.schema.space.create('test', {engine = 'vinyl'}) pk = s:create_index('pk', {run_count_per_level = 10}) sk = s:create_index('sk', {run_count_per_level = 10, parts = {2, 'unsigned', 3, 'string'}, unique = false}) @@ -273,19 +276,25 @@ while pk:stat().disk.compact.count == 0 do fiber.sleep(0.001) end sk:stat().disk.dump.count -- 1 -sk:stat().rows -- 120 old REPLACEs + 120 new REPLACEs + 120 deferred DELETEs +-- Excess of DELETEs triggers compaction. Wait for it to complete. +while sk:stat().disk.compact.queue.rows > 0 do fiber.sleep(0.01) end + +-- Remember the total number of rows before restart. +_ = var:put{'rows', sk:stat().rows} test_run:cmd("restart server test with args='1048576'") +var = box.space.var s = box.space.test pk = s.index.pk sk = s.index.sk --- Should be 360, the same amount of statements as before restart. +-- Should be the same amount of statements as before restart. -- If we applied all deferred DELETEs, including the dumped ones, -- then there would be more. -sk:stat().rows +sk:stat().rows == var:get('rows')[2] s:drop() +var:drop() test_run:cmd("switch default") test_run:cmd("stop server test") diff --git a/test/vinyl/update_optimize.result b/test/vinyl/update_optimize.result index 40f34151..93aa5989 100644 --- a/test/vinyl/update_optimize.result +++ b/test/vinyl/update_optimize.result @@ -721,7 +721,7 @@ s = box.schema.space.create('test', {engine = 'vinyl'}) _ = s:create_index('pk') --- ... -_ = s:create_index('sk', {parts = {2, 'unsigned'}, run_count_per_level = 10}) +_ = s:create_index('sk', {parts = {2, 'unsigned'}}) --- ... s:insert{1, 10} @@ -743,9 +743,12 @@ box.snapshot() --- - ok ... -s.index.sk:stat().rows -- INSERT in the first run + DELETE the second run +while s.index.sk:stat().disk.compact.queue.rows > 0 do fiber.sleep(0.01) end --- -- 2 +... +s.index.sk:stat().rows -- 0 (INSERT + DELETE got purged by compaction) +--- +- 0 ... s:insert{1, 20} --- diff --git a/test/vinyl/update_optimize.test.lua b/test/vinyl/update_optimize.test.lua index a214dcae..a3eed38b 100644 --- a/test/vinyl/update_optimize.test.lua +++ b/test/vinyl/update_optimize.test.lua @@ -241,7 +241,7 @@ space:drop() -- s = box.schema.space.create('test', {engine = 'vinyl'}) _ = s:create_index('pk') -_ = s:create_index('sk', {parts = {2, 'unsigned'}, run_count_per_level = 10}) +_ = s:create_index('sk', {parts = {2, 'unsigned'}}) s:insert{1, 10} box.snapshot() @@ -250,7 +250,8 @@ s:update(1, {{'=', 2, 10}}) s:delete(1) box.snapshot() -s.index.sk:stat().rows -- INSERT in the first run + DELETE the second run +while s.index.sk:stat().disk.compact.queue.rows > 0 do fiber.sleep(0.01) end +s.index.sk:stat().rows -- 0 (INSERT + DELETE got purged by compaction) s:insert{1, 20} s.index.sk:select() -- 2.11.0