From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH 6/9] vinyl: set range size automatically Date: Mon, 21 Jan 2019 00:17:05 +0300 Message-Id: <6cd378d1640f87d46a6e40f1c51e4ae62a70c209.1548017258.git.vdavydov.dev@gmail.com> In-Reply-To: References: In-Reply-To: References: To: tarantool-patches@freelists.org List-ID: The key space of a vinyl index consists of multiple ranges that can be compacted independently. This design was initially invented to enable parallel compaction, so the range size is configured statically, by the range_size index option, which equals 1 GB by default. However, it turns out that ranges can also be useful for smoothing IO load: if we compact approximately the same number of ranges after each dump, we will avoid IO bursts, which is good, because IO bursts can distort the LSM tree shape, resulting in increased read amplification. To achieve that, we need to maintain at least as many ranges as the number of dumps it takes to trigger major compaction of a range. With the default range size, this condition will hold only if the index is huge (tens to hundreds gigabytes). If the database isn't that big or consists of many small indexes, the range count will never even approach that number. So this patch makes the range size scale dynamically to satisfy that condition. The range size configuration options, both global and per index, aren't removed though. The patch just changes box.cfg.vinyl_range_size default value to nil, which enables automatic range sizing for all new indexes created without passing range_size explicitly. All existing indexes will still use the range size stored in index options (we don't want to alter the behavior of an existing production setup). We are not planning to drop range_size option altogether - it still can be useful for testing and performance analysis. The actual range size value is now reported in index.stat(). Needed for #3944 --- src/box/alter.cc | 8 +-- src/box/box.cc | 6 +-- src/box/index_def.c | 2 +- src/box/lua/load_cfg.lua | 2 +- src/box/lua/space.cc | 6 ++- src/box/vinyl.c | 1 + src/box/vy_lsm.c | 36 +++++++++++++- src/box/vy_lsm.h | 4 ++ src/box/vy_range.c | 10 ++-- src/box/vy_range.h | 10 ++-- test/app-tap/init_script.result | 21 ++++---- test/box-tap/cfg.test.lua | 3 +- test/box/admin.result | 2 - test/box/cfg.result | 4 -- test/vinyl/ddl.result | 5 -- test/vinyl/ddl.test.lua | 1 - test/vinyl/misc.result | 78 +++++++++++++++++++++++++++++ test/vinyl/misc.test.lua | 26 ++++++++++ test/vinyl/stat.result | 108 ++++++++++++++++++++-------------------- 19 files changed, 228 insertions(+), 105 deletions(-) diff --git a/src/box/alter.cc b/src/box/alter.cc index 0589c967..83953a88 100644 --- a/src/box/alter.cc +++ b/src/box/alter.cc @@ -187,12 +187,8 @@ index_opts_decode(struct index_opts *opts, const char *map, BOX_INDEX_FIELD_OPTS, "distance must be either "\ "'euclid' or 'manhattan'"); } - if (opts->range_size <= 0) { - tnt_raise(ClientError, ER_WRONG_INDEX_OPTIONS, - BOX_INDEX_FIELD_OPTS, - "range_size must be greater than 0"); - } - if (opts->page_size <= 0 || opts->page_size > opts->range_size) { + if (opts->page_size <= 0 || (opts->range_size > 0 && + opts->page_size > opts->range_size)) { tnt_raise(ClientError, ER_WRONG_INDEX_OPTIONS, BOX_INDEX_FIELD_OPTS, "page_size must be greater than 0 and " diff --git a/src/box/box.cc b/src/box/box.cc index 9f2fd6da..b045e465 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -592,11 +592,7 @@ box_check_vinyl_options(void) tnt_raise(ClientError, ER_CFG, "vinyl_write_threads", "must be greater than or equal to 2"); } - if (range_size <= 0) { - tnt_raise(ClientError, ER_CFG, "vinyl_range_size", - "must be greater than 0"); - } - if (page_size <= 0 || page_size > range_size) { + if (page_size <= 0 || (range_size > 0 && page_size > range_size)) { tnt_raise(ClientError, ER_CFG, "vinyl_page_size", "must be greater than 0 and less than " "or equal to vinyl_range_size"); diff --git a/src/box/index_def.c b/src/box/index_def.c index 2ba57ee9..c82bc01c 100644 --- a/src/box/index_def.c +++ b/src/box/index_def.c @@ -40,7 +40,7 @@ const struct index_opts index_opts_default = { /* .unique = */ true, /* .dimension = */ 2, /* .distance = */ RTREE_INDEX_DISTANCE_TYPE_EUCLID, - /* .range_size = */ 1073741824, + /* .range_size = */ 0, /* .page_size = */ 8192, /* .run_count_per_level = */ 2, /* .run_size_ratio = */ 3.5, diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua index 6dc4a2af..fc4e560d 100644 --- a/src/box/lua/load_cfg.lua +++ b/src/box/lua/load_cfg.lua @@ -41,7 +41,7 @@ local default_cfg = { vinyl_timeout = 60, vinyl_run_count_per_level = 2, vinyl_run_size_ratio = 3.5, - vinyl_range_size = 1024 * 1024 * 1024, + vinyl_range_size = nil, -- set automatically vinyl_page_size = 8 * 1024, vinyl_bloom_fpr = 0.05, log = nil, diff --git a/src/box/lua/space.cc b/src/box/lua/space.cc index 7cae436f..abebaa87 100644 --- a/src/box/lua/space.cc +++ b/src/box/lua/space.cc @@ -334,8 +334,10 @@ lbox_fillspace(struct lua_State *L, struct space *space, int i) lua_pushstring(L, "options"); lua_newtable(L); - lua_pushnumber(L, index_opts->range_size); - lua_setfield(L, -2, "range_size"); + if (index_opts->range_size > 0) { + lua_pushnumber(L, index_opts->range_size); + lua_setfield(L, -2, "range_size"); + } lua_pushnumber(L, index_opts->page_size); lua_setfield(L, -2, "page_size"); diff --git a/src/box/vinyl.c b/src/box/vinyl.c index dc4fc830..0936932b 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -458,6 +458,7 @@ vinyl_index_stat(struct index *index, struct info_handler *h) info_table_end(h); /* iterator */ info_table_end(h); /* txw */ + info_append_int(h, "range_size", vy_lsm_range_size(lsm)); info_append_int(h, "range_count", lsm->range_count); info_append_int(h, "run_count", lsm->run_count); info_append_int(h, "run_avg", lsm->run_count / lsm->range_count); diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c index 6ec86c22..d52482c6 100644 --- a/src/box/vy_lsm.c +++ b/src/box/vy_lsm.c @@ -691,6 +691,37 @@ vy_lsm_dumps_per_compaction(struct vy_lsm *lsm) return range->dumps_per_compaction; } +int64_t +vy_lsm_range_size(struct vy_lsm *lsm) +{ + /* Use the configured range size if available. */ + if (lsm->opts.range_size > 0) + return lsm->opts.range_size; + /* + * It doesn't make much sense to create too small ranges. + * Limit the max number of ranges per index to 1000 and + * never create ranges smaller than 16 MB. + */ + enum { MIN_RANGE_SIZE = 16 * 1024 * 1024 }; + enum { MAX_RANGE_COUNT = 1000 }; + /* + * Ideally, we want to compact roughly the same amount of + * data after each dump so as to avoid IO bursts caused by + * simultaneous major compaction of a bunch of ranges, + * because such IO bursts can lead to a deviation of the + * LSM tree from the configured shape and, as a result, + * increased read amplification. To achieve that, we need + * to have at least as many ranges as the number of dumps + * it takes to trigger major compaction in a range. + */ + int range_count = vy_lsm_dumps_per_compaction(lsm); + range_count = MIN(range_count, MAX_RANGE_COUNT); + int64_t range_size = lsm->stat.disk.last_level_count.bytes / + (range_count + 1); + range_size = MAX(range_size, MIN_RANGE_SIZE); + return range_size; +} + void vy_lsm_add_run(struct vy_lsm *lsm, struct vy_run *run) { @@ -1055,7 +1086,8 @@ vy_lsm_split_range(struct vy_lsm *lsm, struct vy_range *range) struct tuple_format *key_format = lsm->env->key_format; const char *split_key_raw; - if (!vy_range_needs_split(range, &lsm->opts, &split_key_raw)) + if (!vy_range_needs_split(range, vy_lsm_range_size(lsm), + &split_key_raw)) return false; /* Split a range in two parts. */ @@ -1163,7 +1195,7 @@ bool vy_lsm_coalesce_range(struct vy_lsm *lsm, struct vy_range *range) { struct vy_range *first, *last; - if (!vy_range_needs_coalesce(range, lsm->tree, &lsm->opts, + if (!vy_range_needs_coalesce(range, lsm->tree, vy_lsm_range_size(lsm), &first, &last)) return false; diff --git a/src/box/vy_lsm.h b/src/box/vy_lsm.h index 4df9d19a..d7cba109 100644 --- a/src/box/vy_lsm.h +++ b/src/box/vy_lsm.h @@ -444,6 +444,10 @@ vy_lsm_compaction_priority(struct vy_lsm *lsm); int vy_lsm_dumps_per_compaction(struct vy_lsm *lsm); +/** Return the target size of a range in an LSM tree. */ +int64_t +vy_lsm_range_size(struct vy_lsm *lsm); + /** Add a run to the list of runs of an LSM tree. */ void vy_lsm_add_run(struct vy_lsm *lsm, struct vy_run *run); diff --git a/src/box/vy_range.c b/src/box/vy_range.c index 7cb1b4ba..db4a7ab0 100644 --- a/src/box/vy_range.c +++ b/src/box/vy_range.c @@ -418,7 +418,7 @@ vy_range_update_dumps_per_compaction(struct vy_range *range) * 4/3 * range_size. */ bool -vy_range_needs_split(struct vy_range *range, const struct index_opts *opts, +vy_range_needs_split(struct vy_range *range, int64_t range_size, const char **p_split_key) { struct vy_slice *slice; @@ -432,7 +432,7 @@ vy_range_needs_split(struct vy_range *range, const struct index_opts *opts, slice = rlist_last_entry(&range->slices, struct vy_slice, in_range); /* The range is too small to be split. */ - if (slice->count.bytes < opts->range_size * 4 / 3) + if (slice->count.bytes < range_size * 4 / 3) return false; /* Find the median key in the oldest run (approximately). */ @@ -488,15 +488,15 @@ vy_range_needs_split(struct vy_range *range, const struct index_opts *opts, */ bool vy_range_needs_coalesce(struct vy_range *range, vy_range_tree_t *tree, - const struct index_opts *opts, - struct vy_range **p_first, struct vy_range **p_last) + int64_t range_size, struct vy_range **p_first, + struct vy_range **p_last) { struct vy_range *it; /* Size of the coalesced range. */ uint64_t total_size = range->count.bytes; /* Coalesce ranges until total_size > max_size. */ - uint64_t max_size = opts->range_size / 2; + uint64_t max_size = range_size / 2; /* * We can't coalesce a range that was scheduled for dump diff --git a/src/box/vy_range.h b/src/box/vy_range.h index f19c2c6b..1df71dbf 100644 --- a/src/box/vy_range.h +++ b/src/box/vy_range.h @@ -280,13 +280,13 @@ vy_range_update_dumps_per_compaction(struct vy_range *range); * Check if a range needs to be split in two. * * @param range The range. - * @param opts Index options. + * @param range_size Target range size. * @param[out] p_split_key Key to split the range by. * * @retval true If the range needs to be split. */ bool -vy_range_needs_split(struct vy_range *range, const struct index_opts *opts, +vy_range_needs_split(struct vy_range *range, int64_t range_size, const char **p_split_key); /** @@ -295,7 +295,7 @@ vy_range_needs_split(struct vy_range *range, const struct index_opts *opts, * * @param range The range. * @param tree The range tree. - * @param opts Index options. + * @param range_size Target range size. * @param[out] p_first The first range in the tree to coalesce. * @param[out] p_last The last range in the tree to coalesce. * @@ -303,8 +303,8 @@ vy_range_needs_split(struct vy_range *range, const struct index_opts *opts, */ bool vy_range_needs_coalesce(struct vy_range *range, vy_range_tree_t *tree, - const struct index_opts *opts, - struct vy_range **p_first, struct vy_range **p_last); + int64_t range_size, struct vy_range **p_first, + struct vy_range **p_last); #if defined(__cplusplus) } /* extern "C" */ diff --git a/test/app-tap/init_script.result b/test/app-tap/init_script.result index 70a4b258..559ef521 100644 --- a/test/app-tap/init_script.result +++ b/test/app-tap/init_script.result @@ -39,17 +39,16 @@ box.cfg 34 vinyl_max_tuple_size:1048576 35 vinyl_memory:134217728 36 vinyl_page_size:8192 -37 vinyl_range_size:1073741824 -38 vinyl_read_threads:1 -39 vinyl_run_count_per_level:2 -40 vinyl_run_size_ratio:3.5 -41 vinyl_timeout:60 -42 vinyl_write_threads:4 -43 wal_dir:. -44 wal_dir_rescan_delay:2 -45 wal_max_size:268435456 -46 wal_mode:write -47 worker_pool_threads:4 +37 vinyl_read_threads:1 +38 vinyl_run_count_per_level:2 +39 vinyl_run_size_ratio:3.5 +40 vinyl_timeout:60 +41 vinyl_write_threads:4 +42 wal_dir:. +43 wal_dir_rescan_delay:2 +44 wal_max_size:268435456 +45 wal_mode:write +46 worker_pool_threads:4 -- -- Test insert from detached fiber -- diff --git a/test/box-tap/cfg.test.lua b/test/box-tap/cfg.test.lua index d8715e27..f791cc3f 100755 --- a/test/box-tap/cfg.test.lua +++ b/test/box-tap/cfg.test.lua @@ -6,7 +6,7 @@ local socket = require('socket') local fio = require('fio') local uuid = require('uuid') local msgpack = require('msgpack') -test:plan(103) +test:plan(102) -------------------------------------------------------------------------------- -- Invalid values @@ -45,7 +45,6 @@ invalid('log', ':test:') invalid('vinyl_memory', -1) invalid('vinyl_read_threads', 0) invalid('vinyl_write_threads', 1) -invalid('vinyl_range_size', 0) invalid('vinyl_page_size', 0) invalid('vinyl_run_count_per_level', 0) invalid('vinyl_run_size_ratio', 1) diff --git a/test/box/admin.result b/test/box/admin.result index 0b233889..e6fc1f30 100644 --- a/test/box/admin.result +++ b/test/box/admin.result @@ -98,8 +98,6 @@ cfg_filter(box.cfg) - 134217728 - - vinyl_page_size - 8192 - - - vinyl_range_size - - 1073741824 - - vinyl_read_threads - 1 - - vinyl_run_count_per_level diff --git a/test/box/cfg.result b/test/box/cfg.result index 68465669..7778f82a 100644 --- a/test/box/cfg.result +++ b/test/box/cfg.result @@ -86,8 +86,6 @@ cfg_filter(box.cfg) - 134217728 - - vinyl_page_size - 8192 - - - vinyl_range_size - - 1073741824 - - vinyl_read_threads - 1 - - vinyl_run_count_per_level @@ -187,8 +185,6 @@ cfg_filter(box.cfg) - 134217728 - - vinyl_page_size - 8192 - - - vinyl_range_size - - 1073741824 - - vinyl_read_threads - 1 - - vinyl_run_count_per_level diff --git a/test/vinyl/ddl.result b/test/vinyl/ddl.result index 68bb6b3a..864050b3 100644 --- a/test/vinyl/ddl.result +++ b/test/vinyl/ddl.result @@ -8,10 +8,6 @@ test_run = require('test_run').new() space = box.schema.space.create('test', {engine = 'vinyl' }) --- ... -space:create_index('pk', {range_size = 0}) ---- -- error: 'Wrong index options (field 4): range_size must be greater than 0' -... space:create_index('pk', {page_size = 0}) --- - error: 'Wrong index options (field 4): page_size must be greater than 0 and less @@ -586,7 +582,6 @@ box.space.test.index.pk run_count_per_level: 2 run_size_ratio: 3.5 bloom_fpr: 0.05 - range_size: 1073741824 name: pk type: TREE ... diff --git a/test/vinyl/ddl.test.lua b/test/vinyl/ddl.test.lua index 9b870f35..46189828 100644 --- a/test/vinyl/ddl.test.lua +++ b/test/vinyl/ddl.test.lua @@ -3,7 +3,6 @@ test_run = require('test_run').new() -- sanity checks space = box.schema.space.create('test', {engine = 'vinyl' }) -space:create_index('pk', {range_size = 0}) space:create_index('pk', {page_size = 0}) space:create_index('pk', {page_size = 8192, range_size = 4096}) space:create_index('pk', {run_count_per_level = 0}) diff --git a/test/vinyl/misc.result b/test/vinyl/misc.result index 59492f77..5f67271e 100644 --- a/test/vinyl/misc.result +++ b/test/vinyl/misc.result @@ -1,3 +1,6 @@ +test_run = require('test_run').new() +--- +... fiber = require('fiber') --- ... @@ -204,3 +207,78 @@ s:insert{1, 1, 2} -- error s:drop() --- ... +-- +-- gh-3944: automatic range size configuration. +-- +-- Passing range_size explicitly on index creation. +s = box.schema.space.create('test', {engine = 'vinyl'}) +--- +... +i = s:create_index('pk', {range_size = 0}) +--- +... +i.options.range_size -- nil +--- +- null +... +i:stat().range_size -- 16 MB +--- +- 16777216 +... +box.space._index:get{s.id, i.id}[5].range_size -- 0 +--- +- 0 +... +s:drop() +--- +... +-- Inheriting global settings. +test_run:cmd('create server test with script = "vinyl/stat.lua"') +--- +- true +... +test_run:cmd('start server test') +--- +- true +... +test_run:cmd('switch test') +--- +- true +... +box.cfg.vinyl_range_size -- nil +--- +- null +... +s = box.schema.space.create('test', {engine = 'vinyl'}) +--- +... +i = s:create_index('pk') +--- +... +i.options.range_size -- nil +--- +- null +... +i:stat().range_size -- 16 MB +--- +- 16777216 +... +box.space._index:get{s.id, i.id}[5].range_size -- nil +--- +- null +... +s:drop() +--- +... +test_run:cmd('switch default') +--- +- true +... +test_run:cmd('stop server test') +--- +- true +... +test_run:cmd('cleanup server test') +--- +- true +... diff --git a/test/vinyl/misc.test.lua b/test/vinyl/misc.test.lua index ba7403ec..1c3a9517 100644 --- a/test/vinyl/misc.test.lua +++ b/test/vinyl/misc.test.lua @@ -1,3 +1,4 @@ +test_run = require('test_run').new() fiber = require('fiber') -- @@ -88,3 +89,28 @@ _ = s:create_index('sk', {unique = true, parts = {2, 'unsigned'}}) s:insert{1, 1, 1} s:insert{1, 1, 2} -- error s:drop() + +-- +-- gh-3944: automatic range size configuration. +-- +-- Passing range_size explicitly on index creation. +s = box.schema.space.create('test', {engine = 'vinyl'}) +i = s:create_index('pk', {range_size = 0}) +i.options.range_size -- nil +i:stat().range_size -- 16 MB +box.space._index:get{s.id, i.id}[5].range_size -- 0 +s:drop() +-- Inheriting global settings. +test_run:cmd('create server test with script = "vinyl/stat.lua"') +test_run:cmd('start server test') +test_run:cmd('switch test') +box.cfg.vinyl_range_size -- nil +s = box.schema.space.create('test', {engine = 'vinyl'}) +i = s:create_index('pk') +i.options.range_size -- nil +i:stat().range_size -- 16 MB +box.space._index:get{s.id, i.id}[5].range_size -- nil +s:drop() +test_run:cmd('switch default') +test_run:cmd('stop server test') +test_run:cmd('cleanup server test') diff --git a/test/vinyl/stat.result b/test/vinyl/stat.result index b0b569ab..c1fad931 100644 --- a/test/vinyl/stat.result +++ b/test/vinyl/stat.result @@ -129,15 +129,10 @@ test_run:cmd("setopt delimiter ''"); -- initially stats are empty istat() --- -- dumps_per_compaction: 0 - rows: 0 - run_avg: 0 - bytes: 0 - upsert: +- upsert: squashed: 0 applied: 0 - lookup: 0 - run_count: 0 + bytes: 0 cache: invalidate: rows: 0 @@ -155,10 +150,7 @@ istat() get: rows: 0 bytes: 0 - range_count: 1 - put: - rows: 0 - bytes: 0 + run_histogram: '[0]:1' disk: last_level: bytes_compressed: 0 @@ -216,6 +208,12 @@ istat() pages: 0 bytes_compressed: 0 bytes: 0 + range_size: 16384 + rows: 0 + run_avg: 0 + dumps_per_compaction: 0 + lookup: 0 + range_count: 1 txw: bytes: 0 rows: 0 @@ -224,7 +222,10 @@ istat() get: rows: 0 bytes: 0 - run_histogram: '[0]:1' + run_count: 0 + put: + rows: 0 + bytes: 0 memory: bytes: 0 index_size: 0 @@ -295,7 +296,14 @@ wait(istat, st, 'disk.dump.count', 1) ... stat_diff(istat(), st) --- -- disk: +- put: + rows: 25 + bytes: 26525 + rows: 25 + run_avg: 1 + run_count: 1 + dumps_per_compaction: 1 + disk: last_level: bytes: 26049 pages: 7 @@ -319,14 +327,7 @@ stat_diff(istat(), st) pages: 7 bytes_compressed: bloom_size: 70 - rows: 25 - run_avg: 1 - run_count: 1 - dumps_per_compaction: 1 bytes: 26049 - put: - rows: 25 - bytes: 26525 ... -- put + dump + compaction st = istat() @@ -344,7 +345,12 @@ wait(istat, st, 'disk.compaction.count', 1) ... stat_diff(istat(), st) --- -- disk: +- put: + rows: 50 + bytes: 53050 + rows: 25 + bytes: 26042 + disk: last_level: bytes: 26042 pages: 6 @@ -379,11 +385,6 @@ stat_diff(istat(), st) pages: 13 bytes_compressed: rows: 50 - put: - rows: 50 - bytes: 53050 - rows: 25 - bytes: 26042 ... -- point lookup from disk + cache put st = istat() @@ -403,7 +404,6 @@ stat_diff(istat(), st) put: rows: 1 bytes: 1061 - lookup: 1 disk: iterator: read: @@ -415,6 +415,7 @@ stat_diff(istat(), st) get: rows: 1 bytes: 1061 + lookup: 1 memory: iterator: lookup: 1 @@ -654,6 +655,19 @@ stat_diff(istat(), st) put: rows: 51 bytes: 54111 + lookup: 1 + txw: + iterator: + lookup: 1 + get: + rows: 50 + bytes: 53050 + memory: + iterator: + lookup: 1 + get: + rows: 100 + bytes: 106100 disk: iterator: read: @@ -665,19 +679,6 @@ stat_diff(istat(), st) get: rows: 100 bytes: 106100 - txw: - iterator: - lookup: 1 - get: - rows: 50 - bytes: 53050 - memory: - iterator: - lookup: 1 - get: - rows: 100 - bytes: 106100 - lookup: 1 get: rows: 100 bytes: 106100 @@ -1000,15 +1001,10 @@ box.stat.reset() ... istat() --- -- dumps_per_compaction: 1 - rows: 306 - run_avg: 1 +- upsert: + squashed: 0 + applied: 0 bytes: 317731 - upsert: - squashed: 0 - applied: 0 - lookup: 0 - run_count: 2 cache: invalidate: rows: 0 @@ -1026,10 +1022,7 @@ istat() get: rows: 0 bytes: 0 - range_count: 2 - put: - rows: 0 - bytes: 0 + run_histogram: '[1]:2' disk: last_level: bytes_compressed: @@ -1087,6 +1080,12 @@ istat() pages: 25 bytes_compressed: bytes: 104300 + range_size: 16384 + rows: 306 + run_avg: 1 + dumps_per_compaction: 1 + lookup: 0 + range_count: 2 txw: bytes: 0 rows: 0 @@ -1095,7 +1094,10 @@ istat() get: rows: 0 bytes: 0 - run_histogram: '[1]:2' + run_count: 2 + put: + rows: 0 + bytes: 0 memory: bytes: 213431 index_size: 49152 -- 2.11.0