From: Vladimir Davydov <vdavydov.dev@gmail.com> To: kostja@tarantool.org Cc: tarantool-patches@freelists.org Subject: [PATCH] vinyl: account statements skipped on read Date: Fri, 12 Apr 2019 11:34:14 +0300 [thread overview] Message-ID: <b24bab12e22f3417f6b9f1c3e113b7972c401829.1555057966.git.vdavydov.dev@gmail.com> (raw) After we retrieve a statement from a secondary index, we always do a lookup in the primary index to get the full tuple corresponding to the found secondary key. It may turn out that the full tuple doesn't match the secondary key, which means the key was overwritten, but the DELETE statement hasn't been propagated yet (aka deferred DELETE). Currently, there's no way to figure out how often this happens as all tuples read from an LSM tree are accounted under 'get' counter. So this patch splits 'get' in two: 'get', which now accounts only tuples actually returned to the user, and 'skip', which accounts skipped tuples. --- https://github.com/tarantool/tarantool/commits/dv/vy-account-skipped-statements src/box/vinyl.c | 17 ++++++- src/box/vy_point_lookup.c | 5 -- src/box/vy_read_iterator.c | 7 --- src/box/vy_stat.h | 2 + test/vinyl/deferred_delete.result | 88 +++++++++++++++++++++++++++++++---- test/vinyl/deferred_delete.test.lua | 26 +++++++++-- test/vinyl/errinj.result | 6 +-- test/vinyl/errinj.test.lua | 6 +-- test/vinyl/errinj_ddl.result | 8 ++-- test/vinyl/errinj_ddl.test.lua | 8 ++-- test/vinyl/stat.result | 92 ++++++++++++++++++++----------------- 11 files changed, 183 insertions(+), 82 deletions(-) diff --git a/src/box/vinyl.c b/src/box/vinyl.c index b9bf721a..2e1e717e 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -382,6 +382,7 @@ vinyl_index_stat(struct index *index, struct info_handler *h) info_append_int(h, "lookup", stat->lookup); vy_info_append_stmt_counter(h, "get", &stat->get); + vy_info_append_stmt_counter(h, "skip", &stat->skip); vy_info_append_stmt_counter(h, "put", &stat->put); info_table_begin(h, "latency"); @@ -484,6 +485,7 @@ vinyl_index_reset_stat(struct index *index) stat->lookup = 0; latency_reset(&stat->latency); vy_stmt_counter_reset(&stat->get); + vy_stmt_counter_reset(&stat->skip); vy_stmt_counter_reset(&stat->put); memset(&stat->upsert, 0, sizeof(stat->upsert)); @@ -1344,6 +1346,8 @@ vy_get_by_secondary_tuple(struct vy_lsm *lsm, struct vy_tx *tx, } key.hint = vy_stmt_hint(key.stmt, lsm->pk->cmp_def); + lsm->pk->stat.lookup++; + if (vy_point_lookup(lsm->pk, tx, rv, key, result) != 0) { rc = -1; goto out; @@ -1364,7 +1368,10 @@ vy_get_by_secondary_tuple(struct vy_lsm *lsm, struct vy_tx *tx, * propagated to the secondary index yet. In this * case silently skip this tuple. */ + vy_stmt_counter_acct_tuple(&lsm->stat.skip, entry.stmt); if (result->stmt != NULL) { + vy_stmt_counter_acct_tuple(&lsm->pk->stat.skip, + result->stmt); tuple_unref(result->stmt); *result = vy_entry_none(); } @@ -1399,6 +1406,8 @@ vy_get_by_secondary_tuple(struct vy_lsm *lsm, struct vy_tx *tx, /* Inherit the hint from the secondary index entry. */ result->hint = entry.hint; + + vy_stmt_counter_acct_tuple(&lsm->pk->stat.get, result->stmt); out: tuple_unref(key.stmt); return rc; @@ -1435,6 +1444,8 @@ vy_get(struct vy_lsm *lsm, struct vy_tx *tx, key.stmt = key_stmt; key.hint = vy_stmt_hint(key.stmt, lsm->cmp_def); + lsm->stat.lookup++; + if (vy_stmt_is_full_key(key.stmt, lsm->cmp_def)) { /* * Use point lookup for a full key. @@ -1489,6 +1500,8 @@ out: vy_lsm_name(lsm), tuple_str(key.stmt), tuple_str(*result), latency); } + if (*result != NULL) + vy_stmt_counter_acct_tuple(&lsm->stat.get, *result); return 0; } @@ -3797,7 +3810,8 @@ vinyl_iterator_account_read(struct vinyl_iterator *it, double start_time, iterator_type_strs[type], tuple_str(result), latency); } - + if (result != NULL) + vy_stmt_counter_acct_tuple(&lsm->stat.get, result); } static int @@ -3947,6 +3961,7 @@ vinyl_index_create_iterator(struct index *base, enum iterator_type type, } it->tx = tx; + lsm->stat.lookup++; vy_read_iterator_open(&it->iterator, lsm, tx, type, it->key, (const struct vy_read_view **)&tx->read_view); return (struct iterator *)it; diff --git a/src/box/vy_point_lookup.c b/src/box/vy_point_lookup.c index becc0486..ecdcde7d 100644 --- a/src/box/vy_point_lookup.c +++ b/src/box/vy_point_lookup.c @@ -203,8 +203,6 @@ vy_point_lookup(struct vy_lsm *lsm, struct vy_tx *tx, *ret = vy_entry_none(); int rc = 0; - lsm->stat.lookup++; - /* History list */ struct vy_history history, mem_history, disk_history; vy_history_create(&history, &lsm->env->history_node_pool); @@ -294,9 +292,6 @@ done: if (rc != 0) return -1; - if (ret->stmt != NULL) - vy_stmt_counter_acct_tuple(&lsm->stat.get, ret->stmt); - return 0; } diff --git a/src/box/vy_read_iterator.c b/src/box/vy_read_iterator.c index b26750e9..9ea07709 100644 --- a/src/box/vy_read_iterator.c +++ b/src/box/vy_read_iterator.c @@ -857,11 +857,7 @@ vy_read_iterator_next(struct vy_read_iterator *itr, struct vy_entry *result) { assert(itr->tx == NULL || itr->tx->state == VINYL_TX_READY); - struct vy_lsm *lsm = itr->lsm; struct vy_entry entry; - - if (itr->last.stmt == NULL) - lsm->stat.lookup++; /* first iteration */ next_key: if (vy_read_iterator_advance(itr) != 0) return -1; @@ -893,9 +889,6 @@ next_key: vy_stmt_type(entry.stmt) == IPROTO_INSERT || vy_stmt_type(entry.stmt) == IPROTO_REPLACE); - if (entry.stmt != NULL) - vy_stmt_counter_acct_tuple(&lsm->stat.get, entry.stmt); - *result = entry; return 0; } diff --git a/src/box/vy_stat.h b/src/box/vy_stat.h index b7f50d93..a3021a0c 100644 --- a/src/box/vy_stat.h +++ b/src/box/vy_stat.h @@ -117,6 +117,8 @@ struct vy_lsm_stat { int64_t lookup; /** Number of statements read from this LSM tree. */ struct vy_stmt_counter get; + /** Number of statements skipped on read. */ + struct vy_stmt_counter skip; /** Number of statements written to this LSM tree. */ struct vy_stmt_counter put; /** Read latency. */ diff --git a/test/vinyl/deferred_delete.result b/test/vinyl/deferred_delete.result index 0a380ed5..422f2fe0 100644 --- a/test/vinyl/deferred_delete.result +++ b/test/vinyl/deferred_delete.result @@ -71,14 +71,26 @@ i1:select() - [8, 80, 800] - [10, 100, 1000] ... -i1:stat().get.rows -- 15 +i1:stat().get.rows -- 5 --- -- 15 +- 5 +... +i1:stat().skip.rows -- 10 +--- +- 10 ... pk:stat().lookup -- 15 --- - 15 ... +pk:stat().get.rows -- 5 +--- +- 5 +... +pk:stat().skip.rows -- 5 +--- +- 5 +... i2:select() --- - - [2, 20, 200] @@ -87,14 +99,26 @@ i2:select() - [8, 80, 800] - [10, 100, 1000] ... -i2:stat().get.rows -- 15 +i2:stat().get.rows -- 5 --- -- 15 +- 5 +... +i2:stat().skip.rows -- 10 +--- +- 10 ... pk:stat().lookup -- 30 --- - 30 ... +pk:stat().get.rows -- 10 +--- +- 10 +... +pk:stat().skip.rows -- 10 +--- +- 10 +... -- Overwritten/deleted tuples are not stored in the cache so calling -- SELECT for a second time does only 5 lookups. box.stat.reset() @@ -112,10 +136,22 @@ i1:stat().get.rows -- 5 --- - 5 ... +i1:stat().skip.rows -- 0 +--- +- 0 +... pk:stat().lookup -- 5 --- - 5 ... +pk:stat().get.rows -- 5 +--- +- 5 +... +pk:stat().skip.rows -- 0 +--- +- 0 +... i2:select() --- - - [2, 20, 200] @@ -128,10 +164,22 @@ i2:stat().get.rows -- 5 --- - 5 ... +i2:stat().skip.rows -- 0 +--- +- 0 +... pk:stat().lookup -- 10 --- - 10 ... +pk:stat().get.rows -- 10 +--- +- 10 +... +pk:stat().skip.rows -- 0 +--- +- 0 +... -- Cleanup the cache. vinyl_cache = box.cfg.vinyl_cache --- @@ -179,14 +227,26 @@ i1:select() - [8, 80, 800] - [10, 100, 1000] ... -i1:stat().get.rows -- 15 +i1:stat().get.rows -- 5 --- -- 15 +- 5 +... +i1:stat().skip.rows -- 10 +--- +- 10 ... pk:stat().lookup -- 15 --- - 15 ... +pk:stat().get.rows -- 5 +--- +- 5 +... +pk:stat().skip.rows -- 5 +--- +- 5 +... i2:select() --- - - [2, 20, 200] @@ -195,14 +255,26 @@ i2:select() - [8, 80, 800] - [10, 100, 1000] ... -i2:stat().get.rows -- 15 +i2:stat().get.rows -- 5 --- -- 15 +- 5 +... +i2:stat().skip.rows -- 10 +--- +- 10 ... pk:stat().lookup -- 30 --- - 30 ... +pk:stat().get.rows -- 10 +--- +- 10 +... +pk:stat().skip.rows -- 10 +--- +- 10 +... -- Check that deferred DELETEs are not lost after restart. test_run:cmd("restart server default") fiber = require('fiber') diff --git a/test/vinyl/deferred_delete.test.lua b/test/vinyl/deferred_delete.test.lua index 240429fc..6dfcbde5 100644 --- a/test/vinyl/deferred_delete.test.lua +++ b/test/vinyl/deferred_delete.test.lua @@ -33,21 +33,33 @@ i2:stat().rows -- ditto -- overwritten tuples in the primary index hence 15 lookups per SELECT -- in a secondary index. i1:select() -i1:stat().get.rows -- 15 +i1:stat().get.rows -- 5 +i1:stat().skip.rows -- 10 pk:stat().lookup -- 15 +pk:stat().get.rows -- 5 +pk:stat().skip.rows -- 5 i2:select() -i2:stat().get.rows -- 15 +i2:stat().get.rows -- 5 +i2:stat().skip.rows -- 10 pk:stat().lookup -- 30 +pk:stat().get.rows -- 10 +pk:stat().skip.rows -- 10 -- Overwritten/deleted tuples are not stored in the cache so calling -- SELECT for a second time does only 5 lookups. box.stat.reset() i1:select() i1:stat().get.rows -- 5 +i1:stat().skip.rows -- 0 pk:stat().lookup -- 5 +pk:stat().get.rows -- 5 +pk:stat().skip.rows -- 0 i2:select() i2:stat().get.rows -- 5 +i2:stat().skip.rows -- 0 pk:stat().lookup -- 10 +pk:stat().get.rows -- 10 +pk:stat().skip.rows -- 0 -- Cleanup the cache. vinyl_cache = box.cfg.vinyl_cache @@ -67,11 +79,17 @@ i2:stat().rows -- ditto -- the number of lookups. box.stat.reset() i1:select() -i1:stat().get.rows -- 15 +i1:stat().get.rows -- 5 +i1:stat().skip.rows -- 10 pk:stat().lookup -- 15 +pk:stat().get.rows -- 5 +pk:stat().skip.rows -- 5 i2:select() -i2:stat().get.rows -- 15 +i2:stat().get.rows -- 5 +i2:stat().skip.rows -- 10 pk:stat().lookup -- 30 +pk:stat().get.rows -- 10 +pk:stat().skip.rows -- 10 -- Check that deferred DELETEs are not lost after restart. test_run:cmd("restart server default") diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result index f6d37769..6fc4d9e5 100644 --- a/test/vinyl/errinj.result +++ b/test/vinyl/errinj.result @@ -825,15 +825,15 @@ errinj.set("ERRINJ_VY_DELAY_PK_LOOKUP", true) _ = fiber.create(do_read) --- ... -test_run:wait_cond(function() return sk:stat().get.rows > 0 end, 60) +test_run:wait_cond(function() return sk:stat().disk.iterator.get.rows > 0 end, 60) --- - true ... -pk:stat().get.rows -- 0 +pk:stat().disk.iterator.get.rows -- 0 --- - 0 ... -sk:stat().get.rows -- 1 +sk:stat().disk.iterator.get.rows -- 1 --- - 1 ... diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua index 4820ef97..b500b33c 100644 --- a/test/vinyl/errinj.test.lua +++ b/test/vinyl/errinj.test.lua @@ -297,9 +297,9 @@ ret = nil function do_read() ret = sk:select({2}, {iterator = 'GE'}) end errinj.set("ERRINJ_VY_DELAY_PK_LOOKUP", true) _ = fiber.create(do_read) -test_run:wait_cond(function() return sk:stat().get.rows > 0 end, 60) -pk:stat().get.rows -- 0 -sk:stat().get.rows -- 1 +test_run:wait_cond(function() return sk:stat().disk.iterator.get.rows > 0 end, 60) +pk:stat().disk.iterator.get.rows -- 0 +sk:stat().disk.iterator.get.rows -- 1 s:replace{2, 2} errinj.set("ERRINJ_VY_DELAY_PK_LOOKUP", false) test_run:wait_cond(function() return pk:stat().get.rows > 0 end, 60) diff --git a/test/vinyl/errinj_ddl.result b/test/vinyl/errinj_ddl.result index af5f73d4..45e116a1 100644 --- a/test/vinyl/errinj_ddl.result +++ b/test/vinyl/errinj_ddl.result @@ -454,10 +454,10 @@ test_run:cmd("setopt delimiter ''"); - true ... -- Wait until DDL starts scanning the altered space. -lookup = i1:stat().lookup +lookup = i1:stat().disk.iterator.lookup --- ... -wait_cond = function() return i1:stat().lookup > lookup end +wait_cond = function() return i1:stat().disk.iterator.lookup > lookup end --- ... c1 = async_replace(s1, {1}, wait_cond) @@ -501,10 +501,10 @@ s1:format{} --- ... -- Wait until DDL starts scanning the altered space. -lookup = i1:stat().lookup +lookup = i1:stat().disk.iterator.lookup --- ... -wait_cond = function() return i1:stat().lookup > lookup end +wait_cond = function() return i1:stat().disk.iterator.lookup > lookup end --- ... c1 = async_replace(s1, {2}, wait_cond) diff --git a/test/vinyl/errinj_ddl.test.lua b/test/vinyl/errinj_ddl.test.lua index 57af9c65..5870a0b0 100644 --- a/test/vinyl/errinj_ddl.test.lua +++ b/test/vinyl/errinj_ddl.test.lua @@ -204,8 +204,8 @@ end; test_run:cmd("setopt delimiter ''"); -- Wait until DDL starts scanning the altered space. -lookup = i1:stat().lookup -wait_cond = function() return i1:stat().lookup > lookup end +lookup = i1:stat().disk.iterator.lookup +wait_cond = function() return i1:stat().disk.iterator.lookup > lookup end c1 = async_replace(s1, {1}, wait_cond) c2 = async_replace(s2, {1}, wait_cond) @@ -222,8 +222,8 @@ s1:format() s1:format{} -- Wait until DDL starts scanning the altered space. -lookup = i1:stat().lookup -wait_cond = function() return i1:stat().lookup > lookup end +lookup = i1:stat().disk.iterator.lookup +wait_cond = function() return i1:stat().disk.iterator.lookup > lookup end c1 = async_replace(s1, {2}, wait_cond) c2 = async_replace(s2, {2}, wait_cond) diff --git a/test/vinyl/stat.result b/test/vinyl/stat.result index f49394f7..aa47a7be 100644 --- a/test/vinyl/stat.result +++ b/test/vinyl/stat.result @@ -129,10 +129,8 @@ test_run:cmd("setopt delimiter ''"); -- initially stats are empty istat() --- -- upsert: - squashed: 0 - applied: 0 - bytes: 0 +- range_count: 1 + lookup: 0 cache: invalidate: rows: 0 @@ -212,20 +210,25 @@ istat() rows: 0 run_avg: 0 dumps_per_compaction: 0 - lookup: 0 - range_count: 1 - txw: + upsert: + squashed: 0 + applied: 0 + bytes: 0 + put: + rows: 0 bytes: 0 + skip: rows: 0 - iterator: - lookup: 0 - get: - rows: 0 - bytes: 0 + bytes: 0 run_count: 0 - put: - rows: 0 + txw: bytes: 0 + rows: 0 + iterator: + lookup: 0 + get: + rows: 0 + bytes: 0 memory: bytes: 0 index_size: 0 @@ -348,7 +351,6 @@ stat_diff(istat(), st) - put: rows: 50 bytes: 53050 - rows: 25 bytes: 26042 disk: last_level: @@ -385,6 +387,7 @@ stat_diff(istat(), st) pages: 13 bytes_compressed: <bytes_compressed> rows: 50 + rows: 25 ... -- point lookup from disk + cache put st = istat() @@ -404,6 +407,7 @@ stat_diff(istat(), st) put: rows: 1 bytes: 1061 + lookup: 1 disk: iterator: read: @@ -415,7 +419,6 @@ stat_diff(istat(), st) get: rows: 1 bytes: 1061 - lookup: 1 memory: iterator: lookup: 1 @@ -655,19 +658,6 @@ stat_diff(istat(), st) put: rows: 51 bytes: 54111 - lookup: 1 - txw: - iterator: - lookup: 1 - get: - rows: 50 - bytes: 53050 - memory: - iterator: - lookup: 1 - get: - rows: 100 - bytes: 106100 disk: iterator: read: @@ -679,6 +669,19 @@ stat_diff(istat(), st) get: rows: 100 bytes: 106100 + txw: + iterator: + lookup: 1 + get: + rows: 50 + bytes: 53050 + memory: + iterator: + lookup: 1 + get: + rows: 100 + bytes: 106100 + lookup: 1 get: rows: 100 bytes: 106100 @@ -1001,10 +1004,8 @@ box.stat.reset() ... istat() --- -- upsert: - squashed: 0 - applied: 0 - bytes: 317731 +- range_count: 2 + lookup: 0 cache: invalidate: rows: 0 @@ -1084,20 +1085,25 @@ istat() rows: 306 run_avg: 1 dumps_per_compaction: 1 - lookup: 0 - range_count: 2 - txw: + upsert: + squashed: 0 + applied: 0 + bytes: 317731 + put: + rows: 0 bytes: 0 + skip: rows: 0 - iterator: - lookup: 0 - get: - rows: 0 - bytes: 0 + bytes: 0 run_count: 2 - put: - rows: 0 + txw: bytes: 0 + rows: 0 + iterator: + lookup: 0 + get: + rows: 0 + bytes: 0 memory: bytes: 213431 index_size: 49152 -- 2.11.0
next reply other threads:[~2019-04-12 8:34 UTC|newest] Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top 2019-04-12 8:34 Vladimir Davydov [this message] 2019-04-12 9:53 ` Vladimir Davydov
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=b24bab12e22f3417f6b9f1c3e113b7972c401829.1555057966.git.vdavydov.dev@gmail.com \ --to=vdavydov.dev@gmail.com \ --cc=kostja@tarantool.org \ --cc=tarantool-patches@freelists.org \ --subject='Re: [PATCH] vinyl: account statements skipped on read' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox