From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH 1/3] vinyl: fix secondary index divergence on update Date: Sat, 25 May 2019 00:53:40 +0300 Message-Id: <8e4175c3f3b857097ccfd264608b046b71635e91.1558733443.git.vdavydov.dev@gmail.com> In-Reply-To: References: In-Reply-To: References: To: tarantool-patches@freelists.org List-ID: If an UPDATE request doesn't touch key parts of a secondary index, we don't need to write it to the index memory level or dump it to disk, as this would only increase IO load. Historically, we use column mask set by the UPDATE operation to skip secondary indexes that are not affected by the operation on commit. However, there's a problem here: the column mask isn't precise - it may have a bit set even if the corresponding column doesn't really get updated, e.g. consider {'+', 2, 0}. Not taking this into account may result in appearance of phantom tuples on disk as the write iterator assumes that statements that have no effect aren't written to secondary indexes (this is needed to apply INSERT+DELETE "annihilation" optimization). We fixed that by clearing column mask bits in vy_tx_set in case we detect that the key isn't changed, for more details see #3607 and commit e72867cb9169 ("vinyl: fix appearance of phantom tuple in secondary index after update"). It was rather an ugly hack, but it worked. However, it turned out that apart from looking hackish this code has a nasty bug that may lead to tuples missing from secondary indexes. Consider the following example: s = box.schema.space.create('test', {engine = 'vinyl'}) s:create_index('pk') s:create_index('sk', {parts = {2, 'unsigned'}}) s:insert{1, 1, 1} box.begin() s:update(1, {{'=', 2, 2}}) s:update(1, {{'=', 3, 2}}) box.commit() The first update operation writes DELETE{1,1} and REPLACE{2,1} to the secondary index write set. The second update replaces REPLACE{2,1} with DELETE{2,1} and then with REPLACE{2,1}. When replacing DELETE{2,1} with REPLACE{2,1} in the write set, we assume that the update doesn't modify secondary index key parts and clear the column mask so as not to commit a pointless request, see vy_tx_set. As a result, we skip the first update too and get key {2,1} missing from the secondary index. Actually, it was a dumb idea to use column mask to skip statements in the first place, as there's a much easier way to filter out statements that have no effect for secondary indexes. The thing is every DELETE statement inserted into a secondary index write set acts as a "single DELETE", i.e. there's exactly one older statement it is supposed to purge. This is, because in contrast to the primary index we don't write DELETE statements blindly - we always look up the tuple overwritten in the primary index first. This means that REPLACE+DELETE for the same key is basically a no-op and can be safely skip. Moreover, DELETE+REPLACE can be treated as no-op, too, because secondary indexes don't store full tuples hence all REPLACE statements for the same key are equivalent. By marking such pair of statements as no-op in vy_tx_set, we guarantee that no-op statements don't make it to secondary index memory or disk levels. Closes #4242 --- src/box/vinyl.c | 7 ++-- src/box/vy_tx.c | 70 ++++++++++++-------------------------- src/box/vy_tx.h | 17 ++++----- test/engine/update.result | 30 ++++++++++++++++ test/engine/update.test.lua | 12 +++++++ test/vinyl/write_iterator.result | 2 +- test/vinyl/write_iterator.test.lua | 2 +- 7 files changed, 74 insertions(+), 66 deletions(-) diff --git a/src/box/vinyl.c b/src/box/vinyl.c index 9372e5f7..9e731d13 100644 --- a/src/box/vinyl.c +++ b/src/box/vinyl.c @@ -1870,7 +1870,7 @@ vy_perform_update(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt, vy_stmt_set_flags(stmt->new_tuple, VY_STMT_UPDATE); - if (vy_tx_set_with_colmask(tx, pk, stmt->new_tuple, column_mask) != 0) + if (vy_tx_set(tx, pk, stmt->new_tuple) != 0) return -1; if (space->index_count == 1) return 0; @@ -1884,10 +1884,9 @@ vy_perform_update(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt, struct vy_lsm *lsm = vy_lsm(space->index[i]); if (vy_is_committed(env, lsm)) continue; - if (vy_tx_set_with_colmask(tx, lsm, delete, column_mask) != 0) + if (vy_tx_set(tx, lsm, delete) != 0) goto error; - if (vy_tx_set_with_colmask(tx, lsm, stmt->new_tuple, - column_mask) != 0) + if (vy_tx_set(tx, lsm, stmt->new_tuple) != 0) goto error; } tuple_unref(delete); diff --git a/src/box/vy_tx.c b/src/box/vy_tx.c index b1dee0fa..aedd0610 100644 --- a/src/box/vy_tx.c +++ b/src/box/vy_tx.c @@ -51,7 +51,6 @@ #include "trigger.h" #include "trivia/util.h" #include "tuple.h" -#include "column_mask.h" #include "vy_cache.h" #include "vy_lsm.h" #include "vy_mem.h" @@ -212,8 +211,7 @@ tx_manager_destroy_read_view(struct tx_manager *xm, } static struct txv * -txv_new(struct vy_tx *tx, struct vy_lsm *lsm, - struct vy_entry entry, uint64_t column_mask) +txv_new(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry) { struct tx_manager *xm = tx->xm; struct txv *v = mempool_alloc(&xm->txv_mempool); @@ -227,9 +225,9 @@ txv_new(struct vy_tx *tx, struct vy_lsm *lsm, v->entry = entry; tuple_ref(entry.stmt); v->region_stmt = NULL; - v->column_mask = column_mask; v->tx = tx; v->is_first_insert = false; + v->is_nop = false; v->is_overwritten = false; v->overwritten = NULL; xm->write_set_size += tuple_size(entry.stmt); @@ -631,8 +629,7 @@ vy_tx_handle_deferred_delete(struct vy_tx *tx, struct txv *v) struct vy_lsm *lsm = vy_lsm(space->index[i]); struct vy_entry entry; vy_stmt_foreach_entry(entry, delete_stmt, lsm->cmp_def) { - struct txv *delete_txv = txv_new(tx, lsm, entry, - UINT64_MAX); + struct txv *delete_txv = txv_new(tx, lsm, entry); if (delete_txv == NULL) { rc = -1; break; @@ -704,13 +701,7 @@ vy_tx_prepare(struct vy_tx *tx) assert(lsm->space_id == current_space_id); /* Do not save statements that was overwritten by the same tx */ - if (v->is_overwritten) - continue; - - /* Skip statements which don't change this secondary key. */ - if (lsm->index_id > 0 && - key_update_can_be_skipped(lsm->key_def->column_mask, - v->column_mask)) + if (v->is_overwritten || v->is_nop) continue; if (lsm->index_id > 0 && repsert == NULL && delete == NULL) { @@ -1038,8 +1029,7 @@ vy_tx_track_point(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry) * are multiple entries of a single statement in a single index. */ static int -vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm, - struct vy_entry entry, uint64_t column_mask) +vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry) { assert(vy_stmt_type(entry.stmt) != 0); /** @@ -1071,7 +1061,7 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm, } /* Allocate a MVCC container. */ - struct txv *v = txv_new(tx, lsm, entry, column_mask); + struct txv *v = txv_new(tx, lsm, entry); if (applied.stmt != NULL) tuple_unref(applied.stmt); if (v == NULL) @@ -1089,38 +1079,21 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm, if (old == NULL && vy_stmt_type(entry.stmt) == IPROTO_INSERT) v->is_first_insert = true; - if (old != NULL) { + if (lsm->index_id > 0 && old != NULL && !old->is_nop) { /* - * Inherit the column mask of the overwritten statement - * so as not to skip both statements on commit. + * In a secondary index write set, DELETE statement purges + * exactly one older statement so REPLACE + DELETE is no-op. + * Moreover, DELETE + REPLACE can be treated as no-op, too, + * because secondary indexes don't store full tuples hence + * all REPLACE statements for the same key are equivalent. + * Therefore we can zap DELETE + REPLACE as there must be + * an older REPLACE for the same key stored somewhere in the + * index data. */ - v->column_mask |= old->column_mask; - } - - if (lsm->index_id > 0 && vy_stmt_type(entry.stmt) == IPROTO_REPLACE && - old != NULL && vy_stmt_type(old->entry.stmt) == IPROTO_DELETE) { - /* - * The column mask of an update operation may have a bit - * set even if the corresponding field doesn't actually - * get updated, because a column mask is generated - * only from update operations, before applying them. - * E.g. update(1, {{'+', 2, 0}}) doesn't modify the - * second field, but the column mask will say it does. - * - * To discard DELETE statements in the write iterator - * (see optimization #5), we turn a REPLACE into an - * INSERT in case the REPLACE was generated by an - * update that changed secondary key fields. So we - * can't tolerate inaccuracy in a column mask. - * - * So if the update didn't actually modify secondary - * key fields, i.e. DELETE and REPLACE generated by the - * update have the same key fields, we forcefully clear - * key bits in the column mask to ensure that no REPLACE - * statement will be written for this secondary key. - */ - if (v->column_mask != UINT64_MAX) - v->column_mask &= ~lsm->cmp_def->column_mask; + enum iproto_type type = vy_stmt_type(entry.stmt); + enum iproto_type old_type = vy_stmt_type(old->entry.stmt); + if ((type == IPROTO_DELETE) != (old_type == IPROTO_DELETE)) + v->is_nop = true; } v->overwritten = old; @@ -1133,12 +1106,11 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm, } int -vy_tx_set_with_colmask(struct vy_tx *tx, struct vy_lsm *lsm, - struct tuple *stmt, uint64_t column_mask) +vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt) { struct vy_entry entry; vy_stmt_foreach_entry(entry, stmt, lsm->cmp_def) { - if (vy_tx_set_entry(tx, lsm, entry, column_mask) != 0) + if (vy_tx_set_entry(tx, lsm, entry) != 0) return -1; } return 0; diff --git a/src/box/vy_tx.h b/src/box/vy_tx.h index 2712263b..9ee10755 100644 --- a/src/box/vy_tx.h +++ b/src/box/vy_tx.h @@ -87,8 +87,6 @@ struct txv { struct vy_entry entry; /** Statement allocated on vy_mem->allocator. */ struct tuple *region_stmt; - /** Mask of columns modified by this operation. */ - uint64_t column_mask; /** Next in the transaction log. */ struct stailq_entry next_in_log; /** Member the transaction write set. */ @@ -101,6 +99,11 @@ struct txv { */ bool is_first_insert; /** + * True if the statement has no effect and can be safely + * skipped on commit. + */ + bool is_nop; + /** * True if the txv was overwritten by another txv of * the same transaction. */ @@ -397,20 +400,12 @@ vy_tx_track_point(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry); * @param tx Transaction. * @param lsm LSM tree the statement is for. * @param stmt Statement. - * @param column_mask Mask of fields modified by the statement. * * @retval 0 Success * @retval -1 Memory allocation error. */ int -vy_tx_set_with_colmask(struct vy_tx *tx, struct vy_lsm *lsm, - struct tuple *stmt, uint64_t column_mask); - -static inline int -vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt) -{ - return vy_tx_set_with_colmask(tx, lsm, stmt, UINT64_MAX); -} +vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt); /** * Iterator over the write set of a transaction. diff --git a/test/engine/update.result b/test/engine/update.result index b73037eb..69293e46 100644 --- a/test/engine/update.result +++ b/test/engine/update.result @@ -758,3 +758,33 @@ s:upsert({2, 666}, {{'=', 2, 666}}) s:drop() --- ... +-- +-- gh-4242 Tuple is missing from secondary index after update. +-- +s = box.schema.space.create('test', {engine = engine}) +--- +... +pk = s:create_index('pk') +--- +... +sk = s:create_index('sk', {parts = {2, 'unsigned'}}) +--- +... +s:insert{1, 1, 1} +--- +- [1, 1, 1] +... +box.begin() s:update(1, {{'=', 2, 2}}) s:update(1, {{'=', 3, 2}}) box.commit() +--- +... +pk:select() +--- +- - [1, 2, 2] +... +sk:select() +--- +- - [1, 2, 2] +... +s:drop() +--- +... diff --git a/test/engine/update.test.lua b/test/engine/update.test.lua index e2a2265f..51263f57 100644 --- a/test/engine/update.test.lua +++ b/test/engine/update.test.lua @@ -122,3 +122,15 @@ box.space.tst_sample:get(2).VAL -- invalid upsert s:upsert({2, 666}, {{'=', 2, 666}}) s:drop() + +-- +-- gh-4242 Tuple is missing from secondary index after update. +-- +s = box.schema.space.create('test', {engine = engine}) +pk = s:create_index('pk') +sk = s:create_index('sk', {parts = {2, 'unsigned'}}) +s:insert{1, 1, 1} +box.begin() s:update(1, {{'=', 2, 2}}) s:update(1, {{'=', 3, 2}}) box.commit() +pk:select() +sk:select() +s:drop() diff --git a/test/vinyl/write_iterator.result b/test/vinyl/write_iterator.result index 39212572..5c5200b8 100644 --- a/test/vinyl/write_iterator.result +++ b/test/vinyl/write_iterator.result @@ -767,7 +767,7 @@ box.snapshot() - ok ... -- Some padding to trigger minor compaction. -for i = 1001, 1000 + PAD2 do s:replace{i, i} end +for i = 1001, 1000 + PAD2 do s:delete{i} end --- ... box.snapshot() diff --git a/test/vinyl/write_iterator.test.lua b/test/vinyl/write_iterator.test.lua index d7871ffc..fa44961a 100644 --- a/test/vinyl/write_iterator.test.lua +++ b/test/vinyl/write_iterator.test.lua @@ -328,7 +328,7 @@ PAD2 = 15 for i = 1001, 1000 + PAD1 do s:replace{i, i} end box.snapshot() -- Some padding to trigger minor compaction. -for i = 1001, 1000 + PAD2 do s:replace{i, i} end +for i = 1001, 1000 + PAD2 do s:delete{i} end box.snapshot() -- Generate some INSERT statements and dump them to disk. _ = s:insert{1, 1} -- insert -- 2.11.0