[PATCH 1/3] vinyl: fix secondary index divergence on update

Vladimir Davydov vdavydov.dev at gmail.com
Sat May 25 00:53:40 MSK 2019


If an UPDATE request doesn't touch key parts of a secondary index, we
don't need to write it to the index memory level or dump it to disk, as
this would only increase IO load. Historically, we use column mask set
by the UPDATE operation to skip secondary indexes that are not affected
by the operation on commit. However, there's a problem here: the column
mask isn't precise - it may have a bit set even if the corresponding
column doesn't really get updated, e.g. consider {'+', 2, 0}. Not taking
this into account may result in appearance of phantom tuples on disk as
the write iterator assumes that statements that have no effect aren't
written to secondary indexes (this is needed to apply INSERT+DELETE
"annihilation" optimization). We fixed that by clearing column mask bits
in vy_tx_set in case we detect that the key isn't changed, for more
details see #3607 and commit e72867cb9169 ("vinyl: fix appearance of
phantom tuple in secondary index after update"). It was rather an ugly
hack, but it worked.

However, it turned out that apart from looking hackish this code has
a nasty bug that may lead to tuples missing from secondary indexes.
Consider the following example:

  s = box.schema.space.create('test', {engine = 'vinyl'})
  s:create_index('pk')
  s:create_index('sk', {parts = {2, 'unsigned'}})
  s:insert{1, 1, 1}

  box.begin()
  s:update(1, {{'=', 2, 2}})
  s:update(1, {{'=', 3, 2}})
  box.commit()

The first update operation writes DELETE{1,1} and REPLACE{2,1} to the
secondary index write set. The second update replaces REPLACE{2,1} with
DELETE{2,1} and then with REPLACE{2,1}. When replacing DELETE{2,1} with
REPLACE{2,1} in the write set, we assume that the update doesn't modify
secondary index key parts and clear the column mask so as not to commit
a pointless request, see vy_tx_set. As a result, we skip the first
update too and get key {2,1} missing from the secondary index.

Actually, it was a dumb idea to use column mask to skip statements in
the first place, as there's a much easier way to filter out statements
that have no effect for secondary indexes. The thing is every DELETE
statement inserted into a secondary index write set acts as a "single
DELETE", i.e. there's exactly one older statement it is supposed to
purge. This is, because in contrast to the primary index we don't write
DELETE statements blindly - we always look up the tuple overwritten in
the primary index first. This means that REPLACE+DELETE for the same key
is basically a no-op and can be safely skip. Moreover, DELETE+REPLACE
can be treated as no-op, too, because secondary indexes don't store full
tuples hence all REPLACE statements for the same key are equivalent.
By marking such pair of statements as no-op in vy_tx_set, we guarantee
that no-op statements don't make it to secondary index memory or disk
levels.

Closes #4242
---
 src/box/vinyl.c                    |  7 ++--
 src/box/vy_tx.c                    | 70 ++++++++++++--------------------------
 src/box/vy_tx.h                    | 17 ++++-----
 test/engine/update.result          | 30 ++++++++++++++++
 test/engine/update.test.lua        | 12 +++++++
 test/vinyl/write_iterator.result   |  2 +-
 test/vinyl/write_iterator.test.lua |  2 +-
 7 files changed, 74 insertions(+), 66 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 9372e5f7..9e731d13 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -1870,7 +1870,7 @@ vy_perform_update(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
 
 	vy_stmt_set_flags(stmt->new_tuple, VY_STMT_UPDATE);
 
-	if (vy_tx_set_with_colmask(tx, pk, stmt->new_tuple, column_mask) != 0)
+	if (vy_tx_set(tx, pk, stmt->new_tuple) != 0)
 		return -1;
 	if (space->index_count == 1)
 		return 0;
@@ -1884,10 +1884,9 @@ vy_perform_update(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
 		struct vy_lsm *lsm = vy_lsm(space->index[i]);
 		if (vy_is_committed(env, lsm))
 			continue;
-		if (vy_tx_set_with_colmask(tx, lsm, delete, column_mask) != 0)
+		if (vy_tx_set(tx, lsm, delete) != 0)
 			goto error;
-		if (vy_tx_set_with_colmask(tx, lsm, stmt->new_tuple,
-					column_mask) != 0)
+		if (vy_tx_set(tx, lsm, stmt->new_tuple) != 0)
 			goto error;
 	}
 	tuple_unref(delete);
diff --git a/src/box/vy_tx.c b/src/box/vy_tx.c
index b1dee0fa..aedd0610 100644
--- a/src/box/vy_tx.c
+++ b/src/box/vy_tx.c
@@ -51,7 +51,6 @@
 #include "trigger.h"
 #include "trivia/util.h"
 #include "tuple.h"
-#include "column_mask.h"
 #include "vy_cache.h"
 #include "vy_lsm.h"
 #include "vy_mem.h"
@@ -212,8 +211,7 @@ tx_manager_destroy_read_view(struct tx_manager *xm,
 }
 
 static struct txv *
-txv_new(struct vy_tx *tx, struct vy_lsm *lsm,
-	struct vy_entry entry, uint64_t column_mask)
+txv_new(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry)
 {
 	struct tx_manager *xm = tx->xm;
 	struct txv *v = mempool_alloc(&xm->txv_mempool);
@@ -227,9 +225,9 @@ txv_new(struct vy_tx *tx, struct vy_lsm *lsm,
 	v->entry = entry;
 	tuple_ref(entry.stmt);
 	v->region_stmt = NULL;
-	v->column_mask = column_mask;
 	v->tx = tx;
 	v->is_first_insert = false;
+	v->is_nop = false;
 	v->is_overwritten = false;
 	v->overwritten = NULL;
 	xm->write_set_size += tuple_size(entry.stmt);
@@ -631,8 +629,7 @@ vy_tx_handle_deferred_delete(struct vy_tx *tx, struct txv *v)
 		struct vy_lsm *lsm = vy_lsm(space->index[i]);
 		struct vy_entry entry;
 		vy_stmt_foreach_entry(entry, delete_stmt, lsm->cmp_def) {
-			struct txv *delete_txv = txv_new(tx, lsm, entry,
-							 UINT64_MAX);
+			struct txv *delete_txv = txv_new(tx, lsm, entry);
 			if (delete_txv == NULL) {
 				rc = -1;
 				break;
@@ -704,13 +701,7 @@ vy_tx_prepare(struct vy_tx *tx)
 		assert(lsm->space_id == current_space_id);
 
 		/* Do not save statements that was overwritten by the same tx */
-		if (v->is_overwritten)
-			continue;
-
-		/* Skip statements which don't change this secondary key. */
-		if (lsm->index_id > 0 &&
-		    key_update_can_be_skipped(lsm->key_def->column_mask,
-					      v->column_mask))
+		if (v->is_overwritten || v->is_nop)
 			continue;
 
 		if (lsm->index_id > 0 && repsert == NULL && delete == NULL) {
@@ -1038,8 +1029,7 @@ vy_tx_track_point(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry)
  * are multiple entries of a single statement in a single index.
  */
 static int
-vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
-		struct vy_entry entry, uint64_t column_mask)
+vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry)
 {
 	assert(vy_stmt_type(entry.stmt) != 0);
 	/**
@@ -1071,7 +1061,7 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
 	}
 
 	/* Allocate a MVCC container. */
-	struct txv *v = txv_new(tx, lsm, entry, column_mask);
+	struct txv *v = txv_new(tx, lsm, entry);
 	if (applied.stmt != NULL)
 		tuple_unref(applied.stmt);
 	if (v == NULL)
@@ -1089,38 +1079,21 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
 	if (old == NULL && vy_stmt_type(entry.stmt) == IPROTO_INSERT)
 		v->is_first_insert = true;
 
-	if (old != NULL) {
+	if (lsm->index_id > 0 && old != NULL && !old->is_nop) {
 		/*
-		 * Inherit the column mask of the overwritten statement
-		 * so as not to skip both statements on commit.
+		 * In a secondary index write set, DELETE statement purges
+		 * exactly one older statement so REPLACE + DELETE is no-op.
+		 * Moreover, DELETE + REPLACE can be treated as no-op, too,
+		 * because secondary indexes don't store full tuples hence
+		 * all REPLACE statements for the same key are equivalent.
+		 * Therefore we can zap DELETE + REPLACE as there must be
+		 * an older REPLACE for the same key stored somewhere in the
+		 * index data.
 		 */
-		v->column_mask |= old->column_mask;
-	}
-
-	if (lsm->index_id > 0 && vy_stmt_type(entry.stmt) == IPROTO_REPLACE &&
-	    old != NULL && vy_stmt_type(old->entry.stmt) == IPROTO_DELETE) {
-		/*
-		 * The column mask of an update operation may have a bit
-		 * set even if the corresponding field doesn't actually
-		 * get updated, because a column mask is generated
-		 * only from update operations, before applying them.
-		 * E.g. update(1, {{'+', 2, 0}}) doesn't modify the
-		 * second field, but the column mask will say it does.
-		 *
-		 * To discard DELETE statements in the write iterator
-		 * (see optimization #5), we turn a REPLACE into an
-		 * INSERT in case the REPLACE was generated by an
-		 * update that changed secondary key fields. So we
-		 * can't tolerate inaccuracy in a column mask.
-		 *
-		 * So if the update didn't actually modify secondary
-		 * key fields, i.e. DELETE and REPLACE generated by the
-		 * update have the same key fields, we forcefully clear
-		 * key bits in the column mask to ensure that no REPLACE
-		 * statement will be written for this secondary key.
-		 */
-		if (v->column_mask != UINT64_MAX)
-			v->column_mask &= ~lsm->cmp_def->column_mask;
+		enum iproto_type type = vy_stmt_type(entry.stmt);
+		enum iproto_type old_type = vy_stmt_type(old->entry.stmt);
+		if ((type == IPROTO_DELETE) != (old_type == IPROTO_DELETE))
+			v->is_nop = true;
 	}
 
 	v->overwritten = old;
@@ -1133,12 +1106,11 @@ vy_tx_set_entry(struct vy_tx *tx, struct vy_lsm *lsm,
 }
 
 int
-vy_tx_set_with_colmask(struct vy_tx *tx, struct vy_lsm *lsm,
-		       struct tuple *stmt, uint64_t column_mask)
+vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt)
 {
 	struct vy_entry entry;
 	vy_stmt_foreach_entry(entry, stmt, lsm->cmp_def) {
-		if (vy_tx_set_entry(tx, lsm, entry, column_mask) != 0)
+		if (vy_tx_set_entry(tx, lsm, entry) != 0)
 			return -1;
 	}
 	return 0;
diff --git a/src/box/vy_tx.h b/src/box/vy_tx.h
index 2712263b..9ee10755 100644
--- a/src/box/vy_tx.h
+++ b/src/box/vy_tx.h
@@ -87,8 +87,6 @@ struct txv {
 	struct vy_entry entry;
 	/** Statement allocated on vy_mem->allocator. */
 	struct tuple *region_stmt;
-	/** Mask of columns modified by this operation. */
-	uint64_t column_mask;
 	/** Next in the transaction log. */
 	struct stailq_entry next_in_log;
 	/** Member the transaction write set. */
@@ -101,6 +99,11 @@ struct txv {
 	 */
 	bool is_first_insert;
 	/**
+	 * True if the statement has no effect and can be safely
+	 * skipped on commit.
+	 */
+	bool is_nop;
+	/**
 	 * True if the txv was overwritten by another txv of
 	 * the same transaction.
 	 */
@@ -397,20 +400,12 @@ vy_tx_track_point(struct vy_tx *tx, struct vy_lsm *lsm, struct vy_entry entry);
  * @param tx           Transaction.
  * @param lsm          LSM tree the statement is for.
  * @param stmt         Statement.
- * @param column_mask  Mask of fields modified by the statement.
  *
  * @retval  0 Success
  * @retval -1 Memory allocation error.
  */
 int
-vy_tx_set_with_colmask(struct vy_tx *tx, struct vy_lsm *lsm,
-		       struct tuple *stmt, uint64_t column_mask);
-
-static inline int
-vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt)
-{
-	return vy_tx_set_with_colmask(tx, lsm, stmt, UINT64_MAX);
-}
+vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt);
 
 /**
  * Iterator over the write set of a transaction.
diff --git a/test/engine/update.result b/test/engine/update.result
index b73037eb..69293e46 100644
--- a/test/engine/update.result
+++ b/test/engine/update.result
@@ -758,3 +758,33 @@ s:upsert({2, 666}, {{'=', 2, 666}})
 s:drop()
 ---
 ...
+--
+-- gh-4242 Tuple is missing from secondary index after update.
+--
+s = box.schema.space.create('test', {engine = engine})
+---
+...
+pk = s:create_index('pk')
+---
+...
+sk = s:create_index('sk', {parts = {2, 'unsigned'}})
+---
+...
+s:insert{1, 1, 1}
+---
+- [1, 1, 1]
+...
+box.begin() s:update(1, {{'=', 2, 2}}) s:update(1, {{'=', 3, 2}}) box.commit()
+---
+...
+pk:select()
+---
+- - [1, 2, 2]
+...
+sk:select()
+---
+- - [1, 2, 2]
+...
+s:drop()
+---
+...
diff --git a/test/engine/update.test.lua b/test/engine/update.test.lua
index e2a2265f..51263f57 100644
--- a/test/engine/update.test.lua
+++ b/test/engine/update.test.lua
@@ -122,3 +122,15 @@ box.space.tst_sample:get(2).VAL
 -- invalid upsert
 s:upsert({2, 666}, {{'=', 2, 666}})
 s:drop()
+
+--
+-- gh-4242 Tuple is missing from secondary index after update.
+--
+s = box.schema.space.create('test', {engine = engine})
+pk = s:create_index('pk')
+sk = s:create_index('sk', {parts = {2, 'unsigned'}})
+s:insert{1, 1, 1}
+box.begin() s:update(1, {{'=', 2, 2}}) s:update(1, {{'=', 3, 2}}) box.commit()
+pk:select()
+sk:select()
+s:drop()
diff --git a/test/vinyl/write_iterator.result b/test/vinyl/write_iterator.result
index 39212572..5c5200b8 100644
--- a/test/vinyl/write_iterator.result
+++ b/test/vinyl/write_iterator.result
@@ -767,7 +767,7 @@ box.snapshot()
 - ok
 ...
 -- Some padding to trigger minor compaction.
-for i = 1001, 1000 + PAD2 do s:replace{i, i} end
+for i = 1001, 1000 + PAD2 do s:delete{i} end
 ---
 ...
 box.snapshot()
diff --git a/test/vinyl/write_iterator.test.lua b/test/vinyl/write_iterator.test.lua
index d7871ffc..fa44961a 100644
--- a/test/vinyl/write_iterator.test.lua
+++ b/test/vinyl/write_iterator.test.lua
@@ -328,7 +328,7 @@ PAD2 = 15
 for i = 1001, 1000 + PAD1 do s:replace{i, i} end
 box.snapshot()
 -- Some padding to trigger minor compaction.
-for i = 1001, 1000 + PAD2 do s:replace{i, i} end
+for i = 1001, 1000 + PAD2 do s:delete{i} end
 box.snapshot()
 -- Generate some INSERT statements and dump them to disk.
 _ = s:insert{1, 1} -- insert
-- 
2.11.0




More information about the Tarantool-patches mailing list