[PATCH] vinyl: abort rw transactions before DDL

Vladimir Davydov vdavydov.dev at gmail.com
Fri Aug 3 18:21:45 MSK 2018


When building a new index or checking a space format, we propagate
changes done to the space during the procedure with the aid of an
on_replace trigger. The problem is there may be transactions with a
non-empty write set when we install the trigger. Changes done by
those transactions will not be seen by the trigger and so they won't
make it to the new index, resulting in an inconsistency between the
primary and secondary indexes. To fix this issue, let's abort all
rw transactions after setting the trigger. Note, transactions that
reached WAL can't be aborted so we wait for them to complete.

Closes #3458
---
https://github.com/tarantool/tarantool/issues/3458
https://github.com/tarantool/tarantool/commits/dv/gh-3458-vy-abort-tx-before-ddl

 src/box/vinyl.c          | 36 +++++++++++++++++++++++++++---------
 src/box/vy_lsm.h         | 11 +++++++++++
 src/box/vy_tx.c          | 13 +++++++++++++
 src/box/vy_tx.h          | 13 +++++++++++++
 test/vinyl/ddl.result    |  1 -
 test/vinyl/ddl.test.lua  |  1 -
 test/vinyl/misc.result   |  5 +++++
 test/vinyl/misc.test.lua |  4 ++++
 8 files changed, 73 insertions(+), 11 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index a0d2078a..3283ae95 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -1014,6 +1014,24 @@ vinyl_space_prepare_alter(struct space *old_space, struct space *new_space)
 	return 0;
 }
 
+/**
+ * This function is called after installing on_replace trigger
+ * used for propagating changes done during DDL. It aborts all
+ * rw transactions that began before the trigger was installed
+ * so that DDL doesn't miss their working set.
+ */
+static void
+vy_abort_writers_for_ddl(struct vy_env *env)
+{
+	tx_manager_abort_writers(env->xm);
+	/*
+	 * Wait for prepared transactions to complete
+	 * (we can't abort them as they reached WAL).
+	 */
+	struct vclock unused;
+	wal_checkpoint(&unused, false);
+}
+
 /** Argument passed to vy_check_format_on_replace(). */
 struct vy_check_format_ctx {
 	/** Format to check new tuples against. */
@@ -1084,9 +1102,11 @@ vinyl_space_check_format(struct space *space, struct tuple_format *format)
 	trigger_create(&on_replace, vy_check_format_on_replace, &ctx, NULL);
 	trigger_add(&space->on_replace, &on_replace);
 
+	vy_abort_writers_for_ddl(env);
+
 	struct vy_read_iterator itr;
 	vy_read_iterator_open(&itr, pk, NULL, ITER_ALL, key,
-			      &env->xm->p_global_read_view);
+			      &env->xm->p_committed_read_view);
 	int rc;
 	int loops = 0;
 	struct tuple *tuple;
@@ -4169,10 +4189,8 @@ vinyl_space_build_index(struct space *src_space, struct index *new_index,
 {
 	struct vy_env *env = vy_env(src_space->engine);
 	struct vy_lsm *pk = vy_lsm(src_space->index[0]);
-	bool is_empty = (pk->stat.disk.count.rows == 0 &&
-			 pk->stat.memory.count.rows == 0);
 
-	if (new_index->def->iid == 0 && !is_empty) {
+	if (new_index->def->iid == 0 && !vy_lsm_is_empty(pk)) {
 		diag_set(ClientError, ER_UNSUPPORTED, "Vinyl",
 			 "rebuilding the primary index of a non-empty space");
 		return -1;
@@ -4189,9 +4207,6 @@ vinyl_space_build_index(struct space *src_space, struct index *new_index,
 	    env->status == VINYL_FINAL_RECOVERY_LOCAL)
 		return vy_build_recover(env, new_lsm, pk);
 
-	if (is_empty)
-		return 0;
-
 	/*
 	 * Iterate over all tuples stored in the space and insert
 	 * each of them into the new LSM tree. Since read iterator
@@ -4214,6 +4229,8 @@ vinyl_space_build_index(struct space *src_space, struct index *new_index,
 	trigger_create(&on_replace, vy_build_on_replace, &ctx, NULL);
 	trigger_add(&src_space->on_replace, &on_replace);
 
+	vy_abort_writers_for_ddl(env);
+
 	struct vy_read_iterator itr;
 	vy_read_iterator_open(&itr, pk, NULL, ITER_ALL, key,
 			      &env->xm->p_committed_read_view);
@@ -4264,9 +4281,10 @@ vinyl_space_build_index(struct space *src_space, struct index *new_index,
 
 	/*
 	 * Dump the new index upon build completion so that we don't
-	 * have to rebuild it on recovery.
+	 * have to rebuild it on recovery. No need to trigger dump if
+	 * the space happens to be empty.
 	 */
-	if (rc == 0)
+	if (rc == 0 && !vy_lsm_is_empty(new_lsm))
 		rc = vy_scheduler_dump(&env->scheduler);
 
 	if (rc == 0 && ctx.is_failed) {
diff --git a/src/box/vy_lsm.h b/src/box/vy_lsm.h
index f0b7ec9c..08d30530 100644
--- a/src/box/vy_lsm.h
+++ b/src/box/vy_lsm.h
@@ -311,6 +311,17 @@ void
 vy_lsm_delete(struct vy_lsm *lsm);
 
 /**
+ * Return true if the LSM tree has no statements, neither on disk
+ * nor in memory.
+ */
+static inline bool
+vy_lsm_is_empty(struct vy_lsm *lsm)
+{
+	return (lsm->stat.disk.count.rows == 0 &&
+		lsm->stat.memory.count.rows == 0);
+}
+
+/**
  * Increment the reference counter of an LSM tree.
  * An LSM tree cannot be deleted if its reference
  * counter is elevated.
diff --git a/src/box/vy_tx.c b/src/box/vy_tx.c
index 5c186b87..117db68c 100644
--- a/src/box/vy_tx.c
+++ b/src/box/vy_tx.c
@@ -106,6 +106,7 @@ tx_manager_new(void)
 		return NULL;
 	}
 
+	rlist_create(&xm->transactions);
 	rlist_create(&xm->read_views);
 	vy_global_read_view_create((struct vy_read_view *)&xm->global_read_view,
 				   INT64_MAX);
@@ -288,6 +289,7 @@ vy_tx_create(struct tx_manager *xm, struct vy_tx *tx)
 	vy_tx_read_set_new(&tx->read_set);
 	tx->psn = 0;
 	rlist_create(&tx->on_destroy);
+	rlist_add_entry(&xm->transactions, tx, in_transactions);
 }
 
 void
@@ -306,6 +308,7 @@ vy_tx_destroy(struct vy_tx *tx)
 	}
 
 	vy_tx_read_set_iter(&tx->read_set, NULL, vy_tx_read_set_free_cb, NULL);
+	rlist_del_entry(tx, in_transactions);
 }
 
 /** Return true if the transaction is read-only. */
@@ -856,6 +859,16 @@ vy_tx_set(struct vy_tx *tx, struct vy_lsm *lsm, struct tuple *stmt)
 }
 
 void
+tx_manager_abort_writers(struct tx_manager *xm)
+{
+	struct vy_tx *tx;
+	rlist_foreach_entry(tx, &xm->transactions, in_transactions) {
+		if (tx->state == VINYL_TX_READY && !vy_tx_is_ro(tx))
+			tx->state = VINYL_TX_ABORT;
+	}
+}
+
+void
 vy_txw_iterator_open(struct vy_txw_iterator *itr,
 		     struct vy_txw_iterator_stat *stat,
 		     struct vy_tx *tx, struct vy_lsm *lsm,
diff --git a/src/box/vy_tx.h b/src/box/vy_tx.h
index dcf6a739..0378aee9 100644
--- a/src/box/vy_tx.h
+++ b/src/box/vy_tx.h
@@ -133,6 +133,8 @@ write_set_search_key(write_set_t *tree, struct vy_lsm *lsm,
 
 /** Transaction object. */
 struct vy_tx {
+	/** Link in tx_manager::transactions. */
+	struct rlist in_transactions;
 	/** Transaction manager. */
 	struct tx_manager *xm;
 	/**
@@ -209,6 +211,10 @@ struct tx_manager {
 	 */
 	struct vy_tx *last_prepared_tx;
 	/**
+	 * List of open transactions, linked by vy_tx::in_transactions.
+	 */
+	struct rlist transactions;
+	/**
 	 * The list of TXs with a read view in order of vlsn.
 	 */
 	struct rlist read_views;
@@ -262,6 +268,13 @@ tx_manager_new(void);
 void
 tx_manager_delete(struct tx_manager *xm);
 
+/**
+ * Abort all transactions that started before this point of time
+ * and haven't reached WAL yet.
+ */
+void
+tx_manager_abort_writers(struct tx_manager *xm);
+
 /** Initialize a tx object. */
 void
 vy_tx_create(struct tx_manager *xm, struct vy_tx *tx);
diff --git a/test/vinyl/ddl.result b/test/vinyl/ddl.result
index 3e65e232..bb4fc984 100644
--- a/test/vinyl/ddl.result
+++ b/test/vinyl/ddl.result
@@ -656,7 +656,6 @@ last_val = 1000;
 ---
 ...
 function gen_load()
-    fiber.sleep(0.001)
     local s = box.space.test
     for i = 1, 200 do
         local op = math.random(4)
diff --git a/test/vinyl/ddl.test.lua b/test/vinyl/ddl.test.lua
index 45c5cf8e..f7bf03bb 100644
--- a/test/vinyl/ddl.test.lua
+++ b/test/vinyl/ddl.test.lua
@@ -253,7 +253,6 @@ box.commit();
 last_val = 1000;
 
 function gen_load()
-    fiber.sleep(0.001)
     local s = box.space.test
     for i = 1, 200 do
         local op = math.random(4)
diff --git a/test/vinyl/misc.result b/test/vinyl/misc.result
index 409df3e0..d2a7ca68 100644
--- a/test/vinyl/misc.result
+++ b/test/vinyl/misc.result
@@ -140,6 +140,11 @@ i6 = s:create_index('i6', {unique = true, parts = {4, 'unsigned', 6, 'unsigned',
 i7 = s:create_index('i7', {unique = true, parts = {6, 'unsigned'}})
 ---
 ...
+-- space.create_index() does a lookup in the primary index
+-- so reset the stats before calling space.insert().
+box.stat.reset()
+---
+...
 s:insert{1, 1, 1, 1, 1, 1}
 ---
 - [1, 1, 1, 1, 1, 1]
diff --git a/test/vinyl/misc.test.lua b/test/vinyl/misc.test.lua
index 3dbfdb54..9f61ca0a 100644
--- a/test/vinyl/misc.test.lua
+++ b/test/vinyl/misc.test.lua
@@ -62,6 +62,10 @@ i5 = s:create_index('i5', {unique = true, parts = {4, 'unsigned', 5, 'unsigned',
 i6 = s:create_index('i6', {unique = true, parts = {4, 'unsigned', 6, 'unsigned', 5, 'unsigned'}})
 i7 = s:create_index('i7', {unique = true, parts = {6, 'unsigned'}})
 
+-- space.create_index() does a lookup in the primary index
+-- so reset the stats before calling space.insert().
+box.stat.reset()
+
 s:insert{1, 1, 1, 1, 1, 1}
 
 i1:stat().lookup -- 1
-- 
2.11.0




More information about the Tarantool-patches mailing list