[Tarantool-patches] [PATCH v2 00/19] Sync replication

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Fri Jul 10 03:50:48 MSK 2020


Here is a pack of final fixes before the branch goes to master.
Lots of them, but I tried to explain them individually. Not point
in making them separate commits since anyway they are all squashed
into the older commits.

================================================================================
diff --git a/src/box/txn.c b/src/box/txn.c
index ffc2ac6a5..a2df23833 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -749,7 +749,8 @@ txn_commit_async(struct txn *txn)
 
 		if (txn_has_flag(txn, TXN_WAIT_ACK)) {
 			int64_t lsn = req->rows[txn->n_applier_rows - 1]->lsn;
-			txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn);
+			txn_limbo_assign_remote_lsn(&txn_limbo, limbo_entry,
+						    lsn);
 		}
 
 		/*
@@ -836,7 +837,8 @@ txn_commit(struct txn *txn)
 	if (is_sync) {
 		if (txn_has_flag(txn, TXN_WAIT_ACK)) {
 			int64_t lsn = req->rows[req->n_rows - 1]->lsn;
-			txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn);
+			txn_limbo_assign_local_lsn(&txn_limbo, limbo_entry,
+						   lsn);
 			/* Local WAL write is a first 'ACK'. */
 			txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, lsn);
 		}
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index 71a47802a..e28432bfd 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -87,8 +87,7 @@ static inline void
 txn_limbo_remove(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	assert(!rlist_empty(&entry->in_queue));
-	assert(rlist_first_entry(&limbo->queue, struct txn_limbo_entry,
-				 in_queue) == entry);
+	assert(txn_limbo_first_entry(limbo) == entry);
 	(void) limbo;
 	rlist_del_entry(entry, in_queue);
 }
@@ -97,8 +96,7 @@ static inline void
 txn_limbo_pop(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	assert(!rlist_empty(&entry->in_queue));
-	assert(rlist_last_entry(&limbo->queue, struct txn_limbo_entry,
-				in_queue) == entry);
+	assert(txn_limbo_last_entry(limbo) == entry);
 	assert(entry->is_rollback);
 	(void) limbo;
 	rlist_del_entry(entry, in_queue);
@@ -119,10 +117,11 @@ txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 }
 
 void
-txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
-		     int64_t lsn)
+txn_limbo_assign_remote_lsn(struct txn_limbo *limbo,
+			    struct txn_limbo_entry *entry, int64_t lsn)
 {
 	assert(limbo->instance_id != REPLICA_ID_NIL);
+	assert(limbo->instance_id != instance_id);
 	assert(entry->lsn == -1);
 	assert(lsn > 0);
 	assert(txn_has_flag(entry->txn, TXN_WAIT_ACK));
@@ -130,27 +129,30 @@ txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
 	entry->lsn = lsn;
 }
 
-static bool
-txn_limbo_check_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
+void
+txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
+			   struct txn_limbo_entry *entry, int64_t lsn)
 {
-	if (txn_limbo_entry_is_complete(entry))
-		return true;
+	assert(limbo->instance_id != REPLICA_ID_NIL);
+	assert(limbo->instance_id == instance_id);
+	assert(entry->lsn == -1);
+	assert(lsn > 0);
+	assert(txn_has_flag(entry->txn, TXN_WAIT_ACK));
+	(void) limbo;
+	entry->lsn = lsn;
 	/*
-	 * Async transaction can't complete itself. It is always
-	 * completed by a previous sync transaction.
+	 * The entry just got its LSN after a WAL write. It could
+	 * happen that this LSN was already ACKed by some
+	 * replicas. Update the ACK counter to take them into
+	 * account.
 	 */
-	if (!txn_has_flag(entry->txn, TXN_WAIT_ACK))
-		return false;
 	struct vclock_iterator iter;
 	vclock_iterator_init(&iter, &limbo->vclock);
 	int ack_count = 0;
-	int64_t lsn = entry->lsn;
 	vclock_foreach(&iter, vc)
 		ack_count += vc.lsn >= lsn;
 	assert(ack_count >= entry->ack_count);
 	entry->ack_count = ack_count;
-	entry->is_commit = ack_count >= replication_synchro_quorum;
-	return entry->is_commit;
================================================================================

The changes above are motivated by a bug I found during stress
testing (by running the same test in parallel in test-run in 10-15
processes).

The bug was a crash happening in case a transaction was replicated
and ACKed earlier than WAL thread responded ok to TX thread. Then
CONFIRM wasn't written at all.

================================================================================
 }
 
 static int
@@ -161,7 +163,7 @@ txn_limbo_wait_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	struct txn *txn = entry->txn;
 	assert(entry->lsn > 0 || !txn_has_flag(entry->txn, TXN_WAIT_ACK));
-	if (txn_limbo_check_complete(limbo, entry))
+	if (txn_limbo_entry_is_complete(entry))
 		goto complete;
 
 	assert(!txn_has_flag(txn, TXN_IS_DONE));
@@ -226,7 +229,26 @@ complete:
 		diag_set(ClientError, ER_SYNC_ROLLBACK);
 		return -1;
 	}
-	txn_limbo_remove(limbo, entry);
+	/*
+	 * The entry might be not the first in the limbo. It
+	 * happens when there was a sync transaction and async
+	 * transaction. The sync and async went to WAL. After sync
+	 * WAL write is done, it may be already ACKed by the
+	 * needed replica count. Now it marks self as committed
+	 * and does the same for the next async txn. Then it
+	 * starts writing CONFIRM. During that the async
+	 * transaction finishes its WAL write, sees it is
+	 * committed and ends up here. Not being the first
+	 * transaction in the limbo.
+	 */
+	while (!rlist_empty(&entry->in_queue) &&
+	       txn_limbo_first_entry(limbo) != entry) {
+		bool cancellable = fiber_set_cancellable(false);
+		fiber_yield();
+		fiber_set_cancellable(cancellable);
+	}
+	if (!rlist_empty(&entry->in_queue))
+		txn_limbo_remove(limbo, entry);
 	txn_clear_flag(txn, TXN_WAIT_SYNC);
 	txn_clear_flag(txn, TXN_WAIT_ACK);
 	return 0;
@@ -257,7 +279,7 @@ txn_limbo_write_confirm_rollback(struct txn_limbo *limbo, int64_t lsn,
 		 * the last "safe" lsn is lsn - 1.
 		 */
 		res = xrow_encode_rollback(&row, &txn->region,
-					   limbo->instance_id, lsn - 1);
+					   limbo->instance_id, lsn);
================================================================================

I asked Sergey to do that, but he temporarily left. There wasn't no
a bug, just inconsistency. For CONFIRM we use inclusive LSN - it commits
all <= LSN. But for ROLLBACK we used exclusive LSN - it rolls back all > LSN.
This is strange. So I made ROLLBACK LSN inclusive too. This is one step
towards https://github.com/tarantool/tarantool/issues/5151.

================================================================================
 	}
 	if (res == -1)
 		goto rollback;
@@ -342,7 +364,7 @@ txn_limbo_read_rollback(struct txn_limbo *limbo, int64_t lsn)
 	rlist_foreach_entry_reverse(e, &limbo->queue, in_queue) {
 		if (!txn_has_flag(e->txn, TXN_WAIT_ACK))
 			continue;
-		if (e->lsn <= lsn)
+		if (e->lsn < lsn)
 			break;
 		last_rollback = e;
 	}
@@ -542,7 +564,7 @@ txn_limbo_force_empty(struct txn_limbo *limbo, int64_t confirm_lsn)
 	}
 	if (rollback != NULL) {
 		txn_limbo_write_rollback(limbo, rollback->lsn);
-		txn_limbo_read_rollback(limbo, rollback->lsn - 1);
+		txn_limbo_read_rollback(limbo, rollback->lsn);
 	}
 }
 
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index 1ee416231..88614d4a6 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -158,13 +158,21 @@ void
 txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry);
 
 /**
- * Assign local LSN to the limbo entry. That happens when the
- * transaction is added to the limbo, writes to WAL, and gets an
- * LSN.
+ * Assign a remote LSN to a limbo entry. That happens when a
+ * remote transaction is added to the limbo and starts waiting for
+ * a confirm.
  */
 void
-txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
-		     int64_t lsn);
+txn_limbo_assign_remote_lsn(struct txn_limbo *limbo,
+			    struct txn_limbo_entry *entry, int64_t lsn);
+
+/**
+ * Assign a local LSN to a limbo entry. That happens when a local
+ * transaction is written to WAL.
+ */
+void
+txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
+			   struct txn_limbo_entry *entry, int64_t lsn);
 
 /**
  * Ack all transactions up to the given LSN on behalf of the
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 7e6a4aceb..b325213e6 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -246,7 +246,7 @@ xrow_decode_confirm(struct xrow_header *row, uint32_t *replica_id, int64_t *lsn)
  * @param row xrow header.
  * @param region Region to use to encode the rollback body.
  * @param replica_id master's instance id.
- * @param lsn lsn to rollback to.
+ * @param lsn lsn to rollback from, including it.
  * @retval -1  on error.
  * @retval 0 success.
  */
diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
index 6d1624798..6b55a0e5e 100644
--- a/test/replication/qsync_basic.result
+++ b/test/replication/qsync_basic.result
@@ -199,7 +199,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000}
 -- Commit something non-sync. So as applier writer fiber would
 -- flush the pending heartbeat and go to sleep with the new huge
 -- replication timeout.
-s = box.schema.create_space('test')
+s = box.schema.create_space('test', {engine = engine})
  | ---
  | ...
 pk = s:create_index('pk')
@@ -309,7 +309,7 @@ test_run:switch('default')
 box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
  | ---
  | ...
-_ = box.schema.create_space('locallocal', {is_local = true})
+_ = box.schema.create_space('locallocal', {is_local = true, engine = engine})
  | ---
  | ...
 _ = _:create_index('pk')
@@ -551,6 +551,9 @@ test_run:switch('default')
  | ---
  | - true
  | ...
+box.cfg{replication_synchro_timeout = 1000}
================================================================================

There was used timeout from the previous testcase, < 1 second. Was
flaky.

================================================================================
+ | ---
+ | ...
 ok, err = nil
  | ---
  | ...
diff --git a/test/replication/qsync_basic.test.lua b/test/replication/qsync_basic.test.lua
index 384b3593c..dcd1d6c76 100644
--- a/test/replication/qsync_basic.test.lua
+++ b/test/replication/qsync_basic.test.lua
@@ -83,7 +83,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000}
 -- Commit something non-sync. So as applier writer fiber would
 -- flush the pending heartbeat and go to sleep with the new huge
 -- replication timeout.
-s = box.schema.create_space('test')
+s = box.schema.create_space('test', {engine = engine})
 pk = s:create_index('pk')
 s:replace{1}
 -- Now commit something sync. It should return immediately even
@@ -123,7 +123,7 @@ box.space.sync:select{6}
 --
 test_run:switch('default')
 box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
-_ = box.schema.create_space('locallocal', {is_local = true})
+_ = box.schema.create_space('locallocal', {is_local = true, engine = engine})
 _ = _:create_index('pk')
 -- Propagate local vclock to some insane value to ensure it won't
 -- affect anything.
@@ -217,6 +217,7 @@ box.space.sync:select{11}
 
 -- Test it is possible to early ACK a transaction with a new quorum.
 test_run:switch('default')
+box.cfg{replication_synchro_timeout = 1000}
 ok, err = nil
 f = fiber.create(function()                                                     \
     ok, err = pcall(box.space.sync.insert, box.space.sync, {12})                \
diff --git a/test/replication/qsync_snapshots.result b/test/replication/qsync_snapshots.result
index 61cb7164b..2a126087a 100644
--- a/test/replication/qsync_snapshots.result
+++ b/test/replication/qsync_snapshots.result
@@ -48,7 +48,7 @@ test_run:switch('default')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
================================================================================

Too small timeout. The test assumed it doesn't fail, but 0.1 is quite
easy to fail. Especially when runs in parallel. The same for some other
fixes below.

================================================================================
  | ---
  | ...
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -86,7 +86,7 @@ test_run:switch('default')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
  | ---
  | ...
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -112,58 +112,9 @@ box.space.sync:select{} -- 1
  | ---
  | - - [1]
  | ...
-box.snapshot()
- | ---
- | - ok
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
--- Testcase cleanup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.space.sync:drop()
+box.cfg{replication_synchro_timeout=1000}
  | ---
  | ...
-
--- [RFC, Snapshot generation] rolled back operations are not snapshotted.
--- Testcase setup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
- | ---
- | ...
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
- | ---
- | ...
-_ = box.space.sync:create_index('pk')
- | ---
- | ...
--- Testcase body.
-box.space.sync:insert{1}
- | ---
- | - [1]
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}
- | ---
- | ...
-box.space.sync:insert{2}
- | ---
- | - error: Quorum collection for a synchronous transaction is timed out
- | ...
 box.snapshot()
  | ---
  | - ok
@@ -172,14 +123,6 @@ box.space.sync:select{} -- 1
  | ---
  | - - [1]
  | ...
-test_run:switch('replica')
- | ---
- | - true
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
 -- Testcase cleanup.
 test_run:switch('default')
  | ---
@@ -191,11 +134,40 @@ box.space.sync:drop()
 
 -- [RFC, Snapshot generation] snapshot started on master, then rollback
 -- arrived, expected snapshot abort.
+-- The test is temporary blocked on 5146 due to a crash when local
+-- WAL write fails inside the WAL thread. Since this is the only
+-- way to cause rollback of the transaction used in a snapshot
+-- without triggering snapshot timeout.
+
+-- test_run:switch('default')
+-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+-- _ = box.space.sync:create_index('pk')
+-- -- Testcase body.
+-- box.space.sync:insert{1}
+-- box.space.sync:select{} -- 1
+-- test_run:switch('default')
+-- test_run:cmd("setopt delimiter ';'")
+-- _ = fiber.create(function()
+--     box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
+--     box.space.sync:insert{2}
+-- end);
+-- test_run:cmd("setopt delimiter ''");
+-- box.snapshot() -- abort
+-- box.space.sync:select{} -- 1
+-- test_run:switch('replica')
+-- box.space.sync:select{} -- 1
+-- -- Testcase cleanup.
+-- test_run:switch('default')
+-- box.space.sync:drop()
+
+-- [RFC, Snapshot generation] snapshot started on replica, then rollback
+-- arrived, expected snapshot abort.
 test_run:switch('default')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
  | ---
  | ...
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -204,128 +176,85 @@ _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
  | ---
  | ...
+
 -- Testcase body.
-box.space.sync:insert{1}
- | ---
- | - [1]
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
 test_run:switch('default')
  | ---
  | - true
  | ...
-test_run:cmd("setopt delimiter ';'")
- | ---
- | - true
- | ...
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
================================================================================

2 seconds was too long and was flaky. I made it faster and more stable using
event-oriented instead of time oriented things.

================================================================================
  | ---
  | ...
-test_run:cmd("setopt delimiter ''");
+ok, err = nil
  | ---
- | - true
- | ...
-box.snapshot() -- abort
- | ---
- | - error: A rollback for a synchronous transaction is received
  | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('replica')
- | ---
- | - true
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
--- Testcase cleanup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.space.sync:drop()
+f = fiber.create(function()                                                     \
+    ok, err = pcall(box.space.sync.insert, box.space.sync, {1})                 \
+end)
  | ---
  | ...
 
--- [RFC, Snapshot generation] snapshot started on replica, then rollback
--- arrived, expected snapshot abort.
-test_run:switch('default')
+test_run:switch('replica')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+fiber = require('fiber')
  | ---
  | ...
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+box.cfg{replication_synchro_timeout=1000}
  | ---
  | ...
-_ = box.space.sync:create_index('pk')
+ok, err = nil
  | ---
  | ...
--- Testcase body.
-box.space.sync:insert{1}
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
  | ---
- | - [1]
  | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('replica')
+
+test_run:switch('default')
  | ---
  | - true
  | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('default')
+box.cfg{replication_synchro_timeout=0.0001}
  | ---
- | - true
  | ...
-test_run:cmd("setopt delimiter ';'")
+test_run:wait_cond(function() return f:status() == 'dead' end)
  | ---
  | - true
  | ...
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
+ok, err
  | ---
+ | - false
+ | - Quorum collection for a synchronous transaction is timed out
  | ...
-test_run:cmd("setopt delimiter ''");
+
+test_run:switch('replica')
  | ---
  | - true
  | ...
-test_run:switch('replica')
+test_run:wait_cond(function() return f:status() == 'dead' end)
  | ---
  | - true
  | ...
-box.snapshot() -- abort
+ok, err
  | ---
- | - error: A rollback for a synchronous transaction is received
+ | - false
+ | - A rollback for a synchronous transaction is received
  | ...
-box.space.sync:select{} -- 1
+box.space.sync:select{}
  | ---
- | - - [1]
+ | - []
  | ...
+
 test_run:switch('default')
  | ---
  | - true
  | ...
-box.space.sync:select{} -- 1
+box.space.sync:select{}
  | ---
- | - - [1]
+ | - []
  | ...
+
 -- Testcase cleanup.
 test_run:switch('default')
  | ---
diff --git a/test/replication/qsync_snapshots.test.lua b/test/replication/qsync_snapshots.test.lua
index b5990bce7..0db61da95 100644
--- a/test/replication/qsync_snapshots.test.lua
+++ b/test/replication/qsync_snapshots.test.lua
@@ -20,7 +20,7 @@ test_run:cmd('start server replica with wait=True, wait_load=True')
 -- expected success.
 -- Testcase setup.
 test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
 -- Testcase body.
@@ -35,7 +35,7 @@ box.space.sync:drop()
 -- expected success.
 -- Testcase setup.
 test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
 -- Testcase body.
@@ -43,79 +43,76 @@ box.space.sync:insert{1}
 box.space.sync:select{} -- 1
 test_run:switch('replica')
 box.space.sync:select{} -- 1
+box.cfg{replication_synchro_timeout=1000}
 box.snapshot()
 box.space.sync:select{} -- 1
 -- Testcase cleanup.
 test_run:switch('default')
 box.space.sync:drop()
 
--- [RFC, Snapshot generation] rolled back operations are not snapshotted.
--- Testcase setup.
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
-_ = box.space.sync:create_index('pk')
--- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}
-box.space.sync:insert{2}
-box.snapshot()
-box.space.sync:select{} -- 1
-test_run:switch('replica')
-box.space.sync:select{} -- 1
--- Testcase cleanup.
-test_run:switch('default')
-box.space.sync:drop()
-
 -- [RFC, Snapshot generation] snapshot started on master, then rollback
 -- arrived, expected snapshot abort.
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
-_ = box.space.sync:create_index('pk')
--- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
-test_run:switch('default')
-test_run:cmd("setopt delimiter ';'")
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
-test_run:cmd("setopt delimiter ''");
-box.snapshot() -- abort
-box.space.sync:select{} -- 1
-test_run:switch('replica')
-box.space.sync:select{} -- 1
--- Testcase cleanup.
-test_run:switch('default')
-box.space.sync:drop()
+-- The test is temporary blocked on 5146 due to a crash when local
+-- WAL write fails inside the WAL thread. Since this is the only
+-- way to cause rollback of the transaction used in a snapshot
+-- without triggering snapshot timeout.
+
+-- test_run:switch('default')
+-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+-- _ = box.space.sync:create_index('pk')
+-- -- Testcase body.
+-- box.space.sync:insert{1}
+-- box.space.sync:select{} -- 1
+-- test_run:switch('default')
+-- test_run:cmd("setopt delimiter ';'")
+-- _ = fiber.create(function()
+--     box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
+--     box.space.sync:insert{2}
+-- end);
+-- test_run:cmd("setopt delimiter ''");
+-- box.snapshot() -- abort
+-- box.space.sync:select{} -- 1
+-- test_run:switch('replica')
+-- box.space.sync:select{} -- 1
+-- -- Testcase cleanup.
+-- test_run:switch('default')
+-- box.space.sync:drop()
 
 -- [RFC, Snapshot generation] snapshot started on replica, then rollback
 -- arrived, expected snapshot abort.
 test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
+
 -- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
+ok, err = nil
+f = fiber.create(function()                                                     \
+    ok, err = pcall(box.space.sync.insert, box.space.sync, {1})                 \
+end)
+
 test_run:switch('replica')
-box.space.sync:select{} -- 1
+fiber = require('fiber')
+box.cfg{replication_synchro_timeout=1000}
+ok, err = nil
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
+
 test_run:switch('default')
-test_run:cmd("setopt delimiter ';'")
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
-test_run:cmd("setopt delimiter ''");
+box.cfg{replication_synchro_timeout=0.0001}
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+
 test_run:switch('replica')
-box.snapshot() -- abort
-box.space.sync:select{} -- 1
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+box.space.sync:select{}
+
 test_run:switch('default')
-box.space.sync:select{} -- 1
+box.space.sync:select{}
+
 -- Testcase cleanup.
 test_run:switch('default')
 box.space.sync:drop()
diff --git a/test/unit/snap_quorum_delay.cc b/test/unit/snap_quorum_delay.cc
index 8d50cfb27..ad0563345 100644
--- a/test/unit/snap_quorum_delay.cc
+++ b/test/unit/snap_quorum_delay.cc
@@ -78,7 +78,7 @@ enum process_type {
  * (to push a transaction to the limbo and simulate confirm).
  */
 const int fake_lsn = 1;
-const int instace_id = 1;
+extern "C" int instance_id;
 const int relay_id = 2;
 
 int
@@ -109,7 +109,7 @@ txn_process_func(va_list ap)
 	 * and call txn_commit (or another) later.
 	 */
 	struct txn_limbo_entry *entry = txn_limbo_append(&txn_limbo,
-							 instace_id, txn);
+							 instance_id, txn);
 	/*
 	 * The trigger is used to verify that the transaction has been
 	 * completed.
@@ -130,7 +130,7 @@ txn_process_func(va_list ap)
 		unreachable();
 	}
 
-	txn_limbo_assign_lsn(&txn_limbo, entry, fake_lsn);
+	txn_limbo_assign_local_lsn(&txn_limbo, entry, fake_lsn);
 	txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, fake_lsn);
 	txn_limbo_wait_complete(&txn_limbo, entry);
 
@@ -239,6 +239,7 @@ main(void)
 	fiber_init(fiber_c_invoke);
 	gc_init();
 	txn_limbo_init();
+	instance_id = 1;
 
 	struct fiber *main_fiber = fiber_new("main", test_snap_delay);
 	assert(main_fiber != NULL);


More information about the Tarantool-patches mailing list