Re: [Tarantool-patches] [PATCH v2 00/19] Sync replication

Tarantool development patches archive
 help / color / mirror / Atom feed

From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
To: tarantool-patches@dev.tarantool.org, sergepetrenko@tarantool.org
Subject: Re: [Tarantool-patches] [PATCH v2 00/19] Sync replication
Date: Fri, 10 Jul 2020 02:50:48 +0200	[thread overview]
Message-ID: <b61eaafd-6659-5aca-4918-61ee6944cf36@tarantool.org> (raw)
In-Reply-To: <cover.1593472477.git.v.shpilevoy@tarantool.org>

Here is a pack of final fixes before the branch goes to master.
Lots of them, but I tried to explain them individually. Not point
in making them separate commits since anyway they are all squashed
into the older commits.

================================================================================
diff --git a/src/box/txn.c b/src/box/txn.c
index ffc2ac6a5..a2df23833 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -749,7 +749,8 @@ txn_commit_async(struct txn *txn)
 
 		if (txn_has_flag(txn, TXN_WAIT_ACK)) {
 			int64_t lsn = req->rows[txn->n_applier_rows - 1]->lsn;
-			txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn);
+			txn_limbo_assign_remote_lsn(&txn_limbo, limbo_entry,
+						    lsn);
 		}
 
 		/*
@@ -836,7 +837,8 @@ txn_commit(struct txn *txn)
 	if (is_sync) {
 		if (txn_has_flag(txn, TXN_WAIT_ACK)) {
 			int64_t lsn = req->rows[req->n_rows - 1]->lsn;
-			txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn);
+			txn_limbo_assign_local_lsn(&txn_limbo, limbo_entry,
+						   lsn);
 			/* Local WAL write is a first 'ACK'. */
 			txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, lsn);
 		}
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index 71a47802a..e28432bfd 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -87,8 +87,7 @@ static inline void
 txn_limbo_remove(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	assert(!rlist_empty(&entry->in_queue));
-	assert(rlist_first_entry(&limbo->queue, struct txn_limbo_entry,
-				 in_queue) == entry);
+	assert(txn_limbo_first_entry(limbo) == entry);
 	(void) limbo;
 	rlist_del_entry(entry, in_queue);
 }
@@ -97,8 +96,7 @@ static inline void
 txn_limbo_pop(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	assert(!rlist_empty(&entry->in_queue));
-	assert(rlist_last_entry(&limbo->queue, struct txn_limbo_entry,
-				in_queue) == entry);
+	assert(txn_limbo_last_entry(limbo) == entry);
 	assert(entry->is_rollback);
 	(void) limbo;
 	rlist_del_entry(entry, in_queue);
@@ -119,10 +117,11 @@ txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 }
 
 void
-txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
-		     int64_t lsn)
+txn_limbo_assign_remote_lsn(struct txn_limbo *limbo,
+			    struct txn_limbo_entry *entry, int64_t lsn)
 {
 	assert(limbo->instance_id != REPLICA_ID_NIL);
+	assert(limbo->instance_id != instance_id);
 	assert(entry->lsn == -1);
 	assert(lsn > 0);
 	assert(txn_has_flag(entry->txn, TXN_WAIT_ACK));
@@ -130,27 +129,30 @@ txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
 	entry->lsn = lsn;
 }
 
-static bool
-txn_limbo_check_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
+void
+txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
+			   struct txn_limbo_entry *entry, int64_t lsn)
 {
-	if (txn_limbo_entry_is_complete(entry))
-		return true;
+	assert(limbo->instance_id != REPLICA_ID_NIL);
+	assert(limbo->instance_id == instance_id);
+	assert(entry->lsn == -1);
+	assert(lsn > 0);
+	assert(txn_has_flag(entry->txn, TXN_WAIT_ACK));
+	(void) limbo;
+	entry->lsn = lsn;
 	/*
-	 * Async transaction can't complete itself. It is always
-	 * completed by a previous sync transaction.
+	 * The entry just got its LSN after a WAL write. It could
+	 * happen that this LSN was already ACKed by some
+	 * replicas. Update the ACK counter to take them into
+	 * account.
 	 */
-	if (!txn_has_flag(entry->txn, TXN_WAIT_ACK))
-		return false;
 	struct vclock_iterator iter;
 	vclock_iterator_init(&iter, &limbo->vclock);
 	int ack_count = 0;
-	int64_t lsn = entry->lsn;
 	vclock_foreach(&iter, vc)
 		ack_count += vc.lsn >= lsn;
 	assert(ack_count >= entry->ack_count);
 	entry->ack_count = ack_count;
-	entry->is_commit = ack_count >= replication_synchro_quorum;
-	return entry->is_commit;
================================================================================

The changes above are motivated by a bug I found during stress
testing (by running the same test in parallel in test-run in 10-15
processes).

The bug was a crash happening in case a transaction was replicated
and ACKed earlier than WAL thread responded ok to TX thread. Then
CONFIRM wasn't written at all.

================================================================================
 }
 
 static int
@@ -161,7 +163,7 @@ txn_limbo_wait_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
 {
 	struct txn *txn = entry->txn;
 	assert(entry->lsn > 0 || !txn_has_flag(entry->txn, TXN_WAIT_ACK));
-	if (txn_limbo_check_complete(limbo, entry))
+	if (txn_limbo_entry_is_complete(entry))
 		goto complete;
 
 	assert(!txn_has_flag(txn, TXN_IS_DONE));
@@ -226,7 +229,26 @@ complete:
 		diag_set(ClientError, ER_SYNC_ROLLBACK);
 		return -1;
 	}
-	txn_limbo_remove(limbo, entry);
+	/*
+	 * The entry might be not the first in the limbo. It
+	 * happens when there was a sync transaction and async
+	 * transaction. The sync and async went to WAL. After sync
+	 * WAL write is done, it may be already ACKed by the
+	 * needed replica count. Now it marks self as committed
+	 * and does the same for the next async txn. Then it
+	 * starts writing CONFIRM. During that the async
+	 * transaction finishes its WAL write, sees it is
+	 * committed and ends up here. Not being the first
+	 * transaction in the limbo.
+	 */
+	while (!rlist_empty(&entry->in_queue) &&
+	       txn_limbo_first_entry(limbo) != entry) {
+		bool cancellable = fiber_set_cancellable(false);
+		fiber_yield();
+		fiber_set_cancellable(cancellable);
+	}
+	if (!rlist_empty(&entry->in_queue))
+		txn_limbo_remove(limbo, entry);
 	txn_clear_flag(txn, TXN_WAIT_SYNC);
 	txn_clear_flag(txn, TXN_WAIT_ACK);
 	return 0;
@@ -257,7 +279,7 @@ txn_limbo_write_confirm_rollback(struct txn_limbo *limbo, int64_t lsn,
 		 * the last "safe" lsn is lsn - 1.
 		 */
 		res = xrow_encode_rollback(&row, &txn->region,
-					   limbo->instance_id, lsn - 1);
+					   limbo->instance_id, lsn);
================================================================================

I asked Sergey to do that, but he temporarily left. There wasn't no
a bug, just inconsistency. For CONFIRM we use inclusive LSN - it commits
all <= LSN. But for ROLLBACK we used exclusive LSN - it rolls back all > LSN.
This is strange. So I made ROLLBACK LSN inclusive too. This is one step
towards https://github.com/tarantool/tarantool/issues/5151.

================================================================================
 	}
 	if (res == -1)
 		goto rollback;
@@ -342,7 +364,7 @@ txn_limbo_read_rollback(struct txn_limbo *limbo, int64_t lsn)
 	rlist_foreach_entry_reverse(e, &limbo->queue, in_queue) {
 		if (!txn_has_flag(e->txn, TXN_WAIT_ACK))
 			continue;
-		if (e->lsn <= lsn)
+		if (e->lsn < lsn)
 			break;
 		last_rollback = e;
 	}
@@ -542,7 +564,7 @@ txn_limbo_force_empty(struct txn_limbo *limbo, int64_t confirm_lsn)
 	}
 	if (rollback != NULL) {
 		txn_limbo_write_rollback(limbo, rollback->lsn);
-		txn_limbo_read_rollback(limbo, rollback->lsn - 1);
+		txn_limbo_read_rollback(limbo, rollback->lsn);
 	}
 }
 
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index 1ee416231..88614d4a6 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -158,13 +158,21 @@ void
 txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry);
 
 /**
- * Assign local LSN to the limbo entry. That happens when the
- * transaction is added to the limbo, writes to WAL, and gets an
- * LSN.
+ * Assign a remote LSN to a limbo entry. That happens when a
+ * remote transaction is added to the limbo and starts waiting for
+ * a confirm.
  */
 void
-txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
-		     int64_t lsn);
+txn_limbo_assign_remote_lsn(struct txn_limbo *limbo,
+			    struct txn_limbo_entry *entry, int64_t lsn);
+
+/**
+ * Assign a local LSN to a limbo entry. That happens when a local
+ * transaction is written to WAL.
+ */
+void
+txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
+			   struct txn_limbo_entry *entry, int64_t lsn);
 
 /**
  * Ack all transactions up to the given LSN on behalf of the
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 7e6a4aceb..b325213e6 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -246,7 +246,7 @@ xrow_decode_confirm(struct xrow_header *row, uint32_t *replica_id, int64_t *lsn)
  * @param row xrow header.
  * @param region Region to use to encode the rollback body.
  * @param replica_id master's instance id.
- * @param lsn lsn to rollback to.
+ * @param lsn lsn to rollback from, including it.
  * @retval -1  on error.
  * @retval 0 success.
  */
diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
index 6d1624798..6b55a0e5e 100644
--- a/test/replication/qsync_basic.result
+++ b/test/replication/qsync_basic.result
@@ -199,7 +199,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000}
 -- Commit something non-sync. So as applier writer fiber would
 -- flush the pending heartbeat and go to sleep with the new huge
 -- replication timeout.
-s = box.schema.create_space('test')
+s = box.schema.create_space('test', {engine = engine})
  | ---
  | ...
 pk = s:create_index('pk')
@@ -309,7 +309,7 @@ test_run:switch('default')
 box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
  | ---
  | ...
-_ = box.schema.create_space('locallocal', {is_local = true})
+_ = box.schema.create_space('locallocal', {is_local = true, engine = engine})
  | ---
  | ...
 _ = _:create_index('pk')
@@ -551,6 +551,9 @@ test_run:switch('default')
  | ---
  | - true
  | ...
+box.cfg{replication_synchro_timeout = 1000}
================================================================================

There was used timeout from the previous testcase, < 1 second. Was
flaky.

================================================================================
+ | ---
+ | ...
 ok, err = nil
  | ---
  | ...
diff --git a/test/replication/qsync_basic.test.lua b/test/replication/qsync_basic.test.lua
index 384b3593c..dcd1d6c76 100644
--- a/test/replication/qsync_basic.test.lua
+++ b/test/replication/qsync_basic.test.lua
@@ -83,7 +83,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000}
 -- Commit something non-sync. So as applier writer fiber would
 -- flush the pending heartbeat and go to sleep with the new huge
 -- replication timeout.
-s = box.schema.create_space('test')
+s = box.schema.create_space('test', {engine = engine})
 pk = s:create_index('pk')
 s:replace{1}
 -- Now commit something sync. It should return immediately even
@@ -123,7 +123,7 @@ box.space.sync:select{6}
 --
 test_run:switch('default')
 box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
-_ = box.schema.create_space('locallocal', {is_local = true})
+_ = box.schema.create_space('locallocal', {is_local = true, engine = engine})
 _ = _:create_index('pk')
 -- Propagate local vclock to some insane value to ensure it won't
 -- affect anything.
@@ -217,6 +217,7 @@ box.space.sync:select{11}
 
 -- Test it is possible to early ACK a transaction with a new quorum.
 test_run:switch('default')
+box.cfg{replication_synchro_timeout = 1000}
 ok, err = nil
 f = fiber.create(function()                                                     \
     ok, err = pcall(box.space.sync.insert, box.space.sync, {12})                \
diff --git a/test/replication/qsync_snapshots.result b/test/replication/qsync_snapshots.result
index 61cb7164b..2a126087a 100644
--- a/test/replication/qsync_snapshots.result
+++ b/test/replication/qsync_snapshots.result
@@ -48,7 +48,7 @@ test_run:switch('default')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
================================================================================

Too small timeout. The test assumed it doesn't fail, but 0.1 is quite
easy to fail. Especially when runs in parallel. The same for some other
fixes below.

================================================================================
  | ---
  | ...
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -86,7 +86,7 @@ test_run:switch('default')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
  | ---
  | ...
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -112,58 +112,9 @@ box.space.sync:select{} -- 1
  | ---
  | - - [1]
  | ...
-box.snapshot()
- | ---
- | - ok
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
--- Testcase cleanup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.space.sync:drop()
+box.cfg{replication_synchro_timeout=1000}
  | ---
  | ...
-
--- [RFC, Snapshot generation] rolled back operations are not snapshotted.
--- Testcase setup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
- | ---
- | ...
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
- | ---
- | ...
-_ = box.space.sync:create_index('pk')
- | ---
- | ...
--- Testcase body.
-box.space.sync:insert{1}
- | ---
- | - [1]
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}
- | ---
- | ...
-box.space.sync:insert{2}
- | ---
- | - error: Quorum collection for a synchronous transaction is timed out
- | ...
 box.snapshot()
  | ---
  | - ok
@@ -172,14 +123,6 @@ box.space.sync:select{} -- 1
  | ---
  | - - [1]
  | ...
-test_run:switch('replica')
- | ---
- | - true
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
 -- Testcase cleanup.
 test_run:switch('default')
  | ---
@@ -191,11 +134,40 @@ box.space.sync:drop()
 
 -- [RFC, Snapshot generation] snapshot started on master, then rollback
 -- arrived, expected snapshot abort.
+-- The test is temporary blocked on 5146 due to a crash when local
+-- WAL write fails inside the WAL thread. Since this is the only
+-- way to cause rollback of the transaction used in a snapshot
+-- without triggering snapshot timeout.
+
+-- test_run:switch('default')
+-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+-- _ = box.space.sync:create_index('pk')
+-- -- Testcase body.
+-- box.space.sync:insert{1}
+-- box.space.sync:select{} -- 1
+-- test_run:switch('default')
+-- test_run:cmd("setopt delimiter ';'")
+-- _ = fiber.create(function()
+--     box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
+--     box.space.sync:insert{2}
+-- end);
+-- test_run:cmd("setopt delimiter ''");
+-- box.snapshot() -- abort
+-- box.space.sync:select{} -- 1
+-- test_run:switch('replica')
+-- box.space.sync:select{} -- 1
+-- -- Testcase cleanup.
+-- test_run:switch('default')
+-- box.space.sync:drop()
+
+-- [RFC, Snapshot generation] snapshot started on replica, then rollback
+-- arrived, expected snapshot abort.
 test_run:switch('default')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
  | ---
  | ...
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -204,128 +176,85 @@ _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
  | ---
  | ...
+
 -- Testcase body.
-box.space.sync:insert{1}
- | ---
- | - [1]
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
 test_run:switch('default')
  | ---
  | - true
  | ...
-test_run:cmd("setopt delimiter ';'")
- | ---
- | - true
- | ...
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
================================================================================

2 seconds was too long and was flaky. I made it faster and more stable using
event-oriented instead of time oriented things.

================================================================================
  | ---
  | ...
-test_run:cmd("setopt delimiter ''");
+ok, err = nil
  | ---
- | - true
- | ...
-box.snapshot() -- abort
- | ---
- | - error: A rollback for a synchronous transaction is received
  | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('replica')
- | ---
- | - true
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
--- Testcase cleanup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.space.sync:drop()
+f = fiber.create(function()                                                     \
+    ok, err = pcall(box.space.sync.insert, box.space.sync, {1})                 \
+end)
  | ---
  | ...
 
--- [RFC, Snapshot generation] snapshot started on replica, then rollback
--- arrived, expected snapshot abort.
-test_run:switch('default')
+test_run:switch('replica')
  | ---
  | - true
  | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+fiber = require('fiber')
  | ---
  | ...
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+box.cfg{replication_synchro_timeout=1000}
  | ---
  | ...
-_ = box.space.sync:create_index('pk')
+ok, err = nil
  | ---
  | ...
--- Testcase body.
-box.space.sync:insert{1}
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
  | ---
- | - [1]
  | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('replica')
+
+test_run:switch('default')
  | ---
  | - true
  | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('default')
+box.cfg{replication_synchro_timeout=0.0001}
  | ---
- | - true
  | ...
-test_run:cmd("setopt delimiter ';'")
+test_run:wait_cond(function() return f:status() == 'dead' end)
  | ---
  | - true
  | ...
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
+ok, err
  | ---
+ | - false
+ | - Quorum collection for a synchronous transaction is timed out
  | ...
-test_run:cmd("setopt delimiter ''");
+
+test_run:switch('replica')
  | ---
  | - true
  | ...
-test_run:switch('replica')
+test_run:wait_cond(function() return f:status() == 'dead' end)
  | ---
  | - true
  | ...
-box.snapshot() -- abort
+ok, err
  | ---
- | - error: A rollback for a synchronous transaction is received
+ | - false
+ | - A rollback for a synchronous transaction is received
  | ...
-box.space.sync:select{} -- 1
+box.space.sync:select{}
  | ---
- | - - [1]
+ | - []
  | ...
+
 test_run:switch('default')
  | ---
  | - true
  | ...
-box.space.sync:select{} -- 1
+box.space.sync:select{}
  | ---
- | - - [1]
+ | - []
  | ...
+
 -- Testcase cleanup.
 test_run:switch('default')
  | ---
diff --git a/test/replication/qsync_snapshots.test.lua b/test/replication/qsync_snapshots.test.lua
index b5990bce7..0db61da95 100644
--- a/test/replication/qsync_snapshots.test.lua
+++ b/test/replication/qsync_snapshots.test.lua
@@ -20,7 +20,7 @@ test_run:cmd('start server replica with wait=True, wait_load=True')
 -- expected success.
 -- Testcase setup.
 test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
 -- Testcase body.
@@ -35,7 +35,7 @@ box.space.sync:drop()
 -- expected success.
 -- Testcase setup.
 test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
 -- Testcase body.
@@ -43,79 +43,76 @@ box.space.sync:insert{1}
 box.space.sync:select{} -- 1
 test_run:switch('replica')
 box.space.sync:select{} -- 1
+box.cfg{replication_synchro_timeout=1000}
 box.snapshot()
 box.space.sync:select{} -- 1
 -- Testcase cleanup.
 test_run:switch('default')
 box.space.sync:drop()
 
--- [RFC, Snapshot generation] rolled back operations are not snapshotted.
--- Testcase setup.
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
-_ = box.space.sync:create_index('pk')
--- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}
-box.space.sync:insert{2}
-box.snapshot()
-box.space.sync:select{} -- 1
-test_run:switch('replica')
-box.space.sync:select{} -- 1
--- Testcase cleanup.
-test_run:switch('default')
-box.space.sync:drop()
-
 -- [RFC, Snapshot generation] snapshot started on master, then rollback
 -- arrived, expected snapshot abort.
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
-_ = box.space.sync:create_index('pk')
--- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
-test_run:switch('default')
-test_run:cmd("setopt delimiter ';'")
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
-test_run:cmd("setopt delimiter ''");
-box.snapshot() -- abort
-box.space.sync:select{} -- 1
-test_run:switch('replica')
-box.space.sync:select{} -- 1
--- Testcase cleanup.
-test_run:switch('default')
-box.space.sync:drop()
+-- The test is temporary blocked on 5146 due to a crash when local
+-- WAL write fails inside the WAL thread. Since this is the only
+-- way to cause rollback of the transaction used in a snapshot
+-- without triggering snapshot timeout.
+
+-- test_run:switch('default')
+-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+-- _ = box.space.sync:create_index('pk')
+-- -- Testcase body.
+-- box.space.sync:insert{1}
+-- box.space.sync:select{} -- 1
+-- test_run:switch('default')
+-- test_run:cmd("setopt delimiter ';'")
+-- _ = fiber.create(function()
+--     box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
+--     box.space.sync:insert{2}
+-- end);
+-- test_run:cmd("setopt delimiter ''");
+-- box.snapshot() -- abort
+-- box.space.sync:select{} -- 1
+-- test_run:switch('replica')
+-- box.space.sync:select{} -- 1
+-- -- Testcase cleanup.
+-- test_run:switch('default')
+-- box.space.sync:drop()
 
 -- [RFC, Snapshot generation] snapshot started on replica, then rollback
 -- arrived, expected snapshot abort.
 test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
 _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
 _ = box.space.sync:create_index('pk')
+
 -- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
+ok, err = nil
+f = fiber.create(function()                                                     \
+    ok, err = pcall(box.space.sync.insert, box.space.sync, {1})                 \
+end)
+
 test_run:switch('replica')
-box.space.sync:select{} -- 1
+fiber = require('fiber')
+box.cfg{replication_synchro_timeout=1000}
+ok, err = nil
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
+
 test_run:switch('default')
-test_run:cmd("setopt delimiter ';'")
-_ = fiber.create(function()
-    box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
-    box.space.sync:insert{2}
-end);
-test_run:cmd("setopt delimiter ''");
+box.cfg{replication_synchro_timeout=0.0001}
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+
 test_run:switch('replica')
-box.snapshot() -- abort
-box.space.sync:select{} -- 1
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+box.space.sync:select{}
+
 test_run:switch('default')
-box.space.sync:select{} -- 1
+box.space.sync:select{}
+
 -- Testcase cleanup.
 test_run:switch('default')
 box.space.sync:drop()
diff --git a/test/unit/snap_quorum_delay.cc b/test/unit/snap_quorum_delay.cc
index 8d50cfb27..ad0563345 100644
--- a/test/unit/snap_quorum_delay.cc
+++ b/test/unit/snap_quorum_delay.cc
@@ -78,7 +78,7 @@ enum process_type {
  * (to push a transaction to the limbo and simulate confirm).
  */
 const int fake_lsn = 1;
-const int instace_id = 1;
+extern "C" int instance_id;
 const int relay_id = 2;
 
 int
@@ -109,7 +109,7 @@ txn_process_func(va_list ap)
 	 * and call txn_commit (or another) later.
 	 */
 	struct txn_limbo_entry *entry = txn_limbo_append(&txn_limbo,
-							 instace_id, txn);
+							 instance_id, txn);
 	/*
 	 * The trigger is used to verify that the transaction has been
 	 * completed.
@@ -130,7 +130,7 @@ txn_process_func(va_list ap)
 		unreachable();
 	}
 
-	txn_limbo_assign_lsn(&txn_limbo, entry, fake_lsn);
+	txn_limbo_assign_local_lsn(&txn_limbo, entry, fake_lsn);
 	txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, fake_lsn);
 	txn_limbo_wait_complete(&txn_limbo, entry);
 
@@ -239,6 +239,7 @@ main(void)
 	fiber_init(fiber_c_invoke);
 	gc_init();
 	txn_limbo_init();
+	instance_id = 1;
 
 	struct fiber *main_fiber = fiber_new("main", test_snap_delay);
 	assert(main_fiber != NULL);

next prev parent reply	other threads:[~2020-07-10  0:50 UTC|newest]

Thread overview: 68+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <cover.1593723973.git.sergeyb@tarantool.org>
2020-06-29 23:15 ` Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 01/19] replication: introduce space.is_sync option Vladislav Shpilevoy
2020-06-30 23:00     ` Vladislav Shpilevoy
2020-07-01 15:55       ` Sergey Ostanevich
2020-07-01 23:46         ` Vladislav Shpilevoy
2020-07-02  8:25       ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 10/19] txn_limbo: add ROLLBACK processing Vladislav Shpilevoy
2020-07-05 15:29     ` Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 11/19] box: rework local_recovery to use async txn_commit Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 12/19] replication: support ROLLBACK and CONFIRM during recovery Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 13/19] replication: add test for synchro CONFIRM/ROLLBACK Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 14/19] applier: remove writer_cond Vladislav Shpilevoy
2020-07-02  9:13     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 15/19] applier: send heartbeat not only on commit, but on any write Vladislav Shpilevoy
2020-07-01 23:55     ` Vladislav Shpilevoy
2020-07-03 12:23     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 16/19] txn_limbo: add diag_set in txn_limbo_wait_confirm Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 17/19] replication: delay initial join until confirmation Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 18/19] replication: only send confirmed data during final join Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 19/19] replication: block async transactions when not empty limbo Vladislav Shpilevoy
2020-07-01 17:12     ` Sergey Ostanevich
2020-07-01 23:47       ` Vladislav Shpilevoy
2020-07-03 12:28     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 02/19] replication: introduce replication_synchro_* cfg options Vladislav Shpilevoy
2020-07-01 16:05     ` Sergey Ostanevich
2020-07-01 23:46       ` Vladislav Shpilevoy
2020-07-02  8:29     ` Serge Petrenko
2020-07-02 23:36       ` Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 03/19] txn: add TXN_WAIT_ACK flag Vladislav Shpilevoy
2020-07-01 17:14     ` Sergey Ostanevich
2020-07-01 23:46     ` Vladislav Shpilevoy
2020-07-02  8:30     ` Serge Petrenko
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 04/19] replication: make sync transactions wait quorum Vladislav Shpilevoy
2020-06-30 23:00     ` Vladislav Shpilevoy
2020-07-02  8:48     ` Serge Petrenko
2020-07-03 21:16       ` Vladislav Shpilevoy
2020-07-05 16:05     ` Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 05/19] xrow: introduce CONFIRM and ROLLBACK entries Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 06/19] txn: introduce various reasons for txn rollback Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 07/19] replication: write and read CONFIRM entries Vladislav Shpilevoy
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 08/19] replication: add support of qsync to the snapshot machinery Vladislav Shpilevoy
2020-07-02  8:52     ` Serge Petrenko
2020-07-08 11:43     ` Leonid Vasiliev
2020-06-29 23:15   ` [Tarantool-patches] [PATCH v2 09/19] txn_limbo: add timeout when waiting for acks Vladislav Shpilevoy
2020-06-29 23:22   ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy
2020-06-30 23:00   ` [Tarantool-patches] [PATCH v2 20/19] replication: add test for quorum 1 Vladislav Shpilevoy
2020-07-03 12:32     ` Serge Petrenko
2020-07-02 21:13   ` [Tarantool-patches] [PATCH 1/4] replication: regression test on gh-5119 [not fixed] sergeyb
2020-07-02 21:13   ` [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication sergeyb
2020-07-02 22:46     ` Sergey Bronnikov
2020-07-02 23:20     ` Vladislav Shpilevoy
2020-07-06 12:30       ` Sergey Bronnikov
2020-07-06 23:31     ` Vladislav Shpilevoy
2020-07-07 12:12       ` Sergey Bronnikov
2020-07-07 20:57         ` Vladislav Shpilevoy
2020-07-08 12:07           ` Sergey Bronnikov
2020-07-08 22:13             ` Vladislav Shpilevoy
2020-07-09  9:39               ` Sergey Bronnikov
2020-07-02 21:13   ` [Tarantool-patches] [PATCH 3/4] replication: add tests for sync replication with anon replica sergeyb
2020-07-06 23:31     ` Vladislav Shpilevoy
2020-07-02 21:13   ` [Tarantool-patches] [PATCH 4/4] replication: add tests for sync replication with snapshots sergeyb
2020-07-02 22:46     ` Sergey Bronnikov
2020-07-02 23:20     ` Vladislav Shpilevoy
2020-07-06 23:31     ` Vladislav Shpilevoy
2020-07-07 16:00       ` Sergey Bronnikov
2020-07-06 23:31   ` [Tarantool-patches] [PATCH] Add new error injection constant ERRINJ_SYNC_TIMEOUT Vladislav Shpilevoy
2020-07-10  0:50   ` Vladislav Shpilevoy [this message]
2020-07-10  7:40   ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Kirill Yukhin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b61eaafd-6659-5aca-4918-61ee6944cf36@tarantool.org \
    --to=v.shpilevoy@tarantool.org \
    --cc=sergepetrenko@tarantool.org \
    --cc=tarantool-patches@dev.tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH v2 00/19] Sync replication' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox