[Tarantool-patches] [PATCH v2 00/19] Sync replication
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Fri Jul 10 03:50:48 MSK 2020
Here is a pack of final fixes before the branch goes to master.
Lots of them, but I tried to explain them individually. Not point
in making them separate commits since anyway they are all squashed
into the older commits.
================================================================================
diff --git a/src/box/txn.c b/src/box/txn.c
index ffc2ac6a5..a2df23833 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -749,7 +749,8 @@ txn_commit_async(struct txn *txn)
if (txn_has_flag(txn, TXN_WAIT_ACK)) {
int64_t lsn = req->rows[txn->n_applier_rows - 1]->lsn;
- txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn);
+ txn_limbo_assign_remote_lsn(&txn_limbo, limbo_entry,
+ lsn);
}
/*
@@ -836,7 +837,8 @@ txn_commit(struct txn *txn)
if (is_sync) {
if (txn_has_flag(txn, TXN_WAIT_ACK)) {
int64_t lsn = req->rows[req->n_rows - 1]->lsn;
- txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn);
+ txn_limbo_assign_local_lsn(&txn_limbo, limbo_entry,
+ lsn);
/* Local WAL write is a first 'ACK'. */
txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, lsn);
}
diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c
index 71a47802a..e28432bfd 100644
--- a/src/box/txn_limbo.c
+++ b/src/box/txn_limbo.c
@@ -87,8 +87,7 @@ static inline void
txn_limbo_remove(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
{
assert(!rlist_empty(&entry->in_queue));
- assert(rlist_first_entry(&limbo->queue, struct txn_limbo_entry,
- in_queue) == entry);
+ assert(txn_limbo_first_entry(limbo) == entry);
(void) limbo;
rlist_del_entry(entry, in_queue);
}
@@ -97,8 +96,7 @@ static inline void
txn_limbo_pop(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
{
assert(!rlist_empty(&entry->in_queue));
- assert(rlist_last_entry(&limbo->queue, struct txn_limbo_entry,
- in_queue) == entry);
+ assert(txn_limbo_last_entry(limbo) == entry);
assert(entry->is_rollback);
(void) limbo;
rlist_del_entry(entry, in_queue);
@@ -119,10 +117,11 @@ txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
}
void
-txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
- int64_t lsn)
+txn_limbo_assign_remote_lsn(struct txn_limbo *limbo,
+ struct txn_limbo_entry *entry, int64_t lsn)
{
assert(limbo->instance_id != REPLICA_ID_NIL);
+ assert(limbo->instance_id != instance_id);
assert(entry->lsn == -1);
assert(lsn > 0);
assert(txn_has_flag(entry->txn, TXN_WAIT_ACK));
@@ -130,27 +129,30 @@ txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
entry->lsn = lsn;
}
-static bool
-txn_limbo_check_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
+void
+txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
+ struct txn_limbo_entry *entry, int64_t lsn)
{
- if (txn_limbo_entry_is_complete(entry))
- return true;
+ assert(limbo->instance_id != REPLICA_ID_NIL);
+ assert(limbo->instance_id == instance_id);
+ assert(entry->lsn == -1);
+ assert(lsn > 0);
+ assert(txn_has_flag(entry->txn, TXN_WAIT_ACK));
+ (void) limbo;
+ entry->lsn = lsn;
/*
- * Async transaction can't complete itself. It is always
- * completed by a previous sync transaction.
+ * The entry just got its LSN after a WAL write. It could
+ * happen that this LSN was already ACKed by some
+ * replicas. Update the ACK counter to take them into
+ * account.
*/
- if (!txn_has_flag(entry->txn, TXN_WAIT_ACK))
- return false;
struct vclock_iterator iter;
vclock_iterator_init(&iter, &limbo->vclock);
int ack_count = 0;
- int64_t lsn = entry->lsn;
vclock_foreach(&iter, vc)
ack_count += vc.lsn >= lsn;
assert(ack_count >= entry->ack_count);
entry->ack_count = ack_count;
- entry->is_commit = ack_count >= replication_synchro_quorum;
- return entry->is_commit;
================================================================================
The changes above are motivated by a bug I found during stress
testing (by running the same test in parallel in test-run in 10-15
processes).
The bug was a crash happening in case a transaction was replicated
and ACKed earlier than WAL thread responded ok to TX thread. Then
CONFIRM wasn't written at all.
================================================================================
}
static int
@@ -161,7 +163,7 @@ txn_limbo_wait_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry)
{
struct txn *txn = entry->txn;
assert(entry->lsn > 0 || !txn_has_flag(entry->txn, TXN_WAIT_ACK));
- if (txn_limbo_check_complete(limbo, entry))
+ if (txn_limbo_entry_is_complete(entry))
goto complete;
assert(!txn_has_flag(txn, TXN_IS_DONE));
@@ -226,7 +229,26 @@ complete:
diag_set(ClientError, ER_SYNC_ROLLBACK);
return -1;
}
- txn_limbo_remove(limbo, entry);
+ /*
+ * The entry might be not the first in the limbo. It
+ * happens when there was a sync transaction and async
+ * transaction. The sync and async went to WAL. After sync
+ * WAL write is done, it may be already ACKed by the
+ * needed replica count. Now it marks self as committed
+ * and does the same for the next async txn. Then it
+ * starts writing CONFIRM. During that the async
+ * transaction finishes its WAL write, sees it is
+ * committed and ends up here. Not being the first
+ * transaction in the limbo.
+ */
+ while (!rlist_empty(&entry->in_queue) &&
+ txn_limbo_first_entry(limbo) != entry) {
+ bool cancellable = fiber_set_cancellable(false);
+ fiber_yield();
+ fiber_set_cancellable(cancellable);
+ }
+ if (!rlist_empty(&entry->in_queue))
+ txn_limbo_remove(limbo, entry);
txn_clear_flag(txn, TXN_WAIT_SYNC);
txn_clear_flag(txn, TXN_WAIT_ACK);
return 0;
@@ -257,7 +279,7 @@ txn_limbo_write_confirm_rollback(struct txn_limbo *limbo, int64_t lsn,
* the last "safe" lsn is lsn - 1.
*/
res = xrow_encode_rollback(&row, &txn->region,
- limbo->instance_id, lsn - 1);
+ limbo->instance_id, lsn);
================================================================================
I asked Sergey to do that, but he temporarily left. There wasn't no
a bug, just inconsistency. For CONFIRM we use inclusive LSN - it commits
all <= LSN. But for ROLLBACK we used exclusive LSN - it rolls back all > LSN.
This is strange. So I made ROLLBACK LSN inclusive too. This is one step
towards https://github.com/tarantool/tarantool/issues/5151.
================================================================================
}
if (res == -1)
goto rollback;
@@ -342,7 +364,7 @@ txn_limbo_read_rollback(struct txn_limbo *limbo, int64_t lsn)
rlist_foreach_entry_reverse(e, &limbo->queue, in_queue) {
if (!txn_has_flag(e->txn, TXN_WAIT_ACK))
continue;
- if (e->lsn <= lsn)
+ if (e->lsn < lsn)
break;
last_rollback = e;
}
@@ -542,7 +564,7 @@ txn_limbo_force_empty(struct txn_limbo *limbo, int64_t confirm_lsn)
}
if (rollback != NULL) {
txn_limbo_write_rollback(limbo, rollback->lsn);
- txn_limbo_read_rollback(limbo, rollback->lsn - 1);
+ txn_limbo_read_rollback(limbo, rollback->lsn);
}
}
diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h
index 1ee416231..88614d4a6 100644
--- a/src/box/txn_limbo.h
+++ b/src/box/txn_limbo.h
@@ -158,13 +158,21 @@ void
txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry);
/**
- * Assign local LSN to the limbo entry. That happens when the
- * transaction is added to the limbo, writes to WAL, and gets an
- * LSN.
+ * Assign a remote LSN to a limbo entry. That happens when a
+ * remote transaction is added to the limbo and starts waiting for
+ * a confirm.
*/
void
-txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry,
- int64_t lsn);
+txn_limbo_assign_remote_lsn(struct txn_limbo *limbo,
+ struct txn_limbo_entry *entry, int64_t lsn);
+
+/**
+ * Assign a local LSN to a limbo entry. That happens when a local
+ * transaction is written to WAL.
+ */
+void
+txn_limbo_assign_local_lsn(struct txn_limbo *limbo,
+ struct txn_limbo_entry *entry, int64_t lsn);
/**
* Ack all transactions up to the given LSN on behalf of the
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 7e6a4aceb..b325213e6 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -246,7 +246,7 @@ xrow_decode_confirm(struct xrow_header *row, uint32_t *replica_id, int64_t *lsn)
* @param row xrow header.
* @param region Region to use to encode the rollback body.
* @param replica_id master's instance id.
- * @param lsn lsn to rollback to.
+ * @param lsn lsn to rollback from, including it.
* @retval -1 on error.
* @retval 0 success.
*/
diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
index 6d1624798..6b55a0e5e 100644
--- a/test/replication/qsync_basic.result
+++ b/test/replication/qsync_basic.result
@@ -199,7 +199,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000}
-- Commit something non-sync. So as applier writer fiber would
-- flush the pending heartbeat and go to sleep with the new huge
-- replication timeout.
-s = box.schema.create_space('test')
+s = box.schema.create_space('test', {engine = engine})
| ---
| ...
pk = s:create_index('pk')
@@ -309,7 +309,7 @@ test_run:switch('default')
box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
| ---
| ...
-_ = box.schema.create_space('locallocal', {is_local = true})
+_ = box.schema.create_space('locallocal', {is_local = true, engine = engine})
| ---
| ...
_ = _:create_index('pk')
@@ -551,6 +551,9 @@ test_run:switch('default')
| ---
| - true
| ...
+box.cfg{replication_synchro_timeout = 1000}
================================================================================
There was used timeout from the previous testcase, < 1 second. Was
flaky.
================================================================================
+ | ---
+ | ...
ok, err = nil
| ---
| ...
diff --git a/test/replication/qsync_basic.test.lua b/test/replication/qsync_basic.test.lua
index 384b3593c..dcd1d6c76 100644
--- a/test/replication/qsync_basic.test.lua
+++ b/test/replication/qsync_basic.test.lua
@@ -83,7 +83,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000}
-- Commit something non-sync. So as applier writer fiber would
-- flush the pending heartbeat and go to sleep with the new huge
-- replication timeout.
-s = box.schema.create_space('test')
+s = box.schema.create_space('test', {engine = engine})
pk = s:create_index('pk')
s:replace{1}
-- Now commit something sync. It should return immediately even
@@ -123,7 +123,7 @@ box.space.sync:select{6}
--
test_run:switch('default')
box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2}
-_ = box.schema.create_space('locallocal', {is_local = true})
+_ = box.schema.create_space('locallocal', {is_local = true, engine = engine})
_ = _:create_index('pk')
-- Propagate local vclock to some insane value to ensure it won't
-- affect anything.
@@ -217,6 +217,7 @@ box.space.sync:select{11}
-- Test it is possible to early ACK a transaction with a new quorum.
test_run:switch('default')
+box.cfg{replication_synchro_timeout = 1000}
ok, err = nil
f = fiber.create(function() \
ok, err = pcall(box.space.sync.insert, box.space.sync, {12}) \
diff --git a/test/replication/qsync_snapshots.result b/test/replication/qsync_snapshots.result
index 61cb7164b..2a126087a 100644
--- a/test/replication/qsync_snapshots.result
+++ b/test/replication/qsync_snapshots.result
@@ -48,7 +48,7 @@ test_run:switch('default')
| ---
| - true
| ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
================================================================================
Too small timeout. The test assumed it doesn't fail, but 0.1 is quite
easy to fail. Especially when runs in parallel. The same for some other
fixes below.
================================================================================
| ---
| ...
_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -86,7 +86,7 @@ test_run:switch('default')
| ---
| - true
| ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
| ---
| ...
_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -112,58 +112,9 @@ box.space.sync:select{} -- 1
| ---
| - - [1]
| ...
-box.snapshot()
- | ---
- | - ok
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
--- Testcase cleanup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.space.sync:drop()
+box.cfg{replication_synchro_timeout=1000}
| ---
| ...
-
--- [RFC, Snapshot generation] rolled back operations are not snapshotted.
--- Testcase setup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
- | ---
- | ...
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
- | ---
- | ...
-_ = box.space.sync:create_index('pk')
- | ---
- | ...
--- Testcase body.
-box.space.sync:insert{1}
- | ---
- | - [1]
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}
- | ---
- | ...
-box.space.sync:insert{2}
- | ---
- | - error: Quorum collection for a synchronous transaction is timed out
- | ...
box.snapshot()
| ---
| - ok
@@ -172,14 +123,6 @@ box.space.sync:select{} -- 1
| ---
| - - [1]
| ...
-test_run:switch('replica')
- | ---
- | - true
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-- Testcase cleanup.
test_run:switch('default')
| ---
@@ -191,11 +134,40 @@ box.space.sync:drop()
-- [RFC, Snapshot generation] snapshot started on master, then rollback
-- arrived, expected snapshot abort.
+-- The test is temporary blocked on 5146 due to a crash when local
+-- WAL write fails inside the WAL thread. Since this is the only
+-- way to cause rollback of the transaction used in a snapshot
+-- without triggering snapshot timeout.
+
+-- test_run:switch('default')
+-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+-- _ = box.space.sync:create_index('pk')
+-- -- Testcase body.
+-- box.space.sync:insert{1}
+-- box.space.sync:select{} -- 1
+-- test_run:switch('default')
+-- test_run:cmd("setopt delimiter ';'")
+-- _ = fiber.create(function()
+-- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
+-- box.space.sync:insert{2}
+-- end);
+-- test_run:cmd("setopt delimiter ''");
+-- box.snapshot() -- abort
+-- box.space.sync:select{} -- 1
+-- test_run:switch('replica')
+-- box.space.sync:select{} -- 1
+-- -- Testcase cleanup.
+-- test_run:switch('default')
+-- box.space.sync:drop()
+
+-- [RFC, Snapshot generation] snapshot started on replica, then rollback
+-- arrived, expected snapshot abort.
test_run:switch('default')
| ---
| - true
| ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
| ---
| ...
_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
@@ -204,128 +176,85 @@ _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
_ = box.space.sync:create_index('pk')
| ---
| ...
+
-- Testcase body.
-box.space.sync:insert{1}
- | ---
- | - [1]
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
test_run:switch('default')
| ---
| - true
| ...
-test_run:cmd("setopt delimiter ';'")
- | ---
- | - true
- | ...
-_ = fiber.create(function()
- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
- box.space.sync:insert{2}
-end);
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
================================================================================
2 seconds was too long and was flaky. I made it faster and more stable using
event-oriented instead of time oriented things.
================================================================================
| ---
| ...
-test_run:cmd("setopt delimiter ''");
+ok, err = nil
| ---
- | - true
- | ...
-box.snapshot() -- abort
- | ---
- | - error: A rollback for a synchronous transaction is received
| ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('replica')
- | ---
- | - true
- | ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
--- Testcase cleanup.
-test_run:switch('default')
- | ---
- | - true
- | ...
-box.space.sync:drop()
+f = fiber.create(function() \
+ ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \
+end)
| ---
| ...
--- [RFC, Snapshot generation] snapshot started on replica, then rollback
--- arrived, expected snapshot abort.
-test_run:switch('default')
+test_run:switch('replica')
| ---
| - true
| ...
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+fiber = require('fiber')
| ---
| ...
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+box.cfg{replication_synchro_timeout=1000}
| ---
| ...
-_ = box.space.sync:create_index('pk')
+ok, err = nil
| ---
| ...
--- Testcase body.
-box.space.sync:insert{1}
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
| ---
- | - [1]
| ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('replica')
+
+test_run:switch('default')
| ---
| - true
| ...
-box.space.sync:select{} -- 1
- | ---
- | - - [1]
- | ...
-test_run:switch('default')
+box.cfg{replication_synchro_timeout=0.0001}
| ---
- | - true
| ...
-test_run:cmd("setopt delimiter ';'")
+test_run:wait_cond(function() return f:status() == 'dead' end)
| ---
| - true
| ...
-_ = fiber.create(function()
- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
- box.space.sync:insert{2}
-end);
+ok, err
| ---
+ | - false
+ | - Quorum collection for a synchronous transaction is timed out
| ...
-test_run:cmd("setopt delimiter ''");
+
+test_run:switch('replica')
| ---
| - true
| ...
-test_run:switch('replica')
+test_run:wait_cond(function() return f:status() == 'dead' end)
| ---
| - true
| ...
-box.snapshot() -- abort
+ok, err
| ---
- | - error: A rollback for a synchronous transaction is received
+ | - false
+ | - A rollback for a synchronous transaction is received
| ...
-box.space.sync:select{} -- 1
+box.space.sync:select{}
| ---
- | - - [1]
+ | - []
| ...
+
test_run:switch('default')
| ---
| - true
| ...
-box.space.sync:select{} -- 1
+box.space.sync:select{}
| ---
- | - - [1]
+ | - []
| ...
+
-- Testcase cleanup.
test_run:switch('default')
| ---
diff --git a/test/replication/qsync_snapshots.test.lua b/test/replication/qsync_snapshots.test.lua
index b5990bce7..0db61da95 100644
--- a/test/replication/qsync_snapshots.test.lua
+++ b/test/replication/qsync_snapshots.test.lua
@@ -20,7 +20,7 @@ test_run:cmd('start server replica with wait=True, wait_load=True')
-- expected success.
-- Testcase setup.
test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
_ = box.space.sync:create_index('pk')
-- Testcase body.
@@ -35,7 +35,7 @@ box.space.sync:drop()
-- expected success.
-- Testcase setup.
test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
_ = box.space.sync:create_index('pk')
-- Testcase body.
@@ -43,79 +43,76 @@ box.space.sync:insert{1}
box.space.sync:select{} -- 1
test_run:switch('replica')
box.space.sync:select{} -- 1
+box.cfg{replication_synchro_timeout=1000}
box.snapshot()
box.space.sync:select{} -- 1
-- Testcase cleanup.
test_run:switch('default')
box.space.sync:drop()
--- [RFC, Snapshot generation] rolled back operations are not snapshotted.
--- Testcase setup.
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
-_ = box.space.sync:create_index('pk')
--- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}
-box.space.sync:insert{2}
-box.snapshot()
-box.space.sync:select{} -- 1
-test_run:switch('replica')
-box.space.sync:select{} -- 1
--- Testcase cleanup.
-test_run:switch('default')
-box.space.sync:drop()
-
-- [RFC, Snapshot generation] snapshot started on master, then rollback
-- arrived, expected snapshot abort.
-test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
-_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
-_ = box.space.sync:create_index('pk')
--- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
-test_run:switch('default')
-test_run:cmd("setopt delimiter ';'")
-_ = fiber.create(function()
- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
- box.space.sync:insert{2}
-end);
-test_run:cmd("setopt delimiter ''");
-box.snapshot() -- abort
-box.space.sync:select{} -- 1
-test_run:switch('replica')
-box.space.sync:select{} -- 1
--- Testcase cleanup.
-test_run:switch('default')
-box.space.sync:drop()
+-- The test is temporary blocked on 5146 due to a crash when local
+-- WAL write fails inside the WAL thread. Since this is the only
+-- way to cause rollback of the transaction used in a snapshot
+-- without triggering snapshot timeout.
+
+-- test_run:switch('default')
+-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+-- _ = box.space.sync:create_index('pk')
+-- -- Testcase body.
+-- box.space.sync:insert{1}
+-- box.space.sync:select{} -- 1
+-- test_run:switch('default')
+-- test_run:cmd("setopt delimiter ';'")
+-- _ = fiber.create(function()
+-- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
+-- box.space.sync:insert{2}
+-- end);
+-- test_run:cmd("setopt delimiter ''");
+-- box.snapshot() -- abort
+-- box.space.sync:select{} -- 1
+-- test_run:switch('replica')
+-- box.space.sync:select{} -- 1
+-- -- Testcase cleanup.
+-- test_run:switch('default')
+-- box.space.sync:drop()
-- [RFC, Snapshot generation] snapshot started on replica, then rollback
-- arrived, expected snapshot abort.
test_run:switch('default')
-box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000}
_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
_ = box.space.sync:create_index('pk')
+
-- Testcase body.
-box.space.sync:insert{1}
-box.space.sync:select{} -- 1
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=BROKEN_QUORUM}
+ok, err = nil
+f = fiber.create(function() \
+ ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \
+end)
+
test_run:switch('replica')
-box.space.sync:select{} -- 1
+fiber = require('fiber')
+box.cfg{replication_synchro_timeout=1000}
+ok, err = nil
+f = fiber.create(function() ok, err = pcall(box.snapshot) end)
+
test_run:switch('default')
-test_run:cmd("setopt delimiter ';'")
-_ = fiber.create(function()
- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2}
- box.space.sync:insert{2}
-end);
-test_run:cmd("setopt delimiter ''");
+box.cfg{replication_synchro_timeout=0.0001}
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+
test_run:switch('replica')
-box.snapshot() -- abort
-box.space.sync:select{} -- 1
+test_run:wait_cond(function() return f:status() == 'dead' end)
+ok, err
+box.space.sync:select{}
+
test_run:switch('default')
-box.space.sync:select{} -- 1
+box.space.sync:select{}
+
-- Testcase cleanup.
test_run:switch('default')
box.space.sync:drop()
diff --git a/test/unit/snap_quorum_delay.cc b/test/unit/snap_quorum_delay.cc
index 8d50cfb27..ad0563345 100644
--- a/test/unit/snap_quorum_delay.cc
+++ b/test/unit/snap_quorum_delay.cc
@@ -78,7 +78,7 @@ enum process_type {
* (to push a transaction to the limbo and simulate confirm).
*/
const int fake_lsn = 1;
-const int instace_id = 1;
+extern "C" int instance_id;
const int relay_id = 2;
int
@@ -109,7 +109,7 @@ txn_process_func(va_list ap)
* and call txn_commit (or another) later.
*/
struct txn_limbo_entry *entry = txn_limbo_append(&txn_limbo,
- instace_id, txn);
+ instance_id, txn);
/*
* The trigger is used to verify that the transaction has been
* completed.
@@ -130,7 +130,7 @@ txn_process_func(va_list ap)
unreachable();
}
- txn_limbo_assign_lsn(&txn_limbo, entry, fake_lsn);
+ txn_limbo_assign_local_lsn(&txn_limbo, entry, fake_lsn);
txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, fake_lsn);
txn_limbo_wait_complete(&txn_limbo, entry);
@@ -239,6 +239,7 @@ main(void)
fiber_init(fiber_c_invoke);
gc_init();
txn_limbo_init();
+ instance_id = 1;
struct fiber *main_fiber = fiber_new("main", test_snap_delay);
assert(main_fiber != NULL);
More information about the Tarantool-patches
mailing list