From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org> To: tarantool-patches@dev.tarantool.org, sergepetrenko@tarantool.org Subject: Re: [Tarantool-patches] [PATCH v2 00/19] Sync replication Date: Fri, 10 Jul 2020 02:50:48 +0200 [thread overview] Message-ID: <b61eaafd-6659-5aca-4918-61ee6944cf36@tarantool.org> (raw) In-Reply-To: <cover.1593472477.git.v.shpilevoy@tarantool.org> Here is a pack of final fixes before the branch goes to master. Lots of them, but I tried to explain them individually. Not point in making them separate commits since anyway they are all squashed into the older commits. ================================================================================ diff --git a/src/box/txn.c b/src/box/txn.c index ffc2ac6a5..a2df23833 100644 --- a/src/box/txn.c +++ b/src/box/txn.c @@ -749,7 +749,8 @@ txn_commit_async(struct txn *txn) if (txn_has_flag(txn, TXN_WAIT_ACK)) { int64_t lsn = req->rows[txn->n_applier_rows - 1]->lsn; - txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn); + txn_limbo_assign_remote_lsn(&txn_limbo, limbo_entry, + lsn); } /* @@ -836,7 +837,8 @@ txn_commit(struct txn *txn) if (is_sync) { if (txn_has_flag(txn, TXN_WAIT_ACK)) { int64_t lsn = req->rows[req->n_rows - 1]->lsn; - txn_limbo_assign_lsn(&txn_limbo, limbo_entry, lsn); + txn_limbo_assign_local_lsn(&txn_limbo, limbo_entry, + lsn); /* Local WAL write is a first 'ACK'. */ txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, lsn); } diff --git a/src/box/txn_limbo.c b/src/box/txn_limbo.c index 71a47802a..e28432bfd 100644 --- a/src/box/txn_limbo.c +++ b/src/box/txn_limbo.c @@ -87,8 +87,7 @@ static inline void txn_limbo_remove(struct txn_limbo *limbo, struct txn_limbo_entry *entry) { assert(!rlist_empty(&entry->in_queue)); - assert(rlist_first_entry(&limbo->queue, struct txn_limbo_entry, - in_queue) == entry); + assert(txn_limbo_first_entry(limbo) == entry); (void) limbo; rlist_del_entry(entry, in_queue); } @@ -97,8 +96,7 @@ static inline void txn_limbo_pop(struct txn_limbo *limbo, struct txn_limbo_entry *entry) { assert(!rlist_empty(&entry->in_queue)); - assert(rlist_last_entry(&limbo->queue, struct txn_limbo_entry, - in_queue) == entry); + assert(txn_limbo_last_entry(limbo) == entry); assert(entry->is_rollback); (void) limbo; rlist_del_entry(entry, in_queue); @@ -119,10 +117,11 @@ txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry) } void -txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry, - int64_t lsn) +txn_limbo_assign_remote_lsn(struct txn_limbo *limbo, + struct txn_limbo_entry *entry, int64_t lsn) { assert(limbo->instance_id != REPLICA_ID_NIL); + assert(limbo->instance_id != instance_id); assert(entry->lsn == -1); assert(lsn > 0); assert(txn_has_flag(entry->txn, TXN_WAIT_ACK)); @@ -130,27 +129,30 @@ txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry, entry->lsn = lsn; } -static bool -txn_limbo_check_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry) +void +txn_limbo_assign_local_lsn(struct txn_limbo *limbo, + struct txn_limbo_entry *entry, int64_t lsn) { - if (txn_limbo_entry_is_complete(entry)) - return true; + assert(limbo->instance_id != REPLICA_ID_NIL); + assert(limbo->instance_id == instance_id); + assert(entry->lsn == -1); + assert(lsn > 0); + assert(txn_has_flag(entry->txn, TXN_WAIT_ACK)); + (void) limbo; + entry->lsn = lsn; /* - * Async transaction can't complete itself. It is always - * completed by a previous sync transaction. + * The entry just got its LSN after a WAL write. It could + * happen that this LSN was already ACKed by some + * replicas. Update the ACK counter to take them into + * account. */ - if (!txn_has_flag(entry->txn, TXN_WAIT_ACK)) - return false; struct vclock_iterator iter; vclock_iterator_init(&iter, &limbo->vclock); int ack_count = 0; - int64_t lsn = entry->lsn; vclock_foreach(&iter, vc) ack_count += vc.lsn >= lsn; assert(ack_count >= entry->ack_count); entry->ack_count = ack_count; - entry->is_commit = ack_count >= replication_synchro_quorum; - return entry->is_commit; ================================================================================ The changes above are motivated by a bug I found during stress testing (by running the same test in parallel in test-run in 10-15 processes). The bug was a crash happening in case a transaction was replicated and ACKed earlier than WAL thread responded ok to TX thread. Then CONFIRM wasn't written at all. ================================================================================ } static int @@ -161,7 +163,7 @@ txn_limbo_wait_complete(struct txn_limbo *limbo, struct txn_limbo_entry *entry) { struct txn *txn = entry->txn; assert(entry->lsn > 0 || !txn_has_flag(entry->txn, TXN_WAIT_ACK)); - if (txn_limbo_check_complete(limbo, entry)) + if (txn_limbo_entry_is_complete(entry)) goto complete; assert(!txn_has_flag(txn, TXN_IS_DONE)); @@ -226,7 +229,26 @@ complete: diag_set(ClientError, ER_SYNC_ROLLBACK); return -1; } - txn_limbo_remove(limbo, entry); + /* + * The entry might be not the first in the limbo. It + * happens when there was a sync transaction and async + * transaction. The sync and async went to WAL. After sync + * WAL write is done, it may be already ACKed by the + * needed replica count. Now it marks self as committed + * and does the same for the next async txn. Then it + * starts writing CONFIRM. During that the async + * transaction finishes its WAL write, sees it is + * committed and ends up here. Not being the first + * transaction in the limbo. + */ + while (!rlist_empty(&entry->in_queue) && + txn_limbo_first_entry(limbo) != entry) { + bool cancellable = fiber_set_cancellable(false); + fiber_yield(); + fiber_set_cancellable(cancellable); + } + if (!rlist_empty(&entry->in_queue)) + txn_limbo_remove(limbo, entry); txn_clear_flag(txn, TXN_WAIT_SYNC); txn_clear_flag(txn, TXN_WAIT_ACK); return 0; @@ -257,7 +279,7 @@ txn_limbo_write_confirm_rollback(struct txn_limbo *limbo, int64_t lsn, * the last "safe" lsn is lsn - 1. */ res = xrow_encode_rollback(&row, &txn->region, - limbo->instance_id, lsn - 1); + limbo->instance_id, lsn); ================================================================================ I asked Sergey to do that, but he temporarily left. There wasn't no a bug, just inconsistency. For CONFIRM we use inclusive LSN - it commits all <= LSN. But for ROLLBACK we used exclusive LSN - it rolls back all > LSN. This is strange. So I made ROLLBACK LSN inclusive too. This is one step towards https://github.com/tarantool/tarantool/issues/5151. ================================================================================ } if (res == -1) goto rollback; @@ -342,7 +364,7 @@ txn_limbo_read_rollback(struct txn_limbo *limbo, int64_t lsn) rlist_foreach_entry_reverse(e, &limbo->queue, in_queue) { if (!txn_has_flag(e->txn, TXN_WAIT_ACK)) continue; - if (e->lsn <= lsn) + if (e->lsn < lsn) break; last_rollback = e; } @@ -542,7 +564,7 @@ txn_limbo_force_empty(struct txn_limbo *limbo, int64_t confirm_lsn) } if (rollback != NULL) { txn_limbo_write_rollback(limbo, rollback->lsn); - txn_limbo_read_rollback(limbo, rollback->lsn - 1); + txn_limbo_read_rollback(limbo, rollback->lsn); } } diff --git a/src/box/txn_limbo.h b/src/box/txn_limbo.h index 1ee416231..88614d4a6 100644 --- a/src/box/txn_limbo.h +++ b/src/box/txn_limbo.h @@ -158,13 +158,21 @@ void txn_limbo_abort(struct txn_limbo *limbo, struct txn_limbo_entry *entry); /** - * Assign local LSN to the limbo entry. That happens when the - * transaction is added to the limbo, writes to WAL, and gets an - * LSN. + * Assign a remote LSN to a limbo entry. That happens when a + * remote transaction is added to the limbo and starts waiting for + * a confirm. */ void -txn_limbo_assign_lsn(struct txn_limbo *limbo, struct txn_limbo_entry *entry, - int64_t lsn); +txn_limbo_assign_remote_lsn(struct txn_limbo *limbo, + struct txn_limbo_entry *entry, int64_t lsn); + +/** + * Assign a local LSN to a limbo entry. That happens when a local + * transaction is written to WAL. + */ +void +txn_limbo_assign_local_lsn(struct txn_limbo *limbo, + struct txn_limbo_entry *entry, int64_t lsn); /** * Ack all transactions up to the given LSN on behalf of the diff --git a/src/box/xrow.h b/src/box/xrow.h index 7e6a4aceb..b325213e6 100644 --- a/src/box/xrow.h +++ b/src/box/xrow.h @@ -246,7 +246,7 @@ xrow_decode_confirm(struct xrow_header *row, uint32_t *replica_id, int64_t *lsn) * @param row xrow header. * @param region Region to use to encode the rollback body. * @param replica_id master's instance id. - * @param lsn lsn to rollback to. + * @param lsn lsn to rollback from, including it. * @retval -1 on error. * @retval 0 success. */ diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result index 6d1624798..6b55a0e5e 100644 --- a/test/replication/qsync_basic.result +++ b/test/replication/qsync_basic.result @@ -199,7 +199,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000} -- Commit something non-sync. So as applier writer fiber would -- flush the pending heartbeat and go to sleep with the new huge -- replication timeout. -s = box.schema.create_space('test') +s = box.schema.create_space('test', {engine = engine}) | --- | ... pk = s:create_index('pk') @@ -309,7 +309,7 @@ test_run:switch('default') box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2} | --- | ... -_ = box.schema.create_space('locallocal', {is_local = true}) +_ = box.schema.create_space('locallocal', {is_local = true, engine = engine}) | --- | ... _ = _:create_index('pk') @@ -551,6 +551,9 @@ test_run:switch('default') | --- | - true | ... +box.cfg{replication_synchro_timeout = 1000} ================================================================================ There was used timeout from the previous testcase, < 1 second. Was flaky. ================================================================================ + | --- + | ... ok, err = nil | --- | ... diff --git a/test/replication/qsync_basic.test.lua b/test/replication/qsync_basic.test.lua index 384b3593c..dcd1d6c76 100644 --- a/test/replication/qsync_basic.test.lua +++ b/test/replication/qsync_basic.test.lua @@ -83,7 +83,7 @@ box.cfg{replication_timeout = 1000, replication_synchro_timeout = 1000} -- Commit something non-sync. So as applier writer fiber would -- flush the pending heartbeat and go to sleep with the new huge -- replication timeout. -s = box.schema.create_space('test') +s = box.schema.create_space('test', {engine = engine}) pk = s:create_index('pk') s:replace{1} -- Now commit something sync. It should return immediately even @@ -123,7 +123,7 @@ box.space.sync:select{6} -- test_run:switch('default') box.cfg{replication_synchro_timeout = 1000, replication_synchro_quorum = 2} -_ = box.schema.create_space('locallocal', {is_local = true}) +_ = box.schema.create_space('locallocal', {is_local = true, engine = engine}) _ = _:create_index('pk') -- Propagate local vclock to some insane value to ensure it won't -- affect anything. @@ -217,6 +217,7 @@ box.space.sync:select{11} -- Test it is possible to early ACK a transaction with a new quorum. test_run:switch('default') +box.cfg{replication_synchro_timeout = 1000} ok, err = nil f = fiber.create(function() \ ok, err = pcall(box.space.sync.insert, box.space.sync, {12}) \ diff --git a/test/replication/qsync_snapshots.result b/test/replication/qsync_snapshots.result index 61cb7164b..2a126087a 100644 --- a/test/replication/qsync_snapshots.result +++ b/test/replication/qsync_snapshots.result @@ -48,7 +48,7 @@ test_run:switch('default') | --- | - true | ... -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000} ================================================================================ Too small timeout. The test assumed it doesn't fail, but 0.1 is quite easy to fail. Especially when runs in parallel. The same for some other fixes below. ================================================================================ | --- | ... _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) @@ -86,7 +86,7 @@ test_run:switch('default') | --- | - true | ... -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000} | --- | ... _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) @@ -112,58 +112,9 @@ box.space.sync:select{} -- 1 | --- | - - [1] | ... -box.snapshot() - | --- - | - ok - | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... --- Testcase cleanup. -test_run:switch('default') - | --- - | - true - | ... -box.space.sync:drop() +box.cfg{replication_synchro_timeout=1000} | --- | ... - --- [RFC, Snapshot generation] rolled back operations are not snapshotted. --- Testcase setup. -test_run:switch('default') - | --- - | - true - | ... -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} - | --- - | ... -_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) - | --- - | ... -_ = box.space.sync:create_index('pk') - | --- - | ... --- Testcase body. -box.space.sync:insert{1} - | --- - | - [1] - | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... -test_run:switch('default') - | --- - | - true - | ... -box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1} - | --- - | ... -box.space.sync:insert{2} - | --- - | - error: Quorum collection for a synchronous transaction is timed out - | ... box.snapshot() | --- | - ok @@ -172,14 +123,6 @@ box.space.sync:select{} -- 1 | --- | - - [1] | ... -test_run:switch('replica') - | --- - | - true - | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... -- Testcase cleanup. test_run:switch('default') | --- @@ -191,11 +134,40 @@ box.space.sync:drop() -- [RFC, Snapshot generation] snapshot started on master, then rollback -- arrived, expected snapshot abort. +-- The test is temporary blocked on 5146 due to a crash when local +-- WAL write fails inside the WAL thread. Since this is the only +-- way to cause rollback of the transaction used in a snapshot +-- without triggering snapshot timeout. + +-- test_run:switch('default') +-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +-- _ = box.space.sync:create_index('pk') +-- -- Testcase body. +-- box.space.sync:insert{1} +-- box.space.sync:select{} -- 1 +-- test_run:switch('default') +-- test_run:cmd("setopt delimiter ';'") +-- _ = fiber.create(function() +-- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2} +-- box.space.sync:insert{2} +-- end); +-- test_run:cmd("setopt delimiter ''"); +-- box.snapshot() -- abort +-- box.space.sync:select{} -- 1 +-- test_run:switch('replica') +-- box.space.sync:select{} -- 1 +-- -- Testcase cleanup. +-- test_run:switch('default') +-- box.space.sync:drop() + +-- [RFC, Snapshot generation] snapshot started on replica, then rollback +-- arrived, expected snapshot abort. test_run:switch('default') | --- | - true | ... -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000} | --- | ... _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) @@ -204,128 +176,85 @@ _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) _ = box.space.sync:create_index('pk') | --- | ... + -- Testcase body. -box.space.sync:insert{1} - | --- - | - [1] - | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... test_run:switch('default') | --- | - true | ... -test_run:cmd("setopt delimiter ';'") - | --- - | - true - | ... -_ = fiber.create(function() - box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2} - box.space.sync:insert{2} -end); +box.cfg{replication_synchro_quorum=BROKEN_QUORUM} ================================================================================ 2 seconds was too long and was flaky. I made it faster and more stable using event-oriented instead of time oriented things. ================================================================================ | --- | ... -test_run:cmd("setopt delimiter ''"); +ok, err = nil | --- - | - true - | ... -box.snapshot() -- abort - | --- - | - error: A rollback for a synchronous transaction is received | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... -test_run:switch('replica') - | --- - | - true - | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... --- Testcase cleanup. -test_run:switch('default') - | --- - | - true - | ... -box.space.sync:drop() +f = fiber.create(function() \ + ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \ +end) | --- | ... --- [RFC, Snapshot generation] snapshot started on replica, then rollback --- arrived, expected snapshot abort. -test_run:switch('default') +test_run:switch('replica') | --- | - true | ... -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +fiber = require('fiber') | --- | ... -_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +box.cfg{replication_synchro_timeout=1000} | --- | ... -_ = box.space.sync:create_index('pk') +ok, err = nil | --- | ... --- Testcase body. -box.space.sync:insert{1} +f = fiber.create(function() ok, err = pcall(box.snapshot) end) | --- - | - [1] | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... -test_run:switch('replica') + +test_run:switch('default') | --- | - true | ... -box.space.sync:select{} -- 1 - | --- - | - - [1] - | ... -test_run:switch('default') +box.cfg{replication_synchro_timeout=0.0001} | --- - | - true | ... -test_run:cmd("setopt delimiter ';'") +test_run:wait_cond(function() return f:status() == 'dead' end) | --- | - true | ... -_ = fiber.create(function() - box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2} - box.space.sync:insert{2} -end); +ok, err | --- + | - false + | - Quorum collection for a synchronous transaction is timed out | ... -test_run:cmd("setopt delimiter ''"); + +test_run:switch('replica') | --- | - true | ... -test_run:switch('replica') +test_run:wait_cond(function() return f:status() == 'dead' end) | --- | - true | ... -box.snapshot() -- abort +ok, err | --- - | - error: A rollback for a synchronous transaction is received + | - false + | - A rollback for a synchronous transaction is received | ... -box.space.sync:select{} -- 1 +box.space.sync:select{} | --- - | - - [1] + | - [] | ... + test_run:switch('default') | --- | - true | ... -box.space.sync:select{} -- 1 +box.space.sync:select{} | --- - | - - [1] + | - [] | ... + -- Testcase cleanup. test_run:switch('default') | --- diff --git a/test/replication/qsync_snapshots.test.lua b/test/replication/qsync_snapshots.test.lua index b5990bce7..0db61da95 100644 --- a/test/replication/qsync_snapshots.test.lua +++ b/test/replication/qsync_snapshots.test.lua @@ -20,7 +20,7 @@ test_run:cmd('start server replica with wait=True, wait_load=True') -- expected success. -- Testcase setup. test_run:switch('default') -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000} _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) _ = box.space.sync:create_index('pk') -- Testcase body. @@ -35,7 +35,7 @@ box.space.sync:drop() -- expected success. -- Testcase setup. test_run:switch('default') -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000} _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) _ = box.space.sync:create_index('pk') -- Testcase body. @@ -43,79 +43,76 @@ box.space.sync:insert{1} box.space.sync:select{} -- 1 test_run:switch('replica') box.space.sync:select{} -- 1 +box.cfg{replication_synchro_timeout=1000} box.snapshot() box.space.sync:select{} -- 1 -- Testcase cleanup. test_run:switch('default') box.space.sync:drop() --- [RFC, Snapshot generation] rolled back operations are not snapshotted. --- Testcase setup. -test_run:switch('default') -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} -_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) -_ = box.space.sync:create_index('pk') --- Testcase body. -box.space.sync:insert{1} -box.space.sync:select{} -- 1 -test_run:switch('default') -box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1} -box.space.sync:insert{2} -box.snapshot() -box.space.sync:select{} -- 1 -test_run:switch('replica') -box.space.sync:select{} -- 1 --- Testcase cleanup. -test_run:switch('default') -box.space.sync:drop() - -- [RFC, Snapshot generation] snapshot started on master, then rollback -- arrived, expected snapshot abort. -test_run:switch('default') -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} -_ = box.schema.space.create('sync', {is_sync=true, engine=engine}) -_ = box.space.sync:create_index('pk') --- Testcase body. -box.space.sync:insert{1} -box.space.sync:select{} -- 1 -test_run:switch('default') -test_run:cmd("setopt delimiter ';'") -_ = fiber.create(function() - box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2} - box.space.sync:insert{2} -end); -test_run:cmd("setopt delimiter ''"); -box.snapshot() -- abort -box.space.sync:select{} -- 1 -test_run:switch('replica') -box.space.sync:select{} -- 1 --- Testcase cleanup. -test_run:switch('default') -box.space.sync:drop() +-- The test is temporary blocked on 5146 due to a crash when local +-- WAL write fails inside the WAL thread. Since this is the only +-- way to cause rollback of the transaction used in a snapshot +-- without triggering snapshot timeout. + +-- test_run:switch('default') +-- box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +-- _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) +-- _ = box.space.sync:create_index('pk') +-- -- Testcase body. +-- box.space.sync:insert{1} +-- box.space.sync:select{} -- 1 +-- test_run:switch('default') +-- test_run:cmd("setopt delimiter ';'") +-- _ = fiber.create(function() +-- box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2} +-- box.space.sync:insert{2} +-- end); +-- test_run:cmd("setopt delimiter ''"); +-- box.snapshot() -- abort +-- box.space.sync:select{} -- 1 +-- test_run:switch('replica') +-- box.space.sync:select{} -- 1 +-- -- Testcase cleanup. +-- test_run:switch('default') +-- box.space.sync:drop() -- [RFC, Snapshot generation] snapshot started on replica, then rollback -- arrived, expected snapshot abort. test_run:switch('default') -box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1} +box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=1000} _ = box.schema.space.create('sync', {is_sync=true, engine=engine}) _ = box.space.sync:create_index('pk') + -- Testcase body. -box.space.sync:insert{1} -box.space.sync:select{} -- 1 +test_run:switch('default') +box.cfg{replication_synchro_quorum=BROKEN_QUORUM} +ok, err = nil +f = fiber.create(function() \ + ok, err = pcall(box.space.sync.insert, box.space.sync, {1}) \ +end) + test_run:switch('replica') -box.space.sync:select{} -- 1 +fiber = require('fiber') +box.cfg{replication_synchro_timeout=1000} +ok, err = nil +f = fiber.create(function() ok, err = pcall(box.snapshot) end) + test_run:switch('default') -test_run:cmd("setopt delimiter ';'") -_ = fiber.create(function() - box.cfg{replication_synchro_quorum=BROKEN_QUORUM, replication_synchro_timeout=2} - box.space.sync:insert{2} -end); -test_run:cmd("setopt delimiter ''"); +box.cfg{replication_synchro_timeout=0.0001} +test_run:wait_cond(function() return f:status() == 'dead' end) +ok, err + test_run:switch('replica') -box.snapshot() -- abort -box.space.sync:select{} -- 1 +test_run:wait_cond(function() return f:status() == 'dead' end) +ok, err +box.space.sync:select{} + test_run:switch('default') -box.space.sync:select{} -- 1 +box.space.sync:select{} + -- Testcase cleanup. test_run:switch('default') box.space.sync:drop() diff --git a/test/unit/snap_quorum_delay.cc b/test/unit/snap_quorum_delay.cc index 8d50cfb27..ad0563345 100644 --- a/test/unit/snap_quorum_delay.cc +++ b/test/unit/snap_quorum_delay.cc @@ -78,7 +78,7 @@ enum process_type { * (to push a transaction to the limbo and simulate confirm). */ const int fake_lsn = 1; -const int instace_id = 1; +extern "C" int instance_id; const int relay_id = 2; int @@ -109,7 +109,7 @@ txn_process_func(va_list ap) * and call txn_commit (or another) later. */ struct txn_limbo_entry *entry = txn_limbo_append(&txn_limbo, - instace_id, txn); + instance_id, txn); /* * The trigger is used to verify that the transaction has been * completed. @@ -130,7 +130,7 @@ txn_process_func(va_list ap) unreachable(); } - txn_limbo_assign_lsn(&txn_limbo, entry, fake_lsn); + txn_limbo_assign_local_lsn(&txn_limbo, entry, fake_lsn); txn_limbo_ack(&txn_limbo, txn_limbo.instance_id, fake_lsn); txn_limbo_wait_complete(&txn_limbo, entry); @@ -239,6 +239,7 @@ main(void) fiber_init(fiber_c_invoke); gc_init(); txn_limbo_init(); + instance_id = 1; struct fiber *main_fiber = fiber_new("main", test_snap_delay); assert(main_fiber != NULL);
next prev parent reply other threads:[~2020-07-10 0:50 UTC|newest] Thread overview: 68+ messages / expand[flat|nested] mbox.gz Atom feed top [not found] <cover.1593723973.git.sergeyb@tarantool.org> 2020-06-29 23:15 ` Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 01/19] replication: introduce space.is_sync option Vladislav Shpilevoy 2020-06-30 23:00 ` Vladislav Shpilevoy 2020-07-01 15:55 ` Sergey Ostanevich 2020-07-01 23:46 ` Vladislav Shpilevoy 2020-07-02 8:25 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 10/19] txn_limbo: add ROLLBACK processing Vladislav Shpilevoy 2020-07-05 15:29 ` Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 11/19] box: rework local_recovery to use async txn_commit Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 12/19] replication: support ROLLBACK and CONFIRM during recovery Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 13/19] replication: add test for synchro CONFIRM/ROLLBACK Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 14/19] applier: remove writer_cond Vladislav Shpilevoy 2020-07-02 9:13 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 15/19] applier: send heartbeat not only on commit, but on any write Vladislav Shpilevoy 2020-07-01 23:55 ` Vladislav Shpilevoy 2020-07-03 12:23 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 16/19] txn_limbo: add diag_set in txn_limbo_wait_confirm Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 17/19] replication: delay initial join until confirmation Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 18/19] replication: only send confirmed data during final join Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 19/19] replication: block async transactions when not empty limbo Vladislav Shpilevoy 2020-07-01 17:12 ` Sergey Ostanevich 2020-07-01 23:47 ` Vladislav Shpilevoy 2020-07-03 12:28 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 02/19] replication: introduce replication_synchro_* cfg options Vladislav Shpilevoy 2020-07-01 16:05 ` Sergey Ostanevich 2020-07-01 23:46 ` Vladislav Shpilevoy 2020-07-02 8:29 ` Serge Petrenko 2020-07-02 23:36 ` Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 03/19] txn: add TXN_WAIT_ACK flag Vladislav Shpilevoy 2020-07-01 17:14 ` Sergey Ostanevich 2020-07-01 23:46 ` Vladislav Shpilevoy 2020-07-02 8:30 ` Serge Petrenko 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 04/19] replication: make sync transactions wait quorum Vladislav Shpilevoy 2020-06-30 23:00 ` Vladislav Shpilevoy 2020-07-02 8:48 ` Serge Petrenko 2020-07-03 21:16 ` Vladislav Shpilevoy 2020-07-05 16:05 ` Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 05/19] xrow: introduce CONFIRM and ROLLBACK entries Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 06/19] txn: introduce various reasons for txn rollback Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 07/19] replication: write and read CONFIRM entries Vladislav Shpilevoy 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 08/19] replication: add support of qsync to the snapshot machinery Vladislav Shpilevoy 2020-07-02 8:52 ` Serge Petrenko 2020-07-08 11:43 ` Leonid Vasiliev 2020-06-29 23:15 ` [Tarantool-patches] [PATCH v2 09/19] txn_limbo: add timeout when waiting for acks Vladislav Shpilevoy 2020-06-29 23:22 ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Vladislav Shpilevoy 2020-06-30 23:00 ` [Tarantool-patches] [PATCH v2 20/19] replication: add test for quorum 1 Vladislav Shpilevoy 2020-07-03 12:32 ` Serge Petrenko 2020-07-02 21:13 ` [Tarantool-patches] [PATCH 1/4] replication: regression test on gh-5119 [not fixed] sergeyb 2020-07-02 21:13 ` [Tarantool-patches] [PATCH 2/4] replication: add advanced tests for sync replication sergeyb 2020-07-02 22:46 ` Sergey Bronnikov 2020-07-02 23:20 ` Vladislav Shpilevoy 2020-07-06 12:30 ` Sergey Bronnikov 2020-07-06 23:31 ` Vladislav Shpilevoy 2020-07-07 12:12 ` Sergey Bronnikov 2020-07-07 20:57 ` Vladislav Shpilevoy 2020-07-08 12:07 ` Sergey Bronnikov 2020-07-08 22:13 ` Vladislav Shpilevoy 2020-07-09 9:39 ` Sergey Bronnikov 2020-07-02 21:13 ` [Tarantool-patches] [PATCH 3/4] replication: add tests for sync replication with anon replica sergeyb 2020-07-06 23:31 ` Vladislav Shpilevoy 2020-07-02 21:13 ` [Tarantool-patches] [PATCH 4/4] replication: add tests for sync replication with snapshots sergeyb 2020-07-02 22:46 ` Sergey Bronnikov 2020-07-02 23:20 ` Vladislav Shpilevoy 2020-07-06 23:31 ` Vladislav Shpilevoy 2020-07-07 16:00 ` Sergey Bronnikov 2020-07-06 23:31 ` [Tarantool-patches] [PATCH] Add new error injection constant ERRINJ_SYNC_TIMEOUT Vladislav Shpilevoy 2020-07-10 0:50 ` Vladislav Shpilevoy [this message] 2020-07-10 7:40 ` [Tarantool-patches] [PATCH v2 00/19] Sync replication Kirill Yukhin
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=b61eaafd-6659-5aca-4918-61ee6944cf36@tarantool.org \ --to=v.shpilevoy@tarantool.org \ --cc=sergepetrenko@tarantool.org \ --cc=tarantool-patches@dev.tarantool.org \ --subject='Re: [Tarantool-patches] [PATCH v2 00/19] Sync replication' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox