From: Serge Petrenko via Tarantool-patches <tarantool-patches@dev.tarantool.org> To: v.shpilevoy@tarantool.org, gorcunov@gmail.com Cc: tarantool-patches@dev.tarantool.org Subject: Re: [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that Date: Mon, 12 Apr 2021 22:23:00 +0300 [thread overview] Message-ID: <56012d4a-bb15-d0e4-d948-b3a208d031f3@tarantool.org> (raw) In-Reply-To: <f0f142cdbe8ef87bec48dbdfbf1e125a72ac50bd.1618163409.git.sergepetrenko@tarantool.org> 11.04.2021 20:56, Serge Petrenko пишет: > Start writing the actual leader term together with the PROMOTE request > and process terms in PROMOTE requests on receiver side. > > Make applier only apply synchronous transactions from the instance which > has the greatest term as received in PROMOTE requests. > > Closes #5445 > --- Force-pushed a test and a couple of fixes. Please find an incremental diff below and the full patch in v2. ========================== diff --git a/src/box/box.cc b/src/box/box.cc index 9b6323b3f..aae57ec29 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -1507,7 +1507,12 @@ box_clear_synchro_queue(bool try_wait) return -1; } - if (!is_box_configured) + /* + * Do nothing when box isn't configured and when PROMOTE was already + * written for this term. + */ + if (!is_box_configured || + raft_source_term(box_raft(), instance_id) == box_raft()->term) return 0; uint32_t former_leader_id = txn_limbo.owner_id; int64_t wait_lsn = txn_limbo.confirmed_lsn; @@ -1569,12 +1574,12 @@ promote: .replica_id = 0, /* unused */ .origin_id = instance_id, .lsn = wait_lsn, - .term = 0, /* unused */ + .term = box_raft()->term, }; txn_limbo_read_promote(&txn_limbo, &req); assert(txn_limbo_is_empty(&txn_limbo)); raft_source_update_term(box_raft(), req.origin_id, - req.lsn); + req.term); } } in_clear_synchro_queue = false; diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h index cba45a67d..40c8630e9 100644 --- a/src/lib/raft/raft.h +++ b/src/lib/raft/raft.h @@ -256,20 +256,29 @@ raft_is_source_allowed(const struct raft *raft, uint32_t source_id) return !raft->is_enabled || raft->leader == source_id; } -static inline bool -raft_source_has_outdated_term(const struct raft *raft, uint32_t source_id) +/** + * Return the latest term as seen in PROMOTE requests from instance with id + * @a source_id. + */ +static inline uint64_t +raft_source_term(const struct raft *raft, uint32_t source_id) { - uint64_t source_term = vclock_get(&raft->term_map, source_id); - return raft->is_enabled && source_term < raft->greatest_known_term; + assert(source_id != 0 && source_id < VCLOCK_MAX); + return vclock_get(&raft->term_map, source_id); } -/** Check if Raft is enabled. */ +/** + * Check whether replica with id @a source_id is too old to apply synchronous + * data from it. The check remains valid even when elections are disabled. + */ static inline bool -raft_is_enabled(const struct raft *raft) +raft_source_has_outdated_term(const struct raft *raft, uint32_t source_id) { - return raft->is_enabled; + uint64_t source_term = vclock_get(&raft->term_map, source_id); + return source_term < raft->greatest_known_term; } +/** Remember the last term seen for replica with id @a source_id. */ static inline void raft_source_update_term(struct raft *raft, uint32_t source_id, uint64_t term) { @@ -280,6 +289,13 @@ raft_source_update_term(struct raft *raft, uint32_t source_id, uint64_t term) raft->greatest_known_term = term; } +/** Check if Raft is enabled. */ +static inline bool +raft_is_enabled(const struct raft *raft) +{ + return raft->is_enabled; +} + /** Process a raft entry stored in WAL/snapshot. */ void raft_process_recovery(struct raft *raft, const struct raft_msg *req); diff --git a/test/replication/gh-5445-leader-incosistency.result b/test/replication/gh-5445-leader-incosistency.result new file mode 100644 index 000000000..b1f8a4ed1 --- /dev/null +++ b/test/replication/gh-5445-leader-incosistency.result @@ -0,0 +1,238 @@ +-- test-run result file version 2 +test_run = require("test_run").new() + | --- + | ... + +is_leader_cmd = "return box.info.election.state == 'leader'" + | --- + | ... + +-- Auxiliary. +test_run:cmd('setopt delimiter ";"') + | --- + | - true + | ... +function get_leader(nrs) + local leader_nr = 0 + test_run:wait_cond(function() + for nr, do_check in pairs(nrs) do + if do_check then + local is_leader = test_run:eval('election_replica'..nr, + is_leader_cmd)[1] + if is_leader then + leader_nr = nr + return true + end + end + end + return false + end) + assert(leader_nr ~= 0) + return leader_nr +end; + | --- + | ... + +function name(id) + return 'election_replica'..id +end; + | --- + | ... +test_run:cmd('setopt delimiter ""'); + | --- + | - true + | ... + +-- +-- gh-5445: make sure rolled back rows do not reappear once old leader returns +-- to cluster. +-- +SERVERS = {'election_replica1', 'election_replica2' ,'election_replica3'} + | --- + | ... +test_run:create_cluster(SERVERS, "replication", {args='2 0.4'}) + | --- + | ... +test_run:wait_fullmesh(SERVERS) + | --- + | ... + +-- Any of the three instances may bootstrap the cluster and become leader. +is_possible_leader = {true, true, true} + | --- + | ... +leader_nr = get_leader(is_possible_leader) + | --- + | ... +leader = name(leader_nr) + | --- + | ... +next_leader_nr = ((leader_nr - 1) % 3 + 1) % 3 + 1 -- {1, 2, 3} -> {2, 3, 1} + | --- + | ... +next_leader = name(next_leader_nr) + | --- + | ... +other_nr = ((leader_nr - 1) % 3 + 2) % 3 + 1 -- {1, 2, 3} -> {3, 1, 2} + | --- + | ... +other = name(other_nr) + | --- + | ... + +test_run:switch(leader) + | --- + | - true + | ... +box.ctl.wait_rw() + | --- + | ... +_ = box.schema.space.create('test', {is_sync=true}) + | --- + | ... +_ = box.space.test:create_index('pk') + | --- + | ... +box.space.test:insert{1} + | --- + | - [1] + | ... + +-- Simulate a situation when the instance which will become the next leader +-- doesn't know of unconfirmed rows. It should roll them back anyways and do not +-- accept them once they actually appear from the old leader. +-- So, stop the instance which'll be the next leader. +test_run:switch('default') + | --- + | - true + | ... +test_run:cmd('stop server '..next_leader) + | --- + | - true + | ... +test_run:switch(leader) + | --- + | - true + | ... +-- Insert some unconfirmed data. +box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=1000} + | --- + | ... +fib = require('fiber').create(box.space.test.insert, box.space.test, {2}) + | --- + | ... +fib:status() + | --- + | - suspended + | ... + +-- 'other', 'leader', 'next_leader' are defined on 'default' node, hence the +-- double switches. +test_run:switch('default') + | --- + | - true + | ... +test_run:switch(other) + | --- + | - true + | ... +-- Wait until the rows are replicated to the other instance. +test_run:wait_cond(function() return box.space.test:get{2} ~= nil end) + | --- + | - true + | ... +box.cfg{election_mode='voter'} + | --- + | ... +-- Old leader is gone. +test_run:switch('default') + | --- + | - true + | ... +test_run:cmd('stop server '..leader) + | --- + | - true + | ... +is_possible_leader[leader_nr] = false + | --- + | ... + +-- Emulate a situation when next_leader wins the elections. It can't do that in +-- this configuration, obviously, because it's behind the 'other' node, so set +-- quorum to 1 and imagine there are 2 more servers which would vote for +-- next_leader. +-- Also, make the instance ignore synchronization with other replicas. +-- Otherwise it would stall for replication_sync_timeout. This is due to the +-- nature of the test and may be ignored (we restart the instance to simulate +-- a situation when some rows from the old leader were not received). +test_run:cmd('start server '..next_leader..' with args="1 0.4 candidate 1"') + | --- + | - true + | ... +assert(get_leader(is_possible_leader) == next_leader_nr) + | --- + | - true + | ... +test_run:switch(other) + | --- + | - true + | ... +-- New leader didn't know about the unconfirmed rows but still rolled them back. +test_run:wait_cond(function() return box.space.test:get{2} == nil end) + | --- + | - true + | ... + +test_run:switch('default') + | --- + | - true + | ... +test_run:switch(next_leader) + | --- + | - true + | ... +box.space.test:select{} -- 1 + | --- + | - - [1] + | ... +box.cfg{election_mode='voter'} + | --- + | ... +-- Old leader returns and old unconfirmed rows from it must be ignored. +test_run:switch('default') + | --- + | - true + | ... +-- Make old leader win the elections. +test_run:cmd('start server '..leader..' with args="1 0.4 candidate 1"') + | --- + | - true + | ... +is_possible_leader[leader_nr] = true + | --- + | ... +assert(get_leader(is_possible_leader) == leader_nr) + | --- + | - true + | ... +test_run:switch(next_leader) + | --- + | - true + | ... +box.space.test:select{} -- 1 + | --- + | - - [1] + | ... +test_run:wait_upstream(1, {status='follow'}) + | --- + | - true + | ... + +-- Cleanup. +test_run:switch('default') + | --- + | - true + | ... +test_run:drop_cluster(SERVERS) + | --- + | ... diff --git a/test/replication/gh-5445-leader-incosistency.test.lua b/test/replication/gh-5445-leader-incosistency.test.lua new file mode 100644 index 000000000..94beea966 --- /dev/null +++ b/test/replication/gh-5445-leader-incosistency.test.lua @@ -0,0 +1,108 @@ +test_run = require("test_run").new() + +is_leader_cmd = "return box.info.election.state == 'leader'" + +-- Auxiliary. +test_run:cmd('setopt delimiter ";"') +function get_leader(nrs) + local leader_nr = 0 + test_run:wait_cond(function() + for nr, do_check in pairs(nrs) do + if do_check then + local is_leader = test_run:eval('election_replica'..nr, + is_leader_cmd)[1] + if is_leader then + leader_nr = nr + return true + end + end + end + return false + end) + assert(leader_nr ~= 0) + return leader_nr +end; + +function name(id) + return 'election_replica'..id +end; +test_run:cmd('setopt delimiter ""'); + +-- +-- gh-5445: make sure rolled back rows do not reappear once old leader returns +-- to cluster. +-- +SERVERS = {'election_replica1', 'election_replica2' ,'election_replica3'} +test_run:create_cluster(SERVERS, "replication", {args='2 0.4'}) +test_run:wait_fullmesh(SERVERS) + +-- Any of the three instances may bootstrap the cluster and become leader. +is_possible_leader = {true, true, true} +leader_nr = get_leader(is_possible_leader) +leader = name(leader_nr) +next_leader_nr = ((leader_nr - 1) % 3 + 1) % 3 + 1 -- {1, 2, 3} -> {2, 3, 1} +next_leader = name(next_leader_nr) +other_nr = ((leader_nr - 1) % 3 + 2) % 3 + 1 -- {1, 2, 3} -> {3, 1, 2} +other = name(other_nr) + +test_run:switch(leader) +box.ctl.wait_rw() +_ = box.schema.space.create('test', {is_sync=true}) +_ = box.space.test:create_index('pk') +box.space.test:insert{1} + +-- Simulate a situation when the instance which will become the next leader +-- doesn't know of unconfirmed rows. It should roll them back anyways and do not +-- accept them once they actually appear from the old leader. +-- So, stop the instance which'll be the next leader. +test_run:switch('default') +test_run:cmd('stop server '..next_leader) +test_run:switch(leader) +-- Insert some unconfirmed data. +box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=1000} +fib = require('fiber').create(box.space.test.insert, box.space.test, {2}) +fib:status() + +-- 'other', 'leader', 'next_leader' are defined on 'default' node, hence the +-- double switches. +test_run:switch('default') +test_run:switch(other) +-- Wait until the rows are replicated to the other instance. +test_run:wait_cond(function() return box.space.test:get{2} ~= nil end) +box.cfg{election_mode='voter'} +-- Old leader is gone. +test_run:switch('default') +test_run:cmd('stop server '..leader) +is_possible_leader[leader_nr] = false + +-- Emulate a situation when next_leader wins the elections. It can't do that in +-- this configuration, obviously, because it's behind the 'other' node, so set +-- quorum to 1 and imagine there are 2 more servers which would vote for +-- next_leader. +-- Also, make the instance ignore synchronization with other replicas. +-- Otherwise it would stall for replication_sync_timeout. This is due to the +-- nature of the test and may be ignored (we restart the instance to simulate +-- a situation when some rows from the old leader were not received). +test_run:cmd('start server '..next_leader..' with args="1 0.4 candidate 1"') +assert(get_leader(is_possible_leader) == next_leader_nr) +test_run:switch(other) +-- New leader didn't know about the unconfirmed rows but still rolled them back. +test_run:wait_cond(function() return box.space.test:get{2} == nil end) + +test_run:switch('default') +test_run:switch(next_leader) +box.space.test:select{} -- 1 +box.cfg{election_mode='voter'} +-- Old leader returns and old unconfirmed rows from it must be ignored. +test_run:switch('default') +-- Make old leader win the elections. +test_run:cmd('start server '..leader..' with args="1 0.4 candidate 1"') +is_possible_leader[leader_nr] = true +assert(get_leader(is_possible_leader) == leader_nr) +test_run:switch(next_leader) +box.space.test:select{} -- 1 +test_run:wait_upstream(1, {status='follow'}) + +-- Cleanup. +test_run:switch('default') +test_run:drop_cluster(SERVERS) diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg index aff5fda26..0a270d3d6 100644 --- a/test/replication/suite.cfg +++ b/test/replication/suite.cfg @@ -17,6 +17,7 @@ "gh-4424-misc-orphan-on-reconfiguration-error.test.lua": {}, "gh-5426-election-on-off.test.lua": {}, "gh-5433-election-restart-recovery.test.lua": {}, + "gh-5445-leader-incosistency.test.lua": {}, "gh-5506-election-on-off.test.lua": {}, "once.test.lua": {}, "on_replace.test.lua": {}, -- Serge Petrenko
next prev parent reply other threads:[~2021-04-12 19:23 UTC|newest] Thread overview: 16+ messages / expand[flat|nested] mbox.gz Atom feed top 2021-04-11 17:55 [Tarantool-patches] [PATCH 0/9] raft: introduce manual elections and fix a bug with re-applying rolled back transactions Serge Petrenko via Tarantool-patches 2021-04-11 17:55 ` [Tarantool-patches] [PATCH 1/9] wal: enrich row's meta information with sync replication flags Serge Petrenko via Tarantool-patches 2021-04-12 13:06 ` Cyrill Gorcunov via Tarantool-patches 2021-04-13 13:26 ` Serge Petrenko via Tarantool-patches 2021-04-12 19:21 ` Serge Petrenko via Tarantool-patches 2021-04-11 17:55 ` [Tarantool-patches] [PATCH 2/9] xrow: introduce a PROMOTE entry Serge Petrenko via Tarantool-patches 2021-04-11 17:55 ` [Tarantool-patches] [PATCH 3/9] box: actualise iproto_key_type array Serge Petrenko via Tarantool-patches 2021-04-11 17:55 ` [Tarantool-patches] [PATCH 4/9] box: make clear_synchro_queue() write a PROMOTE entry instead of CONFIRM + ROLLBACK Serge Petrenko via Tarantool-patches 2021-04-11 17:56 ` [Tarantool-patches] [PATCH 5/9] box: write PROMOTE even for empty limbo Serge Petrenko via Tarantool-patches 2021-04-11 17:56 ` [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that Serge Petrenko via Tarantool-patches 2021-04-12 19:23 ` Serge Petrenko via Tarantool-patches [this message] 2021-04-11 17:56 ` [Tarantool-patches] [PATCH 7/9] replication: introduce a new election mode: "manual" Serge Petrenko via Tarantool-patches 2021-04-11 17:56 ` [Tarantool-patches] [PATCH 8/9] Support manual elections in `box.ctl.clear_synchro_queue()` Serge Petrenko via Tarantool-patches 2021-04-12 19:23 ` Serge Petrenko via Tarantool-patches 2021-04-11 17:56 ` [Tarantool-patches] [PATCH 9/9] box.ctl: rename clear_synchro_queue to promote Serge Petrenko via Tarantool-patches 2021-04-12 19:24 ` Serge Petrenko via Tarantool-patches
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=56012d4a-bb15-d0e4-d948-b3a208d031f3@tarantool.org \ --to=tarantool-patches@dev.tarantool.org \ --cc=gorcunov@gmail.com \ --cc=sergepetrenko@tarantool.org \ --cc=v.shpilevoy@tarantool.org \ --subject='Re: [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox