Tarantool development patches archive
 help / color / mirror / Atom feed
From: Serge Petrenko via Tarantool-patches <tarantool-patches@dev.tarantool.org>
To: v.shpilevoy@tarantool.org, gorcunov@gmail.com
Cc: tarantool-patches@dev.tarantool.org
Subject: Re: [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that
Date: Mon, 12 Apr 2021 22:23:00 +0300	[thread overview]
Message-ID: <56012d4a-bb15-d0e4-d948-b3a208d031f3@tarantool.org> (raw)
In-Reply-To: <f0f142cdbe8ef87bec48dbdfbf1e125a72ac50bd.1618163409.git.sergepetrenko@tarantool.org>



11.04.2021 20:56, Serge Petrenko пишет:
> Start writing the actual leader term together with the PROMOTE request
> and process terms in PROMOTE requests on receiver side.
>
> Make applier only apply synchronous transactions from the instance which
> has the greatest term as received in PROMOTE requests.
>
> Closes #5445
> ---

Force-pushed a test and a couple of fixes. Please find an incremental 
diff below and
the full patch in v2.

==========================
diff --git a/src/box/box.cc b/src/box/box.cc
index 9b6323b3f..aae57ec29 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1507,7 +1507,12 @@ box_clear_synchro_queue(bool try_wait)
          return -1;
      }

-    if (!is_box_configured)
+    /*
+     * Do nothing when box isn't configured and when PROMOTE was already
+     * written for this term.
+     */
+    if (!is_box_configured ||
+        raft_source_term(box_raft(), instance_id) == box_raft()->term)
          return 0;
      uint32_t former_leader_id = txn_limbo.owner_id;
      int64_t wait_lsn = txn_limbo.confirmed_lsn;
@@ -1569,12 +1574,12 @@ promote:
                  .replica_id = 0, /* unused */
                  .origin_id = instance_id,
                  .lsn = wait_lsn,
-                .term = 0, /* unused */
+                .term = box_raft()->term,
              };
              txn_limbo_read_promote(&txn_limbo, &req);
              assert(txn_limbo_is_empty(&txn_limbo));
              raft_source_update_term(box_raft(), req.origin_id,
-                        req.lsn);
+                        req.term);
          }
      }
      in_clear_synchro_queue = false;
diff --git a/src/lib/raft/raft.h b/src/lib/raft/raft.h
index cba45a67d..40c8630e9 100644
--- a/src/lib/raft/raft.h
+++ b/src/lib/raft/raft.h
@@ -256,20 +256,29 @@ raft_is_source_allowed(const struct raft *raft, 
uint32_t source_id)
      return !raft->is_enabled || raft->leader == source_id;
  }

-static inline bool
-raft_source_has_outdated_term(const struct raft *raft, uint32_t source_id)
+/**
+ * Return the latest term as seen in PROMOTE requests from instance with id
+ * @a source_id.
+ */
+static inline uint64_t
+raft_source_term(const struct raft *raft, uint32_t source_id)
  {
-    uint64_t source_term = vclock_get(&raft->term_map, source_id);
-    return raft->is_enabled && source_term < raft->greatest_known_term;
+    assert(source_id != 0 && source_id < VCLOCK_MAX);
+    return vclock_get(&raft->term_map, source_id);
  }

-/** Check if Raft is enabled. */
+/**
+ * Check whether replica with id @a source_id is too old to apply 
synchronous
+ * data from it. The check remains valid  even when elections are disabled.
+ */
  static inline bool
-raft_is_enabled(const struct raft *raft)
+raft_source_has_outdated_term(const struct raft *raft, uint32_t source_id)
  {
-    return raft->is_enabled;
+    uint64_t source_term = vclock_get(&raft->term_map, source_id);
+    return source_term < raft->greatest_known_term;
  }

+/** Remember the last term seen for replica  with id @a source_id. */
  static inline void
  raft_source_update_term(struct raft *raft, uint32_t source_id, 
uint64_t term)
  {
@@ -280,6 +289,13 @@ raft_source_update_term(struct raft *raft, uint32_t 
source_id, uint64_t term)
          raft->greatest_known_term = term;
  }

+/** Check if Raft is enabled. */
+static inline bool
+raft_is_enabled(const struct raft *raft)
+{
+    return raft->is_enabled;
+}
+
  /** Process a raft entry stored in WAL/snapshot. */
  void
  raft_process_recovery(struct raft *raft, const struct raft_msg *req);
diff --git a/test/replication/gh-5445-leader-incosistency.result 
b/test/replication/gh-5445-leader-incosistency.result
new file mode 100644
index 000000000..b1f8a4ed1
--- /dev/null
+++ b/test/replication/gh-5445-leader-incosistency.result
@@ -0,0 +1,238 @@
+-- test-run result file version 2
+test_run = require("test_run").new()
+ | ---
+ | ...
+
+is_leader_cmd = "return box.info.election.state == 'leader'"
+ | ---
+ | ...
+
+-- Auxiliary.
+test_run:cmd('setopt delimiter ";"')
+ | ---
+ | - true
+ | ...
+function get_leader(nrs)
+    local leader_nr = 0
+    test_run:wait_cond(function()
+        for nr, do_check in pairs(nrs) do
+            if do_check then
+                local is_leader = test_run:eval('election_replica'..nr,
+                                                is_leader_cmd)[1]
+                if is_leader then
+                    leader_nr = nr
+                    return true
+                end
+            end
+        end
+        return false
+    end)
+    assert(leader_nr ~= 0)
+    return leader_nr
+end;
+ | ---
+ | ...
+
+function name(id)
+    return 'election_replica'..id
+end;
+ | ---
+ | ...
+test_run:cmd('setopt delimiter ""');
+ | ---
+ | - true
+ | ...
+
+--
+-- gh-5445: make sure rolled back rows do not reappear once old leader 
returns
+-- to cluster.
+--
+SERVERS = {'election_replica1', 'election_replica2' ,'election_replica3'}
+ | ---
+ | ...
+test_run:create_cluster(SERVERS, "replication", {args='2 0.4'})
+ | ---
+ | ...
+test_run:wait_fullmesh(SERVERS)
+ | ---
+ | ...
+
+-- Any of the three instances may bootstrap the cluster and become leader.
+is_possible_leader = {true, true, true}
+ | ---
+ | ...
+leader_nr = get_leader(is_possible_leader)
+ | ---
+ | ...
+leader = name(leader_nr)
+ | ---
+ | ...
+next_leader_nr = ((leader_nr - 1) % 3 + 1) % 3 + 1 -- {1, 2, 3} -> {2, 
3, 1}
+ | ---
+ | ...
+next_leader = name(next_leader_nr)
+ | ---
+ | ...
+other_nr = ((leader_nr - 1) % 3 + 2) % 3 + 1 -- {1, 2, 3} -> {3, 1, 2}
+ | ---
+ | ...
+other = name(other_nr)
+ | ---
+ | ...
+
+test_run:switch(leader)
+ | ---
+ | - true
+ | ...
+box.ctl.wait_rw()
+ | ---
+ | ...
+_ = box.schema.space.create('test', {is_sync=true})
+ | ---
+ | ...
+_ = box.space.test:create_index('pk')
+ | ---
+ | ...
+box.space.test:insert{1}
+ | ---
+ | - [1]
+ | ...
+
+-- Simulate a situation when the instance which will become the next leader
+-- doesn't know of unconfirmed rows. It should roll them back anyways 
and do not
+-- accept them once they actually appear from the old leader.
+-- So, stop the instance which'll be the next leader.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server '..next_leader)
+ | ---
+ | - true
+ | ...
+test_run:switch(leader)
+ | ---
+ | - true
+ | ...
+-- Insert some unconfirmed data.
+box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=1000}
+ | ---
+ | ...
+fib = require('fiber').create(box.space.test.insert, box.space.test, {2})
+ | ---
+ | ...
+fib:status()
+ | ---
+ | - suspended
+ | ...
+
+-- 'other', 'leader', 'next_leader' are defined on 'default' node, 
hence the
+-- double switches.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:switch(other)
+ | ---
+ | - true
+ | ...
+-- Wait until the rows are replicated to the other instance.
+test_run:wait_cond(function() return box.space.test:get{2} ~= nil end)
+ | ---
+ | - true
+ | ...
+box.cfg{election_mode='voter'}
+ | ---
+ | ...
+-- Old leader is gone.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server '..leader)
+ | ---
+ | - true
+ | ...
+is_possible_leader[leader_nr] = false
+ | ---
+ | ...
+
+-- Emulate a situation when next_leader wins the elections. It can't do 
that in
+-- this configuration, obviously, because it's behind the 'other' node, 
so set
+-- quorum to 1 and imagine there are 2 more servers which would vote for
+-- next_leader.
+-- Also, make the instance ignore synchronization with other replicas.
+-- Otherwise it would stall for replication_sync_timeout. This is due 
to the
+-- nature of the test and may be ignored (we restart the instance to 
simulate
+-- a situation when some rows from the old leader were not received).
+test_run:cmd('start server '..next_leader..' with args="1 0.4 candidate 
1"')
+ | ---
+ | - true
+ | ...
+assert(get_leader(is_possible_leader) == next_leader_nr)
+ | ---
+ | - true
+ | ...
+test_run:switch(other)
+ | ---
+ | - true
+ | ...
+-- New leader didn't know about the unconfirmed rows but still rolled 
them back.
+test_run:wait_cond(function() return box.space.test:get{2} == nil end)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:switch(next_leader)
+ | ---
+ | - true
+ | ...
+box.space.test:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+box.cfg{election_mode='voter'}
+ | ---
+ | ...
+-- Old leader returns and old unconfirmed rows from it must be ignored.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+-- Make old leader win the elections.
+test_run:cmd('start server '..leader..' with args="1 0.4 candidate 1"')
+ | ---
+ | - true
+ | ...
+is_possible_leader[leader_nr] = true
+ | ---
+ | ...
+assert(get_leader(is_possible_leader) == leader_nr)
+ | ---
+ | - true
+ | ...
+test_run:switch(next_leader)
+ | ---
+ | - true
+ | ...
+box.space.test:select{} -- 1
+ | ---
+ | - - [1]
+ | ...
+test_run:wait_upstream(1, {status='follow'})
+ | ---
+ | - true
+ | ...
+
+-- Cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:drop_cluster(SERVERS)
+ | ---
+ | ...
diff --git a/test/replication/gh-5445-leader-incosistency.test.lua 
b/test/replication/gh-5445-leader-incosistency.test.lua
new file mode 100644
index 000000000..94beea966
--- /dev/null
+++ b/test/replication/gh-5445-leader-incosistency.test.lua
@@ -0,0 +1,108 @@
+test_run = require("test_run").new()
+
+is_leader_cmd = "return box.info.election.state == 'leader'"
+
+-- Auxiliary.
+test_run:cmd('setopt delimiter ";"')
+function get_leader(nrs)
+    local leader_nr = 0
+    test_run:wait_cond(function()
+        for nr, do_check in pairs(nrs) do
+            if do_check then
+                local is_leader = test_run:eval('election_replica'..nr,
+                                                is_leader_cmd)[1]
+                if is_leader then
+                    leader_nr = nr
+                    return true
+                end
+            end
+        end
+        return false
+    end)
+    assert(leader_nr ~= 0)
+    return leader_nr
+end;
+
+function name(id)
+    return 'election_replica'..id
+end;
+test_run:cmd('setopt delimiter ""');
+
+--
+-- gh-5445: make sure rolled back rows do not reappear once old leader 
returns
+-- to cluster.
+--
+SERVERS = {'election_replica1', 'election_replica2' ,'election_replica3'}
+test_run:create_cluster(SERVERS, "replication", {args='2 0.4'})
+test_run:wait_fullmesh(SERVERS)
+
+-- Any of the three instances may bootstrap the cluster and become leader.
+is_possible_leader = {true, true, true}
+leader_nr = get_leader(is_possible_leader)
+leader = name(leader_nr)
+next_leader_nr = ((leader_nr - 1) % 3 + 1) % 3 + 1 -- {1, 2, 3} -> {2, 
3, 1}
+next_leader = name(next_leader_nr)
+other_nr = ((leader_nr - 1) % 3 + 2) % 3 + 1 -- {1, 2, 3} -> {3, 1, 2}
+other = name(other_nr)
+
+test_run:switch(leader)
+box.ctl.wait_rw()
+_ = box.schema.space.create('test', {is_sync=true})
+_ = box.space.test:create_index('pk')
+box.space.test:insert{1}
+
+-- Simulate a situation when the instance which will become the next leader
+-- doesn't know of unconfirmed rows. It should roll them back anyways 
and do not
+-- accept them once they actually appear from the old leader.
+-- So, stop the instance which'll be the next leader.
+test_run:switch('default')
+test_run:cmd('stop server '..next_leader)
+test_run:switch(leader)
+-- Insert some unconfirmed data.
+box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=1000}
+fib = require('fiber').create(box.space.test.insert, box.space.test, {2})
+fib:status()
+
+-- 'other', 'leader', 'next_leader' are defined on 'default' node, 
hence the
+-- double switches.
+test_run:switch('default')
+test_run:switch(other)
+-- Wait until the rows are replicated to the other instance.
+test_run:wait_cond(function() return box.space.test:get{2} ~= nil end)
+box.cfg{election_mode='voter'}
+-- Old leader is gone.
+test_run:switch('default')
+test_run:cmd('stop server '..leader)
+is_possible_leader[leader_nr] = false
+
+-- Emulate a situation when next_leader wins the elections. It can't do 
that in
+-- this configuration, obviously, because it's behind the 'other' node, 
so set
+-- quorum to 1 and imagine there are 2 more servers which would vote for
+-- next_leader.
+-- Also, make the instance ignore synchronization with other replicas.
+-- Otherwise it would stall for replication_sync_timeout. This is due 
to the
+-- nature of the test and may be ignored (we restart the instance to 
simulate
+-- a situation when some rows from the old leader were not received).
+test_run:cmd('start server '..next_leader..' with args="1 0.4 candidate 
1"')
+assert(get_leader(is_possible_leader) == next_leader_nr)
+test_run:switch(other)
+-- New leader didn't know about the unconfirmed rows but still rolled 
them back.
+test_run:wait_cond(function() return box.space.test:get{2} == nil end)
+
+test_run:switch('default')
+test_run:switch(next_leader)
+box.space.test:select{} -- 1
+box.cfg{election_mode='voter'}
+-- Old leader returns and old unconfirmed rows from it must be ignored.
+test_run:switch('default')
+-- Make old leader win the elections.
+test_run:cmd('start server '..leader..' with args="1 0.4 candidate 1"')
+is_possible_leader[leader_nr] = true
+assert(get_leader(is_possible_leader) == leader_nr)
+test_run:switch(next_leader)
+box.space.test:select{} -- 1
+test_run:wait_upstream(1, {status='follow'})
+
+-- Cleanup.
+test_run:switch('default')
+test_run:drop_cluster(SERVERS)
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index aff5fda26..0a270d3d6 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -17,6 +17,7 @@
      "gh-4424-misc-orphan-on-reconfiguration-error.test.lua": {},
      "gh-5426-election-on-off.test.lua": {},
      "gh-5433-election-restart-recovery.test.lua": {},
+    "gh-5445-leader-incosistency.test.lua": {},
      "gh-5506-election-on-off.test.lua": {},
      "once.test.lua": {},
      "on_replace.test.lua": {},

-- 
Serge Petrenko


  reply	other threads:[~2021-04-12 19:23 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-04-11 17:55 [Tarantool-patches] [PATCH 0/9] raft: introduce manual elections and fix a bug with re-applying rolled back transactions Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 1/9] wal: enrich row's meta information with sync replication flags Serge Petrenko via Tarantool-patches
2021-04-12 13:06   ` Cyrill Gorcunov via Tarantool-patches
2021-04-13 13:26     ` Serge Petrenko via Tarantool-patches
2021-04-12 19:21   ` Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 2/9] xrow: introduce a PROMOTE entry Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 3/9] box: actualise iproto_key_type array Serge Petrenko via Tarantool-patches
2021-04-11 17:55 ` [Tarantool-patches] [PATCH 4/9] box: make clear_synchro_queue() write a PROMOTE entry instead of CONFIRM + ROLLBACK Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 5/9] box: write PROMOTE even for empty limbo Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that Serge Petrenko via Tarantool-patches
2021-04-12 19:23   ` Serge Petrenko via Tarantool-patches [this message]
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 7/9] replication: introduce a new election mode: "manual" Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 8/9] Support manual elections in `box.ctl.clear_synchro_queue()` Serge Petrenko via Tarantool-patches
2021-04-12 19:23   ` Serge Petrenko via Tarantool-patches
2021-04-11 17:56 ` [Tarantool-patches] [PATCH 9/9] box.ctl: rename clear_synchro_queue to promote Serge Petrenko via Tarantool-patches
2021-04-12 19:24   ` Serge Petrenko via Tarantool-patches

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=56012d4a-bb15-d0e4-d948-b3a208d031f3@tarantool.org \
    --to=tarantool-patches@dev.tarantool.org \
    --cc=gorcunov@gmail.com \
    --cc=sergepetrenko@tarantool.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH 6/9] raft: keep track of greatest known term and filter replication sources based on that' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox