[Tarantool-patches] [PATCH] raft: make sure the leader stays ro till it clears the limbo

Serge Petrenko sergepetrenko at tarantool.org
Tue Nov 24 16:18:20 MSK 2020


When running a cluster with leader election, its useful to wait till the
instance is writeable to determine that it has become a leader. However,
sometimes the instance fails to write data right after transitioning to
leader because its limbo still contains pending transactions from the
old leader. Make sure the instance deals with pending transactions first
and becomes writeable only once the limbo is empty.

Closes #5440
---
https://github.com/tarantool/tarantool/issues/5440
sp/gh-5440-on-state-update

 src/box/raft.c                                  | 17 ++++++++++-------
 test/replication/election_qsync.result          | 10 ++++++++--
 test/replication/election_qsync.test.lua        |  6 ++++--
 test/replication/election_qsync_stress.result   |  4 ++++
 test/replication/election_qsync_stress.test.lua |  2 ++
 5 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/box/raft.c b/src/box/raft.c
index 6d98f5645..fa2396b4f 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -89,9 +89,12 @@ box_raft_update_synchro_queue(struct raft *raft)
 	 * If the node became a leader, it means it will ignore all records from
 	 * all the other nodes, and won't get late CONFIRM messages anyway. Can
 	 * clear the queue without waiting for confirmations.
+	 * Once the queue is empty it's safe to become writeable.
 	 */
-	if (raft->state == RAFT_STATE_LEADER)
+	if (raft->state == RAFT_STATE_LEADER) {
 		box_clear_synchro_queue(false);
+		box_update_ro_summary();
+	}
 }
 
 static int
@@ -155,14 +158,14 @@ box_raft_on_update_f(struct trigger *trigger, void *event)
 	struct raft *raft = (struct raft *)event;
 	assert(raft == box_raft());
 	/*
-	 * XXX: in case the instance became a leader, RO must be updated only
-	 * after clearing the synchro queue.
-	 *
-	 * When the instance became a follower, then on the contrary - make it
-	 * read-only ASAP, this is good.
+	 * When the instance becomes a follower, it's good to make it read-only
+	 * ASAP. This way we make sure followers don't write anything.
+	 * However, if the instance is transitioning to leader make it writeable
+	 * only after it clears its synchro queue. It won't be able to process
+	 * requests earlier, anyway.
 	 */
-	box_update_ro_summary();
 	if (raft->state != RAFT_STATE_LEADER)
+		box_update_ro_summary();
 		return 0;
 	/*
 	 * If the node became a leader, time to clear the synchro queue. But it
diff --git a/test/replication/election_qsync.result b/test/replication/election_qsync.result
index cb349efcc..c06400b38 100644
--- a/test/replication/election_qsync.result
+++ b/test/replication/election_qsync.result
@@ -75,7 +75,10 @@ box.cfg{
  | ---
  | ...
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+ | ---
+ | ...
+assert(box.info.election.state == 'leader')
  | ---
  | - true
  | ...
@@ -130,7 +133,10 @@ box.cfg{
  | ---
  | ...
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+ | ---
+ | ...
+assert(box.info.election.state == 'leader')
  | ---
  | - true
  | ...
diff --git a/test/replication/election_qsync.test.lua b/test/replication/election_qsync.test.lua
index eb89e5b79..ea6fc4a61 100644
--- a/test/replication/election_qsync.test.lua
+++ b/test/replication/election_qsync.test.lua
@@ -39,7 +39,8 @@ box.cfg{
     replication_timeout = 0.1,                                                  \
 }
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+assert(box.info.election.state == 'leader')
 lsn = box.info.lsn
 _ = fiber.create(function()                                                     \
     ok, err = pcall(box.space.test.replace, box.space.test, {1})                \
@@ -69,7 +70,8 @@ box.cfg{
     replication_timeout = 0.01,                                                 \
 }
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+assert(box.info.election.state == 'leader')
 _ = box.space.test:replace{2}
 box.space.test:select{}
 box.space.test:drop()
diff --git a/test/replication/election_qsync_stress.result b/test/replication/election_qsync_stress.result
index 9fab2f1d7..1380c910b 100644
--- a/test/replication/election_qsync_stress.result
+++ b/test/replication/election_qsync_stress.result
@@ -69,6 +69,9 @@ leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1]
 c = netbox.connect(leader_port)
  | ---
  | ...
+c:eval('box.ctl.wait_rw()')
+ | ---
+ | ...
 
 _ = c:eval('box.schema.space.create("test", {is_sync=true})')
  | ---
@@ -94,6 +97,7 @@ for i = 1,10 do
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
+    c:eval('box.ctl.wait_rw()')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
diff --git a/test/replication/election_qsync_stress.test.lua b/test/replication/election_qsync_stress.test.lua
index 0ba15eef7..d70601cc5 100644
--- a/test/replication/election_qsync_stress.test.lua
+++ b/test/replication/election_qsync_stress.test.lua
@@ -40,6 +40,7 @@ old_leader_nr = get_leader(nrs)
 old_leader = 'election_replica'..old_leader_nr
 leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1]
 c = netbox.connect(leader_port)
+c:eval('box.ctl.wait_rw()')
 
 _ = c:eval('box.schema.space.create("test", {is_sync=true})')
 _ = c:eval('box.space.test:create_index("pk")')
@@ -58,6 +59,7 @@ for i = 1,10 do
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
+    c:eval('box.ctl.wait_rw()')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list