Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH] raft: make sure the leader stays ro till it clears the limbo
@ 2020-11-24 13:18 Serge Petrenko
  2020-11-24 14:14 ` Cyrill Gorcunov
  2020-11-26 21:01 ` Vladislav Shpilevoy
  0 siblings, 2 replies; 7+ messages in thread
From: Serge Petrenko @ 2020-11-24 13:18 UTC (permalink / raw)
  To: v.shpilevoy, gorcunov; +Cc: tarantool-patches

When running a cluster with leader election, its useful to wait till the
instance is writeable to determine that it has become a leader. However,
sometimes the instance fails to write data right after transitioning to
leader because its limbo still contains pending transactions from the
old leader. Make sure the instance deals with pending transactions first
and becomes writeable only once the limbo is empty.

Closes #5440
---
https://github.com/tarantool/tarantool/issues/5440
sp/gh-5440-on-state-update

 src/box/raft.c                                  | 17 ++++++++++-------
 test/replication/election_qsync.result          | 10 ++++++++--
 test/replication/election_qsync.test.lua        |  6 ++++--
 test/replication/election_qsync_stress.result   |  4 ++++
 test/replication/election_qsync_stress.test.lua |  2 ++
 5 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/box/raft.c b/src/box/raft.c
index 6d98f5645..fa2396b4f 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -89,9 +89,12 @@ box_raft_update_synchro_queue(struct raft *raft)
 	 * If the node became a leader, it means it will ignore all records from
 	 * all the other nodes, and won't get late CONFIRM messages anyway. Can
 	 * clear the queue without waiting for confirmations.
+	 * Once the queue is empty it's safe to become writeable.
 	 */
-	if (raft->state == RAFT_STATE_LEADER)
+	if (raft->state == RAFT_STATE_LEADER) {
 		box_clear_synchro_queue(false);
+		box_update_ro_summary();
+	}
 }
 
 static int
@@ -155,14 +158,14 @@ box_raft_on_update_f(struct trigger *trigger, void *event)
 	struct raft *raft = (struct raft *)event;
 	assert(raft == box_raft());
 	/*
-	 * XXX: in case the instance became a leader, RO must be updated only
-	 * after clearing the synchro queue.
-	 *
-	 * When the instance became a follower, then on the contrary - make it
-	 * read-only ASAP, this is good.
+	 * When the instance becomes a follower, it's good to make it read-only
+	 * ASAP. This way we make sure followers don't write anything.
+	 * However, if the instance is transitioning to leader make it writeable
+	 * only after it clears its synchro queue. It won't be able to process
+	 * requests earlier, anyway.
 	 */
-	box_update_ro_summary();
 	if (raft->state != RAFT_STATE_LEADER)
+		box_update_ro_summary();
 		return 0;
 	/*
 	 * If the node became a leader, time to clear the synchro queue. But it
diff --git a/test/replication/election_qsync.result b/test/replication/election_qsync.result
index cb349efcc..c06400b38 100644
--- a/test/replication/election_qsync.result
+++ b/test/replication/election_qsync.result
@@ -75,7 +75,10 @@ box.cfg{
  | ---
  | ...
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+ | ---
+ | ...
+assert(box.info.election.state == 'leader')
  | ---
  | - true
  | ...
@@ -130,7 +133,10 @@ box.cfg{
  | ---
  | ...
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+ | ---
+ | ...
+assert(box.info.election.state == 'leader')
  | ---
  | - true
  | ...
diff --git a/test/replication/election_qsync.test.lua b/test/replication/election_qsync.test.lua
index eb89e5b79..ea6fc4a61 100644
--- a/test/replication/election_qsync.test.lua
+++ b/test/replication/election_qsync.test.lua
@@ -39,7 +39,8 @@ box.cfg{
     replication_timeout = 0.1,                                                  \
 }
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+assert(box.info.election.state == 'leader')
 lsn = box.info.lsn
 _ = fiber.create(function()                                                     \
     ok, err = pcall(box.space.test.replace, box.space.test, {1})                \
@@ -69,7 +70,8 @@ box.cfg{
     replication_timeout = 0.01,                                                 \
 }
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+assert(box.info.election.state == 'leader')
 _ = box.space.test:replace{2}
 box.space.test:select{}
 box.space.test:drop()
diff --git a/test/replication/election_qsync_stress.result b/test/replication/election_qsync_stress.result
index 9fab2f1d7..1380c910b 100644
--- a/test/replication/election_qsync_stress.result
+++ b/test/replication/election_qsync_stress.result
@@ -69,6 +69,9 @@ leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1]
 c = netbox.connect(leader_port)
  | ---
  | ...
+c:eval('box.ctl.wait_rw()')
+ | ---
+ | ...
 
 _ = c:eval('box.schema.space.create("test", {is_sync=true})')
  | ---
@@ -94,6 +97,7 @@ for i = 1,10 do
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
+    c:eval('box.ctl.wait_rw()')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
diff --git a/test/replication/election_qsync_stress.test.lua b/test/replication/election_qsync_stress.test.lua
index 0ba15eef7..d70601cc5 100644
--- a/test/replication/election_qsync_stress.test.lua
+++ b/test/replication/election_qsync_stress.test.lua
@@ -40,6 +40,7 @@ old_leader_nr = get_leader(nrs)
 old_leader = 'election_replica'..old_leader_nr
 leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1]
 c = netbox.connect(leader_port)
+c:eval('box.ctl.wait_rw()')
 
 _ = c:eval('box.schema.space.create("test", {is_sync=true})')
 _ = c:eval('box.space.test:create_index("pk")')
@@ -58,6 +59,7 @@ for i = 1,10 do
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
+    c:eval('box.ctl.wait_rw()')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
-- 
2.24.3 (Apple Git-128)

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-11-30  9:40 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-24 13:18 [Tarantool-patches] [PATCH] raft: make sure the leader stays ro till it clears the limbo Serge Petrenko
2020-11-24 14:14 ` Cyrill Gorcunov
2020-11-25  8:48   ` Serge Petrenko
2020-11-26 21:01 ` Vladislav Shpilevoy
2020-11-27 13:00   ` Serge Petrenko
2020-11-27 22:10     ` Vladislav Shpilevoy
2020-11-30  9:40       ` Serge Petrenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox