Tarantool development patches archive
 help / color / mirror / Atom feed
From: Serge Petrenko <sergepetrenko@tarantool.org>
To: v.shpilevoy@tarantool.org, gorcunov@gmail.com
Cc: tarantool-patches@dev.tarantool.org
Subject: [Tarantool-patches] [PATCH] raft: make sure the leader stays ro till it clears the limbo
Date: Tue, 24 Nov 2020 16:18:20 +0300	[thread overview]
Message-ID: <20201124131820.32981-1-sergepetrenko@tarantool.org> (raw)

When running a cluster with leader election, its useful to wait till the
instance is writeable to determine that it has become a leader. However,
sometimes the instance fails to write data right after transitioning to
leader because its limbo still contains pending transactions from the
old leader. Make sure the instance deals with pending transactions first
and becomes writeable only once the limbo is empty.

Closes #5440
---
https://github.com/tarantool/tarantool/issues/5440
sp/gh-5440-on-state-update

 src/box/raft.c                                  | 17 ++++++++++-------
 test/replication/election_qsync.result          | 10 ++++++++--
 test/replication/election_qsync.test.lua        |  6 ++++--
 test/replication/election_qsync_stress.result   |  4 ++++
 test/replication/election_qsync_stress.test.lua |  2 ++
 5 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/src/box/raft.c b/src/box/raft.c
index 6d98f5645..fa2396b4f 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -89,9 +89,12 @@ box_raft_update_synchro_queue(struct raft *raft)
 	 * If the node became a leader, it means it will ignore all records from
 	 * all the other nodes, and won't get late CONFIRM messages anyway. Can
 	 * clear the queue without waiting for confirmations.
+	 * Once the queue is empty it's safe to become writeable.
 	 */
-	if (raft->state == RAFT_STATE_LEADER)
+	if (raft->state == RAFT_STATE_LEADER) {
 		box_clear_synchro_queue(false);
+		box_update_ro_summary();
+	}
 }
 
 static int
@@ -155,14 +158,14 @@ box_raft_on_update_f(struct trigger *trigger, void *event)
 	struct raft *raft = (struct raft *)event;
 	assert(raft == box_raft());
 	/*
-	 * XXX: in case the instance became a leader, RO must be updated only
-	 * after clearing the synchro queue.
-	 *
-	 * When the instance became a follower, then on the contrary - make it
-	 * read-only ASAP, this is good.
+	 * When the instance becomes a follower, it's good to make it read-only
+	 * ASAP. This way we make sure followers don't write anything.
+	 * However, if the instance is transitioning to leader make it writeable
+	 * only after it clears its synchro queue. It won't be able to process
+	 * requests earlier, anyway.
 	 */
-	box_update_ro_summary();
 	if (raft->state != RAFT_STATE_LEADER)
+		box_update_ro_summary();
 		return 0;
 	/*
 	 * If the node became a leader, time to clear the synchro queue. But it
diff --git a/test/replication/election_qsync.result b/test/replication/election_qsync.result
index cb349efcc..c06400b38 100644
--- a/test/replication/election_qsync.result
+++ b/test/replication/election_qsync.result
@@ -75,7 +75,10 @@ box.cfg{
  | ---
  | ...
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+ | ---
+ | ...
+assert(box.info.election.state == 'leader')
  | ---
  | - true
  | ...
@@ -130,7 +133,10 @@ box.cfg{
  | ---
  | ...
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+ | ---
+ | ...
+assert(box.info.election.state == 'leader')
  | ---
  | - true
  | ...
diff --git a/test/replication/election_qsync.test.lua b/test/replication/election_qsync.test.lua
index eb89e5b79..ea6fc4a61 100644
--- a/test/replication/election_qsync.test.lua
+++ b/test/replication/election_qsync.test.lua
@@ -39,7 +39,8 @@ box.cfg{
     replication_timeout = 0.1,                                                  \
 }
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+assert(box.info.election.state == 'leader')
 lsn = box.info.lsn
 _ = fiber.create(function()                                                     \
     ok, err = pcall(box.space.test.replace, box.space.test, {1})                \
@@ -69,7 +70,8 @@ box.cfg{
     replication_timeout = 0.01,                                                 \
 }
 
-test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+box.ctl.wait_rw()
+assert(box.info.election.state == 'leader')
 _ = box.space.test:replace{2}
 box.space.test:select{}
 box.space.test:drop()
diff --git a/test/replication/election_qsync_stress.result b/test/replication/election_qsync_stress.result
index 9fab2f1d7..1380c910b 100644
--- a/test/replication/election_qsync_stress.result
+++ b/test/replication/election_qsync_stress.result
@@ -69,6 +69,9 @@ leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1]
 c = netbox.connect(leader_port)
  | ---
  | ...
+c:eval('box.ctl.wait_rw()')
+ | ---
+ | ...
 
 _ = c:eval('box.schema.space.create("test", {is_sync=true})')
  | ---
@@ -94,6 +97,7 @@ for i = 1,10 do
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
+    c:eval('box.ctl.wait_rw()')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
diff --git a/test/replication/election_qsync_stress.test.lua b/test/replication/election_qsync_stress.test.lua
index 0ba15eef7..d70601cc5 100644
--- a/test/replication/election_qsync_stress.test.lua
+++ b/test/replication/election_qsync_stress.test.lua
@@ -40,6 +40,7 @@ old_leader_nr = get_leader(nrs)
 old_leader = 'election_replica'..old_leader_nr
 leader_port = test_run:eval(old_leader, 'box.cfg.listen')[1]
 c = netbox.connect(leader_port)
+c:eval('box.ctl.wait_rw()')
 
 _ = c:eval('box.schema.space.create("test", {is_sync=true})')
 _ = c:eval('box.space.test:create_index("pk")')
@@ -58,6 +59,7 @@ for i = 1,10 do
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
     c:eval('box.cfg{replication_synchro_timeout=1000}')
+    c:eval('box.ctl.wait_rw()')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
     test_run:cmd('start server '..old_leader..' with wait=True, wait_load=True, args="2 0.4"')
-- 
2.24.3 (Apple Git-128)

             reply	other threads:[~2020-11-24 13:18 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-24 13:18 Serge Petrenko [this message]
2020-11-24 14:14 ` Cyrill Gorcunov
2020-11-25  8:48   ` Serge Petrenko
2020-11-26 21:01 ` Vladislav Shpilevoy
2020-11-27 13:00   ` Serge Petrenko
2020-11-27 22:10     ` Vladislav Shpilevoy
2020-11-30  9:40       ` Serge Petrenko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201124131820.32981-1-sergepetrenko@tarantool.org \
    --to=sergepetrenko@tarantool.org \
    --cc=gorcunov@gmail.com \
    --cc=tarantool-patches@dev.tarantool.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH] raft: make sure the leader stays ro till it clears the limbo' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox