[Tarantool-patches] [PATCH 2/2] raft: fix crash on candidate cfg during WAL write

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sat Nov 7 19:45:32 MSK 2020


Raft state machine crashed if it was configured to be a candidate
during a WAL write with a known leader.

It tried to start waiting for the leader death, but should have
waited for the WAL write end first.

The code tried to handle it, but the order of 'if' conditions was
wrong. WAL write being in progress was checked last, but should
have been checked first.

Closes #5506
---
 src/box/raft.c                                | 10 +--
 .../gh-5506-election-on-off.result            | 85 +++++++++++++++++++
 .../gh-5506-election-on-off.test.lua          | 37 ++++++++
 3 files changed, 127 insertions(+), 5 deletions(-)

diff --git a/src/box/raft.c b/src/box/raft.c
index 3a99a0f26..8f8b59ba6 100644
--- a/src/box/raft.c
+++ b/src/box/raft.c
@@ -960,17 +960,17 @@ raft_cfg_is_candidate(bool is_candidate)
 
 	if (raft.is_candidate) {
 		assert(raft.state == RAFT_STATE_FOLLOWER);
-		if (raft.leader != 0) {
-			raft_sm_wait_leader_dead();
-		} else if (raft_is_fully_on_disk()) {
-			raft_sm_wait_leader_found();
-		} else {
+		if (raft.is_write_in_progress) {
 			/*
 			 * If there is an on-going WAL write, it means there was
 			 * some node who sent newer data to this node. So it is
 			 * probably a better candidate. Anyway can't do anything
 			 * until the new state is fully persisted.
 			 */
+		} else if (raft.leader != 0) {
+			raft_sm_wait_leader_dead();
+		} else {
+			raft_sm_wait_leader_found();
 		}
 	} else {
 		if (raft.state != RAFT_STATE_LEADER) {
diff --git a/test/replication/gh-5506-election-on-off.result b/test/replication/gh-5506-election-on-off.result
index 1a718396f..4fbc31986 100644
--- a/test/replication/gh-5506-election-on-off.result
+++ b/test/replication/gh-5506-election-on-off.result
@@ -47,6 +47,91 @@ box.error.injection.set("ERRINJ_WAL_DELAY", false)
  | - ok
  | ...
 
+box.cfg{                                                                        \
+    election_mode = old_election_mode,                                          \
+}
+ | ---
+ | ...
+
+--
+-- Another crash could happen when election mode was configured to be
+-- 'candidate' with a known leader, but there was a not finished WAL write.
+-- The node tried to start waiting for the leader death, even though with an
+-- active WAL write it should wait for its end first.
+--
+box.schema.user.grant('guest', 'super')
+ | ---
+ | ...
+test_run:cmd('create server replica with rpl_master=default,\
+              script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.cfg{election_mode = 'voter'}
+ | ---
+ | ...
+box.error.injection.set("ERRINJ_WAL_DELAY_COUNTDOWN", 0)
+ | ---
+ | - ok
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{election_mode = 'candidate'}
+ | ---
+ | ...
+test_run:wait_cond(function()                                                   \
+    return box.info.election.leader ~= 0                                        \
+end)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+test_run:wait_cond(function()                                                   \
+    return box.error.injection.get("ERRINJ_WAL_DELAY")                          \
+end)
+ | ---
+ | - true
+ | ...
+box.cfg{election_mode = 'candidate'}
+ | ---
+ | ...
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+ | ---
+ | - ok
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+
+box.schema.user.revoke('guest', 'super')
+ | ---
+ | ...
 box.cfg{                                                                        \
     election_mode = old_election_mode,                                          \
     replication_timeout = old_replication_timeout,                              \
diff --git a/test/replication/gh-5506-election-on-off.test.lua b/test/replication/gh-5506-election-on-off.test.lua
index 290408f06..bb89477d1 100644
--- a/test/replication/gh-5506-election-on-off.test.lua
+++ b/test/replication/gh-5506-election-on-off.test.lua
@@ -25,6 +25,43 @@ box.cfg{election_mode = 'off'}
 box.cfg{election_mode = 'candidate'}
 box.error.injection.set("ERRINJ_WAL_DELAY", false)
 
+box.cfg{                                                                        \
+    election_mode = old_election_mode,                                          \
+}
+
+--
+-- Another crash could happen when election mode was configured to be
+-- 'candidate' with a known leader, but there was a not finished WAL write.
+-- The node tried to start waiting for the leader death, even though with an
+-- active WAL write it should wait for its end first.
+--
+box.schema.user.grant('guest', 'super')
+test_run:cmd('create server replica with rpl_master=default,\
+              script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+
+test_run:switch('replica')
+box.cfg{election_mode = 'voter'}
+box.error.injection.set("ERRINJ_WAL_DELAY_COUNTDOWN", 0)
+
+test_run:switch('default')
+box.cfg{election_mode = 'candidate'}
+test_run:wait_cond(function()                                                   \
+    return box.info.election.leader ~= 0                                        \
+end)
+
+test_run:switch('replica')
+test_run:wait_cond(function()                                                   \
+    return box.error.injection.get("ERRINJ_WAL_DELAY")                          \
+end)
+box.cfg{election_mode = 'candidate'}
+box.error.injection.set("ERRINJ_WAL_DELAY", false)
+
+test_run:switch('default')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+
+box.schema.user.revoke('guest', 'super')
 box.cfg{                                                                        \
     election_mode = old_election_mode,                                          \
     replication_timeout = old_replication_timeout,                              \
-- 
2.21.1 (Apple Git-122.3)



More information about the Tarantool-patches mailing list