Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
To: tarantool-patches@dev.tarantool.org, sergepetrenko@tarantool.org
Subject: [Tarantool-patches] [PATCH 5/6] raft: auto-commit transactions of the old leader
Date: Wed, 14 Oct 2020 01:28:31 +0200	[thread overview]
Message-ID: <b6e57eecad25c845243568e613ead9c78bacb181.1602631481.git.v.shpilevoy@tarantool.org> (raw)
In-Reply-To: <cover.1602631481.git.v.shpilevoy@tarantool.org>

According to Raft, when a new leader is elected, it should finish
transactions of the old leader. In Raft this is done via adding a
new transaction originated from the new leader.

In case of Tarantool this can be done without a new transaction
due to WAL format specifics, and the function doing it is called
box_clear_synchro_queue().

Before the patch, when a node was elected as a leader, it didn't
finish the pending transactions. The queue clearance was expected
to be done by a user. There was no any issue with that, just
technical debt. The patch fixes it.

Now when a node becomes a leader, it finished synchronous
transactions of the old leader. This is done a bit differently
than in the public box.ctl.clear_synchro_queue().

The public box.ctl.clear_synchro_queue() tries to wait for CONFIRM
messages, which may be late. For replication_synchro_timeout * 2
time.

But when a new leader is elected, the leader will ignore all rows
from all the other nodes, as it thinks it is the only source of
truth. Therefore it makes no sense to wait for CONFIRMs here, and
the waiting is omitted.

Closes #5339
---
 src/box/box.cc                                |  42 ++++-
 src/box/box.h                                 |   3 +-
 src/box/lua/ctl.c                             |   2 +-
 test/replication/election_qsync.result        | 153 ++++++++++++++++++
 test/replication/election_qsync.test.lua      |  76 +++++++++
 test/replication/election_qsync_stress.result |   1 -
 .../election_qsync_stress.test.lua            |   1 -
 7 files changed, 266 insertions(+), 12 deletions(-)
 create mode 100644 test/replication/election_qsync.result
 create mode 100644 test/replication/election_qsync.test.lua

diff --git a/src/box/box.cc b/src/box/box.cc
index 566cb41ef..18568df3b 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -152,6 +152,11 @@ static struct fiber_pool tx_fiber_pool;
  * are too many messages in flight (gh-1892).
  */
 static struct cbus_endpoint tx_prio_endpoint;
+/**
+ * A trigger executed each time the Raft state machine updates any
+ * of its visible attributes.
+ */
+static struct trigger box_raft_on_update;
 
 void
 box_update_ro_summary(void)
@@ -1005,7 +1010,7 @@ box_set_replication_anon(void)
 }
 
 void
-box_clear_synchro_queue(void)
+box_clear_synchro_queue(bool try_wait)
 {
 	if (!is_box_configured || txn_limbo_is_empty(&txn_limbo))
 		return;
@@ -1014,13 +1019,15 @@ box_clear_synchro_queue(void)
 	if (former_leader_id == instance_id)
 		return;
 
-	/* Wait until pending confirmations/rollbacks reach us. */
-	double timeout = 2 * replication_synchro_timeout;
-	double start_tm = fiber_clock();
-	while (!txn_limbo_is_empty(&txn_limbo)) {
-		if (fiber_clock() - start_tm > timeout)
-			break;
-		fiber_sleep(0.001);
+	if (try_wait) {
+		/* Wait until pending confirmations/rollbacks reach us. */
+		double timeout = 2 * replication_synchro_timeout;
+		double start_tm = fiber_clock();
+		while (!txn_limbo_is_empty(&txn_limbo)) {
+			if (fiber_clock() - start_tm > timeout)
+				break;
+			fiber_sleep(0.001);
+		}
 	}
 
 	if (!txn_limbo_is_empty(&txn_limbo)) {
@@ -1053,6 +1060,22 @@ box_clear_synchro_queue(void)
 	}
 }
 
+static int
+box_raft_on_update_f(struct trigger *trigger, void *event)
+{
+	(void)trigger;
+	(void)event;
+	if (raft.state != RAFT_STATE_LEADER)
+		return 0;
+	/*
+	 * When the node became a leader, it means it will ignore all records
+	 * from all the other nodes, and won't get late CONFIRM messages anyway.
+	 * Can clear the queue without waiting for confirmations.
+	 */
+	box_clear_synchro_queue(false);
+	return 0;
+}
+
 void
 box_listen(void)
 {
@@ -2633,6 +2656,9 @@ box_init(void)
 	txn_limbo_init();
 	sequence_init();
 	raft_init();
+
+	trigger_create(&box_raft_on_update, box_raft_on_update_f, NULL, NULL);
+	raft_on_update(&box_raft_on_update);
 }
 
 bool
diff --git a/src/box/box.h b/src/box/box.h
index a151fb8f1..b47a220b7 100644
--- a/src/box/box.h
+++ b/src/box/box.h
@@ -266,7 +266,8 @@ extern "C" {
 
 typedef struct tuple box_tuple_t;
 
-void box_clear_synchro_queue(void);
+void
+box_clear_synchro_queue(bool try_wait);
 
 /* box_select is private and used only by FFI */
 API_EXPORT int
diff --git a/src/box/lua/ctl.c b/src/box/lua/ctl.c
index 2017ddc18..bf26465e6 100644
--- a/src/box/lua/ctl.c
+++ b/src/box/lua/ctl.c
@@ -82,7 +82,7 @@ static int
 lbox_ctl_clear_synchro_queue(struct lua_State *L)
 {
 	(void) L;
-	box_clear_synchro_queue();
+	box_clear_synchro_queue(true);
 	return 0;
 }
 
diff --git a/test/replication/election_qsync.result b/test/replication/election_qsync.result
new file mode 100644
index 000000000..5dc9d182a
--- /dev/null
+++ b/test/replication/election_qsync.result
@@ -0,0 +1,153 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+box.schema.user.grant('guest', 'super')
+ | ---
+ | ...
+
+old_election_mode = box.cfg.election_mode
+ | ---
+ | ...
+old_replication_synchro_timeout = box.cfg.replication_synchro_timeout
+ | ---
+ | ...
+old_replication_timeout = box.cfg.replication_timeout
+ | ---
+ | ...
+old_replication = box.cfg.replication
+ | ---
+ | ...
+
+test_run:cmd('create server replica with rpl_master=default,\
+              script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=True, wait_load=True')
+ | ---
+ | - true
+ | ...
+-- Any election activities require fullmesh.
+box.cfg{replication = test_run:eval('replica', 'box.cfg.listen')[1]}
+ | ---
+ | ...
+
+--
+-- gh-5339: leader election manages transaction limbo automatically.
+--
+-- Idea of the test is that there are 2 nodes. A leader and a
+-- follower. The leader creates a synchronous transaction, it gets
+-- replicated to the follower, the leader dies. Now when the
+-- follower is elected as a new leader, it should finish the
+-- pending transaction.
+--
+_ = box.schema.create_space('test', {is_sync = true})
+ | ---
+ | ...
+_ = _:create_index('pk')
+ | ---
+ | ...
+box.cfg{election_mode = 'voter'}
+ | ---
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+-- Replication timeout is small to speed up a first election start.
+box.cfg{                                                                        \
+    election_mode = 'candidate',                                                \
+    replication_synchro_quorum = 3,                                             \
+    replication_timeout = 0.1,                                                  \
+}
+ | ---
+ | ...
+
+test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+ | ---
+ | - true
+ | ...
+lsn = box.info.lsn
+ | ---
+ | ...
+_ = fiber.create(function()                                                     \
+    ok, err = pcall(box.space.test.replace, box.space.test, {1})                \
+end)
+ | ---
+ | ...
+-- Wait WAL write.
+test_run:wait_cond(function() return box.info.lsn > lsn end)
+ | ---
+ | - true
+ | ...
+-- Wait replication to the other instance.
+test_run:wait_lsn('default', 'replica')
+ | ---
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+-- Will fail - the node is not a leader.
+box.space.test:replace{2}
+ | ---
+ | - error: Can't modify data because this instance is in read-only mode.
+ | ...
+
+-- Set synchro timeout to a huge value to ensure, that when a leader is elected,
+-- it won't wait for this timeout.
+box.cfg{replication_synchro_timeout = 1000000}
+ | ---
+ | ...
+
+-- Configure separately from synchro timeout not to depend on the order of
+-- synchro and election options appliance. Replication timeout is tiny to speed
+-- up notice of the old leader death.
+box.cfg{                                                                        \
+    election_mode = 'candidate',                                                \
+    replication_timeout = 0.01,                                                 \
+}
+ | ---
+ | ...
+
+test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+ | ---
+ | - true
+ | ...
+_ = box.space.test:replace{2}
+ | ---
+ | ...
+box.space.test:select{}
+ | ---
+ | - - [1]
+ |   - [2]
+ | ...
+box.space.test:drop()
+ | ---
+ | ...
+
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+box.cfg{                                                                        \
+    election_mode = old_election_mode,                                          \
+    replication_timeout = old_replication_timeout,                              \
+    replication = old_replication,                                              \
+    replication_synchro_timeout = old_replication_synchro_timeout,              \
+}
+ | ---
+ | ...
+box.schema.user.revoke('guest', 'super')
+ | ---
+ | ...
diff --git a/test/replication/election_qsync.test.lua b/test/replication/election_qsync.test.lua
new file mode 100644
index 000000000..69ef33da2
--- /dev/null
+++ b/test/replication/election_qsync.test.lua
@@ -0,0 +1,76 @@
+test_run = require('test_run').new()
+box.schema.user.grant('guest', 'super')
+
+old_election_mode = box.cfg.election_mode
+old_replication_synchro_timeout = box.cfg.replication_synchro_timeout
+old_replication_timeout = box.cfg.replication_timeout
+old_replication = box.cfg.replication
+
+test_run:cmd('create server replica with rpl_master=default,\
+              script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=True, wait_load=True')
+-- Any election activities require fullmesh.
+box.cfg{replication = test_run:eval('replica', 'box.cfg.listen')[1]}
+
+--
+-- gh-5339: leader election manages transaction limbo automatically.
+--
+-- Idea of the test is that there are 2 nodes. A leader and a
+-- follower. The leader creates a synchronous transaction, it gets
+-- replicated to the follower, the leader dies. Now when the
+-- follower is elected as a new leader, it should finish the
+-- pending transaction.
+--
+_ = box.schema.create_space('test', {is_sync = true})
+_ = _:create_index('pk')
+box.cfg{election_mode = 'voter'}
+
+test_run:switch('replica')
+fiber = require('fiber')
+-- Replication timeout is small to speed up a first election start.
+box.cfg{                                                                        \
+    election_mode = 'candidate',                                                \
+    replication_synchro_quorum = 3,                                             \
+    replication_timeout = 0.1,                                                  \
+}
+
+test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+lsn = box.info.lsn
+_ = fiber.create(function()                                                     \
+    ok, err = pcall(box.space.test.replace, box.space.test, {1})                \
+end)
+-- Wait WAL write.
+test_run:wait_cond(function() return box.info.lsn > lsn end)
+-- Wait replication to the other instance.
+test_run:wait_lsn('default', 'replica')
+
+test_run:switch('default')
+test_run:cmd('stop server replica')
+-- Will fail - the node is not a leader.
+box.space.test:replace{2}
+
+-- Set synchro timeout to a huge value to ensure, that when a leader is elected,
+-- it won't wait for this timeout.
+box.cfg{replication_synchro_timeout = 1000000}
+
+-- Configure separately from synchro timeout not to depend on the order of
+-- synchro and election options appliance. Replication timeout is tiny to speed
+-- up notice of the old leader death.
+box.cfg{                                                                        \
+    election_mode = 'candidate',                                                \
+    replication_timeout = 0.01,                                                 \
+}
+
+test_run:wait_cond(function() return box.info.election.state == 'leader' end)
+_ = box.space.test:replace{2}
+box.space.test:select{}
+box.space.test:drop()
+
+test_run:cmd('delete server replica')
+box.cfg{                                                                        \
+    election_mode = old_election_mode,                                          \
+    replication_timeout = old_replication_timeout,                              \
+    replication = old_replication,                                              \
+    replication_synchro_timeout = old_replication_synchro_timeout,              \
+}
+box.schema.user.revoke('guest', 'super')
diff --git a/test/replication/election_qsync_stress.result b/test/replication/election_qsync_stress.result
index 9497b37bf..bb419c3bf 100644
--- a/test/replication/election_qsync_stress.result
+++ b/test/replication/election_qsync_stress.result
@@ -93,7 +93,6 @@ for i = 1,10 do
     new_leader = 'election_replica'..new_leader_nr
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
-    c:eval('box.ctl.clear_synchro_queue()')
     c:eval('box.cfg{replication_synchro_timeout=1000}')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
diff --git a/test/replication/election_qsync_stress.test.lua b/test/replication/election_qsync_stress.test.lua
index bca1b20c7..2f379efdb 100644
--- a/test/replication/election_qsync_stress.test.lua
+++ b/test/replication/election_qsync_stress.test.lua
@@ -57,7 +57,6 @@ for i = 1,10 do
     new_leader = 'election_replica'..new_leader_nr
     leader_port = test_run:eval(new_leader, 'box.cfg.listen')[1]
     c = netbox.connect(leader_port)
-    c:eval('box.ctl.clear_synchro_queue()')
     c:eval('box.cfg{replication_synchro_timeout=1000}')
     c.space._schema:replace{'smth'}
     c.space.test:get{i}
-- 
2.21.1 (Apple Git-122.3)

  parent reply	other threads:[~2020-10-13 23:28 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-10-13 23:28 [Tarantool-patches] [PATCH 0/6] Raft auto-commit Vladislav Shpilevoy
2020-10-13 23:28 ` [Tarantool-patches] [PATCH 1/6] test: add '_stress' suffix to election_qsync test Vladislav Shpilevoy
2020-10-13 23:28 ` [Tarantool-patches] [PATCH 2/6] raft: factor out the code to wakeup worker fiber Vladislav Shpilevoy
2020-10-14 13:29   ` Cyrill Gorcunov
2020-10-14 22:40     ` Vladislav Shpilevoy
2020-10-15  6:50       ` Cyrill Gorcunov
2020-10-13 23:28 ` [Tarantool-patches] [PATCH 3/6] raft: new candidate should wait for leader death Vladislav Shpilevoy
2020-10-13 23:28 ` [Tarantool-patches] [PATCH 4/6] raft: introduce on_update trigger Vladislav Shpilevoy
2020-10-13 23:28 ` Vladislav Shpilevoy [this message]
2020-10-13 23:28 ` [Tarantool-patches] [PATCH 6/6] qsync: reset confirmed lsn in limbo on owner change Vladislav Shpilevoy
2020-11-24 23:23   ` Vladislav Shpilevoy
2020-10-14  7:34 ` [Tarantool-patches] [PATCH 0/6] Raft auto-commit Serge Petrenko
2020-10-14 22:40 ` Vladislav Shpilevoy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b6e57eecad25c845243568e613ead9c78bacb181.1602631481.git.v.shpilevoy@tarantool.org \
    --to=v.shpilevoy@tarantool.org \
    --cc=sergepetrenko@tarantool.org \
    --cc=tarantool-patches@dev.tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH 5/6] raft: auto-commit transactions of the old leader' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox