[tarantool-patches] [PATCH] replication: add is_orphan field to ballot

Serge Petrenko sergepetrenko at tarantool.org
Mon Sep 30 20:54:21 MSK 2019


A successfully fetched remote instance ballot isn't updated during
bootstrap procedure. This leads to a case when different instances
choose different masters as their bootstrap leaders.

Imagine such a situation.
You start instance A without replication set up. Instance A successfully
bootstraps.
You also have instances B and C both with replication set up to {A, B,
C} and replication_connect_quorum set to 3
You first start instance B. It doesn't proceed to choosing a leader
until one of the events happens: either the replication_connect_timeout
runs out, or instance C is up and starts listening on its port.
B has established connection to A and fetched its ballot, with some
vclock, say, {1: 1}.
B retries connection to C every replication_timeout seconds.
Then you start instance C. Instance C succeeds in connecting to A and B
right away and bootstraps from instance A. Instance A registers C in its
_cluster table. This registration is replicated to instance C.
Meanwhile, instance C is trying to sync with quorum instances (which is
3), and stays in orphan mode.
Now replication_timeout on instance B finally runs out. It retries a
previously unsuccessful connection to C and succeeds. C sends its ballot
to B with vclock = {1: 2, 2:0} (in our example), since it has already
incremented it after _cluster registration.
B sees that C has a greater vclock than A, and chooses to bootstrap from
C instead of A. C is orphan and rejects B's attempt to join. B dies.

To fix such ungentlemanlike behaviour of C, we should at least include
orphan status in ballot and prefer fully bootstrapped instances to the
ones still syncing with other replicas.
We also need to use a separate flag instead of ballot's already existent
is_ro, since we still want to prefer orphan instances over the ones
explicitly configured to be read-only.

Closes #4527
---
https://github.com/tarantool/tarantool/tree/sp/gh-4527-ballot-vclock
https://github.com/tarantool/tarantool/issues/4527

 src/box/box.cc                             |  1 +
 src/box/iproto_constants.h                 |  1 +
 src/box/replication.cc                     | 10 +++-
 src/box/xrow.c                             | 13 ++++-
 src/box/xrow.h                             |  5 ++
 test/replication/bootstrap_leader.result   | 61 ++++++++++++++++++++++
 test/replication/bootstrap_leader.test.lua | 27 ++++++++++
 test/replication/replica_uuid_rw.lua       | 26 +++++++++
 test/replication/replica_uuid_rw1.lua      |  1 +
 test/replication/replica_uuid_rw2.lua      |  1 +
 test/replication/replica_uuid_rw3.lua      |  1 +
 11 files changed, 143 insertions(+), 4 deletions(-)
 create mode 100644 test/replication/bootstrap_leader.result
 create mode 100644 test/replication/bootstrap_leader.test.lua
 create mode 100644 test/replication/replica_uuid_rw.lua
 create mode 120000 test/replication/replica_uuid_rw1.lua
 create mode 120000 test/replication/replica_uuid_rw2.lua
 create mode 120000 test/replication/replica_uuid_rw3.lua

diff --git a/src/box/box.cc b/src/box/box.cc
index 40920e649..0680c7893 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1630,6 +1630,7 @@ void
 box_process_vote(struct ballot *ballot)
 {
 	ballot->is_ro = cfg_geti("read_only") != 0;
+	ballot->is_orphan = is_ro || is_orphan;
 	vclock_copy(&ballot->vclock, &replicaset.vclock);
 	vclock_copy(&ballot->gc_vclock, &gc.vclock);
 }
diff --git a/src/box/iproto_constants.h b/src/box/iproto_constants.h
index 724cce535..7b11568ce 100644
--- a/src/box/iproto_constants.h
+++ b/src/box/iproto_constants.h
@@ -137,6 +137,7 @@ enum iproto_ballot_key {
 	IPROTO_BALLOT_IS_RO = 0x01,
 	IPROTO_BALLOT_VCLOCK = 0x02,
 	IPROTO_BALLOT_GC_VCLOCK = 0x03,
+	IPROTO_BALLOT_IS_ORPHAN = 0x04,
 };
 
 #define bit(c) (1ULL<<IPROTO_##c)
diff --git a/src/box/replication.cc b/src/box/replication.cc
index d691ce487..2ff0bb05f 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -863,8 +863,8 @@ replicaset_next(struct replica *replica)
 }
 
 /**
- * Compare vclock and read only mode of all connected
- * replicas and elect a leader.
+ * Compare vclock, read only mode and orphan status
+ * of all connected replicas and elect a leader.
  * Initiallly, skip read-only replicas, since they
  * can not properly act as bootstrap masters (register
  * new nodes in _cluster table). If there are no read-write
@@ -892,6 +892,12 @@ replicaset_round(bool skip_ro)
 			leader = replica;
 			continue;
 		}
+		/*
+		 * Try to find a replica which has already left
+		 * orphan mode.
+		 */
+		if (applier->ballot.is_orphan && ! leader->applier->ballot.is_orphan)
+			continue;
 		/*
 		 * Choose the replica with the most advanced
 		 * vclock. If there are two or more replicas
diff --git a/src/box/xrow.c b/src/box/xrow.c
index 0ae5271c1..fde4fd2ae 100644
--- a/src/box/xrow.c
+++ b/src/box/xrow.c
@@ -441,8 +441,9 @@ iproto_reply_vote(struct obuf *out, const struct ballot *ballot,
 		  uint64_t sync, uint32_t schema_version)
 {
 	size_t max_size = IPROTO_HEADER_LEN + mp_sizeof_map(1) +
-		mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(3) +
+		mp_sizeof_uint(UINT32_MAX) + mp_sizeof_map(4) +
 		mp_sizeof_uint(UINT32_MAX) + mp_sizeof_bool(ballot->is_ro) +
+		mp_sizeof_uint(UINT32_MAX) + mp_sizeof_bool(ballot->is_orphan) +
 		mp_sizeof_uint(UINT32_MAX) + mp_sizeof_vclock(&ballot->vclock) +
 		mp_sizeof_uint(UINT32_MAX) + mp_sizeof_vclock(&ballot->gc_vclock);
 
@@ -456,9 +457,11 @@ iproto_reply_vote(struct obuf *out, const struct ballot *ballot,
 	char *data = buf + IPROTO_HEADER_LEN;
 	data = mp_encode_map(data, 1);
 	data = mp_encode_uint(data, IPROTO_BALLOT);
-	data = mp_encode_map(data, 3);
+	data = mp_encode_map(data, 4);
 	data = mp_encode_uint(data, IPROTO_BALLOT_IS_RO);
 	data = mp_encode_bool(data, ballot->is_ro);
+	data = mp_encode_uint(data, IPROTO_BALLOT_IS_ORPHAN);
+	data = mp_encode_bool(data, ballot->is_orphan);
 	data = mp_encode_uint(data, IPROTO_BALLOT_VCLOCK);
 	data = mp_encode_vclock(data, &ballot->vclock);
 	data = mp_encode_uint(data, IPROTO_BALLOT_GC_VCLOCK);
@@ -1077,6 +1080,7 @@ int
 xrow_decode_ballot(struct xrow_header *row, struct ballot *ballot)
 {
 	ballot->is_ro = false;
+	ballot->is_orphan = false;
 	vclock_create(&ballot->vclock);
 
 	const char *start = NULL;
@@ -1121,6 +1125,11 @@ xrow_decode_ballot(struct xrow_header *row, struct ballot *ballot)
 				goto err;
 			ballot->is_ro = mp_decode_bool(&data);
 			break;
+		case IPROTO_BALLOT_IS_ORPHAN:
+			if (mp_typeof(*data) != MP_BOOL)
+				goto err;
+			ballot->is_orphan = mp_decode_bool(&data);
+			break;
 		case IPROTO_BALLOT_VCLOCK:
 			if (mp_decode_vclock(&data, &ballot->vclock) != 0)
 				goto err;
diff --git a/src/box/xrow.h b/src/box/xrow.h
index 35ec06dc0..ae786d10e 100644
--- a/src/box/xrow.h
+++ b/src/box/xrow.h
@@ -275,6 +275,11 @@ xrow_encode_auth(struct xrow_header *row, const char *salt, size_t salt_len,
 struct ballot {
 	/** Set if the instance is running in read-only mode. */
 	bool is_ro;
+	/**
+	 * Set if the instance is in orphan mode, i.e. hasn't
+	 * finished bootstrap yet.
+	 */
+	bool is_orphan;
 	/** Current instance vclock. */
 	struct vclock vclock;
 	/** Oldest vclock available on the instance. */
diff --git a/test/replication/bootstrap_leader.result b/test/replication/bootstrap_leader.result
new file mode 100644
index 000000000..ef4265cd3
--- /dev/null
+++ b/test/replication/bootstrap_leader.result
@@ -0,0 +1,61 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+create_server_cmd = "create server replica%d with script='replication/replica_uuid_rw%d.lua'"
+ | ---
+ | ...
+
+test_run:cmd("setopt delimiter ';'")
+ | ---
+ | - true
+ | ...
+
+for i = 1,3 do
+    test_run:cmd(string.format(create_server_cmd, i, i))
+end;
+ | ---
+ | ...
+
+test_run:cmd("start server replica1 with wait_load=False, wait=False");
+ | ---
+ | - true
+ | ...
+test_run:cmd("start server replica2 with args='1,2,3 1.0 100500 0.1', wait_load=False, wait=False");
+ | ---
+ | - true
+ | ...
+test_run:cmd("start server replica3 with args='1,2,3 0.1 0.5 100500', wait_load=False, wait=False");
+ | ---
+ | - true
+ | ...
+
+test_run:cmd("switch replica3");
+ | ---
+ | - true
+ | ...
+fiber = require('fiber');
+ | ---
+ | ...
+while (#box.info.replication < 3) do
+    fiber.sleep(0.05)
+end;
+ | ---
+ | ...
+test_run:cmd("switch default");
+ | ---
+ | - true
+ | ...
+
+for i = 1,3 do
+    test_run:cmd("stop server replica"..i.." with cleanup=1")
+    test_run:cmd("delete server replica"..i)
+end;
+ | ---
+ | ...
+
+test_run:cmd("setopt delimiter ''");
+ | ---
+ | - true
+ | ...
diff --git a/test/replication/bootstrap_leader.test.lua b/test/replication/bootstrap_leader.test.lua
new file mode 100644
index 000000000..eb9aa293f
--- /dev/null
+++ b/test/replication/bootstrap_leader.test.lua
@@ -0,0 +1,27 @@
+test_run = require('test_run').new()
+
+create_server_cmd = "create server replica%d with script='replication/replica_uuid_rw%d.lua'"
+
+test_run:cmd("setopt delimiter ';'")
+
+for i = 1,3 do
+    test_run:cmd(string.format(create_server_cmd, i, i))
+end;
+
+test_run:cmd("start server replica1 with wait_load=False, wait=False");
+test_run:cmd("start server replica2 with args='1,2,3 1.0 100500 0.1', wait_load=False, wait=False");
+test_run:cmd("start server replica3 with args='1,2,3 0.1 0.5 100500', wait_load=False, wait=False");
+
+test_run:cmd("switch replica3");
+fiber = require('fiber');
+while (#box.info.replication < 3) do
+    fiber.sleep(0.05)
+end;
+test_run:cmd("switch default");
+
+for i = 1,3 do
+    test_run:cmd("stop server replica"..i.." with cleanup=1")
+    test_run:cmd("delete server replica"..i)
+end;
+
+test_run:cmd("setopt delimiter ''");
diff --git a/test/replication/replica_uuid_rw.lua b/test/replication/replica_uuid_rw.lua
new file mode 100644
index 000000000..60f2d6362
--- /dev/null
+++ b/test/replication/replica_uuid_rw.lua
@@ -0,0 +1,26 @@
+#!/usr/bin/env tarantool
+
+local INSTANCE_ID = string.match(arg[0], "%d")
+local SOCKET_DIR = require('fio').cwd()
+
+local function instance_uri(instance_id)
+    return SOCKET_DIR..'/replica_uuid_rw'..instance_id..'.sock'
+end
+
+local repl_tbl = {}
+for num in string.gmatch(arg[1] or "", "%d") do
+    table.insert(repl_tbl, instance_uri(num))
+end
+
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg({
+    instance_uuid       = "aaaaaaaa-aaaa-0000-0000-00000000000"..INSTANCE_ID,
+    listen              = instance_uri(INSTANCE_ID),
+    replication         = repl_tbl,
+    replication_timeout = arg[2] and tonumber(arg[2]) or 0.1,
+    replication_connect_timeout = arg[3] and tonumber(arg[3]) or 0.5,
+    replication_sync_timeout = arg[4] and tonumber(arg[4]) or 1.0,
+})
+
+box.once("bootstrap", function() box.schema.user.grant('guest', 'replication') end)
diff --git a/test/replication/replica_uuid_rw1.lua b/test/replication/replica_uuid_rw1.lua
new file mode 120000
index 000000000..328386f50
--- /dev/null
+++ b/test/replication/replica_uuid_rw1.lua
@@ -0,0 +1 @@
+replica_uuid_rw.lua
\ No newline at end of file
diff --git a/test/replication/replica_uuid_rw2.lua b/test/replication/replica_uuid_rw2.lua
new file mode 120000
index 000000000..328386f50
--- /dev/null
+++ b/test/replication/replica_uuid_rw2.lua
@@ -0,0 +1 @@
+replica_uuid_rw.lua
\ No newline at end of file
diff --git a/test/replication/replica_uuid_rw3.lua b/test/replication/replica_uuid_rw3.lua
new file mode 120000
index 000000000..328386f50
--- /dev/null
+++ b/test/replication/replica_uuid_rw3.lua
@@ -0,0 +1 @@
+replica_uuid_rw.lua
\ No newline at end of file
-- 
2.20.1 (Apple Git-117)





More information about the Tarantool-patches mailing list