[PATCH] replication: fix rebootstrap race that results in broken subscription

Thu Feb 15 15:30:07 MSK 2018

While a node of the cluster is re-bootstrapping (joining again),
other nodes may try to re-subscribe to it. They will fail, because
the rebootstrapped node hasn't tried to subscribe hence hasn't been
added to the _cluster table yet and so is not present in the hash
at the subscriber's side for replica_on_applier_reconnect() to look
it up.

Fix this by making a subscriber create an id-less (REPLICA_ID_NIL)
struct replica in this case and reattach the applier to it. It will
be assigned an id when it finally subscribes and is registered in
_cluster.

Fixes 71b3340537e9 replication: reconnect applier on master rebootstrap
---
https://github.com/tarantool/tarantool/tree/gh-3151-replication-cfg-improvements

 src/box/box.cc                   |  9 ++---
 src/box/replication.cc           | 13 +++++---
 test/replication/quorum.result   | 71 +++++++++++++++++++++++++++++++++++++---
 test/replication/quorum.test.lua | 35 +++++++++++++++++---
 4 files changed, 111 insertions(+), 17 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index fa1eb051..33d237bc 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1152,7 +1152,7 @@ box_register_replica(uint32_t id, const struct tt_uuid *uuid)
 	if (boxk(IPROTO_INSERT, BOX_CLUSTER_ID, "[%u%s]",
 		 (unsigned) id, tt_uuid_str(uuid)) != 0)
 		diag_raise();
-	assert(replica_by_uuid(uuid) != NULL);
+	assert(replica_by_uuid(uuid)->id == id);
 }
 
 /**
@@ -1166,7 +1166,7 @@ static void
 box_on_join(const tt_uuid *instance_uuid)
 {
 	struct replica *replica = replica_by_uuid(instance_uuid);
-	if (replica != NULL)
+	if (replica != NULL && replica->id != REPLICA_ID_NIL)
 		return; /* nothing to do - already registered */
 
 	box_check_writable_xc();
@@ -1270,7 +1270,8 @@ box_process_join(struct ev_io *io, struct xrow_header *header)
 	 * is complete. Fail early if the caller does not have
 	 * appropriate access privileges.
 	 */
-	if (replica_by_uuid(&instance_uuid) == NULL) {
+	struct replica *replica = replica_by_uuid(&instance_uuid);
+	if (replica == NULL || replica->id == REPLICA_ID_NIL) {
 		box_check_writable_xc();
 		struct space *space = space_cache_find_xc(BOX_CLUSTER_ID);
 		access_check_space_xc(space, PRIV_W);
@@ -1323,7 +1324,7 @@ box_process_join(struct ev_io *io, struct xrow_header *header)
 	 */
 	box_on_join(&instance_uuid);
 
-	struct replica *replica = replica_by_uuid(&instance_uuid);
+	replica = replica_by_uuid(&instance_uuid);
 	assert(replica != NULL);
 	replica->gc = gc;
 	gc_guard.is_active = false;
diff --git a/src/box/replication.cc b/src/box/replication.cc
index fc8f900b..b1c84d36 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -284,10 +284,15 @@ replica_on_applier_reconnect(struct replica *replica)
 		 * the new UUID and reassign the applier to it.
 		 */
 		struct replica *orig = replica_by_uuid(&applier->uuid);
-		if (orig == NULL || orig->applier != NULL) {
-			tnt_raise(ClientError, ER_INSTANCE_UUID_MISMATCH,
-				  tt_uuid_str(&replica->uuid),
-				  tt_uuid_str(&applier->uuid));
+		if (orig == NULL) {
+			orig = replica_new();
+			orig->uuid = applier->uuid;
+			replica_hash_insert(&replicaset.hash, orig);
+		}
+
+		if (orig->applier != NULL) {
+			tnt_raise(ClientError, ER_CFG, "replication",
+				  "duplicate connection to the same replica");
 		}
 
 		replica_set_applier(orig, applier);
diff --git a/test/replication/quorum.result b/test/replication/quorum.result
index 5a294103..34408d42 100644
--- a/test/replication/quorum.result
+++ b/test/replication/quorum.result
@@ -133,7 +133,7 @@ test_run:cmd('switch quorum3')
 ---
 - true
 ...
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01)
+box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.001)
 ---
 - ok
 ...
@@ -141,7 +141,7 @@ test_run:cmd('switch quorum2')
 ---
 - true
 ...
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01)
+box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.001)
 ---
 - ok
 ...
@@ -149,7 +149,7 @@ test_run:cmd('stop server quorum1')
 ---
 - true
 ...
-for i = 1, 10 do box.space.test:insert{i} end
+for i = 1, 100 do box.space.test:insert{i} end
 ---
 ...
 fiber = require('fiber')
@@ -166,9 +166,70 @@ test_run:cmd('switch quorum1')
 ---
 - true
 ...
-box.space.test:count() -- 10
+box.space.test:count() -- 100
 ---
-- 10
+- 100
+...
+-- Rebootstrap one node of the cluster and check that others follow.
+-- Note, due to ERRINJ_RELAY_TIMEOUT there is a substantial delay
+-- between the moment the node starts listening and the moment it
+-- completes bootstrap and subscribes. Other nodes will try and
+-- fail to subscribe to the restarted node during this period.
+-- This is OK - they have to retry until the bootstrap is complete.
+test_run:cmd('switch quorum3')
+---
+- true
+...
+box.snapshot()
+---
+- ok
+...
+test_run:cmd('switch quorum2')
+---
+- true
+...
+box.snapshot()
+---
+- ok
+...
+test_run:cmd('switch quorum1')
+---
+- true
+...
+test_run:cmd('restart server quorum1 with cleanup=1')
+box.space.test:count() -- 100
+---
+- 100
+...
+-- The rebootstrapped replica will be assigned id = 4,
+-- because ids 1..3 are busy.
+test_run:cmd('switch quorum2')
+---
+- true
+...
+fiber = require('fiber')
+---
+...
+while box.info.replication[4].upstream.status ~= 'follow' do fiber.sleep(0.001) end
+---
+...
+box.info.replication[4].upstream.status
+---
+- follow
+...
+test_run:cmd('switch quorum3')
+---
+- true
+...
+fiber = require('fiber')
+---
+...
+while box.info.replication[4].upstream.status ~= 'follow' do fiber.sleep(0.001) end
+---
+...
+box.info.replication[4].upstream.status
+---
+- follow
 ...
 -- Cleanup.
 test_run:cmd('switch default')
diff --git a/test/replication/quorum.test.lua b/test/replication/quorum.test.lua
index 192f1369..85600684 100644
--- a/test/replication/quorum.test.lua
+++ b/test/replication/quorum.test.lua
@@ -54,18 +54,45 @@ box.info.id == 3 or box.info.replication[3].upstream.status == 'follow'
 -- Check that box.cfg() doesn't return until the instance
 -- catches up with all configured replicas.
 test_run:cmd('switch quorum3')
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01)
+box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.001)
 test_run:cmd('switch quorum2')
-box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.01)
+box.error.injection.set("ERRINJ_RELAY_TIMEOUT", 0.001)
 test_run:cmd('stop server quorum1')
 
-for i = 1, 10 do box.space.test:insert{i} end
+for i = 1, 100 do box.space.test:insert{i} end
 fiber = require('fiber')
 fiber.sleep(0.1)
 
 test_run:cmd('start server quorum1')
 test_run:cmd('switch quorum1')
-box.space.test:count() -- 10
+box.space.test:count() -- 100
+
+-- Rebootstrap one node of the cluster and check that others follow.
+-- Note, due to ERRINJ_RELAY_TIMEOUT there is a substantial delay
+-- between the moment the node starts listening and the moment it
+-- completes bootstrap and subscribes. Other nodes will try and
+-- fail to subscribe to the restarted node during this period.
+-- This is OK - they have to retry until the bootstrap is complete.
+test_run:cmd('switch quorum3')
+box.snapshot()
+test_run:cmd('switch quorum2')
+box.snapshot()
+
+test_run:cmd('switch quorum1')
+test_run:cmd('restart server quorum1 with cleanup=1')
+
+box.space.test:count() -- 100
+
+-- The rebootstrapped replica will be assigned id = 4,
+-- because ids 1..3 are busy.
+test_run:cmd('switch quorum2')
+fiber = require('fiber')
+while box.info.replication[4].upstream.status ~= 'follow' do fiber.sleep(0.001) end
+box.info.replication[4].upstream.status
+test_run:cmd('switch quorum3')
+fiber = require('fiber')
+while box.info.replication[4].upstream.status ~= 'follow' do fiber.sleep(0.001) end
+box.info.replication[4].upstream.status
 
 -- Cleanup.
 test_run:cmd('switch default')
-- 
2.11.0