[Tarantool-patches] [PATCH 1/1] replication: set replica ID before _cluster commit

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sun Jul 25 19:53:02 MSK 2021


Replica registration works via looking for the smallest not
occupied ID in _cluster and inserting it into the space.

It works not so good when mvcc is enabled. In particular, if more
than 1 replica try to register at the same time, they might get
the same replica_id because don't see changes of each other until
the registration in _cluster is complete.

This in the end leads to all replicas failing the registration
except one with the 'duplicate key' error (primary index in
_cluster is replica ID).

The patch makes the replicas occupy their ID before they commit it
into _cluster. And new replica ID search now uses the replica ID
map instead of _cluster iterator.

This way the registration works like before - like MVCC does not
exist which is fine.

Part of #5430
---
Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5430-cluster-duplicate
Issue: https://github.com/tarantool/tarantool/issues/5430

 .../gh-5430-cluster-mvcc-duplicate.md         |   7 +
 src/box/alter.cc                              |  96 ++++++------
 src/box/box.cc                                |  19 +--
 src/box/replication.cc                        |  13 ++
 src/box/replication.h                         |   4 +
 test/replication/gh-5430-cluster-mvcc.result  | 146 ++++++++++++++++++
 .../replication/gh-5430-cluster-mvcc.test.lua |  62 ++++++++
 test/replication/gh-5430-mvcc-master.lua      |  11 ++
 test/replication/gh-5430-mvcc-replica1.lua    |  10 ++
 test/replication/gh-5430-mvcc-replica2.lua    |   1 +
 test/replication/suite.cfg                    |   1 +
 test/replication/suite.ini                    |   2 +-
 12 files changed, 306 insertions(+), 66 deletions(-)
 create mode 100644 changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
 create mode 100644 test/replication/gh-5430-cluster-mvcc.result
 create mode 100644 test/replication/gh-5430-cluster-mvcc.test.lua
 create mode 100644 test/replication/gh-5430-mvcc-master.lua
 create mode 100644 test/replication/gh-5430-mvcc-replica1.lua
 create mode 120000 test/replication/gh-5430-mvcc-replica2.lua

diff --git a/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
new file mode 100644
index 000000000..59b90f026
--- /dev/null
+++ b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
@@ -0,0 +1,7 @@
+## bugfix/replication
+
+* Fixed a rare error appearing when MVCC (`box.cfg.memtx_use_mvcc_engine`) was
+  enabled and more than one replica was joined to a cluster. The join could fail
+  with the error `"ER_TUPLE_FOUND: Duplicate key exists in unique index
+  'primary' in space '_cluster'"`. The same could happen at bootstrap of a
+  cluster having >= 3 nodes (gh-5430).
diff --git a/src/box/alter.cc b/src/box/alter.cc
index 89bb5946c..64ba09021 100644
--- a/src/box/alter.cc
+++ b/src/box/alter.cc
@@ -4178,47 +4178,11 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event)
 	return 0;
 }
 
-/**
- * A record with id of the new instance has been synced to the
- * write ahead log. Update the cluster configuration cache
- * with it.
- */
-static int
-register_replica(struct trigger *trigger, void * /* event */)
-{
-	struct tuple *new_tuple = (struct tuple *)trigger->data;
-	uint32_t id;
-	if (tuple_field_u32(new_tuple, BOX_CLUSTER_FIELD_ID, &id) != 0)
-		return -1;
-	tt_uuid uuid;
-	if (tuple_field_uuid(new_tuple, BOX_CLUSTER_FIELD_UUID, &uuid) != 0)
-		return -1;
-	struct replica *replica = replica_by_uuid(&uuid);
-	if (replica != NULL) {
-		replica_set_id(replica, id);
-	} else {
-		try {
-			replica = replicaset_add(id, &uuid);
-			/* Can't throw exceptions from on_commit trigger */
-		} catch(Exception *e) {
-			panic("Can't register replica: %s", e->errmsg);
-		}
-	}
-	return 0;
-}
-
+/** Unregister the replica affected by the change. */
 static int
-unregister_replica(struct trigger *trigger, void * /* event */)
+on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */)
 {
-	struct tuple *old_tuple = (struct tuple *)trigger->data;
-
-	struct tt_uuid old_uuid;
-	if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, &old_uuid) != 0)
-		return -1;
-
-	struct replica *replica = replica_by_uuid(&old_uuid);
-	assert(replica != NULL);
-	replica_clear_id(replica);
+	replica_clear_id((struct replica *)trigger->data);
 	return 0;
 }
 
@@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event)
 					  "updates of instance uuid");
 				return -1;
 			}
-		} else {
-			struct trigger *on_commit;
-			on_commit = txn_alter_trigger_new(register_replica,
-							  new_tuple);
-			if (on_commit == NULL)
-				return -1;
-			txn_stmt_on_commit(stmt, on_commit);
+			return 0;
+		}
+		/*
+		 * With read-views enabled there might be already a replica
+		 * whose registration is in progress in another transaction.
+		 * With the same replica ID.
+		 */
+		if (replica_by_id(replica_id) != NULL) {
+			diag_set(ClientError, ER_UNSUPPORTED, "Tarantool",
+				 "more than 1 replica with the same ID");
+			return -1;
 		}
====================

I couldn't test this check because of the bug in mvcc:
https://github.com/tarantool/tarantool/issues/6246

====================
+		struct trigger *on_rollback = txn_alter_trigger_new(
+			on_replace_cluster_clear_id, NULL);
+		if (on_rollback == NULL)
+			return -1;
+		/*
+		 * Register the replica before commit so as to occupy the
+		 * replica ID now. While WAL write is in progress, new replicas
+		 * might come, they should see the ID is already in use.
+		 */
+		struct replica *replica = replica_by_uuid(&replica_uuid);
+		if (replica != NULL)
+			replica_set_id(replica, replica_id);
+		else
+			replica = replicaset_add(replica_id, &replica_uuid);
+		on_rollback->data = replica;
+		txn_stmt_on_rollback(stmt, on_rollback);
 	} else {
 		/*
 		 * Don't allow deletion of the record for this instance
@@ -4300,9 +4284,23 @@ on_replace_dd_cluster(struct trigger *trigger, void *event)
 		if (replica_check_id(replica_id) != 0)
 			return -1;
 
-		struct trigger *on_commit;
-		on_commit = txn_alter_trigger_new(unregister_replica,
-						  old_tuple);
+		struct replica *replica = replica_by_id(replica_id);
+		if (replica == NULL) {
+			/*
+			 * Impossible, but it is important not to leave
+			 * undefined behaviour if there is a bug. Too sensitive
+			 * subsystem is affected.
+			 */
+			panic("Tried to unregister a replica not stored in "
+			      "replica_by_id map, id is %u", replica_id);
+		}
+		/*
+		 * Unregister only after commit. Otherwise if the transaction
+		 * would be rolled back, there might be already another replica
+		 * taken the freed ID.
+		 */
+		struct trigger *on_commit = txn_alter_trigger_new(
+			on_replace_cluster_clear_id, replica);
 		if (on_commit == NULL)
 			return -1;
 		txn_stmt_on_commit(stmt, on_commit);
diff --git a/src/box/box.cc b/src/box/box.cc
index 8c10a99dd..5c10aceff 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid)
 		return; /* nothing to do - already registered */
 
 	box_check_writable_xc();
-
-	/** Find the largest existing replica id. */
-	struct space *space = space_cache_find_xc(BOX_CLUSTER_ID);
-	struct index *index = index_find_system_xc(space, 0);
-	struct iterator *it = index_create_iterator_xc(index, ITER_ALL,
-						       NULL, 0);
-	IteratorGuard iter_guard(it);
-	struct tuple *tuple;
-	/** Assign a new replica id. */
-	uint32_t replica_id = 1;
-	while ((tuple = iterator_next_xc(it)) != NULL) {
-		if (tuple_field_u32_xc(tuple,
-				       BOX_CLUSTER_FIELD_ID) != replica_id)
-			break;
-		replica_id++;
-	}
+	uint32_t replica_id;
+	if (replica_find_new_id(&replica_id) != 0)
+		diag_raise();
 	box_register_replica(replica_id, instance_uuid);
 }
 
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 45ad03dfd..1288bc9b1 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -1032,3 +1032,16 @@ replica_by_id(uint32_t replica_id)
 {
 	return replicaset.replica_by_id[replica_id];
 }
+
+int
+replica_find_new_id(uint32_t *replica_id)
+{
+	for (uint32_t i = 1; i < VCLOCK_MAX; ++i) {
+		if (replicaset.replica_by_id[i] == NULL) {
+			*replica_id = i;
+			return 0;
+		}
+	}
+	diag_set(ClientError, ER_REPLICA_MAX, VCLOCK_MAX);
+	return -1;
+}
diff --git a/src/box/replication.h b/src/box/replication.h
index 57e0f10ae..5d1fa1255 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -360,6 +360,10 @@ replica_by_uuid(const struct tt_uuid *uuid);
 struct replica *
 replica_by_id(uint32_t replica_id);
 
+/** Find the smallest free replica ID in the available range. */
+int
+replica_find_new_id(uint32_t *replica_id);
+
 /**
  * Find a node in the replicaset on which the instance can try to register to
  * join the replicaset.
diff --git a/test/replication/gh-5430-cluster-mvcc.result b/test/replication/gh-5430-cluster-mvcc.result
new file mode 100644
index 000000000..831689fc2
--- /dev/null
+++ b/test/replication/gh-5430-cluster-mvcc.result
@@ -0,0 +1,146 @@
+-- test-run result file version 2
+--
+-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt
+-- could fail with 'duplicate error' in _cluster space. This was happening,
+-- because _cluster is memtx. Changes to it were not visible for newer requests
+-- until commit.
+-- New replica ID was looked up in the space by its full scan. The scan used a
+-- plain iterator and didn't see replicas, whose registration was in progress of
+-- being written to WAL.
+-- As a result, if 2 replicas came to register at the same time, they got the
+-- same replica ID because didn't see each other in _cluster. One of them would
+-- fail to register in the end due to the conflict.
+--
+-- The test reproduces it by doing anon replica registration. Because during
+-- normal join there are more than one access to WAL, the ID assignment is the
+-- last one. It makes hard to block the ID assignment only. With anon replica
+-- ID assignment the join is already done, the only WAL write is the ID
+-- assignment, easy to block and yet not block new replicas registration
+-- attempts.
+--
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+test_run:cmd('create server master with '..                                     \
+             'script="replication/gh-5430-mvcc-master.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server master')
+ | ---
+ | - true
+ | ...
+
+test_run:cmd('create server replica1 with '..                                   \
+             'script="replication/gh-5430-mvcc-replica1.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica1')
+ | ---
+ | - true
+ | ...
+
+test_run:cmd('create server replica2 with '..                                   \
+             'script="replication/gh-5430-mvcc-replica2.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica2')
+ | ---
+ | - true
+ | ...
+
+test_run:switch('master')
+ | ---
+ | - true
+ | ...
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
+ | ---
+ | - ok
+ | ...
+
+test_run:switch('replica1')
+ | ---
+ | - true
+ | ...
+_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
+ | ---
+ | ...
+str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
+ | ---
+ | ...
+_ = test_run:wait_log('master', str)
+ | ---
+ | ...
+
+test_run:switch('replica2')
+ | ---
+ | - true
+ | ...
+_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
+ | ---
+ | ...
+str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
+ | ---
+ | ...
+_ = test_run:wait_log('master', str)
+ | ---
+ | ...
+
+test_run:switch('master')
+ | ---
+ | - true
+ | ...
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
+ | ---
+ | - ok
+ | ...
+
+test_run:switch('replica1')
+ | ---
+ | - true
+ | ...
+test_run:wait_cond(function() return box.info.id > 1 end)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('replica2')
+ | ---
+ | - true
+ | ...
+test_run:wait_cond(function() return box.info.id > 1 end)
+ | ---
+ | - true
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica2')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica2')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica1')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica1')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server master')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server master')
+ | ---
+ | - true
+ | ...
diff --git a/test/replication/gh-5430-cluster-mvcc.test.lua b/test/replication/gh-5430-cluster-mvcc.test.lua
new file mode 100644
index 000000000..bb34ba540
--- /dev/null
+++ b/test/replication/gh-5430-cluster-mvcc.test.lua
@@ -0,0 +1,62 @@
+--
+-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt
+-- could fail with 'duplicate error' in _cluster space. This was happening,
+-- because _cluster is memtx. Changes to it were not visible for newer requests
+-- until commit.
+-- New replica ID was looked up in the space by its full scan. The scan used a
+-- plain iterator and didn't see replicas, whose registration was in progress of
+-- being written to WAL.
+-- As a result, if 2 replicas came to register at the same time, they got the
+-- same replica ID because didn't see each other in _cluster. One of them would
+-- fail to register in the end due to the conflict.
+--
+-- The test reproduces it by doing anon replica registration. Because during
+-- normal join there are more than one access to WAL, the ID assignment is the
+-- last one. It makes hard to block the ID assignment only. With anon replica
+-- ID assignment the join is already done, the only WAL write is the ID
+-- assignment, easy to block and yet not block new replicas registration
+-- attempts.
+--
+test_run = require('test_run').new()
+
+test_run:cmd('create server master with '..                                     \
+             'script="replication/gh-5430-mvcc-master.lua"')
+test_run:cmd('start server master')
+
+test_run:cmd('create server replica1 with '..                                   \
+             'script="replication/gh-5430-mvcc-replica1.lua"')
+test_run:cmd('start server replica1')
+
+test_run:cmd('create server replica2 with '..                                   \
+             'script="replication/gh-5430-mvcc-replica2.lua"')
+test_run:cmd('start server replica2')
+
+test_run:switch('master')
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
+
+test_run:switch('replica1')
+_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
+str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
+_ = test_run:wait_log('master', str)
+
+test_run:switch('replica2')
+_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
+str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
+_ = test_run:wait_log('master', str)
+
+test_run:switch('master')
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
+
+test_run:switch('replica1')
+test_run:wait_cond(function() return box.info.id > 1 end)
+
+test_run:switch('replica2')
+test_run:wait_cond(function() return box.info.id > 1 end)
+
+test_run:switch('default')
+test_run:cmd('stop server replica2')
+test_run:cmd('delete server replica2')
+test_run:cmd('stop server replica1')
+test_run:cmd('delete server replica1')
+test_run:cmd('stop server master')
+test_run:cmd('delete server master')
diff --git a/test/replication/gh-5430-mvcc-master.lua b/test/replication/gh-5430-mvcc-master.lua
new file mode 100644
index 000000000..6c7d0de3d
--- /dev/null
+++ b/test/replication/gh-5430-mvcc-master.lua
@@ -0,0 +1,11 @@
+#!/usr/bin/env tarantool
+
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg({
+    listen = 'unix/:./master.sock',
+    replication_timeout = 0.1,
+    memtx_use_mvcc_engine = true,
+})
+
+box.schema.user.grant('guest', 'super')
diff --git a/test/replication/gh-5430-mvcc-replica1.lua b/test/replication/gh-5430-mvcc-replica1.lua
new file mode 100644
index 000000000..f1ad7b8a4
--- /dev/null
+++ b/test/replication/gh-5430-mvcc-replica1.lua
@@ -0,0 +1,10 @@
+#!/usr/bin/env tarantool
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg({
+    listen = os.getenv("LISTEN"),
+    replication = 'unix/:./master.sock',
+    replication_timeout = 0.1,
+    read_only = true,
+    replication_anon = true,
+})
diff --git a/test/replication/gh-5430-mvcc-replica2.lua b/test/replication/gh-5430-mvcc-replica2.lua
new file mode 120000
index 000000000..28c968a8a
--- /dev/null
+++ b/test/replication/gh-5430-mvcc-replica2.lua
@@ -0,0 +1 @@
+gh-5430-mvcc-replica1.lua
\ No newline at end of file
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index e0bbe2676..0acc66816 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -18,6 +18,7 @@
     "gh-5213-qsync-applier-order.test.lua": {},
     "gh-5213-qsync-applier-order-3.test.lua": {},
     "gh-5426-election-on-off.test.lua": {},
+    "gh-5430-cluster-mvcc.test.lua": {},
     "gh-5433-election-restart-recovery.test.lua": {},
     "gh-5445-leader-inconsistency.test.lua": {},
     "gh-5506-election-on-off.test.lua": {},
diff --git a/test/replication/suite.ini b/test/replication/suite.ini
index 18981996d..907fd0be9 100644
--- a/test/replication/suite.ini
+++ b/test/replication/suite.ini
@@ -3,7 +3,7 @@ core = tarantool
 script =  master.lua
 description = tarantool/box, replication
 disabled = consistent.test.lua
-release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua
+release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5430-cluster-mvcc.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua
 config = suite.cfg
 lua_libs = lua/fast_replica.lua lua/rlimit.lua
 use_unix_sockets = True
-- 
2.24.3 (Apple Git-128)



More information about the Tarantool-patches mailing list