[Tarantool-patches] [PATCH 1/1] replication: set replica ID before _cluster commit

Sun Jul 25 21:31:51 MSK 2021

25.07.2021 19:53, Vladislav Shpilevoy пишет:
> Replica registration works via looking for the smallest not
> occupied ID in _cluster and inserting it into the space.
>
> It works not so good when mvcc is enabled. In particular, if more
> than 1 replica try to register at the same time, they might get
> the same replica_id because don't see changes of each other until
> the registration in _cluster is complete.
>
> This in the end leads to all replicas failing the registration
> except one with the 'duplicate key' error (primary index in
> _cluster is replica ID).
>
> The patch makes the replicas occupy their ID before they commit it
> into _cluster. And new replica ID search now uses the replica ID
> map instead of _cluster iterator.
>
> This way the registration works like before - like MVCC does not
> exist which is fine.
>
> Part of #5430
> ---

Hi! Thanks for the patch!

Please, find a couple of comments below.

> Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5430-cluster-duplicate
> Issue: https://github.com/tarantool/tarantool/issues/5430
>
>   .../gh-5430-cluster-mvcc-duplicate.md         |   7 +
>   src/box/alter.cc                              |  96 ++++++------
>   src/box/box.cc                                |  19 +--
>   src/box/replication.cc                        |  13 ++
>   src/box/replication.h                         |   4 +
>   test/replication/gh-5430-cluster-mvcc.result  | 146 ++++++++++++++++++
>   .../replication/gh-5430-cluster-mvcc.test.lua |  62 ++++++++
>   test/replication/gh-5430-mvcc-master.lua      |  11 ++
>   test/replication/gh-5430-mvcc-replica1.lua    |  10 ++
>   test/replication/gh-5430-mvcc-replica2.lua    |   1 +
>   test/replication/suite.cfg                    |   1 +
>   test/replication/suite.ini                    |   2 +-
>   12 files changed, 306 insertions(+), 66 deletions(-)
>   create mode 100644 changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
>   create mode 100644 test/replication/gh-5430-cluster-mvcc.result
>   create mode 100644 test/replication/gh-5430-cluster-mvcc.test.lua
>   create mode 100644 test/replication/gh-5430-mvcc-master.lua
>   create mode 100644 test/replication/gh-5430-mvcc-replica1.lua
>   create mode 120000 test/replication/gh-5430-mvcc-replica2.lua
>
> diff --git a/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
> new file mode 100644
> index 000000000..59b90f026
> --- /dev/null
> +++ b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
> @@ -0,0 +1,7 @@
> +## bugfix/replication
> +
> +* Fixed a rare error appearing when MVCC (`box.cfg.memtx_use_mvcc_engine`) was
> +  enabled and more than one replica was joined to a cluster. The join could fail
> +  with the error `"ER_TUPLE_FOUND: Duplicate key exists in unique index
> +  'primary' in space '_cluster'"`. The same could happen at bootstrap of a
> +  cluster having >= 3 nodes (gh-5430).
> diff --git a/src/box/alter.cc b/src/box/alter.cc
> index 89bb5946c..64ba09021 100644
> --- a/src/box/alter.cc
> +++ b/src/box/alter.cc
> @@ -4178,47 +4178,11 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event)
>   	return 0;
>   }
>   
> -/**
> - * A record with id of the new instance has been synced to the
> - * write ahead log. Update the cluster configuration cache
> - * with it.
> - */
> -static int
> -register_replica(struct trigger *trigger, void * /* event */)
> -{
> -	struct tuple *new_tuple = (struct tuple *)trigger->data;
> -	uint32_t id;
> -	if (tuple_field_u32(new_tuple, BOX_CLUSTER_FIELD_ID, &id) != 0)
> -		return -1;
> -	tt_uuid uuid;
> -	if (tuple_field_uuid(new_tuple, BOX_CLUSTER_FIELD_UUID, &uuid) != 0)
> -		return -1;
> -	struct replica *replica = replica_by_uuid(&uuid);
> -	if (replica != NULL) {
> -		replica_set_id(replica, id);
> -	} else {
> -		try {
> -			replica = replicaset_add(id, &uuid);
> -			/* Can't throw exceptions from on_commit trigger */
> -		} catch(Exception *e) {
> -			panic("Can't register replica: %s", e->errmsg);
> -		}
> -	}
> -	return 0;
> -}
> -
> +/** Unregister the replica affected by the change. */
>   static int
> -unregister_replica(struct trigger *trigger, void * /* event */)
> +on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */)
>   {
> -	struct tuple *old_tuple = (struct tuple *)trigger->data;
> -
> -	struct tt_uuid old_uuid;
> -	if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, &old_uuid) != 0)
> -		return -1;
> -
> -	struct replica *replica = replica_by_uuid(&old_uuid);
> -	assert(replica != NULL);
> -	replica_clear_id(replica);
> +	replica_clear_id((struct replica *)trigger->data);
>   	return 0;
>   }
>   
> @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event)
>   					  "updates of instance uuid");
>   				return -1;
>   			}
> -		} else {
> -			struct trigger *on_commit;
> -			on_commit = txn_alter_trigger_new(register_replica,
> -							  new_tuple);
> -			if (on_commit == NULL)
> -				return -1;
> -			txn_stmt_on_commit(stmt, on_commit);
> +			return 0;
> +		}
> +		/*
> +		 * With read-views enabled there might be already a replica
> +		 * whose registration is in progress in another transaction.
> +		 * With the same replica ID.
> +		 */
> +		if (replica_by_id(replica_id) != NULL) {
> +			diag_set(ClientError, ER_UNSUPPORTED, "Tarantool",
> +				 "more than 1 replica with the same ID");
> +			return -1;
>   		}
> ====================
>
> I couldn't test this check because of the bug in mvcc:
> https://github.com/tarantool/tarantool/issues/6246
>
> ====================

I don't understand how this could happen
(I mean the if branch being taken).

Would you mind explaining?

> +		struct trigger *on_rollback = txn_alter_trigger_new(
> +			on_replace_cluster_clear_id, NULL);
> +		if (on_rollback == NULL)
> +			return -1;
> +		/*
> +		 * Register the replica before commit so as to occupy the
> +		 * replica ID now. While WAL write is in progress, new replicas
> +		 * might come, they should see the ID is already in use.
> +		 */
> +		struct replica *replica = replica_by_uuid(&replica_uuid);
> +		if (replica != NULL)
> +			replica_set_id(replica, replica_id);
> +		else
> +			replica = replicaset_add(replica_id, &replica_uuid);
> +		on_rollback->data = replica;
> +		txn_stmt_on_rollback(stmt, on_rollback);
>   	} else {
>   		/*
>   		 * Don't allow deletion of the record for this instance
> @@ -4300,9 +4284,23 @@ on_replace_dd_cluster(struct trigger *trigger, void *event)
>   		if (replica_check_id(replica_id) != 0)
>   			return -1;
>   
> -		struct trigger *on_commit;
> -		on_commit = txn_alter_trigger_new(unregister_replica,
> -						  old_tuple);
> +		struct replica *replica = replica_by_id(replica_id);
> +		if (replica == NULL) {
> +			/*
> +			 * Impossible, but it is important not to leave
> +			 * undefined behaviour if there is a bug. Too sensitive
> +			 * subsystem is affected.
> +			 */
> +			panic("Tried to unregister a replica not stored in "
> +			      "replica_by_id map, id is %u", replica_id);
> +		}
> +		/*
> +		 * Unregister only after commit. Otherwise if the transaction
> +		 * would be rolled back, there might be already another replica
> +		 * taken the freed ID.
> +		 */
> +		struct trigger *on_commit = txn_alter_trigger_new(
> +			on_replace_cluster_clear_id, replica);
>   		if (on_commit == NULL)
>   			return -1;
>   		txn_stmt_on_commit(stmt, on_commit);
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 8c10a99dd..5c10aceff 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid)
>   		return; /* nothing to do - already registered */
>   
>   	box_check_writable_xc();
> -
> -	/** Find the largest existing replica id. */
> -	struct space *space = space_cache_find_xc(BOX_CLUSTER_ID);
> -	struct index *index = index_find_system_xc(space, 0);
> -	struct iterator *it = index_create_iterator_xc(index, ITER_ALL,
> -						       NULL, 0);
> -	IteratorGuard iter_guard(it);
> -	struct tuple *tuple;
> -	/** Assign a new replica id. */
> -	uint32_t replica_id = 1;
> -	while ((tuple = iterator_next_xc(it)) != NULL) {
> -		if (tuple_field_u32_xc(tuple,
> -				       BOX_CLUSTER_FIELD_ID) != replica_id)
> -			break;
> -		replica_id++;
> -	}
> +	uint32_t replica_id;
> +	if (replica_find_new_id(&replica_id) != 0)
> +		diag_raise();
>   	box_register_replica(replica_id, instance_uuid);
>   }
>   
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 45ad03dfd..1288bc9b1 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -1032,3 +1032,16 @@ replica_by_id(uint32_t replica_id)
>   {
>   	return replicaset.replica_by_id[replica_id];
>   }
> +
> +int
> +replica_find_new_id(uint32_t *replica_id)
> +{
> +	for (uint32_t i = 1; i < VCLOCK_MAX; ++i) {
> +		if (replicaset.replica_by_id[i] == NULL) {
> +			*replica_id = i;
> +			return 0;
> +		}
> +	}
> +	diag_set(ClientError, ER_REPLICA_MAX, VCLOCK_MAX);
> +	return -1;
> +}
> diff --git a/src/box/replication.h b/src/box/replication.h
> index 57e0f10ae..5d1fa1255 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -360,6 +360,10 @@ replica_by_uuid(const struct tt_uuid *uuid);
>   struct replica *
>   replica_by_id(uint32_t replica_id);
>   
> +/** Find the smallest free replica ID in the available range. */

nit: free -> empty / unoccupied

> +int
> +replica_find_new_id(uint32_t *replica_id);
> +
>   /**
>    * Find a node in the replicaset on which the instance can try to register to
>    * join the replicaset.