[Tarantool-patches] [PATCH 1/1] replication: set replica ID before _cluster commit
Sergey Ostanevich
sergos at tarantool.org
Tue Jul 27 18:10:54 MSK 2021
Hi!
Thanks for the patch!
Please, see my 3 comments below.
> On 25 Jul 2021, at 19:53, Vladislav Shpilevoy via Tarantool-patches <tarantool-patches at dev.tarantool.org> wrote:
>
> Replica registration works via looking for the smallest not
> occupied ID in _cluster and inserting it into the space.
>
> It works not so good when mvcc is enabled. In particular, if more
> than 1 replica try to register at the same time, they might get
> the same replica_id because don't see changes of each other until
> the registration in _cluster is complete.
>
> This in the end leads to all replicas failing the registration
> except one with the 'duplicate key' error (primary index in
> _cluster is replica ID).
>
> The patch makes the replicas occupy their ID before they commit it
> into _cluster. And new replica ID search now uses the replica ID
> map instead of _cluster iterator.
>
> This way the registration works like before - like MVCC does not
> exist which is fine.
Did you discuss the MVCC capabilities - if we can address it there, by
setting a dedicated flag for this particular - and perhaps for some
other internal - space(s) to suppress MVCC ‘coverage’ for them. The
solution will be way more common with supposedly less hassle with
local structures, triggers, and so on?
>
> Part of #5430
> ---
> Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5430-cluster-duplicate
> Issue: https://github.com/tarantool/tarantool/issues/5430
>
> .../gh-5430-cluster-mvcc-duplicate.md | 7 +
> src/box/alter.cc | 96 ++++++------
> src/box/box.cc | 19 +--
> src/box/replication.cc | 13 ++
> src/box/replication.h | 4 +
> test/replication/gh-5430-cluster-mvcc.result | 146 ++++++++++++++++++
> .../replication/gh-5430-cluster-mvcc.test.lua | 62 ++++++++
> test/replication/gh-5430-mvcc-master.lua | 11 ++
> test/replication/gh-5430-mvcc-replica1.lua | 10 ++
> test/replication/gh-5430-mvcc-replica2.lua | 1 +
> test/replication/suite.cfg | 1 +
> test/replication/suite.ini | 2 +-
> 12 files changed, 306 insertions(+), 66 deletions(-)
> create mode 100644 changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
> create mode 100644 test/replication/gh-5430-cluster-mvcc.result
> create mode 100644 test/replication/gh-5430-cluster-mvcc.test.lua
> create mode 100644 test/replication/gh-5430-mvcc-master.lua
> create mode 100644 test/replication/gh-5430-mvcc-replica1.lua
> create mode 120000 test/replication/gh-5430-mvcc-replica2.lua
>
> diff --git a/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
> new file mode 100644
> index 000000000..59b90f026
> --- /dev/null
> +++ b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md
> @@ -0,0 +1,7 @@
> +## bugfix/replication
> +
> +* Fixed a rare error appearing when MVCC (`box.cfg.memtx_use_mvcc_engine`) was
> + enabled and more than one replica was joined to a cluster. The join could fail
> + with the error `"ER_TUPLE_FOUND: Duplicate key exists in unique index
> + 'primary' in space '_cluster'"`. The same could happen at bootstrap of a
> + cluster having >= 3 nodes (gh-5430).
> diff --git a/src/box/alter.cc b/src/box/alter.cc
> index 89bb5946c..64ba09021 100644
> --- a/src/box/alter.cc
> +++ b/src/box/alter.cc
> @@ -4178,47 +4178,11 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event)
> return 0;
> }
>
> -/**
> - * A record with id of the new instance has been synced to the
> - * write ahead log. Update the cluster configuration cache
> - * with it.
> - */
> -static int
> -register_replica(struct trigger *trigger, void * /* event */)
> -{
> - struct tuple *new_tuple = (struct tuple *)trigger->data;
> - uint32_t id;
> - if (tuple_field_u32(new_tuple, BOX_CLUSTER_FIELD_ID, &id) != 0)
> - return -1;
> - tt_uuid uuid;
> - if (tuple_field_uuid(new_tuple, BOX_CLUSTER_FIELD_UUID, &uuid) != 0)
> - return -1;
> - struct replica *replica = replica_by_uuid(&uuid);
> - if (replica != NULL) {
> - replica_set_id(replica, id);
> - } else {
> - try {
> - replica = replicaset_add(id, &uuid);
> - /* Can't throw exceptions from on_commit trigger */
> - } catch(Exception *e) {
> - panic("Can't register replica: %s", e->errmsg);
> - }
> - }
> - return 0;
> -}
> -
> +/** Unregister the replica affected by the change. */
> static int
> -unregister_replica(struct trigger *trigger, void * /* event */)
> +on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */)
> {
> - struct tuple *old_tuple = (struct tuple *)trigger->data;
> -
> - struct tt_uuid old_uuid;
> - if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, &old_uuid) != 0)
> - return -1;
> -
> - struct replica *replica = replica_by_uuid(&old_uuid);
> - assert(replica != NULL);
> - replica_clear_id(replica);
> + replica_clear_id((struct replica *)trigger->data);
> return 0;
> }
>
> @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event)
> "updates of instance uuid");
> return -1;
> }
> - } else {
> - struct trigger *on_commit;
> - on_commit = txn_alter_trigger_new(register_replica,
> - new_tuple);
> - if (on_commit == NULL)
> - return -1;
> - txn_stmt_on_commit(stmt, on_commit);
> + return 0;
> + }
> + /*
> + * With read-views enabled there might be already a replica
> + * whose registration is in progress in another transaction.
> + * With the same replica ID.
> + */
> + if (replica_by_id(replica_id) != NULL) {
> + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool",
> + "more than 1 replica with the same ID");
There should be more details on replica ID/UUID/name here, same as in unregister
panic below.
> + return -1;
> }
> ====================
>
> I couldn't test this check because of the bug in mvcc:
> https://github.com/tarantool/tarantool/issues/6246
>
> ====================
> + struct trigger *on_rollback = txn_alter_trigger_new(
> + on_replace_cluster_clear_id, NULL);
> + if (on_rollback == NULL)
> + return -1;
> + /*
> + * Register the replica before commit so as to occupy the
> + * replica ID now. While WAL write is in progress, new replicas
> + * might come, they should see the ID is already in use.
> + */
> + struct replica *replica = replica_by_uuid(&replica_uuid);
> + if (replica != NULL)
> + replica_set_id(replica, replica_id);
> + else
> + replica = replicaset_add(replica_id, &replica_uuid);
> + on_rollback->data = replica;
> + txn_stmt_on_rollback(stmt, on_rollback);
> } else {
> /*
> * Don't allow deletion of the record for this instance
> @@ -4300,9 +4284,23 @@ on_replace_dd_cluster(struct trigger *trigger, void *event)
> if (replica_check_id(replica_id) != 0)
> return -1;
>
> - struct trigger *on_commit;
> - on_commit = txn_alter_trigger_new(unregister_replica,
> - old_tuple);
> + struct replica *replica = replica_by_id(replica_id);
> + if (replica == NULL) {
> + /*
> + * Impossible, but it is important not to leave
> + * undefined behaviour if there is a bug. Too sensitive
> + * subsystem is affected.
> + */
> + panic("Tried to unregister a replica not stored in "
> + "replica_by_id map, id is %u", replica_id);
> + }
> + /*
> + * Unregister only after commit. Otherwise if the transaction
> + * would be rolled back, there might be already another replica
> + * taken the freed ID.
> + */
> + struct trigger *on_commit = txn_alter_trigger_new(
> + on_replace_cluster_clear_id, replica);
> if (on_commit == NULL)
> return -1;
> txn_stmt_on_commit(stmt, on_commit);
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 8c10a99dd..5c10aceff 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid)
> return; /* nothing to do - already registered */
>
> box_check_writable_xc();
> -
> - /** Find the largest existing replica id. */
> - struct space *space = space_cache_find_xc(BOX_CLUSTER_ID);
> - struct index *index = index_find_system_xc(space, 0);
> - struct iterator *it = index_create_iterator_xc(index, ITER_ALL,
> - NULL, 0);
> - IteratorGuard iter_guard(it);
> - struct tuple *tuple;
> - /** Assign a new replica id. */
> - uint32_t replica_id = 1;
> - while ((tuple = iterator_next_xc(it)) != NULL) {
> - if (tuple_field_u32_xc(tuple,
> - BOX_CLUSTER_FIELD_ID) != replica_id)
> - break;
> - replica_id++;
> - }
> + uint32_t replica_id;
> + if (replica_find_new_id(&replica_id) != 0)
> + diag_raise();
Any info on why register fails?
> box_register_replica(replica_id, instance_uuid);
> }
>
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 45ad03dfd..1288bc9b1 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -1032,3 +1032,16 @@ replica_by_id(uint32_t replica_id)
> {
> return replicaset.replica_by_id[replica_id];
> }
> +
> +int
> +replica_find_new_id(uint32_t *replica_id)
> +{
> + for (uint32_t i = 1; i < VCLOCK_MAX; ++i) {
> + if (replicaset.replica_by_id[i] == NULL) {
> + *replica_id = i;
> + return 0;
> + }
> + }
> + diag_set(ClientError, ER_REPLICA_MAX, VCLOCK_MAX);
> + return -1;
> +}
> diff --git a/src/box/replication.h b/src/box/replication.h
> index 57e0f10ae..5d1fa1255 100644
> --- a/src/box/replication.h
> +++ b/src/box/replication.h
> @@ -360,6 +360,10 @@ replica_by_uuid(const struct tt_uuid *uuid);
> struct replica *
> replica_by_id(uint32_t replica_id);
>
> +/** Find the smallest free replica ID in the available range. */
> +int
> +replica_find_new_id(uint32_t *replica_id);
> +
> /**
> * Find a node in the replicaset on which the instance can try to register to
> * join the replicaset.
> diff --git a/test/replication/gh-5430-cluster-mvcc.result b/test/replication/gh-5430-cluster-mvcc.result
> new file mode 100644
> index 000000000..831689fc2
> --- /dev/null
> +++ b/test/replication/gh-5430-cluster-mvcc.result
> @@ -0,0 +1,146 @@
> +-- test-run result file version 2
> +--
> +-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt
> +-- could fail with 'duplicate error' in _cluster space. This was happening,
> +-- because _cluster is memtx. Changes to it were not visible for newer requests
> +-- until commit.
> +-- New replica ID was looked up in the space by its full scan. The scan used a
> +-- plain iterator and didn't see replicas, whose registration was in progress of
> +-- being written to WAL.
> +-- As a result, if 2 replicas came to register at the same time, they got the
> +-- same replica ID because didn't see each other in _cluster. One of them would
> +-- fail to register in the end due to the conflict.
> +--
> +-- The test reproduces it by doing anon replica registration. Because during
> +-- normal join there are more than one access to WAL, the ID assignment is the
> +-- last one. It makes hard to block the ID assignment only. With anon replica
> +-- ID assignment the join is already done, the only WAL write is the ID
> +-- assignment, easy to block and yet not block new replicas registration
> +-- attempts.
> +--
> +test_run = require('test_run').new()
> + | ---
> + | ...
> +
> +test_run:cmd('create server master with '.. \
> + 'script="replication/gh-5430-mvcc-master.lua"')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('start server master')
> + | ---
> + | - true
> + | ...
> +
> +test_run:cmd('create server replica1 with '.. \
> + 'script="replication/gh-5430-mvcc-replica1.lua"')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('start server replica1')
> + | ---
> + | - true
> + | ...
> +
> +test_run:cmd('create server replica2 with '.. \
> + 'script="replication/gh-5430-mvcc-replica2.lua"')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('start server replica2')
> + | ---
> + | - true
> + | ...
> +
> +test_run:switch('master')
> + | ---
> + | - true
> + | ...
> +box.error.injection.set('ERRINJ_WAL_DELAY', true)
> + | ---
> + | - ok
> + | ...
> +
> +test_run:switch('replica1')
> + | ---
> + | - true
> + | ...
> +_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
> + | ---
> + | ...
> +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
> + | ---
> + | ...
> +_ = test_run:wait_log('master', str)
> + | ---
> + | ...
> +
> +test_run:switch('replica2')
> + | ---
> + | - true
> + | ...
> +_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
> + | ---
> + | ...
> +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
> + | ---
> + | ...
> +_ = test_run:wait_log('master', str)
> + | ---
> + | ...
> +
> +test_run:switch('master')
> + | ---
> + | - true
> + | ...
> +box.error.injection.set('ERRINJ_WAL_DELAY', false)
> + | ---
> + | - ok
> + | ...
> +
> +test_run:switch('replica1')
> + | ---
> + | - true
> + | ...
> +test_run:wait_cond(function() return box.info.id > 1 end)
> + | ---
> + | - true
> + | ...
> +
> +test_run:switch('replica2')
> + | ---
> + | - true
> + | ...
> +test_run:wait_cond(function() return box.info.id > 1 end)
> + | ---
> + | - true
> + | ...
> +
> +test_run:switch('default')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('stop server replica2')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('delete server replica2')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('stop server replica1')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('delete server replica1')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('stop server master')
> + | ---
> + | - true
> + | ...
> +test_run:cmd('delete server master')
> + | ---
> + | - true
> + | ...
> diff --git a/test/replication/gh-5430-cluster-mvcc.test.lua b/test/replication/gh-5430-cluster-mvcc.test.lua
> new file mode 100644
> index 000000000..bb34ba540
> --- /dev/null
> +++ b/test/replication/gh-5430-cluster-mvcc.test.lua
> @@ -0,0 +1,62 @@
> +--
> +-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt
> +-- could fail with 'duplicate error' in _cluster space. This was happening,
> +-- because _cluster is memtx. Changes to it were not visible for newer requests
> +-- until commit.
> +-- New replica ID was looked up in the space by its full scan. The scan used a
> +-- plain iterator and didn't see replicas, whose registration was in progress of
> +-- being written to WAL.
> +-- As a result, if 2 replicas came to register at the same time, they got the
> +-- same replica ID because didn't see each other in _cluster. One of them would
> +-- fail to register in the end due to the conflict.
> +--
> +-- The test reproduces it by doing anon replica registration. Because during
> +-- normal join there are more than one access to WAL, the ID assignment is the
> +-- last one. It makes hard to block the ID assignment only. With anon replica
> +-- ID assignment the join is already done, the only WAL write is the ID
> +-- assignment, easy to block and yet not block new replicas registration
> +-- attempts.
> +--
> +test_run = require('test_run').new()
> +
> +test_run:cmd('create server master with '.. \
> + 'script="replication/gh-5430-mvcc-master.lua"')
> +test_run:cmd('start server master')
> +
> +test_run:cmd('create server replica1 with '.. \
> + 'script="replication/gh-5430-mvcc-replica1.lua"')
> +test_run:cmd('start server replica1')
> +
> +test_run:cmd('create server replica2 with '.. \
> + 'script="replication/gh-5430-mvcc-replica2.lua"')
> +test_run:cmd('start server replica2')
> +
> +test_run:switch('master')
> +box.error.injection.set('ERRINJ_WAL_DELAY', true)
> +
> +test_run:switch('replica1')
> +_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
> +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
> +_ = test_run:wait_log('master', str)
> +
> +test_run:switch('replica2')
> +_ = require('fiber').create(function() box.cfg{replication_anon = false} end)
> +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-')
> +_ = test_run:wait_log('master', str)
> +
> +test_run:switch('master')
> +box.error.injection.set('ERRINJ_WAL_DELAY', false)
> +
> +test_run:switch('replica1')
> +test_run:wait_cond(function() return box.info.id > 1 end)
> +
> +test_run:switch('replica2')
> +test_run:wait_cond(function() return box.info.id > 1 end)
> +
> +test_run:switch('default')
> +test_run:cmd('stop server replica2')
> +test_run:cmd('delete server replica2')
> +test_run:cmd('stop server replica1')
> +test_run:cmd('delete server replica1')
> +test_run:cmd('stop server master')
> +test_run:cmd('delete server master')
> diff --git a/test/replication/gh-5430-mvcc-master.lua b/test/replication/gh-5430-mvcc-master.lua
> new file mode 100644
> index 000000000..6c7d0de3d
> --- /dev/null
> +++ b/test/replication/gh-5430-mvcc-master.lua
> @@ -0,0 +1,11 @@
> +#!/usr/bin/env tarantool
> +
> +require('console').listen(os.getenv('ADMIN'))
> +
> +box.cfg({
> + listen = 'unix/:./master.sock',
> + replication_timeout = 0.1,
> + memtx_use_mvcc_engine = true,
> +})
> +
> +box.schema.user.grant('guest', 'super')
> diff --git a/test/replication/gh-5430-mvcc-replica1.lua b/test/replication/gh-5430-mvcc-replica1.lua
> new file mode 100644
> index 000000000..f1ad7b8a4
> --- /dev/null
> +++ b/test/replication/gh-5430-mvcc-replica1.lua
> @@ -0,0 +1,10 @@
> +#!/usr/bin/env tarantool
> +require('console').listen(os.getenv('ADMIN'))
> +
> +box.cfg({
> + listen = os.getenv("LISTEN"),
> + replication = 'unix/:./master.sock',
> + replication_timeout = 0.1,
> + read_only = true,
> + replication_anon = true,
> +})
> diff --git a/test/replication/gh-5430-mvcc-replica2.lua b/test/replication/gh-5430-mvcc-replica2.lua
> new file mode 120000
> index 000000000..28c968a8a
> --- /dev/null
> +++ b/test/replication/gh-5430-mvcc-replica2.lua
> @@ -0,0 +1 @@
> +gh-5430-mvcc-replica1.lua
> \ No newline at end of file
> diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
> index e0bbe2676..0acc66816 100644
> --- a/test/replication/suite.cfg
> +++ b/test/replication/suite.cfg
> @@ -18,6 +18,7 @@
> "gh-5213-qsync-applier-order.test.lua": {},
> "gh-5213-qsync-applier-order-3.test.lua": {},
> "gh-5426-election-on-off.test.lua": {},
> + "gh-5430-cluster-mvcc.test.lua": {},
> "gh-5433-election-restart-recovery.test.lua": {},
> "gh-5445-leader-inconsistency.test.lua": {},
> "gh-5506-election-on-off.test.lua": {},
> diff --git a/test/replication/suite.ini b/test/replication/suite.ini
> index 18981996d..907fd0be9 100644
> --- a/test/replication/suite.ini
> +++ b/test/replication/suite.ini
> @@ -3,7 +3,7 @@ core = tarantool
> script = master.lua
> description = tarantool/box, replication
> disabled = consistent.test.lua
> -release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua
> +release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5430-cluster-mvcc.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua
> config = suite.cfg
> lua_libs = lua/fast_replica.lua lua/rlimit.lua
> use_unix_sockets = True
> --
> 2.24.3 (Apple Git-128)
>
More information about the Tarantool-patches
mailing list