Replica registration works via looking for the smallest not occupied ID in _cluster and inserting it into the space. It works not so good when mvcc is enabled. In particular, if more than 1 replica try to register at the same time, they might get the same replica_id because don't see changes of each other until the registration in _cluster is complete. This in the end leads to all replicas failing the registration except one with the 'duplicate key' error (primary index in _cluster is replica ID). The patch makes the replicas occupy their ID before they commit it into _cluster. And new replica ID search now uses the replica ID map instead of _cluster iterator. This way the registration works like before - like MVCC does not exist which is fine. Part of #5430 --- Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5430-cluster-duplicate Issue: https://github.com/tarantool/tarantool/issues/5430 .../gh-5430-cluster-mvcc-duplicate.md | 7 + src/box/alter.cc | 96 ++++++------ src/box/box.cc | 19 +-- src/box/replication.cc | 13 ++ src/box/replication.h | 4 + test/replication/gh-5430-cluster-mvcc.result | 146 ++++++++++++++++++ .../replication/gh-5430-cluster-mvcc.test.lua | 62 ++++++++ test/replication/gh-5430-mvcc-master.lua | 11 ++ test/replication/gh-5430-mvcc-replica1.lua | 10 ++ test/replication/gh-5430-mvcc-replica2.lua | 1 + test/replication/suite.cfg | 1 + test/replication/suite.ini | 2 +- 12 files changed, 306 insertions(+), 66 deletions(-) create mode 100644 changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md create mode 100644 test/replication/gh-5430-cluster-mvcc.result create mode 100644 test/replication/gh-5430-cluster-mvcc.test.lua create mode 100644 test/replication/gh-5430-mvcc-master.lua create mode 100644 test/replication/gh-5430-mvcc-replica1.lua create mode 120000 test/replication/gh-5430-mvcc-replica2.lua diff --git a/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md new file mode 100644 index 000000000..59b90f026 --- /dev/null +++ b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md @@ -0,0 +1,7 @@ +## bugfix/replication + +* Fixed a rare error appearing when MVCC (`box.cfg.memtx_use_mvcc_engine`) was + enabled and more than one replica was joined to a cluster. The join could fail + with the error `"ER_TUPLE_FOUND: Duplicate key exists in unique index + 'primary' in space '_cluster'"`. The same could happen at bootstrap of a + cluster having >= 3 nodes (gh-5430). diff --git a/src/box/alter.cc b/src/box/alter.cc index 89bb5946c..64ba09021 100644 --- a/src/box/alter.cc +++ b/src/box/alter.cc @@ -4178,47 +4178,11 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event) return 0; } -/** - * A record with id of the new instance has been synced to the - * write ahead log. Update the cluster configuration cache - * with it. - */ -static int -register_replica(struct trigger *trigger, void * /* event */) -{ - struct tuple *new_tuple = (struct tuple *)trigger->data; - uint32_t id; - if (tuple_field_u32(new_tuple, BOX_CLUSTER_FIELD_ID, &id) != 0) - return -1; - tt_uuid uuid; - if (tuple_field_uuid(new_tuple, BOX_CLUSTER_FIELD_UUID, &uuid) != 0) - return -1; - struct replica *replica = replica_by_uuid(&uuid); - if (replica != NULL) { - replica_set_id(replica, id); - } else { - try { - replica = replicaset_add(id, &uuid); - /* Can't throw exceptions from on_commit trigger */ - } catch(Exception *e) { - panic("Can't register replica: %s", e->errmsg); - } - } - return 0; -} - +/** Unregister the replica affected by the change. */ static int -unregister_replica(struct trigger *trigger, void * /* event */) +on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */) { - struct tuple *old_tuple = (struct tuple *)trigger->data; - - struct tt_uuid old_uuid; - if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, &old_uuid) != 0) - return -1; - - struct replica *replica = replica_by_uuid(&old_uuid); - assert(replica != NULL); - replica_clear_id(replica); + replica_clear_id((struct replica *)trigger->data); return 0; } @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) "updates of instance uuid"); return -1; } - } else { - struct trigger *on_commit; - on_commit = txn_alter_trigger_new(register_replica, - new_tuple); - if (on_commit == NULL) - return -1; - txn_stmt_on_commit(stmt, on_commit); + return 0; + } + /* + * With read-views enabled there might be already a replica + * whose registration is in progress in another transaction. + * With the same replica ID. + */ + if (replica_by_id(replica_id) != NULL) { + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", + "more than 1 replica with the same ID"); + return -1; } ==================== I couldn't test this check because of the bug in mvcc: https://github.com/tarantool/tarantool/issues/6246 ==================== + struct trigger *on_rollback = txn_alter_trigger_new( + on_replace_cluster_clear_id, NULL); + if (on_rollback == NULL) + return -1; + /* + * Register the replica before commit so as to occupy the + * replica ID now. While WAL write is in progress, new replicas + * might come, they should see the ID is already in use. + */ + struct replica *replica = replica_by_uuid(&replica_uuid); + if (replica != NULL) + replica_set_id(replica, replica_id); + else + replica = replicaset_add(replica_id, &replica_uuid); + on_rollback->data = replica; + txn_stmt_on_rollback(stmt, on_rollback); } else { /* * Don't allow deletion of the record for this instance @@ -4300,9 +4284,23 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) if (replica_check_id(replica_id) != 0) return -1; - struct trigger *on_commit; - on_commit = txn_alter_trigger_new(unregister_replica, - old_tuple); + struct replica *replica = replica_by_id(replica_id); + if (replica == NULL) { + /* + * Impossible, but it is important not to leave + * undefined behaviour if there is a bug. Too sensitive + * subsystem is affected. + */ + panic("Tried to unregister a replica not stored in " + "replica_by_id map, id is %u", replica_id); + } + /* + * Unregister only after commit. Otherwise if the transaction + * would be rolled back, there might be already another replica + * taken the freed ID. + */ + struct trigger *on_commit = txn_alter_trigger_new( + on_replace_cluster_clear_id, replica); if (on_commit == NULL) return -1; txn_stmt_on_commit(stmt, on_commit); diff --git a/src/box/box.cc b/src/box/box.cc index 8c10a99dd..5c10aceff 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid) return; /* nothing to do - already registered */ box_check_writable_xc(); - - /** Find the largest existing replica id. */ - struct space *space = space_cache_find_xc(BOX_CLUSTER_ID); - struct index *index = index_find_system_xc(space, 0); - struct iterator *it = index_create_iterator_xc(index, ITER_ALL, - NULL, 0); - IteratorGuard iter_guard(it); - struct tuple *tuple; - /** Assign a new replica id. */ - uint32_t replica_id = 1; - while ((tuple = iterator_next_xc(it)) != NULL) { - if (tuple_field_u32_xc(tuple, - BOX_CLUSTER_FIELD_ID) != replica_id) - break; - replica_id++; - } + uint32_t replica_id; + if (replica_find_new_id(&replica_id) != 0) + diag_raise(); box_register_replica(replica_id, instance_uuid); } diff --git a/src/box/replication.cc b/src/box/replication.cc index 45ad03dfd..1288bc9b1 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -1032,3 +1032,16 @@ replica_by_id(uint32_t replica_id) { return replicaset.replica_by_id[replica_id]; } + +int +replica_find_new_id(uint32_t *replica_id) +{ + for (uint32_t i = 1; i < VCLOCK_MAX; ++i) { + if (replicaset.replica_by_id[i] == NULL) { + *replica_id = i; + return 0; + } + } + diag_set(ClientError, ER_REPLICA_MAX, VCLOCK_MAX); + return -1; +} diff --git a/src/box/replication.h b/src/box/replication.h index 57e0f10ae..5d1fa1255 100644 --- a/src/box/replication.h +++ b/src/box/replication.h @@ -360,6 +360,10 @@ replica_by_uuid(const struct tt_uuid *uuid); struct replica * replica_by_id(uint32_t replica_id); +/** Find the smallest free replica ID in the available range. */ +int +replica_find_new_id(uint32_t *replica_id); + /** * Find a node in the replicaset on which the instance can try to register to * join the replicaset. diff --git a/test/replication/gh-5430-cluster-mvcc.result b/test/replication/gh-5430-cluster-mvcc.result new file mode 100644 index 000000000..831689fc2 --- /dev/null +++ b/test/replication/gh-5430-cluster-mvcc.result @@ -0,0 +1,146 @@ +-- test-run result file version 2 +-- +-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt +-- could fail with 'duplicate error' in _cluster space. This was happening, +-- because _cluster is memtx. Changes to it were not visible for newer requests +-- until commit. +-- New replica ID was looked up in the space by its full scan. The scan used a +-- plain iterator and didn't see replicas, whose registration was in progress of +-- being written to WAL. +-- As a result, if 2 replicas came to register at the same time, they got the +-- same replica ID because didn't see each other in _cluster. One of them would +-- fail to register in the end due to the conflict. +-- +-- The test reproduces it by doing anon replica registration. Because during +-- normal join there are more than one access to WAL, the ID assignment is the +-- last one. It makes hard to block the ID assignment only. With anon replica +-- ID assignment the join is already done, the only WAL write is the ID +-- assignment, easy to block and yet not block new replicas registration +-- attempts. +-- +test_run = require('test_run').new() + | --- + | ... + +test_run:cmd('create server master with '.. \ + 'script="replication/gh-5430-mvcc-master.lua"') + | --- + | - true + | ... +test_run:cmd('start server master') + | --- + | - true + | ... + +test_run:cmd('create server replica1 with '.. \ + 'script="replication/gh-5430-mvcc-replica1.lua"') + | --- + | - true + | ... +test_run:cmd('start server replica1') + | --- + | - true + | ... + +test_run:cmd('create server replica2 with '.. \ + 'script="replication/gh-5430-mvcc-replica2.lua"') + | --- + | - true + | ... +test_run:cmd('start server replica2') + | --- + | - true + | ... + +test_run:switch('master') + | --- + | - true + | ... +box.error.injection.set('ERRINJ_WAL_DELAY', true) + | --- + | - ok + | ... + +test_run:switch('replica1') + | --- + | - true + | ... +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) + | --- + | ... +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') + | --- + | ... +_ = test_run:wait_log('master', str) + | --- + | ... + +test_run:switch('replica2') + | --- + | - true + | ... +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) + | --- + | ... +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') + | --- + | ... +_ = test_run:wait_log('master', str) + | --- + | ... + +test_run:switch('master') + | --- + | - true + | ... +box.error.injection.set('ERRINJ_WAL_DELAY', false) + | --- + | - ok + | ... + +test_run:switch('replica1') + | --- + | - true + | ... +test_run:wait_cond(function() return box.info.id > 1 end) + | --- + | - true + | ... + +test_run:switch('replica2') + | --- + | - true + | ... +test_run:wait_cond(function() return box.info.id > 1 end) + | --- + | - true + | ... + +test_run:switch('default') + | --- + | - true + | ... +test_run:cmd('stop server replica2') + | --- + | - true + | ... +test_run:cmd('delete server replica2') + | --- + | - true + | ... +test_run:cmd('stop server replica1') + | --- + | - true + | ... +test_run:cmd('delete server replica1') + | --- + | - true + | ... +test_run:cmd('stop server master') + | --- + | - true + | ... +test_run:cmd('delete server master') + | --- + | - true + | ... diff --git a/test/replication/gh-5430-cluster-mvcc.test.lua b/test/replication/gh-5430-cluster-mvcc.test.lua new file mode 100644 index 000000000..bb34ba540 --- /dev/null +++ b/test/replication/gh-5430-cluster-mvcc.test.lua @@ -0,0 +1,62 @@ +-- +-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt +-- could fail with 'duplicate error' in _cluster space. This was happening, +-- because _cluster is memtx. Changes to it were not visible for newer requests +-- until commit. +-- New replica ID was looked up in the space by its full scan. The scan used a +-- plain iterator and didn't see replicas, whose registration was in progress of +-- being written to WAL. +-- As a result, if 2 replicas came to register at the same time, they got the +-- same replica ID because didn't see each other in _cluster. One of them would +-- fail to register in the end due to the conflict. +-- +-- The test reproduces it by doing anon replica registration. Because during +-- normal join there are more than one access to WAL, the ID assignment is the +-- last one. It makes hard to block the ID assignment only. With anon replica +-- ID assignment the join is already done, the only WAL write is the ID +-- assignment, easy to block and yet not block new replicas registration +-- attempts. +-- +test_run = require('test_run').new() + +test_run:cmd('create server master with '.. \ + 'script="replication/gh-5430-mvcc-master.lua"') +test_run:cmd('start server master') + +test_run:cmd('create server replica1 with '.. \ + 'script="replication/gh-5430-mvcc-replica1.lua"') +test_run:cmd('start server replica1') + +test_run:cmd('create server replica2 with '.. \ + 'script="replication/gh-5430-mvcc-replica2.lua"') +test_run:cmd('start server replica2') + +test_run:switch('master') +box.error.injection.set('ERRINJ_WAL_DELAY', true) + +test_run:switch('replica1') +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') +_ = test_run:wait_log('master', str) + +test_run:switch('replica2') +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') +_ = test_run:wait_log('master', str) + +test_run:switch('master') +box.error.injection.set('ERRINJ_WAL_DELAY', false) + +test_run:switch('replica1') +test_run:wait_cond(function() return box.info.id > 1 end) + +test_run:switch('replica2') +test_run:wait_cond(function() return box.info.id > 1 end) + +test_run:switch('default') +test_run:cmd('stop server replica2') +test_run:cmd('delete server replica2') +test_run:cmd('stop server replica1') +test_run:cmd('delete server replica1') +test_run:cmd('stop server master') +test_run:cmd('delete server master') diff --git a/test/replication/gh-5430-mvcc-master.lua b/test/replication/gh-5430-mvcc-master.lua new file mode 100644 index 000000000..6c7d0de3d --- /dev/null +++ b/test/replication/gh-5430-mvcc-master.lua @@ -0,0 +1,11 @@ +#!/usr/bin/env tarantool + +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + listen = 'unix/:./master.sock', + replication_timeout = 0.1, + memtx_use_mvcc_engine = true, +}) + +box.schema.user.grant('guest', 'super') diff --git a/test/replication/gh-5430-mvcc-replica1.lua b/test/replication/gh-5430-mvcc-replica1.lua new file mode 100644 index 000000000..f1ad7b8a4 --- /dev/null +++ b/test/replication/gh-5430-mvcc-replica1.lua @@ -0,0 +1,10 @@ +#!/usr/bin/env tarantool +require('console').listen(os.getenv('ADMIN')) + +box.cfg({ + listen = os.getenv("LISTEN"), + replication = 'unix/:./master.sock', + replication_timeout = 0.1, + read_only = true, + replication_anon = true, +}) diff --git a/test/replication/gh-5430-mvcc-replica2.lua b/test/replication/gh-5430-mvcc-replica2.lua new file mode 120000 index 000000000..28c968a8a --- /dev/null +++ b/test/replication/gh-5430-mvcc-replica2.lua @@ -0,0 +1 @@ +gh-5430-mvcc-replica1.lua \ No newline at end of file diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg index e0bbe2676..0acc66816 100644 --- a/test/replication/suite.cfg +++ b/test/replication/suite.cfg @@ -18,6 +18,7 @@ "gh-5213-qsync-applier-order.test.lua": {}, "gh-5213-qsync-applier-order-3.test.lua": {}, "gh-5426-election-on-off.test.lua": {}, + "gh-5430-cluster-mvcc.test.lua": {}, "gh-5433-election-restart-recovery.test.lua": {}, "gh-5445-leader-inconsistency.test.lua": {}, "gh-5506-election-on-off.test.lua": {}, diff --git a/test/replication/suite.ini b/test/replication/suite.ini index 18981996d..907fd0be9 100644 --- a/test/replication/suite.ini +++ b/test/replication/suite.ini @@ -3,7 +3,7 @@ core = tarantool script = master.lua description = tarantool/box, replication disabled = consistent.test.lua -release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua +release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5430-cluster-mvcc.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua config = suite.cfg lua_libs = lua/fast_replica.lua lua/rlimit.lua use_unix_sockets = True -- 2.24.3 (Apple Git-128)
25.07.2021 19:53, Vladislav Shpilevoy пишет: > Replica registration works via looking for the smallest not > occupied ID in _cluster and inserting it into the space. > > It works not so good when mvcc is enabled. In particular, if more > than 1 replica try to register at the same time, they might get > the same replica_id because don't see changes of each other until > the registration in _cluster is complete. > > This in the end leads to all replicas failing the registration > except one with the 'duplicate key' error (primary index in > _cluster is replica ID). > > The patch makes the replicas occupy their ID before they commit it > into _cluster. And new replica ID search now uses the replica ID > map instead of _cluster iterator. > > This way the registration works like before - like MVCC does not > exist which is fine. > > Part of #5430 > --- Hi! Thanks for the patch! Please, find a couple of comments below. > Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5430-cluster-duplicate > Issue: https://github.com/tarantool/tarantool/issues/5430 > > .../gh-5430-cluster-mvcc-duplicate.md | 7 + > src/box/alter.cc | 96 ++++++------ > src/box/box.cc | 19 +-- > src/box/replication.cc | 13 ++ > src/box/replication.h | 4 + > test/replication/gh-5430-cluster-mvcc.result | 146 ++++++++++++++++++ > .../replication/gh-5430-cluster-mvcc.test.lua | 62 ++++++++ > test/replication/gh-5430-mvcc-master.lua | 11 ++ > test/replication/gh-5430-mvcc-replica1.lua | 10 ++ > test/replication/gh-5430-mvcc-replica2.lua | 1 + > test/replication/suite.cfg | 1 + > test/replication/suite.ini | 2 +- > 12 files changed, 306 insertions(+), 66 deletions(-) > create mode 100644 changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md > create mode 100644 test/replication/gh-5430-cluster-mvcc.result > create mode 100644 test/replication/gh-5430-cluster-mvcc.test.lua > create mode 100644 test/replication/gh-5430-mvcc-master.lua > create mode 100644 test/replication/gh-5430-mvcc-replica1.lua > create mode 120000 test/replication/gh-5430-mvcc-replica2.lua > > diff --git a/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md > new file mode 100644 > index 000000000..59b90f026 > --- /dev/null > +++ b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md > @@ -0,0 +1,7 @@ > +## bugfix/replication > + > +* Fixed a rare error appearing when MVCC (`box.cfg.memtx_use_mvcc_engine`) was > + enabled and more than one replica was joined to a cluster. The join could fail > + with the error `"ER_TUPLE_FOUND: Duplicate key exists in unique index > + 'primary' in space '_cluster'"`. The same could happen at bootstrap of a > + cluster having >= 3 nodes (gh-5430). > diff --git a/src/box/alter.cc b/src/box/alter.cc > index 89bb5946c..64ba09021 100644 > --- a/src/box/alter.cc > +++ b/src/box/alter.cc > @@ -4178,47 +4178,11 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event) > return 0; > } > > -/** > - * A record with id of the new instance has been synced to the > - * write ahead log. Update the cluster configuration cache > - * with it. > - */ > -static int > -register_replica(struct trigger *trigger, void * /* event */) > -{ > - struct tuple *new_tuple = (struct tuple *)trigger->data; > - uint32_t id; > - if (tuple_field_u32(new_tuple, BOX_CLUSTER_FIELD_ID, &id) != 0) > - return -1; > - tt_uuid uuid; > - if (tuple_field_uuid(new_tuple, BOX_CLUSTER_FIELD_UUID, &uuid) != 0) > - return -1; > - struct replica *replica = replica_by_uuid(&uuid); > - if (replica != NULL) { > - replica_set_id(replica, id); > - } else { > - try { > - replica = replicaset_add(id, &uuid); > - /* Can't throw exceptions from on_commit trigger */ > - } catch(Exception *e) { > - panic("Can't register replica: %s", e->errmsg); > - } > - } > - return 0; > -} > - > +/** Unregister the replica affected by the change. */ > static int > -unregister_replica(struct trigger *trigger, void * /* event */) > +on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */) > { > - struct tuple *old_tuple = (struct tuple *)trigger->data; > - > - struct tt_uuid old_uuid; > - if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, &old_uuid) != 0) > - return -1; > - > - struct replica *replica = replica_by_uuid(&old_uuid); > - assert(replica != NULL); > - replica_clear_id(replica); > + replica_clear_id((struct replica *)trigger->data); > return 0; > } > > @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > "updates of instance uuid"); > return -1; > } > - } else { > - struct trigger *on_commit; > - on_commit = txn_alter_trigger_new(register_replica, > - new_tuple); > - if (on_commit == NULL) > - return -1; > - txn_stmt_on_commit(stmt, on_commit); > + return 0; > + } > + /* > + * With read-views enabled there might be already a replica > + * whose registration is in progress in another transaction. > + * With the same replica ID. > + */ > + if (replica_by_id(replica_id) != NULL) { > + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", > + "more than 1 replica with the same ID"); > + return -1; > } > ==================== > > I couldn't test this check because of the bug in mvcc: > https://github.com/tarantool/tarantool/issues/6246 > > ==================== I don't understand how this could happen (I mean the if branch being taken). Would you mind explaining? > + struct trigger *on_rollback = txn_alter_trigger_new( > + on_replace_cluster_clear_id, NULL); > + if (on_rollback == NULL) > + return -1; > + /* > + * Register the replica before commit so as to occupy the > + * replica ID now. While WAL write is in progress, new replicas > + * might come, they should see the ID is already in use. > + */ > + struct replica *replica = replica_by_uuid(&replica_uuid); > + if (replica != NULL) > + replica_set_id(replica, replica_id); > + else > + replica = replicaset_add(replica_id, &replica_uuid); > + on_rollback->data = replica; > + txn_stmt_on_rollback(stmt, on_rollback); > } else { > /* > * Don't allow deletion of the record for this instance > @@ -4300,9 +4284,23 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > if (replica_check_id(replica_id) != 0) > return -1; > > - struct trigger *on_commit; > - on_commit = txn_alter_trigger_new(unregister_replica, > - old_tuple); > + struct replica *replica = replica_by_id(replica_id); > + if (replica == NULL) { > + /* > + * Impossible, but it is important not to leave > + * undefined behaviour if there is a bug. Too sensitive > + * subsystem is affected. > + */ > + panic("Tried to unregister a replica not stored in " > + "replica_by_id map, id is %u", replica_id); > + } > + /* > + * Unregister only after commit. Otherwise if the transaction > + * would be rolled back, there might be already another replica > + * taken the freed ID. > + */ > + struct trigger *on_commit = txn_alter_trigger_new( > + on_replace_cluster_clear_id, replica); > if (on_commit == NULL) > return -1; > txn_stmt_on_commit(stmt, on_commit); > diff --git a/src/box/box.cc b/src/box/box.cc > index 8c10a99dd..5c10aceff 100644 > --- a/src/box/box.cc > +++ b/src/box/box.cc > @@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid) > return; /* nothing to do - already registered */ > > box_check_writable_xc(); > - > - /** Find the largest existing replica id. */ > - struct space *space = space_cache_find_xc(BOX_CLUSTER_ID); > - struct index *index = index_find_system_xc(space, 0); > - struct iterator *it = index_create_iterator_xc(index, ITER_ALL, > - NULL, 0); > - IteratorGuard iter_guard(it); > - struct tuple *tuple; > - /** Assign a new replica id. */ > - uint32_t replica_id = 1; > - while ((tuple = iterator_next_xc(it)) != NULL) { > - if (tuple_field_u32_xc(tuple, > - BOX_CLUSTER_FIELD_ID) != replica_id) > - break; > - replica_id++; > - } > + uint32_t replica_id; > + if (replica_find_new_id(&replica_id) != 0) > + diag_raise(); > box_register_replica(replica_id, instance_uuid); > } > > diff --git a/src/box/replication.cc b/src/box/replication.cc > index 45ad03dfd..1288bc9b1 100644 > --- a/src/box/replication.cc > +++ b/src/box/replication.cc > @@ -1032,3 +1032,16 @@ replica_by_id(uint32_t replica_id) > { > return replicaset.replica_by_id[replica_id]; > } > + > +int > +replica_find_new_id(uint32_t *replica_id) > +{ > + for (uint32_t i = 1; i < VCLOCK_MAX; ++i) { > + if (replicaset.replica_by_id[i] == NULL) { > + *replica_id = i; > + return 0; > + } > + } > + diag_set(ClientError, ER_REPLICA_MAX, VCLOCK_MAX); > + return -1; > +} > diff --git a/src/box/replication.h b/src/box/replication.h > index 57e0f10ae..5d1fa1255 100644 > --- a/src/box/replication.h > +++ b/src/box/replication.h > @@ -360,6 +360,10 @@ replica_by_uuid(const struct tt_uuid *uuid); > struct replica * > replica_by_id(uint32_t replica_id); > > +/** Find the smallest free replica ID in the available range. */ nit: free -> empty / unoccupied > +int > +replica_find_new_id(uint32_t *replica_id); > + > /** > * Find a node in the replicaset on which the instance can try to register to > * join the replicaset.
Thanks for the review! I add Sergey Os. to the review as Sergey P. is on vacation. >> diff --git a/src/box/alter.cc b/src/box/alter.cc >> index 89bb5946c..64ba09021 100644 >> --- a/src/box/alter.cc >> +++ b/src/box/alter.cc >> @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) >> "updates of instance uuid"); >> return -1; >> } >> - } else { >> - struct trigger *on_commit; >> - on_commit = txn_alter_trigger_new(register_replica, >> - new_tuple); >> - if (on_commit == NULL) >> - return -1; >> - txn_stmt_on_commit(stmt, on_commit); >> + return 0; >> + } >> + /* >> + * With read-views enabled there might be already a replica >> + * whose registration is in progress in another transaction. >> + * With the same replica ID. >> + */ >> + if (replica_by_id(replica_id) != NULL) { >> + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", >> + "more than 1 replica with the same ID"); >> + return -1; >> } >> ==================== >> >> I couldn't test this check because of the bug in mvcc: >> https://github.com/tarantool/tarantool/issues/6246 >> >> ==================== > > I don't understand how this could happen > (I mean the if branch being taken). > > Would you mind explaining? When MVCC will be fixed, it can happen that there are 2 transactions trying to change _cluster. If they add the same replica ID and the second transaction starts before the first one is finished, it is going to see replicaset.replica_by_id[new_id] != NULL from the first transaction. Even before any of them is committed. Hence it can't add its own new struct replica into the same place nor can it touch the struct replica from the foreign transaction. Therefore the only choice is to fail with an error. The same would happen if MVCC is turned off - then space:insert() would fail with duplicate tuple error right away due to dirty reads. >> diff --git a/src/box/replication.h b/src/box/replication.h >> index 57e0f10ae..5d1fa1255 100644 >> --- a/src/box/replication.h >> +++ b/src/box/replication.h >> @@ -360,6 +360,10 @@ replica_by_uuid(const struct tt_uuid *uuid); >> struct replica * >> replica_by_id(uint32_t replica_id); >> +/** Find the smallest free replica ID in the available range. */ > > > nit: free -> empty / unoccupied Oxford dictionary says 'free' meanings are, among others, "not busy", "not being used", "not blocked", "ready to give". https://www.oxfordlearnersdictionaries.com/definition/english/free_1?q=free Which means the current word is fine. But I don't mind to change it: ==================== @@ -360,7 +360,7 @@ replica_by_uuid(const struct tt_uuid *uuid); struct replica * replica_by_id(uint32_t replica_id); -/** Find the smallest free replica ID in the available range. */ +/** Find the smallest empty replica ID in the available range. */ int replica_find_new_id(uint32_t *replica_id);
On Sun, Jul 25, 2021 at 06:53:02PM +0200, Vladislav Shpilevoy wrote:
> Replica registration works via looking for the smallest not
> occupied ID in _cluster and inserting it into the space.
>
> It works not so good when mvcc is enabled. In particular, if more
> than 1 replica try to register at the same time, they might get
> the same replica_id because don't see changes of each other until
> the registration in _cluster is complete.
>
> This in the end leads to all replicas failing the registration
> except one with the 'duplicate key' error (primary index in
> _cluster is replica ID).
>
> The patch makes the replicas occupy their ID before they commit it
> into _cluster. And new replica ID search now uses the replica ID
> map instead of _cluster iterator.
>
> This way the registration works like before - like MVCC does not
> exist which is fine.
>
> Part of #5430
I don't see any obvious problems in the patch, so Ack.
Hi! Thanks for the patch! Please, see my 3 comments below. > On 25 Jul 2021, at 19:53, Vladislav Shpilevoy via Tarantool-patches <tarantool-patches@dev.tarantool.org> wrote: > > Replica registration works via looking for the smallest not > occupied ID in _cluster and inserting it into the space. > > It works not so good when mvcc is enabled. In particular, if more > than 1 replica try to register at the same time, they might get > the same replica_id because don't see changes of each other until > the registration in _cluster is complete. > > This in the end leads to all replicas failing the registration > except one with the 'duplicate key' error (primary index in > _cluster is replica ID). > > The patch makes the replicas occupy their ID before they commit it > into _cluster. And new replica ID search now uses the replica ID > map instead of _cluster iterator. > > This way the registration works like before - like MVCC does not > exist which is fine. Did you discuss the MVCC capabilities - if we can address it there, by setting a dedicated flag for this particular - and perhaps for some other internal - space(s) to suppress MVCC ‘coverage’ for them. The solution will be way more common with supposedly less hassle with local structures, triggers, and so on? > > Part of #5430 > --- > Branch: http://github.com/tarantool/tarantool/tree/gerold103/gh-5430-cluster-duplicate > Issue: https://github.com/tarantool/tarantool/issues/5430 > > .../gh-5430-cluster-mvcc-duplicate.md | 7 + > src/box/alter.cc | 96 ++++++------ > src/box/box.cc | 19 +-- > src/box/replication.cc | 13 ++ > src/box/replication.h | 4 + > test/replication/gh-5430-cluster-mvcc.result | 146 ++++++++++++++++++ > .../replication/gh-5430-cluster-mvcc.test.lua | 62 ++++++++ > test/replication/gh-5430-mvcc-master.lua | 11 ++ > test/replication/gh-5430-mvcc-replica1.lua | 10 ++ > test/replication/gh-5430-mvcc-replica2.lua | 1 + > test/replication/suite.cfg | 1 + > test/replication/suite.ini | 2 +- > 12 files changed, 306 insertions(+), 66 deletions(-) > create mode 100644 changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md > create mode 100644 test/replication/gh-5430-cluster-mvcc.result > create mode 100644 test/replication/gh-5430-cluster-mvcc.test.lua > create mode 100644 test/replication/gh-5430-mvcc-master.lua > create mode 100644 test/replication/gh-5430-mvcc-replica1.lua > create mode 120000 test/replication/gh-5430-mvcc-replica2.lua > > diff --git a/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md > new file mode 100644 > index 000000000..59b90f026 > --- /dev/null > +++ b/changelogs/unreleased/gh-5430-cluster-mvcc-duplicate.md > @@ -0,0 +1,7 @@ > +## bugfix/replication > + > +* Fixed a rare error appearing when MVCC (`box.cfg.memtx_use_mvcc_engine`) was > + enabled and more than one replica was joined to a cluster. The join could fail > + with the error `"ER_TUPLE_FOUND: Duplicate key exists in unique index > + 'primary' in space '_cluster'"`. The same could happen at bootstrap of a > + cluster having >= 3 nodes (gh-5430). > diff --git a/src/box/alter.cc b/src/box/alter.cc > index 89bb5946c..64ba09021 100644 > --- a/src/box/alter.cc > +++ b/src/box/alter.cc > @@ -4178,47 +4178,11 @@ on_replace_dd_schema(struct trigger * /* trigger */, void *event) > return 0; > } > > -/** > - * A record with id of the new instance has been synced to the > - * write ahead log. Update the cluster configuration cache > - * with it. > - */ > -static int > -register_replica(struct trigger *trigger, void * /* event */) > -{ > - struct tuple *new_tuple = (struct tuple *)trigger->data; > - uint32_t id; > - if (tuple_field_u32(new_tuple, BOX_CLUSTER_FIELD_ID, &id) != 0) > - return -1; > - tt_uuid uuid; > - if (tuple_field_uuid(new_tuple, BOX_CLUSTER_FIELD_UUID, &uuid) != 0) > - return -1; > - struct replica *replica = replica_by_uuid(&uuid); > - if (replica != NULL) { > - replica_set_id(replica, id); > - } else { > - try { > - replica = replicaset_add(id, &uuid); > - /* Can't throw exceptions from on_commit trigger */ > - } catch(Exception *e) { > - panic("Can't register replica: %s", e->errmsg); > - } > - } > - return 0; > -} > - > +/** Unregister the replica affected by the change. */ > static int > -unregister_replica(struct trigger *trigger, void * /* event */) > +on_replace_cluster_clear_id(struct trigger *trigger, void * /* event */) > { > - struct tuple *old_tuple = (struct tuple *)trigger->data; > - > - struct tt_uuid old_uuid; > - if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, &old_uuid) != 0) > - return -1; > - > - struct replica *replica = replica_by_uuid(&old_uuid); > - assert(replica != NULL); > - replica_clear_id(replica); > + replica_clear_id((struct replica *)trigger->data); > return 0; > } > > @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > "updates of instance uuid"); > return -1; > } > - } else { > - struct trigger *on_commit; > - on_commit = txn_alter_trigger_new(register_replica, > - new_tuple); > - if (on_commit == NULL) > - return -1; > - txn_stmt_on_commit(stmt, on_commit); > + return 0; > + } > + /* > + * With read-views enabled there might be already a replica > + * whose registration is in progress in another transaction. > + * With the same replica ID. > + */ > + if (replica_by_id(replica_id) != NULL) { > + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", > + "more than 1 replica with the same ID"); There should be more details on replica ID/UUID/name here, same as in unregister panic below. > + return -1; > } > ==================== > > I couldn't test this check because of the bug in mvcc: > https://github.com/tarantool/tarantool/issues/6246 > > ==================== > + struct trigger *on_rollback = txn_alter_trigger_new( > + on_replace_cluster_clear_id, NULL); > + if (on_rollback == NULL) > + return -1; > + /* > + * Register the replica before commit so as to occupy the > + * replica ID now. While WAL write is in progress, new replicas > + * might come, they should see the ID is already in use. > + */ > + struct replica *replica = replica_by_uuid(&replica_uuid); > + if (replica != NULL) > + replica_set_id(replica, replica_id); > + else > + replica = replicaset_add(replica_id, &replica_uuid); > + on_rollback->data = replica; > + txn_stmt_on_rollback(stmt, on_rollback); > } else { > /* > * Don't allow deletion of the record for this instance > @@ -4300,9 +4284,23 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > if (replica_check_id(replica_id) != 0) > return -1; > > - struct trigger *on_commit; > - on_commit = txn_alter_trigger_new(unregister_replica, > - old_tuple); > + struct replica *replica = replica_by_id(replica_id); > + if (replica == NULL) { > + /* > + * Impossible, but it is important not to leave > + * undefined behaviour if there is a bug. Too sensitive > + * subsystem is affected. > + */ > + panic("Tried to unregister a replica not stored in " > + "replica_by_id map, id is %u", replica_id); > + } > + /* > + * Unregister only after commit. Otherwise if the transaction > + * would be rolled back, there might be already another replica > + * taken the freed ID. > + */ > + struct trigger *on_commit = txn_alter_trigger_new( > + on_replace_cluster_clear_id, replica); > if (on_commit == NULL) > return -1; > txn_stmt_on_commit(stmt, on_commit); > diff --git a/src/box/box.cc b/src/box/box.cc > index 8c10a99dd..5c10aceff 100644 > --- a/src/box/box.cc > +++ b/src/box/box.cc > @@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid) > return; /* nothing to do - already registered */ > > box_check_writable_xc(); > - > - /** Find the largest existing replica id. */ > - struct space *space = space_cache_find_xc(BOX_CLUSTER_ID); > - struct index *index = index_find_system_xc(space, 0); > - struct iterator *it = index_create_iterator_xc(index, ITER_ALL, > - NULL, 0); > - IteratorGuard iter_guard(it); > - struct tuple *tuple; > - /** Assign a new replica id. */ > - uint32_t replica_id = 1; > - while ((tuple = iterator_next_xc(it)) != NULL) { > - if (tuple_field_u32_xc(tuple, > - BOX_CLUSTER_FIELD_ID) != replica_id) > - break; > - replica_id++; > - } > + uint32_t replica_id; > + if (replica_find_new_id(&replica_id) != 0) > + diag_raise(); Any info on why register fails? > box_register_replica(replica_id, instance_uuid); > } > > diff --git a/src/box/replication.cc b/src/box/replication.cc > index 45ad03dfd..1288bc9b1 100644 > --- a/src/box/replication.cc > +++ b/src/box/replication.cc > @@ -1032,3 +1032,16 @@ replica_by_id(uint32_t replica_id) > { > return replicaset.replica_by_id[replica_id]; > } > + > +int > +replica_find_new_id(uint32_t *replica_id) > +{ > + for (uint32_t i = 1; i < VCLOCK_MAX; ++i) { > + if (replicaset.replica_by_id[i] == NULL) { > + *replica_id = i; > + return 0; > + } > + } > + diag_set(ClientError, ER_REPLICA_MAX, VCLOCK_MAX); > + return -1; > +} > diff --git a/src/box/replication.h b/src/box/replication.h > index 57e0f10ae..5d1fa1255 100644 > --- a/src/box/replication.h > +++ b/src/box/replication.h > @@ -360,6 +360,10 @@ replica_by_uuid(const struct tt_uuid *uuid); > struct replica * > replica_by_id(uint32_t replica_id); > > +/** Find the smallest free replica ID in the available range. */ > +int > +replica_find_new_id(uint32_t *replica_id); > + > /** > * Find a node in the replicaset on which the instance can try to register to > * join the replicaset. > diff --git a/test/replication/gh-5430-cluster-mvcc.result b/test/replication/gh-5430-cluster-mvcc.result > new file mode 100644 > index 000000000..831689fc2 > --- /dev/null > +++ b/test/replication/gh-5430-cluster-mvcc.result > @@ -0,0 +1,146 @@ > +-- test-run result file version 2 > +-- > +-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt > +-- could fail with 'duplicate error' in _cluster space. This was happening, > +-- because _cluster is memtx. Changes to it were not visible for newer requests > +-- until commit. > +-- New replica ID was looked up in the space by its full scan. The scan used a > +-- plain iterator and didn't see replicas, whose registration was in progress of > +-- being written to WAL. > +-- As a result, if 2 replicas came to register at the same time, they got the > +-- same replica ID because didn't see each other in _cluster. One of them would > +-- fail to register in the end due to the conflict. > +-- > +-- The test reproduces it by doing anon replica registration. Because during > +-- normal join there are more than one access to WAL, the ID assignment is the > +-- last one. It makes hard to block the ID assignment only. With anon replica > +-- ID assignment the join is already done, the only WAL write is the ID > +-- assignment, easy to block and yet not block new replicas registration > +-- attempts. > +-- > +test_run = require('test_run').new() > + | --- > + | ... > + > +test_run:cmd('create server master with '.. \ > + 'script="replication/gh-5430-mvcc-master.lua"') > + | --- > + | - true > + | ... > +test_run:cmd('start server master') > + | --- > + | - true > + | ... > + > +test_run:cmd('create server replica1 with '.. \ > + 'script="replication/gh-5430-mvcc-replica1.lua"') > + | --- > + | - true > + | ... > +test_run:cmd('start server replica1') > + | --- > + | - true > + | ... > + > +test_run:cmd('create server replica2 with '.. \ > + 'script="replication/gh-5430-mvcc-replica2.lua"') > + | --- > + | - true > + | ... > +test_run:cmd('start server replica2') > + | --- > + | - true > + | ... > + > +test_run:switch('master') > + | --- > + | - true > + | ... > +box.error.injection.set('ERRINJ_WAL_DELAY', true) > + | --- > + | - ok > + | ... > + > +test_run:switch('replica1') > + | --- > + | - true > + | ... > +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) > + | --- > + | ... > +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') > + | --- > + | ... > +_ = test_run:wait_log('master', str) > + | --- > + | ... > + > +test_run:switch('replica2') > + | --- > + | - true > + | ... > +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) > + | --- > + | ... > +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') > + | --- > + | ... > +_ = test_run:wait_log('master', str) > + | --- > + | ... > + > +test_run:switch('master') > + | --- > + | - true > + | ... > +box.error.injection.set('ERRINJ_WAL_DELAY', false) > + | --- > + | - ok > + | ... > + > +test_run:switch('replica1') > + | --- > + | - true > + | ... > +test_run:wait_cond(function() return box.info.id > 1 end) > + | --- > + | - true > + | ... > + > +test_run:switch('replica2') > + | --- > + | - true > + | ... > +test_run:wait_cond(function() return box.info.id > 1 end) > + | --- > + | - true > + | ... > + > +test_run:switch('default') > + | --- > + | - true > + | ... > +test_run:cmd('stop server replica2') > + | --- > + | - true > + | ... > +test_run:cmd('delete server replica2') > + | --- > + | - true > + | ... > +test_run:cmd('stop server replica1') > + | --- > + | - true > + | ... > +test_run:cmd('delete server replica1') > + | --- > + | - true > + | ... > +test_run:cmd('stop server master') > + | --- > + | - true > + | ... > +test_run:cmd('delete server master') > + | --- > + | - true > + | ... > diff --git a/test/replication/gh-5430-cluster-mvcc.test.lua b/test/replication/gh-5430-cluster-mvcc.test.lua > new file mode 100644 > index 000000000..bb34ba540 > --- /dev/null > +++ b/test/replication/gh-5430-cluster-mvcc.test.lua > @@ -0,0 +1,62 @@ > +-- > +-- gh-5430: when MVCC was enabled for memtx, new replica registration attempt > +-- could fail with 'duplicate error' in _cluster space. This was happening, > +-- because _cluster is memtx. Changes to it were not visible for newer requests > +-- until commit. > +-- New replica ID was looked up in the space by its full scan. The scan used a > +-- plain iterator and didn't see replicas, whose registration was in progress of > +-- being written to WAL. > +-- As a result, if 2 replicas came to register at the same time, they got the > +-- same replica ID because didn't see each other in _cluster. One of them would > +-- fail to register in the end due to the conflict. > +-- > +-- The test reproduces it by doing anon replica registration. Because during > +-- normal join there are more than one access to WAL, the ID assignment is the > +-- last one. It makes hard to block the ID assignment only. With anon replica > +-- ID assignment the join is already done, the only WAL write is the ID > +-- assignment, easy to block and yet not block new replicas registration > +-- attempts. > +-- > +test_run = require('test_run').new() > + > +test_run:cmd('create server master with '.. \ > + 'script="replication/gh-5430-mvcc-master.lua"') > +test_run:cmd('start server master') > + > +test_run:cmd('create server replica1 with '.. \ > + 'script="replication/gh-5430-mvcc-replica1.lua"') > +test_run:cmd('start server replica1') > + > +test_run:cmd('create server replica2 with '.. \ > + 'script="replication/gh-5430-mvcc-replica2.lua"') > +test_run:cmd('start server replica2') > + > +test_run:switch('master') > +box.error.injection.set('ERRINJ_WAL_DELAY', true) > + > +test_run:switch('replica1') > +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) > +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') > +_ = test_run:wait_log('master', str) > + > +test_run:switch('replica2') > +_ = require('fiber').create(function() box.cfg{replication_anon = false} end) > +str = string.format('registering replica %s', box.info.uuid):gsub('-', '%%-') > +_ = test_run:wait_log('master', str) > + > +test_run:switch('master') > +box.error.injection.set('ERRINJ_WAL_DELAY', false) > + > +test_run:switch('replica1') > +test_run:wait_cond(function() return box.info.id > 1 end) > + > +test_run:switch('replica2') > +test_run:wait_cond(function() return box.info.id > 1 end) > + > +test_run:switch('default') > +test_run:cmd('stop server replica2') > +test_run:cmd('delete server replica2') > +test_run:cmd('stop server replica1') > +test_run:cmd('delete server replica1') > +test_run:cmd('stop server master') > +test_run:cmd('delete server master') > diff --git a/test/replication/gh-5430-mvcc-master.lua b/test/replication/gh-5430-mvcc-master.lua > new file mode 100644 > index 000000000..6c7d0de3d > --- /dev/null > +++ b/test/replication/gh-5430-mvcc-master.lua > @@ -0,0 +1,11 @@ > +#!/usr/bin/env tarantool > + > +require('console').listen(os.getenv('ADMIN')) > + > +box.cfg({ > + listen = 'unix/:./master.sock', > + replication_timeout = 0.1, > + memtx_use_mvcc_engine = true, > +}) > + > +box.schema.user.grant('guest', 'super') > diff --git a/test/replication/gh-5430-mvcc-replica1.lua b/test/replication/gh-5430-mvcc-replica1.lua > new file mode 100644 > index 000000000..f1ad7b8a4 > --- /dev/null > +++ b/test/replication/gh-5430-mvcc-replica1.lua > @@ -0,0 +1,10 @@ > +#!/usr/bin/env tarantool > +require('console').listen(os.getenv('ADMIN')) > + > +box.cfg({ > + listen = os.getenv("LISTEN"), > + replication = 'unix/:./master.sock', > + replication_timeout = 0.1, > + read_only = true, > + replication_anon = true, > +}) > diff --git a/test/replication/gh-5430-mvcc-replica2.lua b/test/replication/gh-5430-mvcc-replica2.lua > new file mode 120000 > index 000000000..28c968a8a > --- /dev/null > +++ b/test/replication/gh-5430-mvcc-replica2.lua > @@ -0,0 +1 @@ > +gh-5430-mvcc-replica1.lua > \ No newline at end of file > diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg > index e0bbe2676..0acc66816 100644 > --- a/test/replication/suite.cfg > +++ b/test/replication/suite.cfg > @@ -18,6 +18,7 @@ > "gh-5213-qsync-applier-order.test.lua": {}, > "gh-5213-qsync-applier-order-3.test.lua": {}, > "gh-5426-election-on-off.test.lua": {}, > + "gh-5430-cluster-mvcc.test.lua": {}, > "gh-5433-election-restart-recovery.test.lua": {}, > "gh-5445-leader-inconsistency.test.lua": {}, > "gh-5506-election-on-off.test.lua": {}, > diff --git a/test/replication/suite.ini b/test/replication/suite.ini > index 18981996d..907fd0be9 100644 > --- a/test/replication/suite.ini > +++ b/test/replication/suite.ini > @@ -3,7 +3,7 @@ core = tarantool > script = master.lua > description = tarantool/box, replication > disabled = consistent.test.lua > -release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua > +release_disabled = catch.test.lua errinj.test.lua gc.test.lua gc_no_space.test.lua before_replace.test.lua qsync_advanced.test.lua qsync_errinj.test.lua quorum.test.lua recover_missing_xlog.test.lua sync.test.lua long_row_timeout.test.lua gh-4739-vclock-assert.test.lua gh-4730-applier-rollback.test.lua gh-5140-qsync-casc-rollback.test.lua gh-5144-qsync-dup-confirm.test.lua gh-5167-qsync-rollback-snap.test.lua gh-5430-cluster-mvcc.test.lua gh-5506-election-on-off.test.lua gh-5536-wal-limit.test.lua hang_on_synchro_fail.test.lua anon_register_gap.test.lua gh-5213-qsync-applier-order.test.lua gh-5213-qsync-applier-order-3.test.lua gh-6027-applier-error-show.test.lua gh-6032-promote-wal-write.test.lua gh-6057-qsync-confirm-async-no-wal.test.lua gh-5447-downstream-lag.test.lua gh-4040-invalid-msgpack.test.lua > config = suite.cfg > lua_libs = lua/fast_replica.lua lua/rlimit.lua > use_unix_sockets = True > -- > 2.24.3 (Apple Git-128) >
Hi! Thanks for the review! >> On 25 Jul 2021, at 19:53, Vladislav Shpilevoy via Tarantool-patches <tarantool-patches@dev.tarantool.org> wrote: >> >> Replica registration works via looking for the smallest not >> occupied ID in _cluster and inserting it into the space. >> >> It works not so good when mvcc is enabled. In particular, if more >> than 1 replica try to register at the same time, they might get >> the same replica_id because don't see changes of each other until >> the registration in _cluster is complete. >> >> This in the end leads to all replicas failing the registration >> except one with the 'duplicate key' error (primary index in >> _cluster is replica ID). >> >> The patch makes the replicas occupy their ID before they commit it >> into _cluster. And new replica ID search now uses the replica ID >> map instead of _cluster iterator. >> >> This way the registration works like before - like MVCC does not >> exist which is fine. > > Did you discuss the MVCC capabilities - if we can address it there, by > setting a dedicated flag for this particular - and perhaps for some > other internal - space(s) to suppress MVCC ‘coverage’ for them. The > solution will be way more common with supposedly less hassle with > local structures, triggers, and so on? I did think of that in the beginning, but then I decided it is a bad idea. Firstly, from the side of unnecessary complication of MVCC which is complex enough already. Secondly, _cluster DQL is going to see the uncommitted changes despite MVCC turned on. Does not look well. In my patch I made so that you do not see the tuples in :get() and :select() in _cluster and you won't stumble into the "dirty" errors because you are not supposed to make any :insert() nor :replace() into this space manually. Automatic registration bypasses the busy IDs before trying to insert them. >> diff --git a/src/box/alter.cc b/src/box/alter.cc >> index 89bb5946c..64ba09021 100644 >> --- a/src/box/alter.cc >> +++ b/src/box/alter.cc >> @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) >> "updates of instance uuid"); >> return -1; >> } >> - } else { >> - struct trigger *on_commit; >> - on_commit = txn_alter_trigger_new(register_replica, >> - new_tuple); >> - if (on_commit == NULL) >> - return -1; >> - txn_stmt_on_commit(stmt, on_commit); >> + return 0; >> + } >> + /* >> + * With read-views enabled there might be already a replica >> + * whose registration is in progress in another transaction. >> + * With the same replica ID. >> + */ >> + if (replica_by_id(replica_id) != NULL) { >> + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", >> + "more than 1 replica with the same ID"); > > There should be more details on replica ID/UUID/name here, same as in unregister > panic below. The info would be visible in the logs anyway because before something is added to _cluster, there are always more logs about 'joining ...', 'subscribed replica ...' and so on. But I don't mind to extend the logs. ==================== diff --git a/src/box/alter.cc b/src/box/alter.cc index 64ba09021..390199298 100644 --- a/src/box/alter.cc +++ b/src/box/alter.cc @@ -4251,9 +4251,14 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) * whose registration is in progress in another transaction. * With the same replica ID. */ - if (replica_by_id(replica_id) != NULL) { - diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", - "more than 1 replica with the same ID"); + struct replica *replica = replica_by_id(replica_id); + if (replica != NULL) { + const char *msg = tt_sprintf( + "more than 1 replica with the same ID %u: new " + "uuid - %s, old uuid - %s", replica_id, + tt_uuid_str(&replica_uuid), + tt_uuid_str(&replica->uuid)); + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", msg); return -1; } struct trigger *on_rollback = txn_alter_trigger_new( @@ -4265,7 +4270,7 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) * replica ID now. While WAL write is in progress, new replicas * might come, they should see the ID is already in use. */ - struct replica *replica = replica_by_uuid(&replica_uuid); + replica = replica_by_uuid(&replica_uuid); if (replica != NULL) replica_set_id(replica, replica_id); else @@ -4283,6 +4288,10 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) return -1; if (replica_check_id(replica_id) != 0) return -1; + tt_uuid replica_uuid; + if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, + &replica_uuid) != 0) + return -1; struct replica *replica = replica_by_id(replica_id); if (replica == NULL) { @@ -4292,7 +4301,15 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) * subsystem is affected. */ panic("Tried to unregister a replica not stored in " - "replica_by_id map, id is %u", replica_id); + "replica_by_id map, id is %u, uuid is %s", + replica_id, tt_uuid_str(&replica_uuid)); + } + if (!tt_uuid_is_equal(&replica->uuid, &replica_uuid)) { + panic("Tried to unregister a replica with id %u, but " + "its uuid is different from stored internally: " + "in space - %s, internally - %s", replica_id, + tt_uuid_str(&replica_uuid), + tt_uuid_str(&replica->uuid)); } /* * Unregister only after commit. Otherwise if the transaction ==================== >> diff --git a/src/box/box.cc b/src/box/box.cc >> index 8c10a99dd..5c10aceff 100644 >> --- a/src/box/box.cc >> +++ b/src/box/box.cc >> @@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid) >> return; /* nothing to do - already registered */ >> >> box_check_writable_xc(); >> - >> - /** Find the largest existing replica id. */ >> - struct space *space = space_cache_find_xc(BOX_CLUSTER_ID); >> - struct index *index = index_find_system_xc(space, 0); >> - struct iterator *it = index_create_iterator_xc(index, ITER_ALL, >> - NULL, 0); >> - IteratorGuard iter_guard(it); >> - struct tuple *tuple; >> - /** Assign a new replica id. */ >> - uint32_t replica_id = 1; >> - while ((tuple = iterator_next_xc(it)) != NULL) { >> - if (tuple_field_u32_xc(tuple, >> - BOX_CLUSTER_FIELD_ID) != replica_id) >> - break; >> - replica_id++; >> - } >> + uint32_t replica_id; >> + if (replica_find_new_id(&replica_id) != 0) >> + diag_raise(); > > Any info on why register fails? replica_find_new_id() sets diag with a proper error. This is why I do diag_raise() here.
[-- Attachment #1: Type: text/plain, Size: 6914 bytes --] Thanks for updates! LGTM. Best regards, Sergos Wednesday, 28 July 2021, 00:53 +0300 from Shpilevoy Vladislav <v.shpilevoy@tarantool.org>: >Hi! Thanks for the review! > >>> On 25 Jul 2021, at 19:53, Vladislav Shpilevoy via Tarantool-patches < tarantool-patches@dev.tarantool.org > wrote: >>> >>> Replica registration works via looking for the smallest not >>> occupied ID in _cluster and inserting it into the space. >>> >>> It works not so good when mvcc is enabled. In particular, if more >>> than 1 replica try to register at the same time, they might get >>> the same replica_id because don't see changes of each other until >>> the registration in _cluster is complete. >>> >>> This in the end leads to all replicas failing the registration >>> except one with the 'duplicate key' error (primary index in >>> _cluster is replica ID). >>> >>> The patch makes the replicas occupy their ID before they commit it >>> into _cluster. And new replica ID search now uses the replica ID >>> map instead of _cluster iterator. >>> >>> This way the registration works like before - like MVCC does not >>> exist which is fine. >> >> Did you discuss the MVCC capabilities - if we can address it there, by >> setting a dedicated flag for this particular - and perhaps for some >> other internal - space(s) to suppress MVCC ‘coverage’ for them. The >> solution will be way more common with supposedly less hassle with >> local structures, triggers, and so on? > >I did think of that in the beginning, but then I decided it is a bad >idea. Firstly, from the side of unnecessary complication of MVCC which >is complex enough already. Secondly, _cluster DQL is going to see the >uncommitted changes despite MVCC turned on. Does not look well. In my >patch I made so that you do not see the tuples in :get() and :select() >in _cluster and you won't stumble into the "dirty" errors because you >are not supposed to make any :insert() nor :replace() into this space >manually. Automatic registration bypasses the busy IDs before trying >to insert them. > >>> diff --git a/src/box/alter.cc b/src/box/alter.cc >>> index 89bb5946c..64ba09021 100644 >>> --- a/src/box/alter.cc >>> +++ b/src/box/alter.cc >>> @@ -4280,14 +4244,34 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) >>> "updates of instance uuid"); >>> return -1; >>> } >>> - } else { >>> - struct trigger *on_commit; >>> - on_commit = txn_alter_trigger_new(register_replica, >>> - new_tuple); >>> - if (on_commit == NULL) >>> - return -1; >>> - txn_stmt_on_commit(stmt, on_commit); >>> + return 0; >>> + } >>> + /* >>> + * With read-views enabled there might be already a replica >>> + * whose registration is in progress in another transaction. >>> + * With the same replica ID. >>> + */ >>> + if (replica_by_id(replica_id) != NULL) { >>> + diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", >>> + "more than 1 replica with the same ID"); >> >> There should be more details on replica ID/UUID/name here, same as in unregister >> panic below. > >The info would be visible in the logs anyway because before something >is added to _cluster, there are always more logs about 'joining ...', >'subscribed replica ...' and so on. > >But I don't mind to extend the logs. > >==================== >diff --git a/src/box/alter.cc b/src/box/alter.cc >index 64ba09021..390199298 100644 >--- a/src/box/alter.cc >+++ b/src/box/alter.cc >@@ -4251,9 +4251,14 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > * whose registration is in progress in another transaction. > * With the same replica ID. > */ >- if (replica_by_id(replica_id) != NULL) { >- diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", >- "more than 1 replica with the same ID"); >+ struct replica *replica = replica_by_id(replica_id); >+ if (replica != NULL) { >+ const char *msg = tt_sprintf( >+ "more than 1 replica with the same ID %u: new " >+ "uuid - %s, old uuid - %s", replica_id, >+ tt_uuid_str(&replica_uuid), >+ tt_uuid_str(&replica->uuid)); >+ diag_set(ClientError, ER_UNSUPPORTED, "Tarantool", msg); > return -1; > } > struct trigger *on_rollback = txn_alter_trigger_new( >@@ -4265,7 +4270,7 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > * replica ID now. While WAL write is in progress, new replicas > * might come, they should see the ID is already in use. > */ >- struct replica *replica = replica_by_uuid(&replica_uuid); >+ replica = replica_by_uuid(&replica_uuid); > if (replica != NULL) > replica_set_id(replica, replica_id); > else >@@ -4283,6 +4288,10 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > return -1; > if (replica_check_id(replica_id) != 0) > return -1; >+ tt_uuid replica_uuid; >+ if (tuple_field_uuid(old_tuple, BOX_CLUSTER_FIELD_UUID, >+ &replica_uuid) != 0) >+ return -1; > > struct replica *replica = replica_by_id(replica_id); > if (replica == NULL) { >@@ -4292,7 +4301,15 @@ on_replace_dd_cluster(struct trigger *trigger, void *event) > * subsystem is affected. > */ > panic("Tried to unregister a replica not stored in " >- "replica_by_id map, id is %u", replica_id); >+ "replica_by_id map, id is %u, uuid is %s", >+ replica_id, tt_uuid_str(&replica_uuid)); >+ } >+ if (!tt_uuid_is_equal(&replica->uuid, &replica_uuid)) { >+ panic("Tried to unregister a replica with id %u, but " >+ "its uuid is different from stored internally: " >+ "in space - %s, internally - %s", replica_id, >+ tt_uuid_str(&replica_uuid), >+ tt_uuid_str(&replica->uuid)); > } > /* > * Unregister only after commit. Otherwise if the transaction >==================== > >>> diff --git a/src/box/box.cc b/src/box/box.cc >>> index 8c10a99dd..5c10aceff 100644 >>> --- a/src/box/box.cc >>> +++ b/src/box/box.cc >>> @@ -2407,22 +2407,9 @@ box_on_join(const tt_uuid *instance_uuid) >>> return; /* nothing to do - already registered */ >>> >>> box_check_writable_xc(); >>> - >>> - /** Find the largest existing replica id. */ >>> - struct space *space = space_cache_find_xc(BOX_CLUSTER_ID); >>> - struct index *index = index_find_system_xc(space, 0); >>> - struct iterator *it = index_create_iterator_xc(index, ITER_ALL, >>> - NULL, 0); >>> - IteratorGuard iter_guard(it); >>> - struct tuple *tuple; >>> - /** Assign a new replica id. */ >>> - uint32_t replica_id = 1; >>> - while ((tuple = iterator_next_xc(it)) != NULL) { >>> - if (tuple_field_u32_xc(tuple, >>> - BOX_CLUSTER_FIELD_ID) != replica_id) >>> - break; >>> - replica_id++; >>> - } >>> + uint32_t replica_id; >>> + if (replica_find_new_id(&replica_id) != 0) >>> + diag_raise(); >> >> Any info on why register fails? > >replica_find_new_id() sets diag with a proper error. This is >why I do diag_raise() here. [-- Attachment #2: Type: text/html, Size: 14833 bytes --]
I realized I used a wrong issue number in the commit message and the changelog. The real ticket is 5601, fixed and pushed to master, 2.8, 2.7.