[tarantool-patches] [PATCH v5] replication: fix assertion with duplicate connection
Vladimir Davydov
vdavydov.dev at gmail.com
Tue Oct 2 15:19:38 MSK 2018
On Tue, Oct 02, 2018 at 01:12:49PM +0300, Olga Arkhangelskaia wrote:
> Patch fixes behavior when replica tries to connect to the same master
> more than once. In case when it is initial configuration we raise the
> exception. If it in not initial config we print the error and disconnect
> the applier.
>
> Closes #3610
> ---
> https://github.com/tarantool/tarantool/issues/3610
> https://github.com/tarantool/tarantool/tree/OKriw/gh-3610-assert_fail_when_connect_master_twice-1.10
replication/misc.test.lua failed on Travis CI:
https://travis-ci.org/tarantool/tarantool/jobs/436060141
Please fix.
> diff --git a/src/box/box.h b/src/box/box.h
> index 9930d4a1a..21c60e8f0 100644
> --- a/src/box/box.h
> +++ b/src/box/box.h
> @@ -38,6 +38,9 @@
> extern "C" {
> #endif /* defined(__cplusplus) */
>
> +/* Size of buffer for error message */
> +#define MSG_SIZE 50
> +
> /*
> * Box - data storage (spaces, indexes) and query
> * processor (INSERT, UPDATE, DELETE, SELECT, Lua)
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 5755ad45e..f97f54d53 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -271,6 +271,8 @@ replica_on_applier_connect(struct replica *replica)
> assert(replica->applier_sync_state == APPLIER_DISCONNECTED);
>
> replica->uuid = applier->uuid;
> + replica->applier_sync_state = APPLIER_CONNECTED;
> + replicaset.applier.connected++;
>
> struct replica *orig = replica_hash_search(&replicaset.hash, replica);
> if (orig != NULL && orig->applier != NULL) {
> @@ -290,6 +292,8 @@ replica_on_applier_connect(struct replica *replica)
>
> if (orig != NULL) {
> /* Use existing struct replica */
> + assert(orig->applier_sync_state == APPLIER_DISCONNECTED);
> + orig->applier_sync_state = replica->applier_sync_state;
> replica_set_applier(orig, applier);
> replica_clear_applier(replica);
> replica_delete(replica);
> @@ -299,8 +303,6 @@ replica_on_applier_connect(struct replica *replica)
> replica_hash_insert(&replicaset.hash, replica);
> }
>
> - replica->applier_sync_state = APPLIER_CONNECTED;
> - replicaset.applier.connected++;
Nit: as I pointed out in my previous review, an extra empty line is left
at the end of this function after this change. Please remove.
> }
>
> static void
> @@ -427,6 +429,7 @@ replicaset_update(struct applier **appliers, int count)
> auto uniq_guard = make_scoped_guard([&]{
> replica_hash_foreach_safe(&uniq, replica, next) {
> replica_hash_remove(&uniq, replica);
> + replica_clear_applier(replica);
> replica_delete(replica);
> }
> });
> @@ -454,6 +457,8 @@ replicaset_update(struct applier **appliers, int count)
> replica->uuid = applier->uuid;
>
> if (replica_hash_search(&uniq, replica) != NULL) {
> + replica_clear_applier(replica);
> + replica_delete(replica);
> tnt_raise(ClientError, ER_CFG, "replication",
> "duplicate connection to the same replica");
> }
> @@ -596,6 +601,9 @@ replicaset_connect(struct applier **appliers, int count,
>
> double timeout = replication_connect_timeout;
> int quorum = MIN(count, replication_connect_quorum);
> + /* Default error message in case if something will fail */
> + char err_msg [MSG_SIZE] = "failed to connect to one or more replicas";
> + char *msg = err_msg;
>
> /* Add triggers and start simulations connection to remote peers */
> for (int i = 0; i < count; i++) {
> @@ -641,8 +649,17 @@ replicaset_connect(struct applier **appliers, int count,
> }
>
> /* Now all the appliers are connected, update the replica set. */
> - replicaset_update(appliers, count);
> - return;
> + try {
> + replicaset_update(appliers, count);
> + return;
> + } catch (Exception *e) {
> + /* If exception occurred in replicaset_update we need to rewrite
> + * error message.
> + */
> + char dupl_msg[MSG_SIZE] = "duplicate connection to the same replica";
> + msg = dupl_msg;
> + goto error;
> + }
What's this juggling with messages for? You can pass an error in diag -
in fact, that's what diag is for. BTW catching an Exception sets diag
automatically. replicaset_update() already throws a correct error so all
you need to do is catch it, destroy appliers, and then re-raise it.
> error:
> /* Destroy appliers */
> for (int i = 0; i < count; i++) {
> @@ -651,8 +668,7 @@ error:
> }
>
> /* ignore original error */
> - tnt_raise(ClientError, ER_CFG, "replication",
> - "failed to connect to one or more replicas");
> + tnt_raise(ClientError, ER_CFG, "replication", msg);
> }
>
> bool
> diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
> index 56e1bab69..5eb440837 100644
> --- a/test/replication/misc.test.lua
> +++ b/test/replication/misc.test.lua
> @@ -161,5 +161,44 @@ _ = test_run:wait_vclock('replica_auth', vclock)
> test_run:cmd("stop server replica_auth")
> test_run:cmd("cleanup server replica_auth")
> test_run:cmd("delete server replica_auth")
> -
> box.schema.user.drop('cluster')
> +
> +--
> +-- Test case for gh-3610. Before the fix replica would fail with the assertion
> +-- when trying to connect to the same master twice.
> +--
> +box.schema.user.grant('guest', 'replication')
> +test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
> +test_run:cmd("start server replica")
> +test_run:cmd("switch replica")
> +replication = box.cfg.replication
> +box.cfg{replication = {replication, replication}}
> +
> +test_run:cmd("switch default")
> +test_run:cmd('cleanup server replica')
Why cleanup the replica? You can check the next test case on the same
replica, can you not?
> +
> +-- case when replica reconnects master with duplication in new configuration
Nit: the comment is misleading IMO. Should be:
Check the case when duplicate connection is detected in the background.
> +
> +listen = box.cfg.listen
> +test_run:cmd("switch replica")
> +box.cfg{replication_connect_quorum=0, replication_connect_timeout = 0.1}
> +
> +test_run:cmd("switch default")
> +box.cfg{listen = ''}
> +
> +test_run:cmd("switch replica")
> +replication = box.cfg.replication
> +box.cfg{replication = {replication, replication}}
> +
> +test_run:cmd("switch default")
> +box.cfg{listen = listen}
> +fiber.sleep(1)
Don't use long sleeps. Wait in a loop if you need to.
> +test_run:cmd("switch replica")
> +
> +test_run:cmd("switch default")
> +test_run:grep_log('replica', 'duplicate connection')
> +
> +box.schema.user.revoke('guest', 'replication')
> +test_run:cmd("stop server replica")
> +test_run:cmd('cleanup server replica')
> +test_run:cmd("delete server replica")
> diff --git a/test/replication/replica_dupl.lua b/test/replication/replica_dupl.lua
> new file mode 100644
> index 000000000..29e94551d
This file isn't needed anymore. Please self-review your patches before
submitting them.
More information about the Tarantool-patches
mailing list