From: Olga Arkhangelskaia <arkholga@tarantool.org> To: tarantool-patches@freelists.org Cc: Olga Arkhangelskaia <arkholga@tarantool.org> Subject: [tarantool-patches] [PATCH v6] replication: fix assertion with duplicate connection Date: Tue, 2 Oct 2018 21:50:39 +0300 [thread overview] Message-ID: <20181002185039.5761-1-arkholga@tarantool.org> (raw) Patch fixes behavior when replica tries to connect to the same master more than once. In case when it is initial configuration we raise the exception. If it in not initial config we print the error and disconnect the applier. Closes #3610 --- https://github.com/tarantool/tarantool/issues/3610 https://github.com/tarantool/tarantool/tree/OKriw/gh-3610-assert_fail_when_connect_master_twice-1.10 v1: https://www.freelists.org/post/tarantool-patches/PATCH-box-fix-assertion-with-duplication-in-repl-source v2: https://www.freelists.org/post/tarantool-patches/PATCH-v2-replication-fix-assertion-with-duplicated-connect-to-same-master v3: https://www.freelists.org/post/tarantool-patches/PATCH-v3-replication-fix-assertion-with-duplicate-connection v4: https://www.freelists.org/post/tarantool-patches/v4-PATCH-replication-fix-assertion-with-duplicate-connection v5: https://www.freelists.org/post/tarantool-patches/PATCH-v5-replication-fix-assertion-with-duplicate-connection Changes in v2: - changed completely, now we let duplicated params to proceed - stop the applier at the moment when replica has the same hash that other one - changed test Changes in v3: - changed the wsy and order of how we clean up applier replica on duplication - test is now in misc.test.lua - need to think over second test case Changes in v4: - added test case - no changes in applier states or etc. - appliers lwft in stopped states in case of dup. conf Changes in v5: - in case of error appliers returned to the previous state - test: added search for "duplicate" in replica.log Changes in v6: - changed the way we raise error diad_raise instead of tnt_raise - deleted long sleep in test src/box/replication.cc | 26 ++++++++----- test/replication/misc.result | 87 ++++++++++++++++++++++++++++++++++++++++++ test/replication/misc.test.lua | 37 +++++++++++++++++- 3 files changed, 140 insertions(+), 10 deletions(-) diff --git a/src/box/replication.cc b/src/box/replication.cc index 5755ad45e..809887984 100644 --- a/src/box/replication.cc +++ b/src/box/replication.cc @@ -271,6 +271,8 @@ replica_on_applier_connect(struct replica *replica) assert(replica->applier_sync_state == APPLIER_DISCONNECTED); replica->uuid = applier->uuid; + replica->applier_sync_state = APPLIER_CONNECTED; + replicaset.applier.connected++; struct replica *orig = replica_hash_search(&replicaset.hash, replica); if (orig != NULL && orig->applier != NULL) { @@ -290,6 +292,8 @@ replica_on_applier_connect(struct replica *replica) if (orig != NULL) { /* Use existing struct replica */ + assert(orig->applier_sync_state == APPLIER_DISCONNECTED); + orig->applier_sync_state = replica->applier_sync_state; replica_set_applier(orig, applier); replica_clear_applier(replica); replica_delete(replica); @@ -298,9 +302,6 @@ replica_on_applier_connect(struct replica *replica) /* Add a new struct replica */ replica_hash_insert(&replicaset.hash, replica); } - - replica->applier_sync_state = APPLIER_CONNECTED; - replicaset.applier.connected++; } static void @@ -427,6 +428,7 @@ replicaset_update(struct applier **appliers, int count) auto uniq_guard = make_scoped_guard([&]{ replica_hash_foreach_safe(&uniq, replica, next) { replica_hash_remove(&uniq, replica); + replica_clear_applier(replica); replica_delete(replica); } }); @@ -454,6 +456,8 @@ replicaset_update(struct applier **appliers, int count) replica->uuid = applier->uuid; if (replica_hash_search(&uniq, replica) != NULL) { + replica_clear_applier(replica); + replica_delete(replica); tnt_raise(ClientError, ER_CFG, "replication", "duplicate connection to the same replica"); } @@ -621,8 +625,11 @@ replicaset_connect(struct applier **appliers, int count, say_crit("failed to connect to %d out of %d replicas", count - state.connected, count); /* Timeout or connection failure. */ - if (connect_quorum && state.connected < quorum) + if (connect_quorum && state.connected < quorum) { + diag_set(ClientError, ER_CFG, "replication", + "failed to connect to one or more replicas"); goto error; + } } else { say_info("connected to %d replicas", state.connected); } @@ -641,8 +648,11 @@ replicaset_connect(struct applier **appliers, int count, } /* Now all the appliers are connected, update the replica set. */ - replicaset_update(appliers, count); - return; + try { + replicaset_update(appliers, count); + return; + } catch (Exception *e) {} + error: /* Destroy appliers */ for (int i = 0; i < count; i++) { @@ -650,9 +660,7 @@ error: applier_stop(appliers[i]); } - /* ignore original error */ - tnt_raise(ClientError, ER_CFG, "replication", - "failed to connect to one or more replicas"); + diag_raise(); } bool diff --git a/test/replication/misc.result b/test/replication/misc.result index 0ac48ba34..d9582d760 100644 --- a/test/replication/misc.result +++ b/test/replication/misc.result @@ -394,3 +394,90 @@ test_run:cmd("delete server replica_auth") box.schema.user.drop('cluster') --- ... +-- +-- Test case for gh-3610. Before the fix replica would fail with the assertion +-- when trying to connect to the same master twice. +-- +box.schema.user.grant('guest', 'replication') +--- +... +test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +--- +- true +... +test_run:cmd("start server replica") +--- +- true +... +test_run:cmd("switch replica") +--- +- true +... +replication = box.cfg.replication +--- +... +box.cfg{replication = {replication, replication}} +--- +- error: 'Incorrect value for option ''replication'': duplicate connection to the + same replica' +... +test_run:cmd("switch default") +--- +- true +... +-- Check the case when duplicate connection is detected in the background. +listen = box.cfg.listen +--- +... +test_run:cmd("switch replica") +--- +- true +... +box.cfg{replication_connect_quorum=0, replication_connect_timeout = 0.1} +--- +... +test_run:cmd("switch default") +--- +- true +... +box.cfg{listen = ''} +--- +... +test_run:cmd("switch replica") +--- +- true +... +replication = box.cfg.replication +--- +... +box.cfg{replication = {replication, replication}} +--- +... +test_run:cmd("switch default") +--- +- true +... +box.cfg{listen = listen} +--- +... +find = test_run:grep_log('replica', 'duplicate connection') +--- +... +while (find == test_run:grep_log('replica', 'duplicate connection')) do fiber.sleep(0.1) end +--- +... +box.schema.user.revoke('guest', 'replication') +--- +... +test_run:cmd("stop server replica") +--- +- true +... +test_run:cmd('cleanup server replica') +--- +- true +... +test_run:cmd("delete server replica") +--- +- true +... diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua index 56e1bab69..4bfec4056 100644 --- a/test/replication/misc.test.lua +++ b/test/replication/misc.test.lua @@ -161,5 +161,40 @@ _ = test_run:wait_vclock('replica_auth', vclock) test_run:cmd("stop server replica_auth") test_run:cmd("cleanup server replica_auth") test_run:cmd("delete server replica_auth") - box.schema.user.drop('cluster') + +-- +-- Test case for gh-3610. Before the fix replica would fail with the assertion +-- when trying to connect to the same master twice. +-- +box.schema.user.grant('guest', 'replication') +test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'") +test_run:cmd("start server replica") +test_run:cmd("switch replica") +replication = box.cfg.replication +box.cfg{replication = {replication, replication}} + +test_run:cmd("switch default") + +-- Check the case when duplicate connection is detected in the background. + +listen = box.cfg.listen +test_run:cmd("switch replica") +box.cfg{replication_connect_quorum=0, replication_connect_timeout = 0.1} + +test_run:cmd("switch default") +box.cfg{listen = ''} + +test_run:cmd("switch replica") +replication = box.cfg.replication +box.cfg{replication = {replication, replication}} + +test_run:cmd("switch default") +box.cfg{listen = listen} +find = test_run:grep_log('replica', 'duplicate connection') +while (find == test_run:grep_log('replica', 'duplicate connection')) do fiber.sleep(0.1) end + +box.schema.user.revoke('guest', 'replication') +test_run:cmd("stop server replica") +test_run:cmd('cleanup server replica') +test_run:cmd("delete server replica") -- 2.14.3 (Apple Git-98)
next reply other threads:[~2018-10-02 18:51 UTC|newest] Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-10-02 18:50 Olga Arkhangelskaia [this message] 2018-10-03 9:30 ` Vladimir Davydov
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20181002185039.5761-1-arkholga@tarantool.org \ --to=arkholga@tarantool.org \ --cc=tarantool-patches@freelists.org \ --subject='Re: [tarantool-patches] [PATCH v6] replication: fix assertion with duplicate connection' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox