Why did not you use  asserts:assert_server_follow_upstream(server, id) from asserts.lua here?   +local function check_follow_master(server) + return t.assert_equals( + server:eval('return box.info.replication[1].upstream.status'), 'follow') +end     Also, do we want to check this patch only for memtx? I asked because you haven’t provided any engine for the test.   -- Vitaliia Ioffe     >Среда, 27 октября 2021, 10:20 +03:00 от Serge Petrenko < sergepetrenko@tarantool.org >: >  >Replication reconfiguration used to work as follows: upon receiving a >new config disconnect from all the existing masters and try to connect >to all the masters from the new config. > >This lead to instances doing extra work when old config and new config >had the same nodes in them: instead of doing nothing, tarantool >reinitialized the connection. > >There was another problem: when an existing connection is broken, master >takes some time to notice that. So, when replica resets the connection, >it may try to reconnect faster than the master is able to notice its >absence. In this case replica wouldn't be able to reconnect due to >`duplicate connection with the same replica UUID`. So replication would >stop for a replication_timeout, which may be quite large (seconds or >tens of seconds). > >Let's prevent tarantool from reconnecting to the same instance, if there >already is a working connection. > >Closes #4669 >--- > .../gh-4669-applier-reconfig-reconnect.md | 5 ++ > src/box/box.cc | 31 ++++++++--- > src/box/replication.cc | 41 +++++++++----- > src/box/replication.h | 4 +- > .../gh_4669_applier_reconnect_test.lua | 53 +++++++++++++++++++ > 5 files changed, 113 insertions(+), 21 deletions(-) > create mode 100644 changelogs/unreleased/gh-4669-applier-reconfig-reconnect.md > create mode 100644 test/replication-luatest/gh_4669_applier_reconnect_test.lua > >diff --git a/changelogs/unreleased/gh-4669-applier-reconfig-reconnect.md b/changelogs/unreleased/gh-4669-applier-reconfig-reconnect.md >new file mode 100644 >index 000000000..2a776872b >--- /dev/null >+++ b/changelogs/unreleased/gh-4669-applier-reconfig-reconnect.md >@@ -0,0 +1,5 @@ >+## bugfix/replication >+ >+* Fixed replica reconnecting to a living master on any `box.cfg{replication=...}` >+ change. Such reconnects could lead to replica failing to restore connection >+ for `replication_timeout` seconds (gh-4669). >diff --git a/src/box/box.cc b/src/box/box.cc >index b1154affa..c2e4a55a7 100644 >--- a/src/box/box.cc >+++ b/src/box/box.cc >@@ -1261,7 +1261,7 @@ cfg_get_replication(int *p_count) >  * don't start appliers. >  */ > static void >-box_sync_replication(bool connect_quorum) >+box_sync_replication(bool do_quorum, bool do_reuse) > { >  int count = 0; >  struct applier **appliers = cfg_get_replication(&count); >@@ -1272,12 +1272,27 @@ box_sync_replication(bool connect_quorum) >  for (int i = 0; i < count; i++) >  applier_delete(appliers[i]); /* doesn't affect diag */ >  }); >- >- replicaset_connect(appliers, count, connect_quorum); >+ replicaset_connect(appliers, count, do_quorum, do_reuse); >  >  guard.is_active = false; > } >  >+static inline void >+box_restart_replication(void) >+{ >+ const bool do_quorum = true; >+ const bool do_reuse = false; >+ box_sync_replication(do_quorum, do_reuse); >+} >+ >+static inline void >+box_update_replication(void) >+{ >+ const bool do_quorum = false; >+ const bool do_reuse = true; >+ box_sync_replication(do_quorum, do_reuse); >+} >+ > void > box_set_replication(void) > { >@@ -1296,7 +1311,7 @@ box_set_replication(void) >  * Stay in orphan mode in case we fail to connect to at least >  * 'replication_connect_quorum' remote instances. >  */ >- box_sync_replication(false); >+ box_update_replication(); >  /* Follow replica */ >  replicaset_follow(); >  /* Wait until appliers are in sync */ >@@ -1416,7 +1431,7 @@ box_set_replication_anon(void) >  * them can register and others resend a >  * non-anonymous subscribe. >  */ >- box_sync_replication(true); >+ box_restart_replication(); >  /* >  * Wait until the master has registered this >  * instance. >@@ -3279,7 +3294,7 @@ bootstrap(const struct tt_uuid *instance_uuid, >  * with connecting to 'replication_connect_quorum' masters. >  * If this also fails, throw an error. >  */ >- box_sync_replication(true); >+ box_restart_replication(); >  >  struct replica *master = replicaset_find_join_master(); >  assert(master == NULL || master->applier != NULL); >@@ -3356,7 +3371,7 @@ local_recovery(const struct tt_uuid *instance_uuid, >  if (wal_dir_lock >= 0) { >  if (box_listen() != 0) >  diag_raise(); >- box_sync_replication(false); >+ box_update_replication(); >  >  struct replica *master; >  if (replicaset_needs_rejoin(&master)) { >@@ -3435,7 +3450,7 @@ local_recovery(const struct tt_uuid *instance_uuid, >  vclock_copy(&replicaset.vclock, &recovery->vclock); >  if (box_listen() != 0) >  diag_raise(); >- box_sync_replication(false); >+ box_update_replication(); >  } >  stream_guard.is_active = false; >  recovery_finalize(recovery); >diff --git a/src/box/replication.cc b/src/box/replication.cc >index 1288bc9b1..10b4ac915 100644 >--- a/src/box/replication.cc >+++ b/src/box/replication.cc >@@ -507,7 +507,7 @@ replica_on_applier_state_f(struct trigger *trigger, void *event) >  * upon reconfiguration of box.cfg.replication. >  */ > static void >-replicaset_update(struct applier **appliers, int count) >+replicaset_update(struct applier **appliers, int count, bool keep_connect) > { >  replica_hash_t uniq; >  memset(&uniq, 0, sizeof(uniq)); >@@ -572,22 +572,39 @@ replicaset_update(struct applier **appliers, int count) >  applier_stop(applier); >  applier_delete(applier); >  } >+ >+ replicaset.applier.total = count; >+ replicaset.applier.connected = 0; >+ replicaset.applier.loading = 0; >+ replicaset.applier.synced = 0; >  replicaset_foreach(replica) { >  if (replica->applier == NULL) >  continue; >- applier = replica->applier; >- replica_clear_applier(replica); >- replica->applier_sync_state = APPLIER_DISCONNECTED; >+ struct replica *other = replica_hash_search(&uniq, replica); >+ if (keep_connect && other != NULL && >+ (replica->applier->state == APPLIER_FOLLOW || >+ replica->applier->state == APPLIER_SYNC)) { >+ /* >+ * Try not to interrupt working appliers upon >+ * reconfiguration. >+ */ >+ replicaset.applier.connected++; >+ replicaset.applier.synced++; >+ replica_hash_remove(&uniq, other); >+ applier = other->applier; >+ replica_clear_applier(other); >+ replica_delete(other); >+ } else { >+ applier = replica->applier; >+ replica_clear_applier(replica); >+ replica->applier_sync_state = APPLIER_DISCONNECTED; >+ } >  applier_stop(applier); >  applier_delete(applier); >  } >  >- /* Save new appliers */ >- replicaset.applier.total = count; >- replicaset.applier.connected = 0; >- replicaset.applier.loading = 0; >- replicaset.applier.synced = 0; >  >+ /* Save new appliers */ >  replica_hash_foreach_safe(&uniq, replica, next) { >  replica_hash_remove(&uniq, replica); >  >@@ -664,11 +681,11 @@ applier_on_connect_f(struct trigger *trigger, void *event) >  > void > replicaset_connect(struct applier **appliers, int count, >- bool connect_quorum) >+ bool connect_quorum, bool keep_connect) > { >  if (count == 0) { >  /* Cleanup the replica set. */ >- replicaset_update(appliers, count); >+ replicaset_update(appliers, 0, false); >  return; >  } >  >@@ -772,7 +789,7 @@ replicaset_connect(struct applier **appliers, int count, >  >  /* Now all the appliers are connected, update the replica set. */ >  try { >- replicaset_update(appliers, count); >+ replicaset_update(appliers, count, keep_connect); >  } catch (Exception *e) { >  goto error; >  } >diff --git a/src/box/replication.h b/src/box/replication.h >index 4c82cd839..a8fed45e8 100644 >--- a/src/box/replication.h >+++ b/src/box/replication.h >@@ -439,10 +439,12 @@ replicaset_add_anon(const struct tt_uuid *replica_uuid); >  * \param connect_quorum if this flag is set, fail unless at >  * least replication_connect_quorum >  * appliers have successfully connected. >+ * \param keep_connect if this flag is set do not force a reconnect if the >+ * old connection to the replica is fine. >  */ > void > replicaset_connect(struct applier **appliers, int count, >- bool connect_quorum); >+ bool connect_quorum, bool keep_connect); >  > /** >  * Check if the current instance fell too much behind its >diff --git a/test/replication-luatest/gh_4669_applier_reconnect_test.lua b/test/replication-luatest/gh_4669_applier_reconnect_test.lua >new file mode 100644 >index 000000000..9c6d26036 >--- /dev/null >+++ b/test/replication-luatest/gh_4669_applier_reconnect_test.lua >@@ -0,0 +1,53 @@ >+local t = require('luatest') >+local cluster = require('test.luatest_helpers.cluster') >+local helpers = require('test.luatest_helpers') >+ >+local g = t.group('gh-4669-applier-reconnect') >+ >+local function check_follow_master(server) >+ return t.assert_equals( >+ server:eval('return box.info.replication[1].upstream.status'), 'follow') >+end >+ >+g.before_each(function() >+ g.cluster = cluster:new({}) >+ g.master = g.cluster:build_server({alias = 'master'}) >+ local box_cfg = { >+ replication = ('%s/master.iproto'):format(helpers.SOCKET_DIR), >+ } >+ g.replica = g.cluster:build_server({alias = 'replica', box_cfg = box_cfg}) >+ g.replica2 = g.cluster:build_server({alias = 'replica2', box_cfg = box_cfg}) >+ >+ g.cluster:add_server(g.master) >+ g.cluster:add_server(g.replica) >+ g.cluster:add_server(g.replica2) >+ g.cluster:start() >+ check_follow_master(g.replica) >+end) >+ >+g.after_each(function() >+ g.cluster:stop() >+end) >+ >+-- Test that appliers aren't recreated upon replication reconfiguration. >+-- Add and then remove two extra replicas to the configuration. The master >+-- connection should stay intact. >+g.test_applier_connection_on_reconfig = function(g) >+ g.replica:eval(([[ >+ box.cfg{ >+ replication = { >+ box.cfg.listen, >+ box.cfg.replication[1], >+ "%s/replica2.iproto", >+ } >+ }]]):format(helpers.SOCKET_DIR)) >+ g.replica:eval([[ >+ box.cfg{ >+ replication = { >+ box.cfg.replication[2] >+ } >+ } >+ ]]) >+ check_follow_master(g.replica) >+ t.assert_equals(g.master:grep_log("exiting the relay loop"), nil) >+end >-- >2.30.1 (Apple Git-130)