Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH v5] replication: fix assertion with duplicate connection
@ 2018-10-02 10:12 Olga Arkhangelskaia
  2018-10-02 12:19 ` Vladimir Davydov
  0 siblings, 1 reply; 2+ messages in thread
From: Olga Arkhangelskaia @ 2018-10-02 10:12 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Olga Arkhangelskaia

Patch fixes behavior when replica tries to connect to the same master
more than once. In case when it is initial configuration we raise the
exception. If it in not initial config we print the error and disconnect
the applier.

Closes #3610
---
https://github.com/tarantool/tarantool/issues/3610                              
https://github.com/tarantool/tarantool/tree/OKriw/gh-3610-assert_fail_when_connect_master_twice-1.10
                                                                                
v1:                                                                             
https://www.freelists.org/post/tarantool-patches/PATCH-box-fix-assertion-with-duplication-in-repl-source
                                                                                
v2:                                                                             
https://www.freelists.org/post/tarantool-patches/PATCH-v2-replication-fix-assertion-with-duplicated-connect-to-same-master

v3:
https://www.freelists.org/post/tarantool-patches/PATCH-v3-replication-fix-assertion-with-duplicate-connection

v4:
https://www.freelists.org/post/tarantool-patches/v4-PATCH-replication-fix-assertion-with-duplicate-connection       
                                                                         
Changes in v2:                                                                  
- changed completely, now we let duplicated params to proceed                   
- stop the applier at the moment when replica has the same hash that other one  
- changed test

Changes in v3:                                                                  
- changed the wsy and order of how we clean up applier replica on duplication   
- test is now in misc.test.lua                                                  
- need to think over second test case

Changes in v4:
- added test case
- no changes in applier states or etc.
- appliers lwft in stopped states in case of dup. conf

Changes in v5:
- in case of error appliers returned to the previous state
- test: added search for "duplicate" in replica.log


 src/box/box.h                     |   3 ++
 src/box/replication.cc            |  28 ++++++++---
 test/replication/misc.result      | 100 ++++++++++++++++++++++++++++++++++++++
 test/replication/misc.test.lua    |  41 +++++++++++++++-
 test/replication/replica_dupl.lua |  10 ++++
 5 files changed, 175 insertions(+), 7 deletions(-)
 create mode 100644 test/replication/replica_dupl.lua

diff --git a/src/box/box.h b/src/box/box.h
index 9930d4a1a..21c60e8f0 100644
--- a/src/box/box.h
+++ b/src/box/box.h
@@ -38,6 +38,9 @@
 extern "C" {
 #endif /* defined(__cplusplus) */
 
+/* Size of buffer for error message */
+#define MSG_SIZE 50
+
 /*
  * Box - data storage (spaces, indexes) and query
  * processor (INSERT, UPDATE, DELETE, SELECT, Lua)
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 5755ad45e..f97f54d53 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -271,6 +271,8 @@ replica_on_applier_connect(struct replica *replica)
 	assert(replica->applier_sync_state == APPLIER_DISCONNECTED);
 
 	replica->uuid = applier->uuid;
+	replica->applier_sync_state = APPLIER_CONNECTED;
+	replicaset.applier.connected++;
 
 	struct replica *orig = replica_hash_search(&replicaset.hash, replica);
 	if (orig != NULL && orig->applier != NULL) {
@@ -290,6 +292,8 @@ replica_on_applier_connect(struct replica *replica)
 
 	if (orig != NULL) {
 		/* Use existing struct replica */
+		assert(orig->applier_sync_state == APPLIER_DISCONNECTED);
+		orig->applier_sync_state = replica->applier_sync_state;
 		replica_set_applier(orig, applier);
 		replica_clear_applier(replica);
 		replica_delete(replica);
@@ -299,8 +303,6 @@ replica_on_applier_connect(struct replica *replica)
 		replica_hash_insert(&replicaset.hash, replica);
 	}
 
-	replica->applier_sync_state = APPLIER_CONNECTED;
-	replicaset.applier.connected++;
 }
 
 static void
@@ -427,6 +429,7 @@ replicaset_update(struct applier **appliers, int count)
 	auto uniq_guard = make_scoped_guard([&]{
 		replica_hash_foreach_safe(&uniq, replica, next) {
 			replica_hash_remove(&uniq, replica);
+			replica_clear_applier(replica);
 			replica_delete(replica);
 		}
 	});
@@ -454,6 +457,8 @@ replicaset_update(struct applier **appliers, int count)
 		replica->uuid = applier->uuid;
 
 		if (replica_hash_search(&uniq, replica) != NULL) {
+			replica_clear_applier(replica);
+			replica_delete(replica);
 			tnt_raise(ClientError, ER_CFG, "replication",
 				  "duplicate connection to the same replica");
 		}
@@ -596,6 +601,9 @@ replicaset_connect(struct applier **appliers, int count,
 
 	double timeout = replication_connect_timeout;
 	int quorum = MIN(count, replication_connect_quorum);
+	/* Default error message in case if something will fail */
+	char err_msg [MSG_SIZE] = "failed to connect to one or more replicas";
+	char *msg = err_msg;
 
 	/* Add triggers and start simulations connection to remote peers */
 	for (int i = 0; i < count; i++) {
@@ -641,8 +649,17 @@ replicaset_connect(struct applier **appliers, int count,
 	}
 
 	/* Now all the appliers are connected, update the replica set. */
-	replicaset_update(appliers, count);
-	return;
+	try {
+		replicaset_update(appliers, count);
+		return;
+	} catch (Exception *e) {
+		/* If exception occurred in replicaset_update we need to rewrite
+		 * error message.
+		 */
+		char dupl_msg[MSG_SIZE] = "duplicate connection to the same replica";
+		msg = dupl_msg;
+		goto error;
+	}
 error:
 	/* Destroy appliers */
 	for (int i = 0; i < count; i++) {
@@ -651,8 +668,7 @@ error:
 	}
 
 	/* ignore original error */
-	tnt_raise(ClientError, ER_CFG, "replication",
-		  "failed to connect to one or more replicas");
+	tnt_raise(ClientError, ER_CFG, "replication", msg);
 }
 
 bool
diff --git a/test/replication/misc.result b/test/replication/misc.result
index 0ac48ba34..5087586cf 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -394,3 +394,103 @@ test_run:cmd("delete server replica_auth")
 box.schema.user.drop('cluster')
 ---
 ...
+--
+-- Test case for gh-3610. Before the fix replica would fail with the assertion
+-- when trying to connect to the same master twice.
+--
+box.schema.user.grant('guest', 'replication')
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+replication = box.cfg.replication
+---
+...
+box.cfg{replication = {replication, replication}}
+---
+- error: 'Incorrect value for option ''replication'': duplicate connection to the
+    same replica'
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+-- case when replica reconnects master with duplication in new configuration
+listen = box.cfg.listen
+---
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+box.cfg{replication_connect_quorum=0, replication_connect_timeout = 0.1}
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+box.cfg{listen = ''}
+---
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+replication = box.cfg.replication
+---
+...
+box.cfg{replication = {replication, replication}}
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+box.cfg{listen = listen}
+---
+...
+fiber.sleep(1)
+---
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:grep_log('replica', 'duplicate connection')
+---
+- duplicate connection
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+test_run:cmd("delete server replica")
+---
+- true
+...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 56e1bab69..5eb440837 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -161,5 +161,44 @@ _ = test_run:wait_vclock('replica_auth', vclock)
 test_run:cmd("stop server replica_auth")
 test_run:cmd("cleanup server replica_auth")
 test_run:cmd("delete server replica_auth")
-
 box.schema.user.drop('cluster')
+
+--
+-- Test case for gh-3610. Before the fix replica would fail with the assertion
+-- when trying to connect to the same master twice.
+--
+box.schema.user.grant('guest', 'replication')
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("start server replica")
+test_run:cmd("switch replica")
+replication = box.cfg.replication
+box.cfg{replication = {replication, replication}}
+
+test_run:cmd("switch default")
+test_run:cmd('cleanup server replica')
+
+-- case when replica reconnects master with duplication in new configuration
+
+listen = box.cfg.listen
+test_run:cmd("switch replica")
+box.cfg{replication_connect_quorum=0, replication_connect_timeout = 0.1}
+
+test_run:cmd("switch default")
+box.cfg{listen = ''}
+
+test_run:cmd("switch replica")
+replication = box.cfg.replication
+box.cfg{replication = {replication, replication}}
+
+test_run:cmd("switch default")
+box.cfg{listen = listen}
+fiber.sleep(1)
+test_run:cmd("switch replica")
+
+test_run:cmd("switch default")
+test_run:grep_log('replica', 'duplicate connection')
+
+box.schema.user.revoke('guest', 'replication')
+test_run:cmd("stop server replica")
+test_run:cmd('cleanup server replica')
+test_run:cmd("delete server replica")
diff --git a/test/replication/replica_dupl.lua b/test/replication/replica_dupl.lua
new file mode 100644
index 000000000..29e94551d
--- /dev/null
+++ b/test/replication/replica_dupl.lua
@@ -0,0 +1,10 @@
+#!/usr/bin/env tarantool
+
+box.cfg({
+    replication         = os.getenv("MASTER"),
+    memtx_memory        = 107374182,
+    replication_connect_quorum=0,
+    replication_connect_timeout = 0.1,
+})
+
+require('console').listen(os.getenv('ADMIN'))
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [tarantool-patches] [PATCH v5] replication: fix assertion with duplicate connection
  2018-10-02 10:12 [tarantool-patches] [PATCH v5] replication: fix assertion with duplicate connection Olga Arkhangelskaia
@ 2018-10-02 12:19 ` Vladimir Davydov
  0 siblings, 0 replies; 2+ messages in thread
From: Vladimir Davydov @ 2018-10-02 12:19 UTC (permalink / raw)
  To: Olga Arkhangelskaia; +Cc: tarantool-patches

On Tue, Oct 02, 2018 at 01:12:49PM +0300, Olga Arkhangelskaia wrote:
> Patch fixes behavior when replica tries to connect to the same master
> more than once. In case when it is initial configuration we raise the
> exception. If it in not initial config we print the error and disconnect
> the applier.
> 
> Closes #3610
> ---
> https://github.com/tarantool/tarantool/issues/3610                              
> https://github.com/tarantool/tarantool/tree/OKriw/gh-3610-assert_fail_when_connect_master_twice-1.10

replication/misc.test.lua failed on Travis CI:

https://travis-ci.org/tarantool/tarantool/jobs/436060141

Please fix.

> diff --git a/src/box/box.h b/src/box/box.h
> index 9930d4a1a..21c60e8f0 100644
> --- a/src/box/box.h
> +++ b/src/box/box.h
> @@ -38,6 +38,9 @@
>  extern "C" {
>  #endif /* defined(__cplusplus) */
>  
> +/* Size of buffer for error message */
> +#define MSG_SIZE 50
> +
>  /*
>   * Box - data storage (spaces, indexes) and query
>   * processor (INSERT, UPDATE, DELETE, SELECT, Lua)
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 5755ad45e..f97f54d53 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -271,6 +271,8 @@ replica_on_applier_connect(struct replica *replica)
>  	assert(replica->applier_sync_state == APPLIER_DISCONNECTED);
>  
>  	replica->uuid = applier->uuid;
> +	replica->applier_sync_state = APPLIER_CONNECTED;
> +	replicaset.applier.connected++;
>  
>  	struct replica *orig = replica_hash_search(&replicaset.hash, replica);
>  	if (orig != NULL && orig->applier != NULL) {
> @@ -290,6 +292,8 @@ replica_on_applier_connect(struct replica *replica)
>  
>  	if (orig != NULL) {
>  		/* Use existing struct replica */
> +		assert(orig->applier_sync_state == APPLIER_DISCONNECTED);
> +		orig->applier_sync_state = replica->applier_sync_state;
>  		replica_set_applier(orig, applier);
>  		replica_clear_applier(replica);
>  		replica_delete(replica);
> @@ -299,8 +303,6 @@ replica_on_applier_connect(struct replica *replica)
>  		replica_hash_insert(&replicaset.hash, replica);
>  	}
>  
> -	replica->applier_sync_state = APPLIER_CONNECTED;
> -	replicaset.applier.connected++;

Nit: as I pointed out in my previous review, an extra empty line is left
at the end of this function after this change. Please remove.

>  }
>  
>  static void
> @@ -427,6 +429,7 @@ replicaset_update(struct applier **appliers, int count)
>  	auto uniq_guard = make_scoped_guard([&]{
>  		replica_hash_foreach_safe(&uniq, replica, next) {
>  			replica_hash_remove(&uniq, replica);
> +			replica_clear_applier(replica);
>  			replica_delete(replica);
>  		}
>  	});
> @@ -454,6 +457,8 @@ replicaset_update(struct applier **appliers, int count)
>  		replica->uuid = applier->uuid;
>  
>  		if (replica_hash_search(&uniq, replica) != NULL) {
> +			replica_clear_applier(replica);
> +			replica_delete(replica);
>  			tnt_raise(ClientError, ER_CFG, "replication",
>  				  "duplicate connection to the same replica");
>  		}
> @@ -596,6 +601,9 @@ replicaset_connect(struct applier **appliers, int count,
>  
>  	double timeout = replication_connect_timeout;
>  	int quorum = MIN(count, replication_connect_quorum);
> +	/* Default error message in case if something will fail */
> +	char err_msg [MSG_SIZE] = "failed to connect to one or more replicas";
> +	char *msg = err_msg;
>  
>  	/* Add triggers and start simulations connection to remote peers */
>  	for (int i = 0; i < count; i++) {
> @@ -641,8 +649,17 @@ replicaset_connect(struct applier **appliers, int count,
>  	}
>  
>  	/* Now all the appliers are connected, update the replica set. */
> -	replicaset_update(appliers, count);
> -	return;
> +	try {
> +		replicaset_update(appliers, count);
> +		return;
> +	} catch (Exception *e) {
> +		/* If exception occurred in replicaset_update we need to rewrite
> +		 * error message.
> +		 */
> +		char dupl_msg[MSG_SIZE] = "duplicate connection to the same replica";
> +		msg = dupl_msg;
> +		goto error;
> +	}

What's this juggling with messages for? You can pass an error in diag -
in fact, that's what diag is for. BTW catching an Exception sets diag
automatically. replicaset_update() already throws a correct error so all
you need to do is catch it, destroy appliers, and then re-raise it.

>  error:
>  	/* Destroy appliers */
>  	for (int i = 0; i < count; i++) {
> @@ -651,8 +668,7 @@ error:
>  	}
>  
>  	/* ignore original error */
> -	tnt_raise(ClientError, ER_CFG, "replication",
> -		  "failed to connect to one or more replicas");
> +	tnt_raise(ClientError, ER_CFG, "replication", msg);
>  }
>  
>  bool
> diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
> index 56e1bab69..5eb440837 100644
> --- a/test/replication/misc.test.lua
> +++ b/test/replication/misc.test.lua
> @@ -161,5 +161,44 @@ _ = test_run:wait_vclock('replica_auth', vclock)
>  test_run:cmd("stop server replica_auth")
>  test_run:cmd("cleanup server replica_auth")
>  test_run:cmd("delete server replica_auth")
> -
>  box.schema.user.drop('cluster')
> +
> +--
> +-- Test case for gh-3610. Before the fix replica would fail with the assertion
> +-- when trying to connect to the same master twice.
> +--
> +box.schema.user.grant('guest', 'replication')
> +test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
> +test_run:cmd("start server replica")
> +test_run:cmd("switch replica")
> +replication = box.cfg.replication
> +box.cfg{replication = {replication, replication}}
> +
> +test_run:cmd("switch default")
> +test_run:cmd('cleanup server replica')

Why cleanup the replica? You can check the next test case on the same
replica, can you not?

> +
> +-- case when replica reconnects master with duplication in new configuration

Nit: the comment is misleading IMO. Should be:

Check the case when duplicate connection is detected in the background.

> +
> +listen = box.cfg.listen
> +test_run:cmd("switch replica")
> +box.cfg{replication_connect_quorum=0, replication_connect_timeout = 0.1}
> +
> +test_run:cmd("switch default")
> +box.cfg{listen = ''}
> +
> +test_run:cmd("switch replica")
> +replication = box.cfg.replication
> +box.cfg{replication = {replication, replication}}
> +
> +test_run:cmd("switch default")
> +box.cfg{listen = listen}
> +fiber.sleep(1)

Don't use long sleeps. Wait in a loop if you need to.

> +test_run:cmd("switch replica")
> +
> +test_run:cmd("switch default")
> +test_run:grep_log('replica', 'duplicate connection')
> +
> +box.schema.user.revoke('guest', 'replication')
> +test_run:cmd("stop server replica")
> +test_run:cmd('cleanup server replica')
> +test_run:cmd("delete server replica")
> diff --git a/test/replication/replica_dupl.lua b/test/replication/replica_dupl.lua
> new file mode 100644
> index 000000000..29e94551d

This file isn't needed anymore. Please self-review your patches before
submitting them.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-10-02 12:19 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-02 10:12 [tarantool-patches] [PATCH v5] replication: fix assertion with duplicate connection Olga Arkhangelskaia
2018-10-02 12:19 ` Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox