Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH v3] replication: fix assertion with duplicate connection
@ 2018-09-25  8:53 Olga Arkhangelskaia
  2018-09-25 10:26 ` Vladimir Davydov
  0 siblings, 1 reply; 3+ messages in thread
From: Olga Arkhangelskaia @ 2018-09-25  8:53 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Olga Arkhangelskaia

Patch fixes behavior when replica tries to connect to the same master
more than once. In case when it is initial configuration we raise the
exception. If it in not initial config we print the error and disconnect
the applier.

Closes #3610
---
https://github.com/tarantool/tarantool/issues/3610                              
https://github.com/tarantool/tarantool/tree/OKriw/gh-3610-assert_fail_when_connect_master_twice-1.10
                                                                                
v1:                                                                             
https://www.freelists.org/post/tarantool-patches/PATCH-box-fix-assertion-with-duplication-in-repl-source
                                                                                
v2:                                                                             
https://www.freelists.org/post/tarantool-patches/PATCH-v2-replication-fix-assertion-with-duplicated-connect-to-same-master
                                                                                
Changes in v2:                                                                  
- changed completely, now we let duplicated params to proceed                   
- stop the applier at the moment when replica has the same hash that other one  
- changed test                                                                  
                                                                                
Changes in v3:                                                                  
- changed the wsy and order of how we clean up applier replica on duplication   
- test is now in misc.test.lua                                                  
- need to think over second test case

 src/box/applier.cc             |  2 +-
 src/box/box.cc                 |  4 +++-
 src/box/replication.cc         |  4 ++++
 test/replication/misc.result   | 46 ++++++++++++++++++++++++++++++++++++++++++
 test/replication/misc.test.lua | 18 +++++++++++++++++
 5 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index 604119e9d..1f99f8090 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -630,7 +630,7 @@ applier_f(va_list ap)
 				return -1;
 			}
 		} catch (FiberIsCancelled *e) {
-			applier_disconnect(applier, APPLIER_OFF);
+			applier_disconnect(applier, APPLIER_DISCONNECTED);
 			break;
 		} catch (SocketError *e) {
 			applier_log_error(applier, e);
diff --git a/src/box/box.cc b/src/box/box.cc
index f25146d01..d3aeb5de0 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -667,8 +667,10 @@ box_sync_replication(bool connect_quorum)
 		diag_raise();
 
 	auto guard = make_scoped_guard([=]{
-		for (int i = 0; i < count; i++)
+		for (int i = 0; i < count; i++) {
+			applier_stop(appliers[i]);
 			applier_delete(appliers[i]); /* doesn't affect diag */
+		}
 	});
 
 	replicaset_connect(appliers, count, connect_quorum);
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 5755ad45e..0f205212e 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -427,6 +427,7 @@ replicaset_update(struct applier **appliers, int count)
 	auto uniq_guard = make_scoped_guard([&]{
 		replica_hash_foreach_safe(&uniq, replica, next) {
 			replica_hash_remove(&uniq, replica);
+			replica_clear_applier(replica);
 			replica_delete(replica);
 		}
 	});
@@ -454,6 +455,9 @@ replicaset_update(struct applier **appliers, int count)
 		replica->uuid = applier->uuid;
 
 		if (replica_hash_search(&uniq, replica) != NULL) {
+			applier = replica->applier;
+			replica_clear_applier(replica);
+			replica_delete(replica);
 			tnt_raise(ClientError, ER_CFG, "replication",
 				  "duplicate connection to the same replica");
 		}
diff --git a/test/replication/misc.result b/test/replication/misc.result
index 0ac48ba34..d2ed4508d 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -391,6 +391,52 @@ test_run:cmd("delete server replica_auth")
 ---
 - true
 ...
+--
+-- Test case for gh-3610. Before the fix replica would fail with the assertion
+-- when trying to connect to the same master twice.
+--
+box.schema.user.grant('guest', 'replication')
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+replication = box.cfg.replication
+---
+...
+box.cfg{replication = {replication, replication}}
+---
+- error: 'Incorrect value for option ''replication'': duplicate connection to the
+    same replica'
+...
+test_run:cmd("switch default")
+---
+- true
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+test_run:cmd("delete server replica")
+---
+- true
+...
 box.schema.user.drop('cluster')
 ---
 ...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 56e1bab69..95677e12b 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -162,4 +162,22 @@ test_run:cmd("stop server replica_auth")
 test_run:cmd("cleanup server replica_auth")
 test_run:cmd("delete server replica_auth")
 
+
+--
+-- Test case for gh-3610. Before the fix replica would fail with the assertion
+-- when trying to connect to the same master twice.
+--
+box.schema.user.grant('guest', 'replication')
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("start server replica")
+test_run:cmd("switch replica")
+replication = box.cfg.replication
+box.cfg{replication = {replication, replication}}
+
+test_run:cmd("switch default")
+box.schema.user.revoke('guest', 'replication')
+test_run:cmd("stop server replica")
+test_run:cmd('cleanup server replica')
+test_run:cmd("delete server replica")
+
 box.schema.user.drop('cluster')
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [tarantool-patches] [PATCH v3] replication: fix assertion with duplicate connection
  2018-09-25  8:53 [tarantool-patches] [PATCH v3] replication: fix assertion with duplicate connection Olga Arkhangelskaia
@ 2018-09-25 10:26 ` Vladimir Davydov
  0 siblings, 0 replies; 3+ messages in thread
From: Vladimir Davydov @ 2018-09-25 10:26 UTC (permalink / raw)
  To: Olga Arkhangelskaia; +Cc: tarantool-patches

On Tue, Sep 25, 2018 at 11:53:52AM +0300, Olga Arkhangelskaia wrote:
> diff --git a/src/box/applier.cc b/src/box/applier.cc
> index 604119e9d..1f99f8090 100644
> --- a/src/box/applier.cc
> +++ b/src/box/applier.cc
> @@ -630,7 +630,7 @@ applier_f(va_list ap)
>  				return -1;
>  			}
>  		} catch (FiberIsCancelled *e) {
> -			applier_disconnect(applier, APPLIER_OFF);
> +			applier_disconnect(applier, APPLIER_DISCONNECTED);

APPLIER_DISCONNECTED is a state that denotes temporary errors.
It means that the applier will reconnect soon. In case when an
applier is permanently stopped, we should use APPLIER_OFF.

>  			break;
>  		} catch (SocketError *e) {
>  			applier_log_error(applier, e);
> diff --git a/src/box/box.cc b/src/box/box.cc
> index f25146d01..d3aeb5de0 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -667,8 +667,10 @@ box_sync_replication(bool connect_quorum)
>  		diag_raise();
>  
>  	auto guard = make_scoped_guard([=]{
> -		for (int i = 0; i < count; i++)
> +		for (int i = 0; i < count; i++) {
> +			applier_stop(appliers[i]);
>  			applier_delete(appliers[i]); /* doesn't affect diag */
> +		}

As I mentioned in the previous review, this makes replicaset_connect
protocol obscure: turns out it may or may not leave appliers running.
I think that replicaset_connect should revert all appliers to the state
they were in before this function was called, i.e. stopped.

>  	});
>  
>  	replicaset_connect(appliers, count, connect_quorum);
> diff --git a/src/box/replication.cc b/src/box/replication.cc
> index 5755ad45e..0f205212e 100644
> --- a/src/box/replication.cc
> +++ b/src/box/replication.cc
> @@ -427,6 +427,7 @@ replicaset_update(struct applier **appliers, int count)
>  	auto uniq_guard = make_scoped_guard([&]{
>  		replica_hash_foreach_safe(&uniq, replica, next) {
>  			replica_hash_remove(&uniq, replica);
> +			replica_clear_applier(replica);
>  			replica_delete(replica);
>  		}
>  	});
> @@ -454,6 +455,9 @@ replicaset_update(struct applier **appliers, int count)
>  		replica->uuid = applier->uuid;
>  
>  		if (replica_hash_search(&uniq, replica) != NULL) {
> +			applier = replica->applier;

This line is pointless.

> +			replica_clear_applier(replica);
> +			replica_delete(replica);
>  			tnt_raise(ClientError, ER_CFG, "replication",
>  				  "duplicate connection to the same replica");
>  		}
> diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
> index 56e1bab69..95677e12b 100644
> --- a/test/replication/misc.test.lua
> +++ b/test/replication/misc.test.lua
> @@ -162,4 +162,22 @@ test_run:cmd("stop server replica_auth")
>  test_run:cmd("cleanup server replica_auth")
>  test_run:cmd("delete server replica_auth")
>  
> +
> +--
> +-- Test case for gh-3610. Before the fix replica would fail with the assertion
> +-- when trying to connect to the same master twice.
> +--
> +box.schema.user.grant('guest', 'replication')
> +test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
> +test_run:cmd("start server replica")
> +test_run:cmd("switch replica")
> +replication = box.cfg.replication
> +box.cfg{replication = {replication, replication}}
> +
> +test_run:cmd("switch default")
> +box.schema.user.revoke('guest', 'replication')
> +test_run:cmd("stop server replica")
> +test_run:cmd('cleanup server replica')
> +test_run:cmd("delete server replica")
> +
>  box.schema.user.drop('cluster')

Your test case should go before this line, because user 'cluster' was
created by the previous test case.

Also, there's still no test for the case when applier detects duplicate
connection asynchronously. It should be pretty easy to implement:

 1. On the master (default instance). Disable replication by clearing
    box.cfg.listen.
 2. On the replica. Set box.cfg.replication_connect_quorum to 0 and
    replication_connect_timeout to 0.001. Then set box.cfg.replication
    so that the replica tries to connect to the master twice. Wait for
    box.cfg() to return.
 3. On the master. Enable replication by setting box.cfg.listen back to
    its original value.
 4. On the replica. Check that replication connection is established and
    that 'duplicate connection' error is printed to the log.

^ permalink raw reply	[flat|nested] 3+ messages in thread

* [tarantool-patches] [PATCH v3] replication: fix assertion with duplicate connection
@ 2018-09-25  8:26 Olga Arkhangelskaia
  0 siblings, 0 replies; 3+ messages in thread
From: Olga Arkhangelskaia @ 2018-09-25  8:26 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Olga Arkhangelskaia

Patch fixes behavior when replica tries to connect to the same master
more than once. In case when it is initial configuration we raise the
exception. If it in not initial config we print the error and disconnect
the applier.

Closes #3610
---
https://github.com/tarantool/tarantool/issues/3610
https://github.com/tarantool/tarantool/tree/OKriw/gh-3610-assert_fail_when_connect_master_twice-1.10

v1:
https://www.freelists.org/post/tarantool-patches/PATCH-box-fix-assertion-with-duplication-in-repl-source

v2:
https://www.freelists.org/post/tarantool-patches/PATCH-v2-replication-fix-assertion-with-duplicated-connect-to-same-master

Changes in v2:
- changed completely, now we let duplicated params to proceed
- stop the applier at the moment when replica has the same hash that other one
- changed test

Changes in v3:
- changed the wsy and order of how we clean up applier replica on duplication
- test is now in misc.test.lua
- need to think over second test case

 src/box/applier.cc             |  2 +-
 src/box/box.cc                 |  4 +++-
 src/box/replication.cc         |  4 ++++
 test/replication/misc.result   | 43 ++++++++++++++++++++++++++++++++++++++++++
 test/replication/misc.test.lua | 17 +++++++++++++++++
 5 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index 604119e9d..1f99f8090 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -630,7 +630,7 @@ applier_f(va_list ap)
 				return -1;
 			}
 		} catch (FiberIsCancelled *e) {
-			applier_disconnect(applier, APPLIER_OFF);
+			applier_disconnect(applier, APPLIER_DISCONNECTED);
 			break;
 		} catch (SocketError *e) {
 			applier_log_error(applier, e);
diff --git a/src/box/box.cc b/src/box/box.cc
index f25146d01..d3aeb5de0 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -667,8 +667,10 @@ box_sync_replication(bool connect_quorum)
 		diag_raise();
 
 	auto guard = make_scoped_guard([=]{
-		for (int i = 0; i < count; i++)
+		for (int i = 0; i < count; i++) {
+			applier_stop(appliers[i]);
 			applier_delete(appliers[i]); /* doesn't affect diag */
+		}
 	});
 
 	replicaset_connect(appliers, count, connect_quorum);
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 5755ad45e..0f205212e 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -427,6 +427,7 @@ replicaset_update(struct applier **appliers, int count)
 	auto uniq_guard = make_scoped_guard([&]{
 		replica_hash_foreach_safe(&uniq, replica, next) {
 			replica_hash_remove(&uniq, replica);
+			replica_clear_applier(replica);
 			replica_delete(replica);
 		}
 	});
@@ -454,6 +455,9 @@ replicaset_update(struct applier **appliers, int count)
 		replica->uuid = applier->uuid;
 
 		if (replica_hash_search(&uniq, replica) != NULL) {
+			applier = replica->applier;
+			replica_clear_applier(replica);
+			replica_delete(replica);
 			tnt_raise(ClientError, ER_CFG, "replication",
 				  "duplicate connection to the same replica");
 		}
diff --git a/test/replication/misc.result b/test/replication/misc.result
index 0ac48ba34..9e6ca75d8 100644
--- a/test/replication/misc.result
+++ b/test/replication/misc.result
@@ -391,6 +391,49 @@ test_run:cmd("delete server replica_auth")
 ---
 - true
 ...
+--
+-- Test case for gh-3610. Before the fix replica would fail with the assertion
+-- when trying to connect to the same master twice.
+--
+box.schema.user.grant('guest', 'replication')
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+replication = box.cfg.replication
+---
+...
+box.cfg{replication = {replication, replication}}
+---
+- error: 'Incorrect value for option ''replication'': duplicate connection to the
+    same replica'
+...
+test_run:cmd("switch default")
+---
+- true
+...
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd('cleanup server replica')
+---
+- true
+...
+test_run:cmd("delete server replica")
+---
+- true
+...
 box.schema.user.drop('cluster')
 ---
 ...
diff --git a/test/replication/misc.test.lua b/test/replication/misc.test.lua
index 56e1bab69..f32464d19 100644
--- a/test/replication/misc.test.lua
+++ b/test/replication/misc.test.lua
@@ -162,4 +162,21 @@ test_run:cmd("stop server replica_auth")
 test_run:cmd("cleanup server replica_auth")
 test_run:cmd("delete server replica_auth")
 
+
+--
+-- Test case for gh-3610. Before the fix replica would fail with the assertion
+-- when trying to connect to the same master twice.
+--
+box.schema.user.grant('guest', 'replication')
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
+test_run:cmd("start server replica")
+test_run:cmd("switch replica")
+replication = box.cfg.replication
+box.cfg{replication = {replication, replication}}
+
+test_run:cmd("switch default")
+test_run:cmd("stop server replica")
+test_run:cmd('cleanup server replica')
+test_run:cmd("delete server replica")
+
 box.schema.user.drop('cluster')
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2018-09-25 10:26 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-25  8:53 [tarantool-patches] [PATCH v3] replication: fix assertion with duplicate connection Olga Arkhangelskaia
2018-09-25 10:26 ` Vladimir Davydov
  -- strict thread matches above, loose matches on Subject: below --
2018-09-25  8:26 Olga Arkhangelskaia

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox