Tarantool development patches archive
 help / color / mirror / Atom feed
* [tarantool-patches] [PATCH v2] replication: adds replication sync after cfg. update
@ 2018-08-22 15:59 Olga Arkhangelskaia
  2018-08-23 15:06 ` Vladimir Davydov
  0 siblings, 1 reply; 2+ messages in thread
From: Olga Arkhangelskaia @ 2018-08-22 15:59 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Olga Arkhangelskaia

When replica reconnects to replica set not for the first time, we
suffer from absence of synchronization. Such behavior leads to giving
away outdated data.

Closes #3427
---
https://github.com/tarantool/tarantool/issues/3427
https://github.com/tarantool/tarantool/tree/OKriw/replication_no_sync-1.9

v1:
https://www.freelists.org/post/tarantool-patches/PATCH-replication-adds-replication-sync-after-cfg-update

Changes in v2:
- fixed test
- changed replicaset_sync

 src/box/box.cc                      |  7 ++-
 src/box/replication.cc              | 34 ++++++++------
 src/box/replication.h               |  7 ++-
 test/replication/orphan.result      | 92 +++++++++++++++++++++++++++++++++++++
 test/replication/orphan.test.lua    | 41 +++++++++++++++++
 test/replication/replica_orphan.lua | 12 +++++
 6 files changed, 177 insertions(+), 16 deletions(-)
 create mode 100644 test/replication/orphan.result
 create mode 100644 test/replication/orphan.test.lua
 create mode 100644 test/replication/replica_orphan.lua

diff --git a/src/box/box.cc b/src/box/box.cc
index 8d7454d1f..8c67c79e8 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -634,6 +634,9 @@ box_set_replication(void)
 	box_sync_replication(true);
 	/* Follow replica */
 	replicaset_follow();
+	/* Sync replica up to quorum */
+	replicaset_sync();
+	say_verbose("synchronization complete");
 }
 
 void
@@ -1941,8 +1944,10 @@ box_cfg_xc(void)
 	fiber_gc();
 	is_box_configured = true;
 
-	if (!is_bootstrap_leader)
+	if (!is_bootstrap_leader) {
 		replicaset_sync();
+		replicaset_is_synced();
+	}
 
 	say_info("ready to accept requests");
 }
diff --git a/src/box/replication.cc b/src/box/replication.cc
index 861ce34ea..c7b11a5d6 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -662,24 +662,12 @@ replicaset_follow(void)
 }
 
 void
-replicaset_sync(void)
+replicaset_is_synced(void)
 {
 	int quorum = replicaset_quorum();
 
 	if (quorum == 0)
 		return;
-
-	say_verbose("synchronizing with %d replicas", quorum);
-
-	/*
-	 * Wait until all connected replicas synchronize up to
-	 * replication_sync_lag
-	 */
-	while (replicaset.applier.synced < quorum &&
-	       replicaset.applier.connected +
-	       replicaset.applier.loading >= quorum)
-		fiber_cond_wait(&replicaset.applier.cond);
-
 	if (replicaset.applier.synced < quorum) {
 		/*
 		 * Not enough replicas connected to form a quorum.
@@ -694,6 +682,26 @@ replicaset_sync(void)
 		 "replicas formed", quorum);
 }
 
+void
+replicaset_sync(void)
+{
+	int quorum = replicaset_quorum();
+
+	if (quorum == 0)
+		return;
+
+	say_verbose("synchronizing with %d replicas", quorum);
+
+	/*
+	 * Wait until all connected replicas synchronize up to
+	 * replication_sync_lag
+	 */
+	while (replicaset.applier.synced < quorum &&
+	       replicaset.applier.connected +
+	       replicaset.applier.loading >= quorum)
+		fiber_cond_wait(&replicaset.applier.cond);
+}
+
 void
 replicaset_check_quorum(void)
 {
diff --git a/src/box/replication.h b/src/box/replication.h
index 06a2867b6..728a73704 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -371,10 +371,13 @@ replicaset_connect(struct applier **appliers, int count,
 void
 replicaset_follow(void);
 
+/**
+ * Check if replicaset is synced with quorum
+ */
+void
+replicaset_is_synced(void);
 /**
  * Wait until a replication quorum is formed.
- * Return immediately if a quorum cannot be
- * formed because of errors.
  */
 void
 replicaset_sync(void);
diff --git a/test/replication/orphan.result b/test/replication/orphan.result
new file mode 100644
index 000000000..818ea75de
--- /dev/null
+++ b/test/replication/orphan.result
@@ -0,0 +1,92 @@
+--
+-- gh-3427: no sync after configuration update
+--
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+engine = test_run:get_cfg('engine')
+---
+...
+box.schema.user.grant('guest', 'read,write,execute', 'universe')
+---
+...
+box.schema.user.grant('guest', 'replication')
+---
+...
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica_orphan.lua'")
+---
+- true
+...
+test_run:cmd("start server replica")
+---
+- true
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+test_run:cmd("switch default")
+---
+- true
+...
+s = box.schema.space.create('test', {engine = engine})
+---
+...
+index = s:create_index('primary')
+---
+...
+-- change replica configuration
+test_run:cmd("switch replica")
+---
+- true
+...
+box.cfg{replication={}}
+---
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- insert values on the master while replica is unconfigured
+a = 100000 box.begin() while a > 0 do a = a-1 box.space.test:insert{a,a} end box.commit()
+---
+...
+test_run:cmd("switch replica")
+---
+- true
+...
+require'fiber'.sleep(0.1)
+---
+...
+box.cfg{replication = os.getenv("MASTER")}
+---
+...
+box.info.replication[1].upstream.lag > 0.1
+---
+- false
+...
+test_run:cmd("switch default")
+---
+- true
+...
+-- cleanup
+test_run:cmd("stop server replica")
+---
+- true
+...
+test_run:cmd("cleanup server replica")
+---
+- true
+...
+box.space.test:drop()
+---
+...
+box.schema.user.revoke('guest', 'replication')
+---
+...
+box.schema.user.revoke('guest', 'read,write,execute', 'universe')
+---
+...
diff --git a/test/replication/orphan.test.lua b/test/replication/orphan.test.lua
new file mode 100644
index 000000000..862fd69af
--- /dev/null
+++ b/test/replication/orphan.test.lua
@@ -0,0 +1,41 @@
+--
+-- gh-3427: no sync after configuration update
+--
+
+env = require('test_run')
+test_run = env.new()
+engine = test_run:get_cfg('engine')
+
+box.schema.user.grant('guest', 'read,write,execute', 'universe')
+
+box.schema.user.grant('guest', 'replication')
+test_run:cmd("create server replica with rpl_master=default, script='replication/replica_orphan.lua'")
+test_run:cmd("start server replica")
+
+test_run:cmd("switch replica")
+test_run:cmd("switch default")
+
+s = box.schema.space.create('test', {engine = engine})
+index = s:create_index('primary')
+
+-- change replica configuration
+test_run:cmd("switch replica")
+box.cfg{replication={}}
+
+test_run:cmd("switch default")
+-- insert values on the master while replica is unconfigured
+a = 100000 box.begin() while a > 0 do a = a-1 box.space.test:insert{a,a} end box.commit()
+
+test_run:cmd("switch replica")
+require'fiber'.sleep(0.1)
+box.cfg{replication = os.getenv("MASTER")}
+
+box.info.replication[1].upstream.lag > 0.1
+test_run:cmd("switch default")
+
+-- cleanup
+test_run:cmd("stop server replica")
+test_run:cmd("cleanup server replica")
+box.space.test:drop()
+box.schema.user.revoke('guest', 'replication')
+box.schema.user.revoke('guest', 'read,write,execute', 'universe')
diff --git a/test/replication/replica_orphan.lua b/test/replication/replica_orphan.lua
new file mode 100644
index 000000000..97740d69a
--- /dev/null
+++ b/test/replication/replica_orphan.lua
@@ -0,0 +1,12 @@
+#!/usr/bin/env tarantool
+
+local TIMEOUT = tonumber(arg[1])
+
+box.cfg({
+    listen              = os.getenv("LISTEN"),
+    replication         = os.getenv("MASTER"),
+    replication_connect_timeout = 0.5,
+    replication_sync_lag = 0.01,
+})
+
+require('console').listen(os.getenv('ADMIN'))
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 2+ messages in thread

* Re: [tarantool-patches] [PATCH v2] replication: adds replication sync after cfg. update
  2018-08-22 15:59 [tarantool-patches] [PATCH v2] replication: adds replication sync after cfg. update Olga Arkhangelskaia
@ 2018-08-23 15:06 ` Vladimir Davydov
  0 siblings, 0 replies; 2+ messages in thread
From: Vladimir Davydov @ 2018-08-23 15:06 UTC (permalink / raw)
  To: Olga Arkhangelskaia; +Cc: tarantool-patches

On Wed, Aug 22, 2018 at 06:59:30PM +0300, Olga Arkhangelskaia wrote:
> When replica reconnects to replica set not for the first time, we
> suffer from absence of synchronization. Such behavior leads to giving
> away outdated data.
> 
> Closes #3427
> ---
> https://github.com/tarantool/tarantool/issues/3427
> https://github.com/tarantool/tarantool/tree/OKriw/replication_no_sync-1.9
> 
> v1:
> https://www.freelists.org/post/tarantool-patches/PATCH-replication-adds-replication-sync-after-cfg-update
> 
> Changes in v2:
> - fixed test
> - changed replicaset_sync
> 
>  src/box/box.cc                      |  7 ++-
>  src/box/replication.cc              | 34 ++++++++------
>  src/box/replication.h               |  7 ++-
>  test/replication/orphan.result      | 92 +++++++++++++++++++++++++++++++++++++
>  test/replication/orphan.test.lua    | 41 +++++++++++++++++
>  test/replication/replica_orphan.lua | 12 +++++

The test isn't about the 'orphan' state. It's about replica set
synchronization. Please call it appropriately.

Also, the test takes prohibitively long - 15 seconds for memtx+vinyl.
Please try to reduce the test run time.

Also, the test passes with and without the changes done to the code, in
other words it's useless. Please make sure, it doesn't pass without your
patch.


>  6 files changed, 177 insertions(+), 16 deletions(-)
>  create mode 100644 test/replication/orphan.result
>  create mode 100644 test/replication/orphan.test.lua
>  create mode 100644 test/replication/replica_orphan.lua
> 
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 8d7454d1f..8c67c79e8 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -634,6 +634,9 @@ box_set_replication(void)
>  	box_sync_replication(true);
>  	/* Follow replica */
>  	replicaset_follow();
> +	/* Sync replica up to quorum */
> +	replicaset_sync();
> +	say_verbose("synchronization complete");

This message is pointless IMO. I think that replication reconfiguration
should throw an error if it failed to synchronize with the quorum. It
would be consistent with the fact that it throws an error in case it
failed to connect to the quorum.

>  }
>  
>  void
> @@ -1941,8 +1944,10 @@ box_cfg_xc(void)
>  	fiber_gc();
>  	is_box_configured = true;
>  
> -	if (!is_bootstrap_leader)
> +	if (!is_bootstrap_leader) {
>  		replicaset_sync();
> +		replicaset_is_synced();
> +	}

Instead of introducing another function, you could add a return value to
replicaset_sync() - true in case it successfully synchronized, false
otherwise - then use the return value to either log or throw an error,
depending whether its initial configuration or reconfiguration. IMO it
would look neater.

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2018-08-23 15:06 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-22 15:59 [tarantool-patches] [PATCH v2] replication: adds replication sync after cfg. update Olga Arkhangelskaia
2018-08-23 15:06 ` Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox