Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear
       [not found] <cover.1594314820.git.sergeyb@tarantool.org>
@ 2020-07-09 17:16 ` sergeyb
  2020-07-09 20:07   ` Vladislav Shpilevoy
  2020-07-20 21:59   ` Vladislav Shpilevoy
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function sergeyb
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion sergeyb
  2 siblings, 2 replies; 18+ messages in thread
From: sergeyb @ 2020-07-09 17:16 UTC (permalink / raw)
  To: tarantool-patches, v.shpilevoy, sergepetrenko, gorcunov, lvasiliev

From: Sergey Bronnikov <sergeyb@tarantool.org>

---
 src/box/box.cc    | 10 ++++++----
 src/box/box.h     |  2 +-
 src/box/lua/ctl.c |  5 +++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 749c96ca1..a400ed551 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -946,15 +946,15 @@ box_set_replication_anon(void)
 
 }
 
-void
+int
 box_clear_synchro_queue(void)
 {
 	if (!is_box_configured || txn_limbo_is_empty(&txn_limbo))
-		return;
+		return -1;
 	uint32_t former_leader_id = txn_limbo.instance_id;
 	assert(former_leader_id != REPLICA_ID_NIL);
 	if (former_leader_id == instance_id)
-		return;
+		return -2;
 
 	/* Wait until pending confirmations/rollbacks reach us. */
 	double timeout = 2 * txn_limbo_confirm_timeout(&txn_limbo);
@@ -965,9 +965,9 @@ box_clear_synchro_queue(void)
 		fiber_sleep(0.001);
 	}
 
+	int len = 0;
 	if (!txn_limbo_is_empty(&txn_limbo)) {
 		int64_t lsns[VCLOCK_MAX];
-		int len = 0;
 		const struct vclock  *vclock;
 		replicaset_foreach(replica) {
 			if (replica->relay != NULL &&
@@ -993,6 +993,8 @@ box_clear_synchro_queue(void)
 		txn_limbo_force_empty(&txn_limbo, confirm_lsn);
 		assert(txn_limbo_is_empty(&txn_limbo));
 	}
+
+	return len;
 }
 
 void
diff --git a/src/box/box.h b/src/box/box.h
index 5c4a5ed78..608e26b83 100644
--- a/src/box/box.h
+++ b/src/box/box.h
@@ -258,7 +258,7 @@ extern "C" {
 
 typedef struct tuple box_tuple_t;
 
-void box_clear_synchro_queue(void);
+int box_clear_synchro_queue(void);
 
 /* box_select is private and used only by FFI */
 API_EXPORT int
diff --git a/src/box/lua/ctl.c b/src/box/lua/ctl.c
index 2017ddc18..c3cf44f0c 100644
--- a/src/box/lua/ctl.c
+++ b/src/box/lua/ctl.c
@@ -82,8 +82,9 @@ static int
 lbox_ctl_clear_synchro_queue(struct lua_State *L)
 {
 	(void) L;
-	box_clear_synchro_queue();
-	return 0;
+	int len = box_clear_synchro_queue();
+	lua_pushinteger(L, len);
+	return 1;
 }
 
 static const struct luaL_Reg lbox_ctl_lib[] = {
-- 
2.26.2

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function
       [not found] <cover.1594314820.git.sergeyb@tarantool.org>
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear sergeyb
@ 2020-07-09 17:16 ` sergeyb
  2020-07-20 22:00   ` Vladislav Shpilevoy
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion sergeyb
  2 siblings, 1 reply; 18+ messages in thread
From: sergeyb @ 2020-07-09 17:16 UTC (permalink / raw)
  To: tarantool-patches, v.shpilevoy, sergepetrenko, gorcunov, lvasiliev

From: Sergey Bronnikov <sergeyb@tarantool.org>

Part of #5055
Part of #4849
---
 test/replication/qsync_basic.result   | 85 +++++++++++++++++++++++++++
 test/replication/qsync_basic.test.lua | 31 ++++++++++
 2 files changed, 116 insertions(+)

diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
index ab4be0c7e..464df75a7 100644
--- a/test/replication/qsync_basic.result
+++ b/test/replication/qsync_basic.result
@@ -32,6 +32,14 @@ s2.is_sync
  | - false
  | ...
 
+--
+-- gh-4849: clear synchro queue with unconfigured box
+--
+box.ctl.clear_synchro_queue()
+ | ---
+ | - -1
+ | ...
+
 -- Net.box takes sync into account.
 box.schema.user.grant('guest', 'super')
  | ---
@@ -553,6 +561,82 @@ box.space.sync:select{7}
  | - - [7]
  | ...
 
+--
+-- gh-4849: clear synchro queue on a replica
+--
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
+ | ---
+ | ...
+f1 = fiber.create(box.space.sync.replace, box.space.sync, {9})
+ | ---
+ | ...
+f1:status()
+ | ---
+ | - suspended
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.ctl.clear_synchro_queue()
+ | ---
+ | - 0
+ | ...
+box.space.sync:select{9}
+ | ---
+ | - []
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{9}
+ | ---
+ | - []
+ | ...
+f1:status()
+ | ---
+ | - dead
+ | ...
+
+--
+-- gh-4849: clear synchro queue on a master
+--
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
+ | ---
+ | ...
+f1 = fiber.create(box.space.sync.replace, box.space.sync, {10})
+ | ---
+ | ...
+f1:status()
+ | ---
+ | - suspended
+ | ...
+box.ctl.clear_synchro_queue()
+ | ---
+ | - -2
+ | ...
+box.space.sync:select{10}
+ | ---
+ | - - [10]
+ | ...
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+box.space.sync:select{10}
+ | ---
+ | - - [10]
+ | ...
+
 -- Cleanup.
 test_run:cmd('switch default')
  | ---
@@ -576,6 +660,7 @@ test_run:cmd('delete server replica')
  | ...
 box.space.test:drop()
  | ---
+ | - error: A rollback for a synchronous transaction is received
  | ...
 box.space.sync:drop()
  | ---
diff --git a/test/replication/qsync_basic.test.lua b/test/replication/qsync_basic.test.lua
index d2df7a4fb..084ad4abb 100644
--- a/test/replication/qsync_basic.test.lua
+++ b/test/replication/qsync_basic.test.lua
@@ -13,6 +13,11 @@ s1:select{}
 s2 = box.schema.create_space('test2')
 s2.is_sync
 
+--
+-- gh-4849: clear synchro queue with unconfigured box
+--
+box.ctl.clear_synchro_queue()
+
 -- Net.box takes sync into account.
 box.schema.user.grant('guest', 'super')
 netbox = require('net.box')
@@ -222,6 +227,32 @@ assert(newlsn >= oldlsn + 2)
 test_run:switch('replica')
 box.space.sync:select{7}
 
+--
+-- gh-4849: clear synchro queue on a replica
+--
+test_run:switch('default')
+box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
+f1 = fiber.create(box.space.sync.replace, box.space.sync, {9})
+f1:status()
+test_run:switch('replica')
+box.ctl.clear_synchro_queue()
+box.space.sync:select{9}
+test_run:switch('default')
+box.space.sync:select{9}
+f1:status()
+
+--
+-- gh-4849: clear synchro queue on a master
+--
+test_run:switch('default')
+box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
+f1 = fiber.create(box.space.sync.replace, box.space.sync, {10})
+f1:status()
+box.ctl.clear_synchro_queue()
+box.space.sync:select{10}
+test_run:switch('replica')
+box.space.sync:select{10}
+
 -- Cleanup.
 test_run:cmd('switch default')
 
-- 
2.26.2

^ permalink raw reply	[flat|nested] 18+ messages in thread

* [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
       [not found] <cover.1594314820.git.sergeyb@tarantool.org>
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear sergeyb
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function sergeyb
@ 2020-07-09 17:16 ` sergeyb
  2020-07-09 20:07   ` Vladislav Shpilevoy
  2020-07-20 22:01   ` Vladislav Shpilevoy
  2 siblings, 2 replies; 18+ messages in thread
From: sergeyb @ 2020-07-09 17:16 UTC (permalink / raw)
  To: tarantool-patches, v.shpilevoy, sergepetrenko, gorcunov, lvasiliev

From: Sergey Bronnikov <sergeyb@tarantool.org>

Part of #5055
---
 test/replication/qsync.lua                    |  62 ++++++++++
 test/replication/qsync1.lua                   |   1 +
 test/replication/qsync10.lua                  |   1 +
 test/replication/qsync11.lua                  |   1 +
 test/replication/qsync12.lua                  |   1 +
 test/replication/qsync13.lua                  |   1 +
 test/replication/qsync14.lua                  |   1 +
 test/replication/qsync15.lua                  |   1 +
 test/replication/qsync16.lua                  |   1 +
 test/replication/qsync17.lua                  |   1 +
 test/replication/qsync18.lua                  |   1 +
 test/replication/qsync19.lua                  |   1 +
 test/replication/qsync2.lua                   |   1 +
 test/replication/qsync20.lua                  |   1 +
 test/replication/qsync21.lua                  |   1 +
 test/replication/qsync22.lua                  |   1 +
 test/replication/qsync23.lua                  |   1 +
 test/replication/qsync24.lua                  |   1 +
 test/replication/qsync25.lua                  |   1 +
 test/replication/qsync26.lua                  |   1 +
 test/replication/qsync27.lua                  |   1 +
 test/replication/qsync28.lua                  |   1 +
 test/replication/qsync29.lua                  |   1 +
 test/replication/qsync3.lua                   |   1 +
 test/replication/qsync30.lua                  |   1 +
 test/replication/qsync31.lua                  |   1 +
 test/replication/qsync4.lua                   |   1 +
 test/replication/qsync5.lua                   |   1 +
 test/replication/qsync6.lua                   |   1 +
 test/replication/qsync7.lua                   |   1 +
 test/replication/qsync8.lua                   |   1 +
 test/replication/qsync9.lua                   |   1 +
 test/replication/qsync_random_leader.result   | 117 ++++++++++++++++++
 test/replication/qsync_random_leader.test.lua |  52 ++++++++
 34 files changed, 262 insertions(+)
 create mode 100644 test/replication/qsync.lua
 create mode 120000 test/replication/qsync1.lua
 create mode 120000 test/replication/qsync10.lua
 create mode 120000 test/replication/qsync11.lua
 create mode 120000 test/replication/qsync12.lua
 create mode 120000 test/replication/qsync13.lua
 create mode 120000 test/replication/qsync14.lua
 create mode 120000 test/replication/qsync15.lua
 create mode 120000 test/replication/qsync16.lua
 create mode 120000 test/replication/qsync17.lua
 create mode 120000 test/replication/qsync18.lua
 create mode 120000 test/replication/qsync19.lua
 create mode 120000 test/replication/qsync2.lua
 create mode 120000 test/replication/qsync20.lua
 create mode 120000 test/replication/qsync21.lua
 create mode 120000 test/replication/qsync22.lua
 create mode 120000 test/replication/qsync23.lua
 create mode 120000 test/replication/qsync24.lua
 create mode 120000 test/replication/qsync25.lua
 create mode 120000 test/replication/qsync26.lua
 create mode 120000 test/replication/qsync27.lua
 create mode 120000 test/replication/qsync28.lua
 create mode 120000 test/replication/qsync29.lua
 create mode 120000 test/replication/qsync3.lua
 create mode 120000 test/replication/qsync30.lua
 create mode 120000 test/replication/qsync31.lua
 create mode 120000 test/replication/qsync4.lua
 create mode 120000 test/replication/qsync5.lua
 create mode 120000 test/replication/qsync6.lua
 create mode 120000 test/replication/qsync7.lua
 create mode 120000 test/replication/qsync8.lua
 create mode 120000 test/replication/qsync9.lua
 create mode 100644 test/replication/qsync_random_leader.result
 create mode 100644 test/replication/qsync_random_leader.test.lua

diff --git a/test/replication/qsync.lua b/test/replication/qsync.lua
new file mode 100644
index 000000000..383aa5272
--- /dev/null
+++ b/test/replication/qsync.lua
@@ -0,0 +1,62 @@
+#!/usr/bin/env tarantool
+
+-- get instance name from filename (qsync1.lua => qsync1)
+local INSTANCE_ID = string.match(arg[0], "%d")
+
+local SOCKET_DIR = require('fio').cwd()
+
+local TIMEOUT = tonumber(arg[1])
+
+local function instance_uri(instance_id)
+    return SOCKET_DIR..'/qsync'..instance_id..'.sock';
+end
+
+-- start console first
+require('console').listen(os.getenv('ADMIN'))
+
+box.cfg({
+    listen = instance_uri(INSTANCE_ID);
+    replication_timeout = TIMEOUT;
+    replication_sync_lag = 0.01;
+    replication_connect_quorum = 3;
+    replication = {
+        instance_uri(1);
+        instance_uri(2);
+        instance_uri(3);
+        instance_uri(4);
+        instance_uri(5);
+        instance_uri(6);
+        instance_uri(7);
+        instance_uri(8);
+        instance_uri(9);
+        instance_uri(10);
+        instance_uri(11);
+        instance_uri(12);
+        instance_uri(13);
+        instance_uri(14);
+        instance_uri(15);
+        instance_uri(16);
+        instance_uri(17);
+        instance_uri(18);
+        instance_uri(19);
+        instance_uri(20);
+        instance_uri(21);
+        instance_uri(22);
+        instance_uri(23);
+        instance_uri(24);
+        instance_uri(25);
+        instance_uri(26);
+        instance_uri(27);
+        instance_uri(28);
+        instance_uri(29);
+        instance_uri(30);
+        instance_uri(31);
+    };
+})
+
+box.once("bootstrap", function()
+    local test_run = require('test_run').new()
+    box.schema.user.grant("guest", 'replication')
+    box.schema.space.create('test', {engine = test_run:get_cfg('engine')})
+    box.space.test:create_index('primary')
+end)
diff --git a/test/replication/qsync1.lua b/test/replication/qsync1.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync1.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync10.lua b/test/replication/qsync10.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync10.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync11.lua b/test/replication/qsync11.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync11.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync12.lua b/test/replication/qsync12.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync12.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync13.lua b/test/replication/qsync13.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync13.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync14.lua b/test/replication/qsync14.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync14.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync15.lua b/test/replication/qsync15.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync15.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync16.lua b/test/replication/qsync16.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync16.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync17.lua b/test/replication/qsync17.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync17.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync18.lua b/test/replication/qsync18.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync18.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync19.lua b/test/replication/qsync19.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync19.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync2.lua b/test/replication/qsync2.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync2.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync20.lua b/test/replication/qsync20.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync20.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync21.lua b/test/replication/qsync21.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync21.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync22.lua b/test/replication/qsync22.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync22.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync23.lua b/test/replication/qsync23.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync23.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync24.lua b/test/replication/qsync24.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync24.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync25.lua b/test/replication/qsync25.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync25.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync26.lua b/test/replication/qsync26.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync26.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync27.lua b/test/replication/qsync27.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync27.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync28.lua b/test/replication/qsync28.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync28.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync29.lua b/test/replication/qsync29.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync29.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync3.lua b/test/replication/qsync3.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync3.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync30.lua b/test/replication/qsync30.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync30.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync31.lua b/test/replication/qsync31.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync31.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync4.lua b/test/replication/qsync4.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync4.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync5.lua b/test/replication/qsync5.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync5.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync6.lua b/test/replication/qsync6.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync6.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync7.lua b/test/replication/qsync7.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync7.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync8.lua b/test/replication/qsync8.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync8.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync9.lua b/test/replication/qsync9.lua
new file mode 120000
index 000000000..df9f3a883
--- /dev/null
+++ b/test/replication/qsync9.lua
@@ -0,0 +1 @@
+qsync.lua
\ No newline at end of file
diff --git a/test/replication/qsync_random_leader.result b/test/replication/qsync_random_leader.result
new file mode 100644
index 000000000..5d0c5d18f
--- /dev/null
+++ b/test/replication/qsync_random_leader.result
@@ -0,0 +1,117 @@
+-- test-run result file version 2
+env = require('test_run')
+ | ---
+ | ...
+math = require('math')
+ | ---
+ | ...
+fiber = require('fiber')
+ | ---
+ | ...
+test_run = env.new()
+ | ---
+ | ...
+engine = test_run:get_cfg('engine')
+ | ---
+ | ...
+
+MAX_REPLICAS = 3
+ | ---
+ | ...
+NUM_INSTANCES = MAX_REPLICAS + 1
+ | ---
+ | ...
+BROKEN_QUORUM = NUM_INSTANCES + 1
+ | ---
+ | ...
+
+SERVERS = {}
+ | ---
+ | ...
+test_run:cmd("setopt delimiter ';'")
+ | ---
+ | - true
+ | ...
+for i=1,MAX_REPLICAS do
+    SERVERS[i] = 'qsync' .. i
+end;
+ | ---
+ | ...
+test_run:cmd("setopt delimiter ''");
+ | ---
+ | - true
+ | ...
+SERVERS -- print instance names
+ | ---
+ | - - qsync1
+ |   - qsync2
+ |   - qsync3
+ | ...
+
+test_run:create_cluster(SERVERS, "replication", {args="0.1"})
+ | ---
+ | ...
+test_run:wait_fullmesh(SERVERS)
+ | ---
+ | ...
+box.schema.user.grant('guest', 'replication')
+ | ---
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+ | ---
+ | ...
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+
+-- Testcase body.
+current_leader = 'default'
+ | ---
+ | ...
+for i=1,100 do                                               \
+    new_leader_id = math.random(1, #SERVERS)                 \
+    new_leader = SERVERS[new_leader_id]                      \
+    test_run:switch(new_leader)                              \
+    box.cfg{read_only=false}                                 \
+    fiber = require('fiber')                                 \
+    f = fiber.create(function() box.space.sync:delete{} end) \
+    f = fiber.create(function() for i=1,10 do box.space.sync:insert{i} end end) \
+    f.status()                                               \
+    test_run:switch('default')                               \
+    test_run:switch(current_leader)                          \
+    box.cfg{read_only=true}                                  \
+    test_run:switch('default')                               \
+    current_leader = new_leader                              \
+    fiber.sleep(0.1)                                         \
+end
+ | ---
+ | ...
+
+-- Teardown.
+test_run:switch(current_leader)
+ | ---
+ | - true
+ | ...
+box.cfg{read_only=true} -- demote master to replica
+ | ---
+ | ...
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+box.cfg{read_only=false} -- promote replica to master
+ | ---
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+test_run:drop_cluster(SERVERS)
+ | ---
+ | ...
diff --git a/test/replication/qsync_random_leader.test.lua b/test/replication/qsync_random_leader.test.lua
new file mode 100644
index 000000000..3bf534e23
--- /dev/null
+++ b/test/replication/qsync_random_leader.test.lua
@@ -0,0 +1,52 @@
+env = require('test_run')
+math = require('math')
+fiber = require('fiber')
+test_run = env.new()
+engine = test_run:get_cfg('engine')
+
+MAX_REPLICAS = 3
+NUM_INSTANCES = MAX_REPLICAS + 1
+BROKEN_QUORUM = NUM_INSTANCES + 1
+
+SERVERS = {}
+test_run:cmd("setopt delimiter ';'")
+for i=1,MAX_REPLICAS do
+    SERVERS[i] = 'qsync' .. i
+end;
+test_run:cmd("setopt delimiter ''");
+SERVERS -- print instance names
+
+test_run:create_cluster(SERVERS, "replication", {args="0.1"})
+test_run:wait_fullmesh(SERVERS)
+box.schema.user.grant('guest', 'replication')
+test_run:switch('default')
+box.cfg{replication_synchro_quorum=NUM_INSTANCES, replication_synchro_timeout=0.1}
+_ = box.schema.space.create('sync', {is_sync=true, engine=engine})
+_ = box.space.sync:create_index('pk')
+
+-- Testcase body.
+current_leader = 'default'
+for i=1,100 do                                               \
+    new_leader_id = math.random(1, #SERVERS)                 \
+    new_leader = SERVERS[new_leader_id]                      \
+    test_run:switch(new_leader)                              \
+    box.cfg{read_only=false}                                 \
+    fiber = require('fiber')                                 \
+    f = fiber.create(function() box.space.sync:delete{} end) \
+    f = fiber.create(function() for i=1,10 do box.space.sync:insert{i} end end) \
+    f.status()                                               \
+    test_run:switch('default')                               \
+    test_run:switch(current_leader)                          \
+    box.cfg{read_only=true}                                  \
+    test_run:switch('default')                               \
+    current_leader = new_leader                              \
+    fiber.sleep(0.1)                                         \
+end
+
+-- Teardown.
+test_run:switch(current_leader)
+box.cfg{read_only=true} -- demote master to replica
+test_run:switch('default')
+box.cfg{read_only=false} -- promote replica to master
+box.space.sync:drop()
+test_run:drop_cluster(SERVERS)
-- 
2.26.2

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion sergeyb
@ 2020-07-09 20:07   ` Vladislav Shpilevoy
  2020-07-10  8:05     ` Sergey Bronnikov
  2020-07-20 22:01   ` Vladislav Shpilevoy
  1 sibling, 1 reply; 18+ messages in thread
From: Vladislav Shpilevoy @ 2020-07-09 20:07 UTC (permalink / raw)
  To: sergeyb, tarantool-patches, sergepetrenko, gorcunov, lvasiliev

Привет!

On 09/07/2020 19:16, sergeyb@tarantool.org wrote:
> From: Sergey Bronnikov <sergeyb@tarantool.org>
> 
> Part of #5055
> ---
>  test/replication/qsync.lua                    |  62 ++++++++++
>  test/replication/qsync1.lua                   |   1 +
>  test/replication/qsync10.lua                  |   1 +
>  test/replication/qsync11.lua                  |   1 +
>  test/replication/qsync12.lua                  |   1 +
>  test/replication/qsync13.lua                  |   1 +
>  test/replication/qsync14.lua                  |   1 +
>  test/replication/qsync15.lua                  |   1 +
>  test/replication/qsync16.lua                  |   1 +
>  test/replication/qsync17.lua                  |   1 +
>  test/replication/qsync18.lua                  |   1 +
>  test/replication/qsync19.lua                  |   1 +
>  test/replication/qsync2.lua                   |   1 +
>  test/replication/qsync20.lua                  |   1 +
>  test/replication/qsync21.lua                  |   1 +
>  test/replication/qsync22.lua                  |   1 +
>  test/replication/qsync23.lua                  |   1 +
>  test/replication/qsync24.lua                  |   1 +
>  test/replication/qsync25.lua                  |   1 +
>  test/replication/qsync26.lua                  |   1 +
>  test/replication/qsync27.lua                  |   1 +
>  test/replication/qsync28.lua                  |   1 +
>  test/replication/qsync29.lua                  |   1 +
>  test/replication/qsync3.lua                   |   1 +
>  test/replication/qsync30.lua                  |   1 +
>  test/replication/qsync31.lua                  |   1 +
>  test/replication/qsync4.lua                   |   1 +
>  test/replication/qsync5.lua                   |   1 +
>  test/replication/qsync6.lua                   |   1 +
>  test/replication/qsync7.lua                   |   1 +
>  test/replication/qsync8.lua                   |   1 +
>  test/replication/qsync9.lua                   |   1 +

Патч я еще не смотрел нормально, но блин, это перебор. Не надо 30 реплик
стартовать плиз. Хватит 5 максимум. И между ними рандомно попрыгать.
Как такой тест отлаживать в случае чего, я не представляю. Как и
насколько долго он работает.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear sergeyb
@ 2020-07-09 20:07   ` Vladislav Shpilevoy
  2020-07-10 12:55     ` Sergey Bronnikov
  2020-07-20 21:59   ` Vladislav Shpilevoy
  1 sibling, 1 reply; 18+ messages in thread
From: Vladislav Shpilevoy @ 2020-07-09 20:07 UTC (permalink / raw)
  To: sergeyb, tarantool-patches, sergepetrenko, gorcunov, lvasiliev

Привет!

Патчей на ветке не вижу.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-07-09 20:07   ` Vladislav Shpilevoy
@ 2020-07-10  8:05     ` Sergey Bronnikov
  0 siblings, 0 replies; 18+ messages in thread
From: Sergey Bronnikov @ 2020-07-10  8:05 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

On 22:07 Thu 09 Jul , Vladislav Shpilevoy wrote:
> Привет!
> 
> On 09/07/2020 19:16, sergeyb@tarantool.org wrote:
> > From: Sergey Bronnikov <sergeyb@tarantool.org>
> > 
> > Part of #5055
> > ---
> >  test/replication/qsync.lua                    |  62 ++++++++++
> >  test/replication/qsync1.lua                   |   1 +
> >  test/replication/qsync10.lua                  |   1 +
> >  test/replication/qsync11.lua                  |   1 +
> >  test/replication/qsync12.lua                  |   1 +
> >  test/replication/qsync13.lua                  |   1 +
> >  test/replication/qsync14.lua                  |   1 +
> >  test/replication/qsync15.lua                  |   1 +
> >  test/replication/qsync16.lua                  |   1 +
> >  test/replication/qsync17.lua                  |   1 +
> >  test/replication/qsync18.lua                  |   1 +
> >  test/replication/qsync19.lua                  |   1 +
> >  test/replication/qsync2.lua                   |   1 +
> >  test/replication/qsync20.lua                  |   1 +
> >  test/replication/qsync21.lua                  |   1 +
> >  test/replication/qsync22.lua                  |   1 +
> >  test/replication/qsync23.lua                  |   1 +
> >  test/replication/qsync24.lua                  |   1 +
> >  test/replication/qsync25.lua                  |   1 +
> >  test/replication/qsync26.lua                  |   1 +
> >  test/replication/qsync27.lua                  |   1 +
> >  test/replication/qsync28.lua                  |   1 +
> >  test/replication/qsync29.lua                  |   1 +
> >  test/replication/qsync3.lua                   |   1 +
> >  test/replication/qsync30.lua                  |   1 +
> >  test/replication/qsync31.lua                  |   1 +
> >  test/replication/qsync4.lua                   |   1 +
> >  test/replication/qsync5.lua                   |   1 +
> >  test/replication/qsync6.lua                   |   1 +
> >  test/replication/qsync7.lua                   |   1 +
> >  test/replication/qsync8.lua                   |   1 +
> >  test/replication/qsync9.lua                   |   1 +
> 
> Патч я еще не смотрел нормально, но блин, это перебор. Не надо 30 реплик
> стартовать плиз. Хватит 5 максимум. И между ними рандомно попрыгать.
> Как такой тест отлаживать в случае чего, я не представляю. Как и
> насколько долго он работает.

У нас максимум может быть VCLOCK_MAX реплик, это сейчас 30 штук.
Изначально хотел сделать тест, который хотя бы просто проверяет, что
синхра будет работать макс количестве реплик так же, как и на 2 или 3.
Поэтому и сделал отдельный конфиг для поднятия такого кластера.

Сам тест с рандомным promote/demote тогда будет на 5.

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear
  2020-07-09 20:07   ` Vladislav Shpilevoy
@ 2020-07-10 12:55     ` Sergey Bronnikov
  0 siblings, 0 replies; 18+ messages in thread
From: Sergey Bronnikov @ 2020-07-10 12:55 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

On 22:07 Thu 09 Jul , Vladislav Shpilevoy wrote:
> Привет!
> 
> Патчей на ветке не вижу.

Я постеснялся это в общий бранч класть.
Сейчас оба теста (box.ctl.clear_synchro_queue() и random leader) лежат здесь -
ligurio/gh-4842-qsync-testing

GH branch: https://github.com/tarantool/tarantool/tree/ligurio/gh-4842-qsync-testing
CI: https://gitlab.com/tarantool/tarantool/-/pipelines/165241959

-- 
sergeyb@

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear sergeyb
  2020-07-09 20:07   ` Vladislav Shpilevoy
@ 2020-07-20 21:59   ` Vladislav Shpilevoy
  1 sibling, 0 replies; 18+ messages in thread
From: Vladislav Shpilevoy @ 2020-07-20 21:59 UTC (permalink / raw)
  To: sergeyb, tarantool-patches, sergepetrenko, gorcunov, lvasiliev

Hi! Thanks for the patch!

On 09.07.2020 19:16, sergeyb@tarantool.org wrote:
> From: Sergey Bronnikov <sergeyb@tarantool.org>
> 
> ---
>  src/box/box.cc    | 10 ++++++----
>  src/box/box.h     |  2 +-
>  src/box/lua/ctl.c |  5 +++--
>  3 files changed, 10 insertions(+), 7 deletions(-)
> 
> diff --git a/src/box/box.cc b/src/box/box.cc
> index 749c96ca1..a400ed551 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -946,15 +946,15 @@ box_set_replication_anon(void)
>  
>  }
>  
> -void
> +int
>  box_clear_synchro_queue(void)

There are 3 reasons why it won't/doesn't work:

- Here you count not the transactions, but the replica count. 'len'
  is not queue size in this function.

- Functions should set an error in the diagnostics area. Here you
  silently return -1, -2. Moreover, in Lua we never use 0/-1 notation.

- I don't think I want this exposed to users. And there is no way to
  change this function without affecting the users.

Also there are ways how to check that the queue was actually cleared
(or not), depending on what exactly you want to test. See my comments
in the other commits.

Talking of other purposes, such as statistics - yes, we can try to
think of something. Queue size would be a useful stat, to see if it
grows too fast, or whatsoever. But it should not be available only at
the queue clean. It should be constantly available somewhere in box.info.

You can provide your ideas here:
https://github.com/tarantool/tarantool/issues/5191

>  {
>  	if (!is_box_configured || txn_limbo_is_empty(&txn_limbo))
> -		return;
> +		return -1;
>  	uint32_t former_leader_id = txn_limbo.instance_id;
>  	assert(former_leader_id != REPLICA_ID_NIL);
>  	if (former_leader_id == instance_id)
> -		return;
> +		return -2;
>  
>  	/* Wait until pending confirmations/rollbacks reach us. */
>  	double timeout = 2 * txn_limbo_confirm_timeout(&txn_limbo);
> @@ -965,9 +965,9 @@ box_clear_synchro_queue(void)
>  		fiber_sleep(0.001);
>  	}
>  
> +	int len = 0;
>  	if (!txn_limbo_is_empty(&txn_limbo)) {
>  		int64_t lsns[VCLOCK_MAX];
> -		int len = 0;
>  		const struct vclock  *vclock;
>  		replicaset_foreach(replica) {
>  			if (replica->relay != NULL &&
> @@ -993,6 +993,8 @@ box_clear_synchro_queue(void)
>  		txn_limbo_force_empty(&txn_limbo, confirm_lsn);
>  		assert(txn_limbo_is_empty(&txn_limbo));
>  	}
> +
> +	return len;
>  }
>  
>  void
> diff --git a/src/box/box.h b/src/box/box.h
> index 5c4a5ed78..608e26b83 100644
> --- a/src/box/box.h
> +++ b/src/box/box.h
> @@ -258,7 +258,7 @@ extern "C" {
>  
>  typedef struct tuple box_tuple_t;
>  
> -void box_clear_synchro_queue(void);
> +int box_clear_synchro_queue(void);
>  
>  /* box_select is private and used only by FFI */
>  API_EXPORT int
> diff --git a/src/box/lua/ctl.c b/src/box/lua/ctl.c
> index 2017ddc18..c3cf44f0c 100644
> --- a/src/box/lua/ctl.c
> +++ b/src/box/lua/ctl.c
> @@ -82,8 +82,9 @@ static int
>  lbox_ctl_clear_synchro_queue(struct lua_State *L)
>  {
>  	(void) L;
> -	box_clear_synchro_queue();
> -	return 0;
> +	int len = box_clear_synchro_queue();
> +	lua_pushinteger(L, len);
> +	return 1;
>  }
>  
>  static const struct luaL_Reg lbox_ctl_lib[] = {
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function sergeyb
@ 2020-07-20 22:00   ` Vladislav Shpilevoy
  2020-08-25 12:49     ` Sergey Bronnikov
  0 siblings, 1 reply; 18+ messages in thread
From: Vladislav Shpilevoy @ 2020-07-20 22:00 UTC (permalink / raw)
  To: sergeyb, tarantool-patches, sergepetrenko, gorcunov, lvasiliev

Thanks for the patch!

See 8 comments below.

On 09.07.2020 19:16, sergeyb@tarantool.org wrote:
> From: Sergey Bronnikov <sergeyb@tarantool.org>
> 
> Part of #5055
> Part of #4849
> ---
>  test/replication/qsync_basic.result   | 85 +++++++++++++++++++++++++++
>  test/replication/qsync_basic.test.lua | 31 ++++++++++
>  2 files changed, 116 insertions(+)
> 
> diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
> index ab4be0c7e..464df75a7 100644
> --- a/test/replication/qsync_basic.result
> +++ b/test/replication/qsync_basic.result
> @@ -32,6 +32,14 @@ s2.is_sync
>   | - false
>   | ...
>  
> +--
> +-- gh-4849: clear synchro queue with unconfigured box
> +--
> +box.ctl.clear_synchro_queue()

1. Enough to test that it does not crash here. No need to
check result.

> + | ---
> + | - -1
> + | ...
> +
>  -- Net.box takes sync into account.
>  box.schema.user.grant('guest', 'super')
>   | ---
> @@ -553,6 +561,82 @@ box.space.sync:select{7}
>   | - - [7]
>   | ...
>  
> +--
> +-- gh-4849: clear synchro queue on a replica
> +--
> +test_run:switch('default')
> + | ---
> + | - true
> + | ...
> +box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}

2. If you want this timeout to fail, it is too big. The test will be too
long. If it is supposed not to fail, then it is way too small. If you want
it to fail, it should be something like 0.001. If you want it to hold, it
should be 1000 to be sure. Keep in mind that you can change the timeout on
fly. That allows to build quite complex reactive tests completely event-based.

> + | ---
> + | ...
> +f1 = fiber.create(box.space.sync.replace, box.space.sync, {9})

3. You need to extract the exact result value. Use pcall for that, and print
its result after the fiber is dead. See other examples with fiber.create() in
qsync test suite. The same for the next test case.

> + | ---
> + | ...
> +f1:status()
> + | ---
> + | - suspended
> + | ...
> +test_run:switch('replica')
> + | ---
> + | - true

4. Better wait until the value is delivered. Otherwise you can switch to
replica before master finishes WAL write, and the queue will be empty here.
Try

    test_run:wait_cond(function() return box.space.sync:get{9} ~= nil end)

> + | ...
> +box.ctl.clear_synchro_queue()
> + | ---
> + | - 0
> + | ...
> +box.space.sync:select{9}
> + | ---
> + | - []

5. If select returns 9 before queue clean, and returns empty afterwards,
it means the queue was cleared. So here the queue size is not really needed,
as you can see.

> + | ...
> +test_run:switch('default')
> + | ---
> + | - true
> + | ...
> +box.space.sync:select{9}
> + | ---
> + | - []
> + | ...
> +f1:status()
> + | ---
> + | - dead
> + | ...
> +
> +--
> +-- gh-4849: clear synchro queue on a master

6. Since the previous test the replica's WAL is different from master's.
I am not sure the replication is still alive.

> +--
> +test_run:switch('default')
> + | ---
> + | - true
> + | ...
> +box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
> + | ---
> + | ...
> +f1 = fiber.create(box.space.sync.replace, box.space.sync, {10})
> + | ---
> + | ...
> +f1:status()
> + | ---
> + | - suspended
> + | ...
> +box.ctl.clear_synchro_queue()
> + | ---
> + | - -2
> + | ...
> +box.space.sync:select{10}
> + | ---
> + | - - [10]

7. Why is it 10? The quorum is not reached, it should have been rolled back.

> + | ...
> +test_run:switch('replica')
> + | ---
> + | - true
> + | ...
> +box.space.sync:select{10}
> + | ---
> + | - - [10]
> + | ...
> +
>  -- Cleanup.
>  test_run:cmd('switch default')
>   | ---
> @@ -576,6 +660,7 @@ test_run:cmd('delete server replica')
>   | ...
>  box.space.test:drop()
>   | ---
> + | - error: A rollback for a synchronous transaction is received

8. Why is it changed?

>   | ...
>  box.space.sync:drop()
>   | ---

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-07-09 17:16 ` [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion sergeyb
  2020-07-09 20:07   ` Vladislav Shpilevoy
@ 2020-07-20 22:01   ` Vladislav Shpilevoy
  2020-08-26 14:45     ` Sergey Bronnikov
  1 sibling, 1 reply; 18+ messages in thread
From: Vladislav Shpilevoy @ 2020-07-20 22:01 UTC (permalink / raw)
  To: sergeyb, tarantool-patches, sergepetrenko, gorcunov, lvasiliev

Thanks for the patch!

See 11 comments below.

> diff --git a/test/replication/qsync.lua b/test/replication/qsync.lua
> new file mode 100644
> index 000000000..383aa5272
> --- /dev/null
> +++ b/test/replication/qsync.lua
> @@ -0,0 +1,62 @@
> +#!/usr/bin/env tarantool
> +
> +-- get instance name from filename (qsync1.lua => qsync1)
> +local INSTANCE_ID = string.match(arg[0], "%d")
> +
> +local SOCKET_DIR = require('fio').cwd()
> +
> +local TIMEOUT = tonumber(arg[1])
> +
> +local function instance_uri(instance_id)
> +    return SOCKET_DIR..'/qsync'..instance_id..'.sock';
> +end
> +
> +-- start console first
> +require('console').listen(os.getenv('ADMIN'))
> +
> +box.cfg({
> +    listen = instance_uri(INSTANCE_ID);
> +    replication_timeout = TIMEOUT;

1. Why do you need the custom replication_timeout?

> +    replication_sync_lag = 0.01;

2. Why do you need the lag setting?

> +    replication_connect_quorum = 3;
> +    replication = {
> +        instance_uri(1);
> +        instance_uri(2);
> +        instance_uri(3);
> +        instance_uri(4);
> +        instance_uri(5);
> +        instance_uri(6);
> +        instance_uri(7);
> +        instance_uri(8);
> +        instance_uri(9);
> +        instance_uri(10);
> +        instance_uri(11);
> +        instance_uri(12);
> +        instance_uri(13);
> +        instance_uri(14);
> +        instance_uri(15);
> +        instance_uri(16);
> +        instance_uri(17);
> +        instance_uri(18);
> +        instance_uri(19);
> +        instance_uri(20);
> +        instance_uri(21);
> +        instance_uri(22);
> +        instance_uri(23);
> +        instance_uri(24);
> +        instance_uri(25);
> +        instance_uri(26);
> +        instance_uri(27);
> +        instance_uri(28);
> +        instance_uri(29);
> +        instance_uri(30);
> +        instance_uri(31);

3. Seems like in the test you use only 3 instances, not 32. Also the
quorum is set to 3.

> +    };
> +})
> +
> +box.once("bootstrap", function()
> +    local test_run = require('test_run').new()
> +    box.schema.user.grant("guest", 'replication')
> +    box.schema.space.create('test', {engine = test_run:get_cfg('engine')})
> +    box.space.test:create_index('primary')

4. Where do you use this space?

> +end)
> diff --git a/test/replication/qsync_random_leader.result b/test/replication/qsync_random_leader.result
> new file mode 100644
> index 000000000..cb1b5e232
> --- /dev/null
> +++ b/test/replication/qsync_random_leader.result
> @@ -0,0 +1,123 @@
> +-- test-run result file version 2
> +os = require('os')
> + | ---
> + | ...
> +env = require('test_run')
> + | ---
> + | ...
> +math = require('math')
> + | ---
> + | ...
> +fiber = require('fiber')
> + | ---
> + | ...
> +test_run = env.new()
> + | ---
> + | ...
> +engine = test_run:get_cfg('engine')
> + | ---
> + | ...
> +
> +NUM_INSTANCES = 3
> + | ---
> + | ...
> +BROKEN_QUORUM = NUM_INSTANCES + 1
> + | ---
> + | ...
> +
> +SERVERS = {}
> + | ---
> + | ...
> +test_run:cmd("setopt delimiter ';'")
> + | ---
> + | - true
> + | ...
> +for i=1,NUM_INSTANCES do
> +    SERVERS[i] = 'qsync' .. i
> +end;
> + | ---
> + | ...
> +test_run:cmd("setopt delimiter ''");

5. Please, lets be consistent and use either \ or the delimiter. Currently
it is irrational - you use \ for big code blocks, and a custom delimiter for
tiny blocks which could even be one line. Personally, I would use \
everywhere.

> + | ---
> + | - true
> + | ...
> +SERVERS -- print instance names
> + | ---
> + | - - qsync1
> + |   - qsync2
> + |   - qsync3
> + | ...
> +
> +random = function(excluded_num, min, max)       \

6. Would be better to align all \ by 80 in this file. Makes easier to add
new longer lines in future without moving all the old \.

> +    math.randomseed(os.time())                  \
> +    local r = math.random(min, max)             \
> +    if (r == excluded_num) then                 \
> +        return random(excluded_num, min, max)   \
> +    end                                         \
> +    return r                                    \
> +end
> + | ---
> + | ...
> +
> +test_run:create_cluster(SERVERS, "replication", {args="0.1"})
> + | ---
> + | ...
> +test_run:wait_fullmesh(SERVERS)
> + | ---
> + | ...
> +current_leader_id = 1
> + | ---
> + | ...
> +test_run:switch(SERVERS[current_leader_id])
> + | ---
> + | - true
> + | ...
> +box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}

7. The timeout is tiny. It will lead to flakiness sooner or later, 100%.

> + | ---
> + | ...
> +_ = box.schema.space.create('sync', {is_sync=true})
> + | ---
> + | ...
> +_ = box.space.sync:create_index('pk')
> + | ---
> + | ...
> +test_run:switch('default')
> + | ---
> + | - true
> + | ...
> +
> +-- Testcase body.
> +for i=1,10 do                                                 \
> +    new_leader_id = random(current_leader_id, 1, #SERVERS)    \
> +    test_run:switch(SERVERS[new_leader_id])                   \
> +    box.cfg{read_only=false}                                  \
> +    fiber = require('fiber')                                  \
> +    f1 = fiber.create(function() box.space.sync:delete{} end) \

8. Delete without a key will fail. You would notice it if you
would check results of the DML operations. Please, do that via pcall.

> +    f2 = fiber.create(function() for i=1,10000 do box.space.sync:insert{i} end end) \

9. You have \ exactly to avoid such long lines.

> +    f1.status()                                               \
> +    f2.status()                                               \

10. Output is not printed inside one statement. This whole cycle is
one statement because of \, so these status() calls are useless.

> +    test_run:switch('default')                                \
> +    test_run:switch(SERVERS[current_leader_id])               \
> +    box.cfg{read_only=true}                                   \
> +    test_run:switch('default')                                \
> +    current_leader_id = new_leader_id                         \
> +    fiber.sleep(0.1)                                          \

11. Why do you need this fiber.sleep()?

> +end
> + | ---
> + | ...
> +
> +-- Teardown.
> +test_run:switch(SERVERS[current_leader_id])
> + | ---
> + | - true
> + | ...
> +box.space.sync:drop()
> + | ---
> + | ...
> +test_run:switch('default')
> + | ---
> + | - true
> + | ...
> +test_run:drop_cluster(SERVERS)
> + | ---
> + | ...

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function
  2020-07-20 22:00   ` Vladislav Shpilevoy
@ 2020-08-25 12:49     ` Sergey Bronnikov
  2020-08-26  7:31       ` Serge Petrenko
  0 siblings, 1 reply; 18+ messages in thread
From: Sergey Bronnikov @ 2020-08-25 12:49 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

Vlad, thanks for review!
Patches updated in a branch and please see my answers inline.

On 00:00 Tue 21 Jul , Vladislav Shpilevoy wrote:
> Thanks for the patch!
> 
> See 8 comments below.
> 
> On 09.07.2020 19:16, sergeyb@tarantool.org wrote:
> > From: Sergey Bronnikov <sergeyb@tarantool.org>
> > 
> > Part of #5055
> > Part of #4849
> > ---
> >  test/replication/qsync_basic.result   | 85 +++++++++++++++++++++++++++
> >  test/replication/qsync_basic.test.lua | 31 ++++++++++
> >  2 files changed, 116 insertions(+)
> > 
> > diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
> > index ab4be0c7e..464df75a7 100644
> > --- a/test/replication/qsync_basic.result
> > +++ b/test/replication/qsync_basic.result
> > @@ -32,6 +32,14 @@ s2.is_sync
> >   | - false
> >   | ...
> >  
> > +--
> > +-- gh-4849: clear synchro queue with unconfigured box
> > +--
> > +box.ctl.clear_synchro_queue()
> 
> 1. Enough to test that it does not crash here. No need to
> check result.

Well, removed assignment to variable.

> > + | ---
> > + | - -1
> > + | ...
> > +
> >  -- Net.box takes sync into account.
> >  box.schema.user.grant('guest', 'super')
> >   | ---
> > @@ -553,6 +561,82 @@ box.space.sync:select{7}
> >   | - - [7]
> >   | ...
> >  
> > +--
> > +-- gh-4849: clear synchro queue on a replica
> > +--
> > +test_run:switch('default')
> > + | ---
> > + | - true
> > + | ...
> > +box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
> 
> 2. If you want this timeout to fail, it is too big. The test will be too
> long. If it is supposed not to fail, then it is way too small. If you want
> it to fail, it should be something like 0.001. If you want it to hold, it
> should be 1000 to be sure. Keep in mind that you can change the timeout on
> fly. That allows to build quite complex reactive tests completely event-based.

set replication_synchro_timeout value to 1000

> > + | ---
> > + | ...
> > +f1 = fiber.create(box.space.sync.replace, box.space.sync, {9})
> 
> 3. You need to extract the exact result value. Use pcall for that, and print
> its result after the fiber is dead. See other examples with fiber.create() in
> qsync test suite. The same for the next test case.

Done.

> > + | ---
> > + | ...
> > +f1:status()
> > + | ---
> > + | - suspended
> > + | ...
> > +test_run:switch('replica')
> > + | ---
> > + | - true
> 
> 4. Better wait until the value is delivered. Otherwise you can switch to
> replica before master finishes WAL write, and the queue will be empty here.
> Try
> 
>     test_run:wait_cond(function() return box.space.sync:get{9} ~= nil end)

Agree, added wait_cond().

> > + | ...
> > +box.ctl.clear_synchro_queue()
> > + | ---
> > + | - 0
> > + | ...
> > +box.space.sync:select{9}
> > + | ---
> > + | - []
> 
> 5. If select returns 9 before queue clean, and returns empty afterwards,
> it means the queue was cleared. So here the queue size is not really needed,
> as you can see.

removed this and next select{} statements

> > + | ...
> > +test_run:switch('default')
> > + | ---
> > + | - true
> > + | ...
> > +box.space.sync:select{9}
> > + | ---
> > + | - []
> > + | ...
> > +f1:status()
> > + | ---
> > + | - dead
> > + | ...
> > +
> > +--
> > +-- gh-4849: clear synchro queue on a master
> 
> 6. Since the previous test the replica's WAL is different from master's.
> I am not sure the replication is still alive.

Test with clear_synchro_queue() on master keeps master alive at the end
of test so I moved it before the same test for the replica. Also added
note for others that cluster may be in a broken state here.

> > +--
> > +test_run:switch('default')
> > + | ---
> > + | - true
> > + | ...
> > +box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
> > + | ---
> > + | ...
> > +f1 = fiber.create(box.space.sync.replace, box.space.sync, {10})
> > + | ---
> > + | ...
> > +f1:status()
> > + | ---
> > + | - suspended
> > + | ...
> > +box.ctl.clear_synchro_queue()
> > + | ---
> > + | - -2
> > + | ...
> > +box.space.sync:select{10}
> > + | ---
> > + | - - [10]
> 
> 7. Why is it 10? The quorum is not reached, it should have been rolled back.

Added "box.cfg{replication_synchro_timeout = 0.1}" before clear_synchro_queue()
call and now tx is rolled back. Also I've added checks of current value with
wait_cond() that is more reliable than select{} on a master and replica.
Updated output looks like this:

box.ctl.clear_synchro_queue()
 | ---
 | ...
test_run:switch('replica')
 | ---
 | - true
 | ...
test_run:wait_cond(function() return box.space.sync:get{10} == nil end)
 | ---
 | - true
 | ...
test_run:switch('default')
 | ---
 | - true
 | ...
test_run:wait_cond(function() return f:status() == 'dead' end)
 | ---
 | - true
 | ...
ok, err
 | ---
 | - false
 | - Quorum collection for a synchronous transaction is timed out
 | ...
test_run:wait_cond(function() return box.space.sync:get{10} == nil end)
 | ---
 | - true
 | ...

> > + | ...
> > +test_run:switch('replica')
> > + | ---
> > + | - true
> > + | ...
> > +box.space.sync:select{10}
> > + | ---
> > + | - - [10]
> > + | ...
> > +
> >  -- Cleanup.
> >  test_run:cmd('switch default')
> >   | ---
> > @@ -576,6 +660,7 @@ test_run:cmd('delete server replica')
> >   | ...
> >  box.space.test:drop()
> >   | ---
> > + | - error: A rollback for a synchronous transaction is received
> 
> 8. Why is it changed?

Perhaps it is because in previous version of test I haven't wait fibers 'dead'.
There is no error now:

box.space.test:drop()
 | ---
 | ...
box.space.sync:drop()
 | ---
 | ...

> 
> >   | ...
> >  box.space.sync:drop()
> >   | ---

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function
  2020-08-25 12:49     ` Sergey Bronnikov
@ 2020-08-26  7:31       ` Serge Petrenko
  2020-08-26 14:48         ` Sergey Bronnikov
  0 siblings, 1 reply; 18+ messages in thread
From: Serge Petrenko @ 2020-08-26  7:31 UTC (permalink / raw)
  To: Sergey Bronnikov; +Cc: tarantool-patches, Vladislav Shpilevoy

[-- Attachment #1: Type: text/plain, Size: 6126 bytes --]


Hi, Sergey!
Thanks for the patch and the fixes. New version LGTM. I couldn’t find the other patches in my mailbox though, could you please resend them? If you need my review, of course. 
>Вторник, 25 августа 2020, 15:49 +03:00 от Sergey Bronnikov <sergeyb@tarantool.org>:
> 
>Vlad, thanks for review!
>Patches updated in a branch and please see my answers inline.
>
>On 00:00 Tue 21 Jul , Vladislav Shpilevoy wrote:
>> Thanks for the patch!
>>
>> See 8 comments below.
>>
>> On 09.07.2020 19:16,  sergeyb@tarantool.org wrote:
>> > From: Sergey Bronnikov < sergeyb@tarantool.org >
>> >
>> > Part of #5055
>> > Part of #4849
>> > ---
>> > test/replication/qsync_basic.result | 85 +++++++++++++++++++++++++++
>> > test/replication/qsync_basic.test.lua | 31 ++++++++++
>> > 2 files changed, 116 insertions(+)
>> >
>> > diff --git a/test/replication/qsync_basic.result b/test/replication/qsync_basic.result
>> > index ab4be0c7e..464df75a7 100644
>> > --- a/test/replication/qsync_basic.result
>> > +++ b/test/replication/qsync_basic.result
>> > @@ -32,6 +32,14 @@ s2.is_sync
>> > | - false
>> > | ...
>> >
>> > +--
>> > +-- gh-4849: clear synchro queue with unconfigured box
>> > +--
>> > +box.ctl.clear_synchro_queue()
>>
>> 1. Enough to test that it does not crash here. No need to
>> check result.
>
>Well, removed assignment to variable.
>
>> > + | ---
>> > + | - -1
>> > + | ...
>> > +
>> > -- Net.box takes sync into account.
>> > box.schema.user.grant('guest', 'super')
>> > | ---
>> > @@ -553,6 +561,82 @@ box.space.sync:select{7}
>> > | - - [7]
>> > | ...
>> >
>> > +--
>> > +-- gh-4849: clear synchro queue on a replica
>> > +--
>> > +test_run:switch('default')
>> > + | ---
>> > + | - true
>> > + | ...
>> > +box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
>>
>> 2. If you want this timeout to fail, it is too big. The test will be too
>> long. If it is supposed not to fail, then it is way too small. If you want
>> it to fail, it should be something like 0.001. If you want it to hold, it
>> should be 1000 to be sure. Keep in mind that you can change the timeout on
>> fly. That allows to build quite complex reactive tests completely event-based.
>
>set replication_synchro_timeout value to 1000
>
>> > + | ---
>> > + | ...
>> > +f1 = fiber.create(box.space.sync.replace, box.space.sync, {9})
>>
>> 3. You need to extract the exact result value. Use pcall for that, and print
>> its result after the fiber is dead. See other examples with fiber.create() in
>> qsync test suite. The same for the next test case.
>
>Done.
>
>> > + | ---
>> > + | ...
>> > +f1:status()
>> > + | ---
>> > + | - suspended
>> > + | ...
>> > +test_run:switch('replica')
>> > + | ---
>> > + | - true
>>
>> 4. Better wait until the value is delivered. Otherwise you can switch to
>> replica before master finishes WAL write, and the queue will be empty here.
>> Try
>>
>> test_run:wait_cond(function() return box.space.sync:get{9} ~= nil end)
>
>Agree, added wait_cond().
>
>> > + | ...
>> > +box.ctl.clear_synchro_queue()
>> > + | ---
>> > + | - 0
>> > + | ...
>> > +box.space.sync:select{9}
>> > + | ---
>> > + | - []
>>
>> 5. If select returns 9 before queue clean, and returns empty afterwards,
>> it means the queue was cleared. So here the queue size is not really needed,
>> as you can see.
>
>removed this and next select{} statements
>
>> > + | ...
>> > +test_run:switch('default')
>> > + | ---
>> > + | - true
>> > + | ...
>> > +box.space.sync:select{9}
>> > + | ---
>> > + | - []
>> > + | ...
>> > +f1:status()
>> > + | ---
>> > + | - dead
>> > + | ...
>> > +
>> > +--
>> > +-- gh-4849: clear synchro queue on a master
>>
>> 6. Since the previous test the replica's WAL is different from master's.
>> I am not sure the replication is still alive.
>
>Test with clear_synchro_queue() on master keeps master alive at the end
>of test so I moved it before the same test for the replica. Also added
>note for others that cluster may be in a broken state here.
>
>> > +--
>> > +test_run:switch('default')
>> > + | ---
>> > + | - true
>> > + | ...
>> > +box.cfg{replication_synchro_quorum = 3, replication_synchro_timeout = 2}
>> > + | ---
>> > + | ...
>> > +f1 = fiber.create(box.space.sync.replace, box.space.sync, {10})
>> > + | ---
>> > + | ...
>> > +f1:status()
>> > + | ---
>> > + | - suspended
>> > + | ...
>> > +box.ctl.clear_synchro_queue()
>> > + | ---
>> > + | - -2
>> > + | ...
>> > +box.space.sync:select{10}
>> > + | ---
>> > + | - - [10]
>>
>> 7. Why is it 10? The quorum is not reached, it should have been rolled back.
>
>Added "box.cfg{replication_synchro_timeout = 0.1}" before clear_synchro_queue()
>call and now tx is rolled back. Also I've added checks of current value with
>wait_cond() that is more reliable than select{} on a master and replica.
>Updated output looks like this:
>
>box.ctl.clear_synchro_queue()
> | ---
> | ...
>test_run:switch('replica')
> | ---
> | - true
> | ...
>test_run:wait_cond(function() return box.space.sync:get{10} == nil end)
> | ---
> | - true
> | ...
>test_run:switch('default')
> | ---
> | - true
> | ...
>test_run:wait_cond(function() return f:status() == 'dead' end)
> | ---
> | - true
> | ...
>ok, err
> | ---
> | - false
> | - Quorum collection for a synchronous transaction is timed out
> | ...
>test_run:wait_cond(function() return box.space.sync:get{10} == nil end)
> | ---
> | - true
> | ...
>
>> > + | ...
>> > +test_run:switch('replica')
>> > + | ---
>> > + | - true
>> > + | ...
>> > +box.space.sync:select{10}
>> > + | ---
>> > + | - - [10]
>> > + | ...
>> > +
>> > -- Cleanup.
>> > test_run:cmd('switch default')
>> > | ---
>> > @@ -576,6 +660,7 @@ test_run:cmd('delete server replica')
>> > | ...
>> > box.space.test:drop()
>> > | ---
>> > + | - error: A rollback for a synchronous transaction is received
>>
>> 8. Why is it changed?
>
>Perhaps it is because in previous version of test I haven't wait fibers 'dead'.
>There is no error now:
>
>box.space.test:drop()
> | ---
> | ...
>box.space.sync:drop()
> | ---
> | ...
>
>>
>> > | ...
>> > box.space.sync:drop()
>> > | ---
 
--
Serge Petrenko

[-- Attachment #2: Type: text/html, Size: 7929 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-07-20 22:01   ` Vladislav Shpilevoy
@ 2020-08-26 14:45     ` Sergey Bronnikov
  2020-08-27 10:49       ` Serge Petrenko
  0 siblings, 1 reply; 18+ messages in thread
From: Sergey Bronnikov @ 2020-08-26 14:45 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

Vlad, thanks for review!
Patch updated in a branch.

To make sure patch doesn't make test flaky I run test 100 times using 10
workers in parallel without problems.
../../test/test-run.py --builddir=/home/s.bronnikov/tarantool/build --vardir=/home/s.bronnikov/tarantool
/build/test/var -j 10 $(yes replication/qsync_random_leader.test.lua | head -n 100)

On 00:01 Tue 21 Jul , Vladislav Shpilevoy wrote:
> Thanks for the patch!
> 
> See 11 comments below.
> 
> > diff --git a/test/replication/qsync.lua b/test/replication/qsync.lua
> > new file mode 100644
> > index 000000000..383aa5272
> > --- /dev/null
> > +++ b/test/replication/qsync.lua
> > @@ -0,0 +1,62 @@
> > +#!/usr/bin/env tarantool
> > +
> > +-- get instance name from filename (qsync1.lua => qsync1)
> > +local INSTANCE_ID = string.match(arg[0], "%d")
> > +
> > +local SOCKET_DIR = require('fio').cwd()
> > +
> > +local TIMEOUT = tonumber(arg[1])
> > +
> > +local function instance_uri(instance_id)
> > +    return SOCKET_DIR..'/qsync'..instance_id..'.sock';
> > +end
> > +
> > +-- start console first
> > +require('console').listen(os.getenv('ADMIN'))
> > +
> > +box.cfg({
> > +    listen = instance_uri(INSTANCE_ID);
> > +    replication_timeout = TIMEOUT;
> 
> 1. Why do you need the custom replication_timeout?

It is actually a copy-paste from original cluster initialization script,
removed replication_timeout.

> > +    replication_sync_lag = 0.01;
> 
> 2. Why do you need the lag setting?

the same as above

> > +    replication_connect_quorum = 3;
> > +    replication = {
> > +        instance_uri(1);
> > +        instance_uri(2);
> > +        instance_uri(3);
> > +        instance_uri(4);
> > +        instance_uri(5);
> > +        instance_uri(6);
> > +        instance_uri(7);
> > +        instance_uri(8);
> > +        instance_uri(9);
> > +        instance_uri(10);
> > +        instance_uri(11);
> > +        instance_uri(12);
> > +        instance_uri(13);
> > +        instance_uri(14);
> > +        instance_uri(15);
> > +        instance_uri(16);
> > +        instance_uri(17);
> > +        instance_uri(18);
> > +        instance_uri(19);
> > +        instance_uri(20);
> > +        instance_uri(21);
> > +        instance_uri(22);
> > +        instance_uri(23);
> > +        instance_uri(24);
> > +        instance_uri(25);
> > +        instance_uri(26);
> > +        instance_uri(27);
> > +        instance_uri(28);
> > +        instance_uri(29);
> > +        instance_uri(30);
> > +        instance_uri(31);
> 
> 3. Seems like in the test you use only 3 instances, not 32. Also the
> quorum is set to 3.

in updated test 5 instances are in use, others removed in initialization script

> > +    };
> > +})
> > +
> > +box.once("bootstrap", function()
> > +    local test_run = require('test_run').new()
> > +    box.schema.user.grant("guest", 'replication')
> > +    box.schema.space.create('test', {engine = test_run:get_cfg('engine')})
> > +    box.space.test:create_index('primary')
> 
> 4. Where do you use this space?

space has been renamed to "sync" and used it in a test

> > +end)
> > diff --git a/test/replication/qsync_random_leader.result b/test/replication/qsync_random_leader.result
> > new file mode 100644
> > index 000000000..cb1b5e232
> > --- /dev/null
> > +++ b/test/replication/qsync_random_leader.result
> > @@ -0,0 +1,123 @@
> > +-- test-run result file version 2
> > +os = require('os')
> > + | ---
> > + | ...
> > +env = require('test_run')
> > + | ---
> > + | ...
> > +math = require('math')
> > + | ---
> > + | ...
> > +fiber = require('fiber')
> > + | ---
> > + | ...
> > +test_run = env.new()
> > + | ---
> > + | ...
> > +engine = test_run:get_cfg('engine')
> > + | ---
> > + | ...
> > +
> > +NUM_INSTANCES = 3
> > + | ---
> > + | ...
> > +BROKEN_QUORUM = NUM_INSTANCES + 1
> > + | ---
> > + | ...
> > +
> > +SERVERS = {}
> > + | ---
> > + | ...
> > +test_run:cmd("setopt delimiter ';'")
> > + | ---
> > + | - true
> > + | ...
> > +for i=1,NUM_INSTANCES do
> > +    SERVERS[i] = 'qsync' .. i
> > +end;
> > + | ---
> > + | ...
> > +test_run:cmd("setopt delimiter ''");
> 
> 5. Please, lets be consistent and use either \ or the delimiter. Currently
> it is irrational - you use \ for big code blocks, and a custom delimiter for
> tiny blocks which could even be one line. Personally, I would use \
> everywhere.

replaced constructions with delimiters to multiline statements

> > + | ---
> > + | - true
> > + | ...
> > +SERVERS -- print instance names
> > + | ---
> > + | - - qsync1
> > + |   - qsync2
> > + |   - qsync3
> > + | ...
> > +
> > +random = function(excluded_num, min, max)       \
> 
> 6. Would be better to align all \ by 80 in this file. Makes easier to add
> new longer lines in future without moving all the old \.

Done.

> > +    math.randomseed(os.time())                  \
> > +    local r = math.random(min, max)             \
> > +    if (r == excluded_num) then                 \
> > +        return random(excluded_num, min, max)   \
> > +    end                                         \
> > +    return r                                    \
> > +end
> > + | ---
> > + | ...
> > +
> > +test_run:create_cluster(SERVERS, "replication", {args="0.1"})
> > + | ---
> > + | ...
> > +test_run:wait_fullmesh(SERVERS)
> > + | ---
> > + | ...
> > +current_leader_id = 1
> > + | ---
> > + | ...
> > +test_run:switch(SERVERS[current_leader_id])
> > + | ---
> > + | - true
> > + | ...
> > +box.cfg{replication_synchro_quorum=3, replication_synchro_timeout=0.1}
> 
> 7. The timeout is tiny. It will lead to flakiness sooner or later, 100%.

increased to 1 sec

> > + | ---
> > + | ...
> > +_ = box.schema.space.create('sync', {is_sync=true})
> > + | ---
> > + | ...
> > +_ = box.space.sync:create_index('pk')
> > + | ---
> > + | ...
> > +test_run:switch('default')
> > + | ---
> > + | - true
> > + | ...
> > +
> > +-- Testcase body.
> > +for i=1,10 do                                                 \
> > +    new_leader_id = random(current_leader_id, 1, #SERVERS)    \
> > +    test_run:switch(SERVERS[new_leader_id])                   \
> > +    box.cfg{read_only=false}                                  \
> > +    fiber = require('fiber')                                  \
> > +    f1 = fiber.create(function() box.space.sync:delete{} end) \
> 
> 8. Delete without a key will fail. You would notice it if you
> would check results of the DML operations. Please, do that via pcall.
> 

replaced delete{} with truncate{}

> > +    f2 = fiber.create(function() for i=1,10000 do box.space.sync:insert{i} end end) \
> 
> 9. You have \ exactly to avoid such long lines.

splitted for shorter lines

> 
> > +    f1.status()                                               \
> > +    f2.status()                                               \
> 
> 10. Output is not printed inside one statement. This whole cycle is
> one statement because of \, so these status() calls are useless.

removed

> > +    test_run:switch('default')                                \
> > +    test_run:switch(SERVERS[current_leader_id])               \
> > +    box.cfg{read_only=true}                                   \
> > +    test_run:switch('default')                                \
> > +    current_leader_id = new_leader_id                         \
> > +    fiber.sleep(0.1)                                          \
> 
> 11. Why do you need this fiber.sleep()?

I don't remember the reason to add it, but test works fine without it.
So I removed it in updated patch.

> > +end
> > + | ---
> > + | ...
> > +
> > +-- Teardown.
> > +test_run:switch(SERVERS[current_leader_id])
> > + | ---
> > + | - true
> > + | ...
> > +box.space.sync:drop()
> > + | ---
> > + | ...
> > +test_run:switch('default')
> > + | ---
> > + | - true
> > + | ...
> > +test_run:drop_cluster(SERVERS)
> > + | ---
> > + | ...

-- 
sergeyb@

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function
  2020-08-26  7:31       ` Serge Petrenko
@ 2020-08-26 14:48         ` Sergey Bronnikov
  0 siblings, 0 replies; 18+ messages in thread
From: Sergey Bronnikov @ 2020-08-26 14:48 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches, Vladislav Shpilevoy

On 10:31 Wed 26 Aug , Serge Petrenko wrote:
> 
> Hi, Sergey!
> Thanks for the patch and the fixes. New version LGTM. I couldn’t find
> the other patches in my mailbox though, could you please resend them? If
> you need my review, of course.

Thanks for review! Second patch in a series updated [1]. You are in CC.
Branch: ligurio/gh-4842-qsync-testing

1. https://lists.tarantool.org/pipermail/tarantool-patches/2020-August/019197.html

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-08-26 14:45     ` Sergey Bronnikov
@ 2020-08-27 10:49       ` Serge Petrenko
  2020-08-31 10:05         ` Sergey Bronnikov
  0 siblings, 1 reply; 18+ messages in thread
From: Serge Petrenko @ 2020-08-27 10:49 UTC (permalink / raw)
  To: Sergey Bronnikov; +Cc: tarantool-patches, Vladislav Shpilevoy

[-- Attachment #1: Type: text/plain, Size: 3856 bytes --]


Hi! Thanks for the fixes!
I’m pasting parts of the patch below to comment on.
 
 
+box.cfg({
+    listen = instance_uri(INSTANCE_ID);
+    replication_connect_quorum = 3;
+    replication = {
+        instance_uri(1);
+        instance_uri(2);
+        instance_uri(3);
+        instance_uri(4);
+        instance_uri(5);
+    };
+})
 
 
*  You should either omit `replication_connect_quorum` at all, or set it to 5.
Omitting it will have the same effect.
I think you meant `replication_synchro_quorum` here, then it makes sense
to set it to 3. Also   `replication_synchro_timeout` should be set here,
I’ll mention it  ​​​​​​​ again below.
 
 
+
+NUM_INSTANCES = 5
+BROKEN_QUORUM = NUM_INSTANCES + 1
+
 
 
*  BROKEN_QUORUM assigned but never used.
 
 
+
+test_run:create_cluster(SERVERS, "replication", {args="0.1"})
+test_run:wait_fullmesh(SERVERS)
 
 
*  You’re passing some argument to qsync1, … qsync5 instances, but you never use  it.
 
 
+current_leader_id = 1
+test_run:switch(SERVERS[current_leader_id])
+box.cfg{replication_synchro_timeout=1}
 
 
*  You should set `replication_synchro_timeout` on every instance, not only
on qsync1 so   you better move this box.cfg call to the instance file.
Besides, the timeout should be bigger (much bigger), like Vlad said.
We typically use 30 seconds for various replication timeouts.
It’s fairly common when a test is stable on your machine, but is flaky on
testing machines.
 
 
+test_run:switch('default')
+
+-- Testcase body.
+for i=1,10 do                                                                  \
+    new_leader_id = random(current_leader_id, 1, #SERVERS)                     \
+    test_run:switch(SERVERS[new_leader_id])                                    \
+    box.cfg{read_only=false}                                                   \
+    f1 = fiber.create(function()                                               \
+        pcall(box.space.sync:truncate{})                                       \
+    end)                                                                       \
 
 
*  Why put truncate call in a separate fiber?

Why use truncate at all? You may just replace all your `insert` calls below
with `replace`, and then truncate won’t be needed. This is up to you
though.
 
 
+    f2 = fiber.create(function()                                               \
+        for i=1,10000 do box.space.sync:insert{i} end                          \
+    end)                                                                       \
 
*  So you’re testing a case when a leader has some unconfirmed transactions in
limbo and then a leader change happens. Then you need to call
`clear_synchro_queue` on a new leader to wait for confirmation of old txns. Otherwise
the new leader fails to insert its data, but the test doesn’t show this, because you
don’t check fiber state or `insert()` return values.
 
+    test_run:switch('default')                                                 \
+    test_run:switch(SERVERS[current_leader_id])                                \
 
 
*  The 2 lines above are useless.
 
 
+    box.cfg{read_only=true}                                                    \
+    test_run:switch('default')                                                 \
+    current_leader_id = new_leader_id                                          \
+end
+
+-- Teardown.
 
 
 
--
Serge Petrenko

[-- Attachment #2: Type: text/html, Size: 13274 bytes --]

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-08-27 10:49       ` Serge Petrenko
@ 2020-08-31 10:05         ` Sergey Bronnikov
  2020-09-01 13:03           ` Serge Petrenko
  0 siblings, 1 reply; 18+ messages in thread
From: Sergey Bronnikov @ 2020-08-31 10:05 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches, Vladislav Shpilevoy

Sergey, thanks for review!
See my comments inline, test updated in a branch.

On 13:49 Thu 27 Aug , Serge Petrenko wrote:
>
> Hi! Thanks for the fixes!
> I’m pasting parts of the patch below to comment on.
>
> +box.cfg({
> +    listen = instance_uri(INSTANCE_ID);
> +    replication_connect_quorum = 3;
> +    replication = {
> +        instance_uri(1);
> +        instance_uri(2);
> +        instance_uri(3);
> +        instance_uri(4);
> +        instance_uri(5);
> +    };
> +})
>  
>  
> *  You should either omit `replication_connect_quorum` at all, or set it to 5.
> Omitting it will have the same effect.
> I think you meant `replication_synchro_quorum` here, then it makes sense
> to set it to 3. Also   `replication_synchro_timeout` should be set here,
> I’ll mention it  ​​​​​​​ again below.

removed replication_connect_quorum at all

> +
> +NUM_INSTANCES = 5
> +BROKEN_QUORUM = NUM_INSTANCES + 1
> +
>  
>  
> *  BROKEN_QUORUM assigned but never used.

removed

>  
> +
> +test_run:create_cluster(SERVERS, "replication", {args="0.1"})
> +test_run:wait_fullmesh(SERVERS)
>  
>  
> *  You’re passing some argument to qsync1, … qsync5 instances, but you never use  it.

removed argument

>  
> +current_leader_id = 1
> +test_run:switch(SERVERS[current_leader_id])
> +box.cfg{replication_synchro_timeout=1}
>  
>  
> *  You should set `replication_synchro_timeout` on every instance, not only
> on qsync1 so   you better move this box.cfg call to the instance file.
> Besides, the timeout should be bigger (much bigger), like Vlad said.
> We typically use 30 seconds for various replication timeouts.
> It’s fairly common when a test is stable on your machine, but is flaky on
> testing machines.

increased timeout up to 1000 and set synchro settings in a qsync.lua file

> +test_run:switch('default')
> +
> +-- Testcase body.
> +for i=1,10 do                                                                  \
> +    new_leader_id = random(current_leader_id, 1, #SERVERS)                     \
> +    test_run:switch(SERVERS[new_leader_id])                                    \
> +    box.cfg{read_only=false}                                                   \
> +    f1 = fiber.create(function()                                               \
> +        pcall(box.space.sync:truncate{})                                       \
> +    end)                                                                       \
>  
>  
> *  Why put truncate call in a separate fiber?
> 
> Why use truncate at all? You may just replace all your `insert` calls below
> with `replace`, and then truncate won’t be needed. This is up to you
> though.

As far as I remember original idea was to cleanup before next insert.
I rewrote a body of test and keep all inserted values in a space and
after loop check amount of inserted values.

>  
> +    f2 = fiber.create(function()                                               \
> +        for i=1,10000 do box.space.sync:insert{i} end                          \
> +    end)                                                                       \
>  
> *  So you’re testing a case when a leader has some unconfirmed transactions in
> limbo and then a leader change happens. Then you need to call
> `clear_synchro_queue` on a new leader to wait for confirmation of old txns. Otherwise
> the new leader fails to insert its data, but the test doesn’t show this, because you
> don’t check fiber state or `insert()` return values.

added clear_synchro_queue() call to updated test

> +    test_run:switch('default')                                                 \
> +    test_run:switch(SERVERS[current_leader_id])                                \
>  
>  
> *  The 2 lines above are useless.

removed in updated test:

-- Testcase body.
for i=1,100 do                                                                 \
    test_run:eval(SERVERS[current_leader_id],                                  \
        "box.cfg{replication_synchro_quorum=6}")                               \
    test_run:eval(SERVERS[current_leader_id],                                  \
        string.format("box.space.sync:insert{%d}", i))                         \
    new_leader_id = random(current_leader_id, 1, #SERVERS)                     \
    test_run:eval(SERVERS[new_leader_id],                                      \
        "box.cfg{replication_synchro_quorum=3}")                               \
    test_run:eval(SERVERS[new_leader_id], "box.ctl.clear_synchro_queue()")     \
    replica = random(new_leader_id, 1, #SERVERS)                               \
    test_run:eval(SERVERS[replica],                                            \
        string.format("box.space.sync:get{%d}", i))                            \
    test_run:switch('default')                                                 \
    current_leader_id = new_leader_id                                          \
end

test_run:switch('qsync1')
box.space.sync:count() -- 100

>  
> +    box.cfg{read_only=true}                                                    \
> +    test_run:switch('default')                                                 \
> +    current_leader_id = new_leader_id                                          \
> +end
> +
> +-- Teardown.
> --
> Serge Petrenko

-- 
sergeyb@

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-08-31 10:05         ` Sergey Bronnikov
@ 2020-09-01 13:03           ` Serge Petrenko
  2020-11-12  9:32             ` Sergey Bronnikov
  0 siblings, 1 reply; 18+ messages in thread
From: Serge Petrenko @ 2020-09-01 13:03 UTC (permalink / raw)
  To: Sergey Bronnikov; +Cc: tarantool-patches, Vladislav Shpilevoy


31.08.2020 13:05, Sergey Bronnikov пишет:
> Sergey, thanks for review!
> See my comments inline, test updated in a branch.

Hi! Thanks for the fixes.

See my comments below.


>
> On 13:49 Thu 27 Aug , Serge Petrenko wrote:
>> Hi! Thanks for the fixes!
>> I’m pasting parts of the patch below to comment on.
>>
>> +box.cfg({
>> +    listen = instance_uri(INSTANCE_ID);
>> +    replication_connect_quorum = 3;
>> +    replication = {
>> +        instance_uri(1);
>> +        instance_uri(2);
>> +        instance_uri(3);
>> +        instance_uri(4);
>> +        instance_uri(5);
>> +    };
>> +})
>>   
>>   
>> *  You should either omit `replication_connect_quorum` at all, or set it to 5.
>> Omitting it will have the same effect.
>> I think you meant `replication_synchro_quorum` here, then it makes sense
>> to set it to 3. Also   `replication_synchro_timeout` should be set here,
>> I’ll mention it  ​​​​​​​ again below.
> removed replication_connect_quorum at all

+
+box.once("bootstrap", function()
+    local test_run = require('test_run').new()
+    box.cfg{replication_synchro_timeout = 1000, 
replication_synchro_quorum = 5}
+    box.cfg{read_only = false}
+    box.schema.user.grant("guest", 'replication')
+end)

Since you moved space creation out of `box.once` call, you don't need 
test_run here.


>> +
>> +NUM_INSTANCES = 5
>> +BROKEN_QUORUM = NUM_INSTANCES + 1
>> +
>>   
>>   
>> *  BROKEN_QUORUM assigned but never used.
> removed
>
>>   
>> +
>> +test_run:create_cluster(SERVERS, "replication", {args="0.1"})
>> +test_run:wait_fullmesh(SERVERS)
>>   
>>   
>> *  You’re passing some argument to qsync1, … qsync5 instances, but you never use  it.
> removed argument
>
>>   
>> +current_leader_id = 1
>> +test_run:switch(SERVERS[current_leader_id])
>> +box.cfg{replication_synchro_timeout=1}
>>   
>>   
>> *  You should set `replication_synchro_timeout` on every instance, not only
>> on qsync1 so   you better move this box.cfg call to the instance file.
>> Besides, the timeout should be bigger (much bigger), like Vlad said.
>> We typically use 30 seconds for various replication timeouts.
>> It’s fairly common when a test is stable on your machine, but is flaky on
>> testing machines.
> increased timeout up to 1000 and set synchro settings in a qsync.lua file
>
>> +test_run:switch('default')
>> +
>> +-- Testcase body.
>> +for i=1,10 do                                                                  \
>> +    new_leader_id = random(current_leader_id, 1, #SERVERS)                     \
>> +    test_run:switch(SERVERS[new_leader_id])                                    \
>> +    box.cfg{read_only=false}                                                   \
>> +    f1 = fiber.create(function()                                               \
>> +        pcall(box.space.sync:truncate{})                                       \
>> +    end)                                                                       \
>>   
>>   
>> *  Why put truncate call in a separate fiber?
>>
>> Why use truncate at all? You may just replace all your `insert` calls below
>> with `replace`, and then truncate won’t be needed. This is up to you
>> though.
> As far as I remember original idea was to cleanup before next insert.
> I rewrote a body of test and keep all inserted values in a space and
> after loop check amount of inserted values.
>
>>   
>> +    f2 = fiber.create(function()                                               \
>> +        for i=1,10000 do box.space.sync:insert{i} end                          \
>> +    end)                                                                       \
>>   
>> *  So you’re testing a case when a leader has some unconfirmed transactions in
>> limbo and then a leader change happens. Then you need to call
>> `clear_synchro_queue` on a new leader to wait for confirmation of old txns. Otherwise
>> the new leader fails to insert its data, but the test doesn’t show this, because you
>> don’t check fiber state or `insert()` return values.
> added clear_synchro_queue() call to updated test
>
>> +    test_run:switch('default')                                                 \
>> +    test_run:switch(SERVERS[current_leader_id])                                \
>>   
>>   
>> *  The 2 lines above are useless.
> removed in updated test:
>
> -- Testcase body.
> for i=1,100 do                                                                 \
>      test_run:eval(SERVERS[current_leader_id],                                  \
>          "box.cfg{replication_synchro_quorum=6}")                               \
>      test_run:eval(SERVERS[current_leader_id],                                  \
>          string.format("box.space.sync:insert{%d}", i))                         \
>      new_leader_id = random(current_leader_id, 1, #SERVERS)                     \
>      test_run:eval(SERVERS[new_leader_id],                                      \
>          "box.cfg{replication_synchro_quorum=3}")                               \
>      test_run:eval(SERVERS[new_leader_id], "box.ctl.clear_synchro_queue()")     \
>      replica = random(new_leader_id, 1, #SERVERS)                               \
>      test_run:eval(SERVERS[replica],                                            \
>          string.format("box.space.sync:get{%d}", i))                            \

It  is not checked whether get returns any data or not.
You only make a call but never check the return value.

It'd also be nice to check whether the insert on the old leader (the one 
with
quorum =  6) succeeds after the new leader executes `clear_synchro_queue`.


>      test_run:switch('default')                                                 \
>      current_leader_id = new_leader_id                                          \
> end
>
> test_run:switch('qsync1')
> box.space.sync:count() -- 100
>
>>   
>> +    box.cfg{read_only=true}                                                    \
>> +    test_run:switch('default')                                                 \
>> +    current_leader_id = new_leader_id                                          \
>> +end
>> +
>> +-- Teardown.
>> --
>> Serge Petrenko

-- 
Serge Petrenko

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion
  2020-09-01 13:03           ` Serge Petrenko
@ 2020-11-12  9:32             ` Sergey Bronnikov
  0 siblings, 0 replies; 18+ messages in thread
From: Sergey Bronnikov @ 2020-11-12  9:32 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches, Vladislav Shpilevoy

Hi, Sergey!

Fixes applied in updated patch [1]

1. https://lists.tarantool.org/pipermail/tarantool-patches/2020-November/020636.html

On 16:03 Tue 01 Sep , Serge Petrenko wrote:
> > -- Testcase body.
> > for i=1,100 do                                                                 \
> >      test_run:eval(SERVERS[current_leader_id],                                  \
> >          "box.cfg{replication_synchro_quorum=6}")                               \
> >      test_run:eval(SERVERS[current_leader_id],                                  \
> >          string.format("box.space.sync:insert{%d}", i))                         \
> >      new_leader_id = random(current_leader_id, 1, #SERVERS)                     \
> >      test_run:eval(SERVERS[new_leader_id],                                      \
> >          "box.cfg{replication_synchro_quorum=3}")                               \
> >      test_run:eval(SERVERS[new_leader_id], "box.ctl.clear_synchro_queue()")     \
> >      replica = random(new_leader_id, 1, #SERVERS)                               \
> >      test_run:eval(SERVERS[replica],                                            \
> >          string.format("box.space.sync:get{%d}", i))                            \
> 
> It  is not checked whether get returns any data or not.
> You only make a call but never check the return value.
> 
> It'd also be nice to check whether the insert on the old leader (the one
> with
> quorum =  6) succeeds after the new leader executes `clear_synchro_queue`.

Added two wait_cond() to check value on a random replica and old leader.
Additionally expected total number of values checked right after the loop with
box.space.sync:count().

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2020-11-12  9:32 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <cover.1594314820.git.sergeyb@tarantool.org>
2020-07-09 17:16 ` [Tarantool-patches] [PATCH 1/3] replication: print number of txs in limbo before its clear sergeyb
2020-07-09 20:07   ` Vladislav Shpilevoy
2020-07-10 12:55     ` Sergey Bronnikov
2020-07-20 21:59   ` Vladislav Shpilevoy
2020-07-09 17:16 ` [Tarantool-patches] [PATCH 2/3] replication: test clear_synchro_queue function sergeyb
2020-07-20 22:00   ` Vladislav Shpilevoy
2020-08-25 12:49     ` Sergey Bronnikov
2020-08-26  7:31       ` Serge Petrenko
2020-08-26 14:48         ` Sergey Bronnikov
2020-07-09 17:16 ` [Tarantool-patches] [PATCH 3/3] replication: add test with random leaders promotion and demotion sergeyb
2020-07-09 20:07   ` Vladislav Shpilevoy
2020-07-10  8:05     ` Sergey Bronnikov
2020-07-20 22:01   ` Vladislav Shpilevoy
2020-08-26 14:45     ` Sergey Bronnikov
2020-08-27 10:49       ` Serge Petrenko
2020-08-31 10:05         ` Sergey Bronnikov
2020-09-01 13:03           ` Serge Petrenko
2020-11-12  9:32             ` Sergey Bronnikov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox