[Tarantool-patches] [PATCH v2 6/7] replication: tolerate synchro rollback during final join

Wed Mar 24 15:24:16 MSK 2021

Both box_process_register and box_process_join had guards ensuring that
not a single rollback occured for transactions residing in WAL around
replica's _cluster registration.
Both functions would error on a rollback and make the replica retry
final join.

The reason for that was that replica couldn't process synchronous
transactions correctly during final join, because it applied the final
join stream row-by-row.

This path with retrying final join was a dead end, because even if
master manages to receive no ROLLBACK messages around N-th retry of
box.space._cluster:insert{}, replica would still have to receive and
process all the data dating back to its first _cluster registration
attempt.
In other words, the guard against sending synchronous rows to the
replica didn't work.

Let's remove the guard altogether, since now replica is capable of
processing synchronous txs in final join stream and even retrying final
join in case the _cluster registration was rolled back.

Closes #5566
---
 changelogs/unreleased/synchro-final-join.md   |   4 +
 src/box/applier.cc                            |   2 +
 src/box/box.cc                                |  24 ---
 src/box/relay.cc                              |   1 +
 .../gh-5566-final-join-synchro.result         | 139 ++++++++++++++++++
 .../gh-5566-final-join-synchro.test.lua       |  61 ++++++++
 test/replication/suite.cfg                    |   1 +
 7 files changed, 208 insertions(+), 24 deletions(-)
 create mode 100644 changelogs/unreleased/synchro-final-join.md
 create mode 100644 test/replication/gh-5566-final-join-synchro.result
 create mode 100644 test/replication/gh-5566-final-join-synchro.test.lua

diff --git a/changelogs/unreleased/synchro-final-join.md b/changelogs/unreleased/synchro-final-join.md
new file mode 100644
index 000000000..cef77df87
--- /dev/null
+++ b/changelogs/unreleased/synchro-final-join.md
@@ -0,0 +1,4 @@
+## bugfix/core
+
+* Fix a bug in applier erroring with `Unknown request type 40` during final join
+  when master has synchronous spaces (gh-5566).
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 9a8b0f0fc..0d1b4d28d 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -109,6 +109,8 @@ applier_log_error(struct applier *applier, struct error *e)
 	case ER_PASSWORD_MISMATCH:
 	case ER_XLOG_GAP:
 	case ER_TOO_EARLY_SUBSCRIBE:
+	case ER_SYNC_QUORUM_TIMEOUT:
+	case ER_SYNC_ROLLBACK:
 		say_info("will retry every %.2lf second",
 			 replication_reconnect_interval());
 		break;
diff --git a/src/box/box.cc b/src/box/box.cc
index cc59564e1..292a54213 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2163,8 +2163,6 @@ box_process_register(struct ev_io *io, struct xrow_header *header)
 	say_info("registering replica %s at %s",
 		 tt_uuid_str(&instance_uuid), sio_socketname(io->fd));
 
-	/* See box_process_join() */
-	int64_t limbo_rollback_count = txn_limbo.rollback_count;
 	struct vclock start_vclock;
 	vclock_copy(&start_vclock, &replicaset.vclock);
 
@@ -2180,12 +2178,6 @@ box_process_register(struct ev_io *io, struct xrow_header *header)
 	struct vclock stop_vclock;
 	vclock_copy(&stop_vclock, &replicaset.vclock);
 
-	if (txn_limbo.rollback_count != limbo_rollback_count)
-		tnt_raise(ClientError, ER_SYNC_ROLLBACK);
-
-	if (txn_limbo_wait_confirm(&txn_limbo) != 0)
-		diag_raise();
-
 	/*
 	 * Feed replica with WALs in range
 	 * (start_vclock, stop_vclock) so that it gets its
@@ -2307,15 +2299,6 @@ box_process_join(struct ev_io *io, struct xrow_header *header)
 	say_info("joining replica %s at %s",
 		 tt_uuid_str(&instance_uuid), sio_socketname(io->fd));
 
-	/*
-	 * In order to join a replica, master has to make sure it
-	 * doesn't send unconfirmed data. We have to check that
-	 * there are no rolled back transactions between
-	 * start_vclock and stop_vclock, and that the data right
-	 * before stop_vclock is confirmed, before we can proceed
-	 * to final join.
-	 */
-	int64_t limbo_rollback_count = txn_limbo.rollback_count;
 	/*
 	 * Initial stream: feed replica with dirty data from engines.
 	 */
@@ -2336,13 +2319,6 @@ box_process_join(struct ev_io *io, struct xrow_header *header)
 	/* Remember master's vclock after the last request */
 	struct vclock stop_vclock;
 	vclock_copy(&stop_vclock, &replicaset.vclock);
-
-	if (txn_limbo.rollback_count != limbo_rollback_count)
-		tnt_raise(ClientError, ER_SYNC_ROLLBACK);
-
-	if (txn_limbo_wait_confirm(&txn_limbo) != 0)
-		diag_raise();
-
 	/* Send end of initial stage data marker */
 	struct xrow_header row;
 	xrow_encode_vclock_xc(&row, &stop_vclock);
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 41f949e8e..dd7a167e4 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -1035,6 +1035,7 @@ relay_send_row(struct xstream *stream, struct xrow_header *packet)
 					    ERRINJ_INT);
 		if (inj != NULL && packet->lsn == inj->iparam) {
 			packet->lsn = inj->iparam - 1;
+			packet->tsn = packet->lsn;
 			say_warn("injected broken lsn: %lld",
 				 (long long) packet->lsn);
 		}
diff --git a/test/replication/gh-5566-final-join-synchro.result b/test/replication/gh-5566-final-join-synchro.result
new file mode 100644
index 000000000..32749bf12
--- /dev/null
+++ b/test/replication/gh-5566-final-join-synchro.result
@@ -0,0 +1,139 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+--
+-- gh-5566 replica tolerates synchronous transactions in final join stream.
+--
+_ = box.schema.space.create('sync', {is_sync=true})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+
+box.schema.user.grant('guest', 'replication')
+ | ---
+ | ...
+box.schema.user.grant('guest', 'write', 'space', 'sync')
+ | ---
+ | ...
+
+-- Part 1. Make sure a joining instance tolerates synchronous rows in final join
+-- stream.
+trig = function()\
+    box.space.sync:replace{1}\
+end
+ | ---
+ | ...
+-- The trigger will generate synchronous rows each time a replica joins.
+_ = box.space._cluster:on_replace(trig)
+ | ---
+ | ...
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum=1}
+ | ---
+ | ...
+
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica')
+ | ---
+ | - true
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+test_run:wait_upstream(1, {status='follow'})
+ | ---
+ | - true
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+
+-- Part 2. Make sure master aborts final join if insert to _cluster is rolled
+-- back and replica is capable of retrying it.
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+ | ---
+ | ...
+-- Make the trigger we used above fail with no quorum.
+box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.01}
+ | ---
+ | ...
+-- Try to join the replica once again.
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=False')
+ | ---
+ | - true
+ | ...
+
+test_run:wait_log('replica', 'ER_SYNC_QUORUM_TIMEOUT', nil, 10)
+ | ---
+ | - ER_SYNC_QUORUM_TIMEOUT
+ | ...
+-- Remove the trigger to let the replica connect.
+box.space._cluster:on_replace(nil, trig)
+ | ---
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+test_run:wait_upstream(1, {status='follow'})
+ | ---
+ | - true
+ | ...
+
+-- Cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+box.cfg{\
+    replication_synchro_quorum=orig_synchro_quorum,\
+    replication_synchro_timeout=orig_synchro_timeout\
+}
+ | ---
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+test_run:cleanup_cluster()
+ | ---
+ | ...
+box.schema.user.revoke('guest', 'replication')
+ | ---
+ | ...
diff --git a/test/replication/gh-5566-final-join-synchro.test.lua b/test/replication/gh-5566-final-join-synchro.test.lua
new file mode 100644
index 000000000..14302f6e6
--- /dev/null
+++ b/test/replication/gh-5566-final-join-synchro.test.lua
@@ -0,0 +1,61 @@
+test_run = require('test_run').new()
+
+--
+-- gh-5566 replica tolerates synchronous transactions in final join stream.
+--
+_ = box.schema.space.create('sync', {is_sync=true})
+_ = box.space.sync:create_index('pk')
+
+box.schema.user.grant('guest', 'replication')
+box.schema.user.grant('guest', 'write', 'space', 'sync')
+
+-- Part 1. Make sure a joining instance tolerates synchronous rows in final join
+-- stream.
+trig = function()\
+    box.space.sync:replace{1}\
+end
+-- The trigger will generate synchronous rows each time a replica joins.
+_ = box.space._cluster:on_replace(trig)
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+box.cfg{replication_synchro_quorum=1}
+
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+test_run:cmd('start server replica')
+
+test_run:switch('replica')
+test_run:wait_upstream(1, {status='follow'})
+
+test_run:switch('default')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+
+-- Part 2. Make sure master aborts final join if insert to _cluster is rolled
+-- back and replica is capable of retrying it.
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+-- Make the trigger we used above fail with no quorum.
+box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.01}
+-- Try to join the replica once again.
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=False')
+
+test_run:wait_log('replica', 'ER_SYNC_QUORUM_TIMEOUT', nil, 10)
+-- Remove the trigger to let the replica connect.
+box.space._cluster:on_replace(nil, trig)
+
+test_run:switch('replica')
+test_run:wait_upstream(1, {status='follow'})
+
+-- Cleanup.
+test_run:switch('default')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+box.cfg{\
+    replication_synchro_quorum=orig_synchro_quorum,\
+    replication_synchro_timeout=orig_synchro_timeout\
+}
+box.space.sync:drop()
+test_run:cleanup_cluster()
+box.schema.user.revoke('guest', 'replication')
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index 7e7004592..04a3c4bb2 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -38,6 +38,7 @@
     "gh-5440-qsync-ro.test.lua": {},
     "gh-5435-qsync-clear-synchro-queue-commit-all.test.lua": {},
     "gh-5536-wal-limit.test.lua": {},
+    "gh-5566-final-join-synchro.test.lua": {},
     "*": {
         "memtx": {"engine": "memtx"},
         "vinyl": {"engine": "vinyl"}
-- 
2.24.3 (Apple Git-128)