Tarantool development patches archive
 help / color / mirror / Atom feed
From: Serge Petrenko via Tarantool-patches <tarantool-patches@dev.tarantool.org>
To: v.shpilevoy@tarantool.org, gorcunov@gmail.com
Cc: tarantool-patches@dev.tarantool.org
Subject: [Tarantool-patches] [PATCH] replication: send only confirmed data during final join
Date: Wed, 10 Mar 2021 16:48:24 +0300	[thread overview]
Message-ID: <20210310134824.33794-1-sergepetrenko@tarantool.org> (raw)

Make relay wait for confirmation of synchronous transactions once final
join stage is entered.
Also let applier silently ignore CONFIRM rows coming during final join.
This is done because everything coming during final join ends up in
replica's initial snapshot, so it must be confirmed.

Closes #5566
---
https://github.com/tarantool/tarantool/issues/5566
https://github.com/tarantool/tarantool/tree/sp/gh-5566-final-join-synchro

 changelogs/unreleased/synchro-final-join.md   |   4 +
 src/box/applier.cc                            |   5 +-
 src/box/relay.cc                              |   4 +
 .../gh-5566-final-join-synchro.result         | 139 ++++++++++++++++++
 .../gh-5566-final-join-synchro.test.lua       |  61 ++++++++
 test/replication/suite.cfg                    |   1 +
 6 files changed, 213 insertions(+), 1 deletion(-)
 create mode 100644 changelogs/unreleased/synchro-final-join.md
 create mode 100644 test/replication/gh-5566-final-join-synchro.result
 create mode 100644 test/replication/gh-5566-final-join-synchro.test.lua

diff --git a/changelogs/unreleased/synchro-final-join.md b/changelogs/unreleased/synchro-final-join.md
new file mode 100644
index 000000000..cef77df87
--- /dev/null
+++ b/changelogs/unreleased/synchro-final-join.md
@@ -0,0 +1,4 @@
+## bugfix/core
+
+* Fix a bug in applier erroring with `Unknown request type 40` during final join
+  when master has synchronous spaces (gh-5566).
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 8d7ce5d99..2af7693a0 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -109,6 +109,8 @@ applier_log_error(struct applier *applier, struct error *e)
 	case ER_PASSWORD_MISMATCH:
 	case ER_XLOG_GAP:
 	case ER_TOO_EARLY_SUBSCRIBE:
+	case ER_SYNC_QUORUM_TIMEOUT:
+	case ER_SYNC_ROLLBACK:
 		say_info("will retry every %.2lf second",
 			 replication_reconnect_interval());
 		break;
@@ -526,7 +528,8 @@ applier_wait_register(struct applier *applier, uint64_t row_count)
 	while (true) {
 		coio_read_xrow(coio, ibuf, &row);
 		applier->last_row_time = ev_monotonic_now(loop());
-		if (iproto_type_is_dml(row.type)) {
+		if (iproto_type_is_dml(row.type) ||
+		    row.type == IPROTO_CONFIRM) {
 			vclock_follow_xrow(&replicaset.vclock, &row);
 			if (apply_final_join_row(&row) != 0)
 				diag_raise();
diff --git a/src/box/relay.cc b/src/box/relay.cc
index 186f77ed4..5699b8a63 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -381,6 +381,10 @@ relay_final_join(int fd, uint64_t sync, struct vclock *start_vclock,
 		relay_delete(relay);
 	});
 
+	/* \sa relay_initial_join(). */
+	if (txn_limbo_wait_confirm(&txn_limbo) != 0)
+		diag_raise();
+
 	relay->r = recovery_new(wal_dir(), false, start_vclock);
 	vclock_copy(&relay->stop_vclock, stop_vclock);
 
diff --git a/test/replication/gh-5566-final-join-synchro.result b/test/replication/gh-5566-final-join-synchro.result
new file mode 100644
index 000000000..a386de2a4
--- /dev/null
+++ b/test/replication/gh-5566-final-join-synchro.result
@@ -0,0 +1,139 @@
+-- test-run result file version 2
+test_run = require('test_run').new()
+ | ---
+ | ...
+
+--
+-- gh-5566 only confirmed synchronous rows are sent during final join.
+--
+_ = box.schema.space.create('sync', {is_sync=true})
+ | ---
+ | ...
+_ = box.space.sync:create_index('pk')
+ | ---
+ | ...
+
+box.schema.user.grant('guest', 'replication')
+ | ---
+ | ...
+box.schema.user.grant('guest', 'write', 'space', 'sync')
+ | ---
+ | ...
+
+-- Part 1. Make sure a joining instance tolerates synchronous rows in final join
+-- stream.
+trig = function()\
+    box.space.sync:replace{1}\
+end
+ | ---
+ | ...
+-- The trigger will generate synchronous rows each time a replica joins.
+_ = box.space._cluster:on_replace(trig)
+ | ---
+ | ...
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+ | ---
+ | ...
+box.cfg{replication_synchro_quorum=1}
+ | ---
+ | ...
+
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica')
+ | ---
+ | - true
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+test_run:wait_upstream(1, {status='follow'})
+ | ---
+ | - true
+ | ...
+
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+
+-- Part 2. Make sure master aborts final join once it faces a synchronous
+-- rollback.
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+ | ---
+ | ...
+-- Make the trigger we used above fail with no quorum.
+box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.01}
+ | ---
+ | ...
+-- Try to join the replica once again.
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+ | ---
+ | - true
+ | ...
+test_run:cmd('start server replica with wait=False')
+ | ---
+ | - true
+ | ...
+
+test_run:wait_log('replica', 'ER_SYNC_QUORUM_TIMEOUT', nil, 10)
+ | ---
+ | - ER_SYNC_QUORUM_TIMEOUT
+ | ...
+-- Remove the trigger to let the replica connect.
+box.space._cluster:on_replace(nil, trig)
+ | ---
+ | ...
+
+test_run:switch('replica')
+ | ---
+ | - true
+ | ...
+test_run:wait_upstream(1, {status='follow'})
+ | ---
+ | - true
+ | ...
+
+-- Cleanup.
+test_run:switch('default')
+ | ---
+ | - true
+ | ...
+test_run:cmd('stop server replica')
+ | ---
+ | - true
+ | ...
+test_run:cmd('delete server replica')
+ | ---
+ | - true
+ | ...
+box.cfg{\
+    replication_synchro_quorum=orig_synchro_quorum,\
+    replication_synchro_timeout=orig_synchro_timeout\
+}
+ | ---
+ | ...
+box.space.sync:drop()
+ | ---
+ | ...
+test_run:cleanup_cluster()
+ | ---
+ | ...
+box.schema.user.revoke('guest', 'replication')
+ | ---
+ | ...
diff --git a/test/replication/gh-5566-final-join-synchro.test.lua b/test/replication/gh-5566-final-join-synchro.test.lua
new file mode 100644
index 000000000..5b55943c9
--- /dev/null
+++ b/test/replication/gh-5566-final-join-synchro.test.lua
@@ -0,0 +1,61 @@
+test_run = require('test_run').new()
+
+--
+-- gh-5566 only confirmed synchronous rows are sent during final join.
+--
+_ = box.schema.space.create('sync', {is_sync=true})
+_ = box.space.sync:create_index('pk')
+
+box.schema.user.grant('guest', 'replication')
+box.schema.user.grant('guest', 'write', 'space', 'sync')
+
+-- Part 1. Make sure a joining instance tolerates synchronous rows in final join
+-- stream.
+trig = function()\
+    box.space.sync:replace{1}\
+end
+-- The trigger will generate synchronous rows each time a replica joins.
+_ = box.space._cluster:on_replace(trig)
+
+orig_synchro_quorum = box.cfg.replication_synchro_quorum
+box.cfg{replication_synchro_quorum=1}
+
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+test_run:cmd('start server replica')
+
+test_run:switch('replica')
+test_run:wait_upstream(1, {status='follow'})
+
+test_run:switch('default')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+
+-- Part 2. Make sure master aborts final join once it faces a synchronous
+-- rollback.
+orig_synchro_timeout = box.cfg.replication_synchro_timeout
+-- Make the trigger we used above fail with no quorum.
+box.cfg{replication_synchro_quorum=2, replication_synchro_timeout=0.01}
+-- Try to join the replica once again.
+test_run:cmd('create server replica with rpl_master=default,\
+                                         script="replication/replica.lua"')
+test_run:cmd('start server replica with wait=False')
+
+test_run:wait_log('replica', 'ER_SYNC_QUORUM_TIMEOUT', nil, 10)
+-- Remove the trigger to let the replica connect.
+box.space._cluster:on_replace(nil, trig)
+
+test_run:switch('replica')
+test_run:wait_upstream(1, {status='follow'})
+
+-- Cleanup.
+test_run:switch('default')
+test_run:cmd('stop server replica')
+test_run:cmd('delete server replica')
+box.cfg{\
+    replication_synchro_quorum=orig_synchro_quorum,\
+    replication_synchro_timeout=orig_synchro_timeout\
+}
+box.space.sync:drop()
+test_run:cleanup_cluster()
+box.schema.user.revoke('guest', 'replication')
diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg
index c80430afc..5754c0531 100644
--- a/test/replication/suite.cfg
+++ b/test/replication/suite.cfg
@@ -37,6 +37,7 @@
     "gh-4928-tx-boundaries.test.lua": {},
     "gh-5440-qsync-ro.test.lua": {},
     "gh-5435-qsync-clear-synchro-queue-commit-all.test.lua": {},
+    "gh-5566-final-join-synchro.test.lua": {},
     "*": {
         "memtx": {"engine": "memtx"},
         "vinyl": {"engine": "vinyl"}
-- 
2.24.3 (Apple Git-128)


             reply	other threads:[~2021-03-10 13:48 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-10 13:48 Serge Petrenko via Tarantool-patches [this message]
2021-03-10 19:12 ` Vladislav Shpilevoy via Tarantool-patches

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210310134824.33794-1-sergepetrenko@tarantool.org \
    --to=tarantool-patches@dev.tarantool.org \
    --cc=gorcunov@gmail.com \
    --cc=sergepetrenko@tarantool.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH] replication: send only confirmed data during final join' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox