Tarantool development patches archive
 help / color / mirror / Atom feed
From: Serge Petrenko <sergepetrenko@tarantool.org>
To: v.shpilevoy@tarantool.org, kostja.osipov@gmail.com,
	kirichenkoga@gmail.com
Cc: tarantool-patches@dev.tarantool.org
Subject: [Tarantool-patches] [PATCH] replication: fix rebootstrap in case the instance is listed in box.cfg.replication
Date: Fri, 28 Feb 2020 20:01:30 +0300	[thread overview]
Message-ID: <20200228170130.81713-1-sergepetrenko@tarantool.org> (raw)

When checking wheter rejoin is needed, replica loops through all the
instances in box.cfg.replication, which makes it believe that there is a
master holding files, needed by it, since it accounts itself just like
all other instances.
So make replica skip itself when finding an instance which holds files
needed by it, and determining whether rebootstrap is needed.

We already have a working test for the issue, it missed the issue due to
replica.lua replication settings. Fix replica.lua to optionally include
itself in box.cfg.replication, so that the corresponding test works
correctly.

Closes #4759
---
https://github.com/tarantool/tarantool/issues/4759
https://github.com/tarantool/tarantool/tree/sp/gh-4759-rebootstrap-fix

@ChangeLog
 - fix rebootstrap procedure not working in case replica itself
   is listed in `box.cfg.replication`

 src/box/replication.cc                   | 13 ++++++++++++-
 test/replication/replica.lua             | 11 ++++++++++-
 test/replication/replica_rejoin.result   | 12 ++++++------
 test/replication/replica_rejoin.test.lua | 12 ++++++------
 4 files changed, 34 insertions(+), 14 deletions(-)

diff --git a/src/box/replication.cc b/src/box/replication.cc
index e7bfa22ab..01edc0fb2 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -768,8 +768,19 @@ replicaset_needs_rejoin(struct replica **master)
 	struct replica *leader = NULL;
 	replicaset_foreach(replica) {
 		struct applier *applier = replica->applier;
-		if (applier == NULL)
+		/*
+		 * The function is called right after
+		 * box_sync_replication(), which in turn calls
+		 * replicaset_connect(), which ensures that
+		 * appliers are either stopped (APPLIER OFF) or
+		 * connected.
+		 * Also ignore self, as self applier might not
+		 * have disconnected yet.
+		 */
+		if (applier == NULL || applier->state == APPLIER_OFF ||
+		    tt_uuid_is_equal(&replica->uuid, &INSTANCE_UUID))
 			continue;
+		assert(applier->state == APPLIER_CONNECTED);
 
 		const struct ballot *ballot = &applier->ballot;
 		if (vclock_compare(&ballot->gc_vclock,
diff --git a/test/replication/replica.lua b/test/replication/replica.lua
index 20ac064e1..1cd60e062 100644
--- a/test/replication/replica.lua
+++ b/test/replication/replica.lua
@@ -1,8 +1,17 @@
 #!/usr/bin/env tarantool
 
+repl_include_self = arg[1] and arg[1] == 'true' or false
+repl_list = nil
+
+if repl_include_self then
+    repl_list = {os.getenv("MASTER"), os.getenv("LISTEN")}
+else
+    repl_list = os.getenv("MASTER")
+end
+
 box.cfg({
     listen              = os.getenv("LISTEN"),
-    replication         = os.getenv("MASTER"),
+    replication         = repl_list,
     memtx_memory        = 107374182,
     replication_timeout = 0.1,
     replication_connect_timeout = 0.5,
diff --git a/test/replication/replica_rejoin.result b/test/replication/replica_rejoin.result
index f71292da1..b8ed79f14 100644
--- a/test/replication/replica_rejoin.result
+++ b/test/replication/replica_rejoin.result
@@ -37,7 +37,7 @@ test_run:cmd("create server replica with rpl_master=default, script='replication
 ---
 - true
 ...
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 ---
 - true
 ...
@@ -111,7 +111,7 @@ box.cfg{checkpoint_count = checkpoint_count}
 ...
 -- Restart the replica. Since xlogs have been removed,
 -- it is supposed to rejoin without changing id.
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 ---
 - true
 ...
@@ -158,7 +158,7 @@ box.space.test:select()
   - [30, 30]
 ...
 -- Check that restart works as usual.
-test_run:cmd("restart server replica")
+test_run:cmd("restart server replica with args='true'")
 box.info.replication[1].upstream.status == 'follow' or box.info
 ---
 - true
@@ -210,7 +210,7 @@ test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.
 box.cfg{checkpoint_count = checkpoint_count}
 ---
 ...
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 ---
 - true
 ...
@@ -252,7 +252,7 @@ test_run:cleanup_cluster()
 box.space.test:truncate()
 ---
 ...
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 ---
 - true
 ...
@@ -349,7 +349,7 @@ _ = test_run:wait_vclock('default', vclock)
 ---
 ...
 -- Restart the replica. It should successfully rebootstrap.
-test_run:cmd("restart server replica")
+test_run:cmd("restart server replica with args='true'")
 box.space.test:select()
 ---
 - - [1]
diff --git a/test/replication/replica_rejoin.test.lua b/test/replication/replica_rejoin.test.lua
index 22a91d8d7..25c0849ec 100644
--- a/test/replication/replica_rejoin.test.lua
+++ b/test/replication/replica_rejoin.test.lua
@@ -17,7 +17,7 @@ _ = box.space.test:insert{3}
 
 -- Join a replica, then stop it.
 test_run:cmd("create server replica with rpl_master=default, script='replication/replica.lua'")
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 test_run:cmd("switch replica")
 box.info.replication[1].upstream.status == 'follow' or box.info
 box.space.test:select()
@@ -45,7 +45,7 @@ box.cfg{checkpoint_count = checkpoint_count}
 
 -- Restart the replica. Since xlogs have been removed,
 -- it is supposed to rejoin without changing id.
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 box.info.replication[2].downstream.vclock ~= nil or box.info
 test_run:cmd("switch replica")
 box.info.replication[1].upstream.status == 'follow' or box.info
@@ -60,7 +60,7 @@ test_run:cmd("switch replica")
 box.space.test:select()
 
 -- Check that restart works as usual.
-test_run:cmd("restart server replica")
+test_run:cmd("restart server replica with args='true'")
 box.info.replication[1].upstream.status == 'follow' or box.info
 box.space.test:select()
 
@@ -78,7 +78,7 @@ for i = 1, 3 do box.space.test:insert{i * 100} end
 fio = require('fio')
 test_run:wait_cond(function() return #fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog')) == 1 end) or fio.pathjoin(box.cfg.wal_dir, '*.xlog')
 box.cfg{checkpoint_count = checkpoint_count}
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 test_run:cmd("switch replica")
 box.info.status -- orphan
 box.space.test:select()
@@ -94,7 +94,7 @@ test_run:cmd("stop server replica")
 test_run:cmd("cleanup server replica")
 test_run:cleanup_cluster()
 box.space.test:truncate()
-test_run:cmd("start server replica")
+test_run:cmd("start server replica with args='true'")
 -- Subscribe the master to the replica.
 replica_listen = test_run:cmd("eval replica 'return box.cfg.listen'")
 replica_listen ~= nil
@@ -128,7 +128,7 @@ for i = 1, 10 do box.space.test:replace{2} end
 vclock = test_run:get_vclock('replica')
 _ = test_run:wait_vclock('default', vclock)
 -- Restart the replica. It should successfully rebootstrap.
-test_run:cmd("restart server replica")
+test_run:cmd("restart server replica with args='true'")
 box.space.test:select()
 box.snapshot()
 box.space.test:replace{2}
-- 
2.21.1 (Apple Git-122.3)

             reply	other threads:[~2020-02-28 17:01 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-02-28 17:01 Serge Petrenko [this message]
2020-02-28 23:43 ` Vladislav Shpilevoy
2020-02-29  8:54   ` Konstantin Osipov
2020-02-29  9:52   ` Serge Petrenko
2020-02-29 14:19     ` Vladislav Shpilevoy
2020-03-02 17:18       ` Serge Petrenko
2020-03-02 21:26         ` Vladislav Shpilevoy
2020-03-03  8:25 ` Kirill Yukhin
2020-03-05  4:52   ` Kirill Yukhin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200228170130.81713-1-sergepetrenko@tarantool.org \
    --to=sergepetrenko@tarantool.org \
    --cc=kirichenkoga@gmail.com \
    --cc=kostja.osipov@gmail.com \
    --cc=tarantool-patches@dev.tarantool.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='Re: [Tarantool-patches] [PATCH] replication: fix rebootstrap in case the instance is listed in box.cfg.replication' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox