Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [RFC PATCH 10/12] box: factor out local recovery function
Date: Wed,  6 Jun 2018 20:45:10 +0300	[thread overview]
Message-ID: <3e7bc40b285ca152cbbc2c9fdbd1a8a8b05c9318.1528305232.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1528305232.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1528305232.git.vdavydov.dev@gmail.com>

 - Factor out local_recovery() from box_cfg_xc(). Make it setup
   replication and handle local recovery and hot standby cases.
 - Move replication setup in case of initial bootstrap from box_cfg_xc()
   to bootstrap() to make bootstrap() consistent with local_recovery().
 - Move initial snapshot creation from bootstrap() to bootsrap_master()
   and bootstrap_from_master().

Needed for #461
---
 src/box/box.cc | 277 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 150 insertions(+), 127 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 2ff9fb5f..9b2c2e2a 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1644,6 +1644,11 @@ bootstrap_master(const struct tt_uuid *replicaset_uuid)
 
 	/* Set UUID of a new replica set */
 	box_set_replicaset_uuid(replicaset_uuid);
+
+	/* Make the initial checkpoint */
+	if (engine_begin_checkpoint() ||
+	    engine_commit_checkpoint(&replicaset.vclock))
+		panic("failed to create a checkpoint");
 }
 
 /**
@@ -1698,6 +1703,11 @@ bootstrap_from_master(struct replica *master)
 	/* Switch applier to initial state */
 	applier_resume_to_state(applier, APPLIER_READY, TIMEOUT_INFINITY);
 	assert(applier->state == APPLIER_READY);
+
+	/* Make the initial checkpoint */
+	if (engine_begin_checkpoint() ||
+	    engine_commit_checkpoint(&replicaset.vclock))
+		panic("failed to create a checkpoint");
 }
 
 /**
@@ -1708,8 +1718,31 @@ bootstrap_from_master(struct replica *master)
  *                                  the leader of a new cluster
  */
 static void
-bootstrap(const struct tt_uuid *replicaset_uuid, bool *is_bootstrap_leader)
-{
+bootstrap(const struct tt_uuid *instance_uuid,
+	  const struct tt_uuid *replicaset_uuid,
+	  bool *is_bootstrap_leader)
+{
+	/* Initialize instance UUID. */
+	assert(tt_uuid_is_nil(&INSTANCE_UUID));
+	if (!tt_uuid_is_nil(instance_uuid))
+		INSTANCE_UUID = *instance_uuid;
+	else
+		tt_uuid_create(&INSTANCE_UUID);
+	/*
+	 * Begin listening on the socket to enable
+	 * master-master replication leader election.
+	 */
+	box_listen();
+	/*
+	 * Wait for the cluster to start up.
+	 *
+	 * Note, when bootstrapping a new instance, we have to
+	 * connect to all masters to make sure all replicas
+	 * receive the same replica set UUID when a new cluster
+	 * is deployed.
+	 */
+	box_sync_replication(TIMEOUT_INFINITY, true);
+
 	/* Use the first replica by URI as a bootstrap leader */
 	struct replica *master = replicaset_leader();
 	assert(master == NULL || master->applier != NULL);
@@ -1727,9 +1760,116 @@ bootstrap(const struct tt_uuid *replicaset_uuid, bool *is_bootstrap_leader)
 		bootstrap_master(replicaset_uuid);
 		*is_bootstrap_leader = true;
 	}
-	if (engine_begin_checkpoint() ||
-	    engine_commit_checkpoint(&replicaset.vclock))
-		panic("failed to create a checkpoint");
+}
+
+/**
+ * Recover the instance from the local directory.
+ * Enter hot standby if the directory is locked.
+ */
+static void
+local_recovery(const struct tt_uuid *instance_uuid,
+	       const struct tt_uuid *replicaset_uuid,
+	       const struct vclock *checkpoint_vclock)
+{
+	/* Check instance UUID. */
+	assert(!tt_uuid_is_nil(&INSTANCE_UUID));
+	if (!tt_uuid_is_nil(instance_uuid) &&
+	    !tt_uuid_is_equal(instance_uuid, &INSTANCE_UUID)) {
+		tnt_raise(ClientError, ER_INSTANCE_UUID_MISMATCH,
+			  tt_uuid_str(instance_uuid),
+			  tt_uuid_str(&INSTANCE_UUID));
+	}
+
+	struct wal_stream wal_stream;
+	wal_stream_create(&wal_stream, cfg_geti64("rows_per_wal"));
+
+	struct recovery *recovery;
+	recovery = recovery_new(cfg_gets("wal_dir"),
+				cfg_geti("force_recovery"),
+				checkpoint_vclock);
+	auto guard = make_scoped_guard([=]{ recovery_delete(recovery); });
+
+	/*
+	 * Initialize the replica set vclock from recovery.
+	 * The local WAL may contain rows from remote masters,
+	 * so we must reflect this in replicaset vclock to
+	 * not attempt to apply these rows twice.
+	 */
+	recovery_end_vclock(recovery, &replicaset.vclock);
+
+	if (wal_dir_lock >= 0) {
+		box_listen();
+		box_sync_replication(replication_connect_timeout, false);
+	}
+
+	/*
+	 * recovery->vclock is needed by Vinyl to filter
+	 * WAL rows that were dumped before restart.
+	 *
+	 * XXX: Passing an internal member of the recovery
+	 * object to an engine is an ugly hack. Instead we
+	 * should introduce space_vtab::apply_wal_row method
+	 * and explicitly pass the statement LSN to it.
+	 */
+	engine_begin_initial_recovery_xc(&recovery->vclock);
+
+	struct memtx_engine *memtx;
+	memtx = (struct memtx_engine *)engine_by_name("memtx");
+	assert(memtx != NULL);
+
+	struct recovery_journal journal;
+	recovery_journal_create(&journal, &recovery->vclock);
+	journal_set(&journal.base);
+
+	/*
+	 * We explicitly request memtx to recover its
+	 * snapshot as a separate phase since it contains
+	 * data for system spaces, and triggers on
+	 * recovery of system spaces issue DDL events in
+	 * other engines.
+	 */
+	memtx_engine_recover_snapshot_xc(memtx, checkpoint_vclock);
+
+	engine_begin_final_recovery_xc();
+	recover_remaining_wals(recovery, &wal_stream.base, NULL, true);
+	/*
+	 * Leave hot standby mode, if any, only after
+	 * acquiring the lock.
+	 */
+	if (wal_dir_lock < 0) {
+		title("hot_standby");
+		say_info("Entering hot standby mode");
+		recovery_follow_local(recovery, &wal_stream.base, "hot_standby",
+				      cfg_getd("wal_dir_rescan_delay"));
+		while (true) {
+			if (path_lock(cfg_gets("wal_dir"), &wal_dir_lock))
+				diag_raise();
+			if (wal_dir_lock >= 0)
+				break;
+			fiber_sleep(0.1);
+		}
+		recovery_stop_local(recovery);
+		/*
+		 * Advance replica set vclock to reflect records
+		 * applied in hot standby mode.
+		 */
+		vclock_copy(&replicaset.vclock, &recovery->vclock);
+		box_listen();
+		box_sync_replication(replication_connect_timeout, false);
+	}
+	recovery_finalize(recovery);
+	engine_end_recovery_xc();
+
+	/* Check replica set UUID. */
+	if (!tt_uuid_is_nil(replicaset_uuid) &&
+	    !tt_uuid_is_equal(replicaset_uuid, &REPLICASET_UUID)) {
+		tnt_raise(ClientError, ER_REPLICASET_UUID_MISMATCH,
+			  tt_uuid_str(replicaset_uuid),
+			  tt_uuid_str(&REPLICASET_UUID));
+	}
+
+	/* Clear the pointer to journal before it goes out of scope */
+	journal_set(NULL);
 }
 
 static void
@@ -1826,130 +1966,13 @@ box_cfg_xc(void)
 	}
 	bool is_bootstrap_leader = false;
 	if (last_checkpoint_lsn >= 0) {
-		/* Check instance UUID. */
-		assert(!tt_uuid_is_nil(&INSTANCE_UUID));
-		if (!tt_uuid_is_nil(&instance_uuid) &&
-		    !tt_uuid_is_equal(&instance_uuid, &INSTANCE_UUID)) {
-			tnt_raise(ClientError, ER_INSTANCE_UUID_MISMATCH,
-				  tt_uuid_str(&instance_uuid),
-				  tt_uuid_str(&INSTANCE_UUID));
-		}
-
-		struct wal_stream wal_stream;
-		wal_stream_create(&wal_stream, cfg_geti64("rows_per_wal"));
-
-		struct recovery *recovery;
-		recovery = recovery_new(cfg_gets("wal_dir"),
-					cfg_geti("force_recovery"),
-					&last_checkpoint_vclock);
-		auto guard = make_scoped_guard([=]{ recovery_delete(recovery); });
-
-		/*
-		 * Initialize the replica set vclock from recovery.
-		 * The local WAL may contain rows from remote masters,
-		 * so we must reflect this in replicaset vclock to
-		 * not attempt to apply these rows twice.
-		 */
-		recovery_end_vclock(recovery, &replicaset.vclock);
-
-		if (wal_dir_lock >= 0) {
-			box_listen();
-			box_sync_replication(replication_connect_timeout, false);
-		}
-
-		/*
-		 * recovery->vclock is needed by Vinyl to filter
-		 * WAL rows that were dumped before restart.
-		 *
-		 * XXX: Passing an internal member of the recovery
-		 * object to an engine is an ugly hack. Instead we
-		 * should introduce Engine::applyWALRow method and
-		 * explicitly pass the statement LSN to it.
-		 */
-		engine_begin_initial_recovery_xc(&recovery->vclock);
-
-		struct memtx_engine *memtx;
-		memtx = (struct memtx_engine *)engine_by_name("memtx");
-		assert(memtx != NULL);
-
-		struct recovery_journal journal;
-		recovery_journal_create(&journal, &recovery->vclock);
-		journal_set(&journal.base);
-
-		/**
-		 * We explicitly request memtx to recover its
-		 * snapshot as a separate phase since it contains
-		 * data for system spaces, and triggers on
-		 * recovery of system spaces issue DDL events in
-		 * other engines.
-		 */
-		memtx_engine_recover_snapshot_xc(memtx,
-				&last_checkpoint_vclock);
-
-		engine_begin_final_recovery_xc();
-		recover_remaining_wals(recovery, &wal_stream.base, NULL, true);
-		/*
-		 * Leave hot standby mode, if any, only
-		 * after acquiring the lock.
-		 */
-		if (wal_dir_lock < 0) {
-			title("hot_standby");
-			say_info("Entering hot standby mode");
-			recovery_follow_local(recovery, &wal_stream.base,
-					      "hot_standby",
-					      cfg_getd("wal_dir_rescan_delay"));
-			while (true) {
-				if (path_lock(cfg_gets("wal_dir"),
-					      &wal_dir_lock))
-					diag_raise();
-				if (wal_dir_lock >= 0)
-					break;
-				fiber_sleep(0.1);
-			}
-			recovery_stop_local(recovery);
-			/*
-			 * Advance replica set vclock to reflect records
-			 * applied in hot standby mode.
-			 */
-			vclock_copy(&replicaset.vclock, &recovery->vclock);
-			box_listen();
-			box_sync_replication(replication_connect_timeout, false);
-		}
-		recovery_finalize(recovery);
-		engine_end_recovery_xc();
-
-		/* Check replica set UUID. */
-		if (!tt_uuid_is_nil(&replicaset_uuid) &&
-		    !tt_uuid_is_equal(&replicaset_uuid, &REPLICASET_UUID)) {
-			tnt_raise(ClientError, ER_REPLICASET_UUID_MISMATCH,
-				  tt_uuid_str(&replicaset_uuid),
-				  tt_uuid_str(&REPLICASET_UUID));
-		}
-
-		/* Clear the pointer to journal before it goes out of scope */
-		journal_set(NULL);
+		/* Recover the instance from the local directory */
+		local_recovery(&instance_uuid, &replicaset_uuid,
+			       &last_checkpoint_vclock);
 	} else {
-		if (!tt_uuid_is_nil(&instance_uuid))
-			INSTANCE_UUID = instance_uuid;
-		else
-			tt_uuid_create(&INSTANCE_UUID);
-		/*
-		 * Begin listening on the socket to enable
-		 * master-master replication leader election.
-		 */
-		box_listen();
-
-		/*
-		 * Wait for the cluster to start up.
-		 *
-		 * Note, when bootstrapping a new instance, we have to
-		 * connect to all masters to make sure all replicas
-		 * receive the same replica set UUID when a new cluster
-		 * is deployed.
-		 */
-		box_sync_replication(TIMEOUT_INFINITY, true);
 		/* Bootstrap a new master */
-		bootstrap(&replicaset_uuid, &is_bootstrap_leader);
+		bootstrap(&instance_uuid, &replicaset_uuid,
+			  &is_bootstrap_leader);
 	}
 	fiber_gc();
 
-- 
2.11.0

  parent reply	other threads:[~2018-06-06 17:45 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-06 17:45 [RFC PATCH 00/12] Replica rejoin Vladimir Davydov
2018-06-06 17:45 ` [RFC PATCH 01/12] recovery: drop unused recovery_exit Vladimir Davydov
2018-06-08  4:13   ` Konstantin Osipov
2018-06-06 17:45 ` [RFC PATCH 02/12] recovery: constify vclock argument Vladimir Davydov
2018-06-08  4:14   ` Konstantin Osipov
2018-06-06 17:45 ` [RFC PATCH 03/12] applier: remove extra new line in log message printed on connect Vladimir Davydov
2018-06-08  4:15   ` Konstantin Osipov
2018-06-06 17:45 ` [RFC PATCH 04/12] xrow: add helper function for encoding vclock Vladimir Davydov
2018-06-08  4:16   ` Konstantin Osipov
2018-06-06 17:45 ` [RFC PATCH 05/12] box: retrieve instance uuid before starting local recovery Vladimir Davydov
2018-06-08  4:22   ` Konstantin Osipov
2018-06-06 17:45 ` [RFC PATCH 06/12] box: refactor hot standby recovery Vladimir Davydov
2018-06-08  4:40   ` Konstantin Osipov
2018-06-08  6:43     ` Vladimir Davydov
2018-06-08 13:15       ` Konstantin Osipov
2018-06-08 13:30         ` Vladimir Davydov
2018-06-06 17:45 ` [RFC PATCH 07/12] box: retrieve end vclock before starting local recovery Vladimir Davydov
2018-06-06 17:45 ` [RFC PATCH 08/12] box: open the port " Vladimir Davydov
2018-06-06 17:45 ` [RFC PATCH 09/12] box: connect to remote peers " Vladimir Davydov
2018-06-06 17:45 ` Vladimir Davydov [this message]
2018-06-06 17:45 ` [RFC PATCH 11/12] applier: inquire oldest vclock on connect Vladimir Davydov
2018-06-06 17:45 ` [RFC PATCH 12/12] replication: rebootstrap instance on startup if it fell behind Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=3e7bc40b285ca152cbbc2c9fdbd1a8a8b05c9318.1528305232.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [RFC PATCH 10/12] box: factor out local recovery function' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox