From: Vladimir Davydov <vdavydov.dev@gmail.com> To: kostja@tarantool.org Cc: tarantool-patches@freelists.org Subject: [PATCH v2 06/11] box: factor out local recovery function Date: Fri, 8 Jun 2018 20:34:24 +0300 [thread overview] Message-ID: <75f728c07f3a850e22228880e97bf39d882b4bdf.1528478913.git.vdavydov.dev@gmail.com> (raw) In-Reply-To: <cover.1528478913.git.vdavydov.dev@gmail.com> In-Reply-To: <cover.1528478913.git.vdavydov.dev@gmail.com> - Factor out local_recovery() from box_cfg_xc(). Make it setup replication and handle local recovery and hot standby cases. - Move replication setup in case of initial bootstrap from box_cfg_xc() to bootstrap() to make bootstrap() consistent with local_recovery(). - Move initial snapshot creation from bootstrap() to bootsrap_master() and bootstrap_from_master(). Needed for #461 --- src/box/box.cc | 280 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 151 insertions(+), 129 deletions(-) diff --git a/src/box/box.cc b/src/box/box.cc index 3f0c1176..922e8604 100644 --- a/src/box/box.cc +++ b/src/box/box.cc @@ -1644,6 +1644,11 @@ bootstrap_master(const struct tt_uuid *replicaset_uuid) /* Set UUID of a new replica set */ box_set_replicaset_uuid(replicaset_uuid); + + /* Make the initial checkpoint */ + if (engine_begin_checkpoint() || + engine_commit_checkpoint(&replicaset.vclock)) + panic("failed to create a checkpoint"); } /** @@ -1698,6 +1703,11 @@ bootstrap_from_master(struct replica *master) /* Switch applier to initial state */ applier_resume_to_state(applier, APPLIER_READY, TIMEOUT_INFINITY); assert(applier->state == APPLIER_READY); + + /* Make the initial checkpoint */ + if (engine_begin_checkpoint() || + engine_commit_checkpoint(&replicaset.vclock)) + panic("failed to create a checkpoint"); } /** @@ -1708,8 +1718,31 @@ bootstrap_from_master(struct replica *master) * the leader of a new cluster */ static void -bootstrap(const struct tt_uuid *replicaset_uuid, bool *is_bootstrap_leader) -{ +bootstrap(const struct tt_uuid *instance_uuid, + const struct tt_uuid *replicaset_uuid, + bool *is_bootstrap_leader) +{ + /* Initialize instance UUID. */ + assert(tt_uuid_is_nil(&INSTANCE_UUID)); + if (!tt_uuid_is_nil(instance_uuid)) + INSTANCE_UUID = *instance_uuid; + else + tt_uuid_create(&INSTANCE_UUID); + /* + * Begin listening on the socket to enable + * master-master replication leader election. + */ + box_listen(); + /* + * Wait for the cluster to start up. + * + * Note, when bootstrapping a new instance, we have to + * connect to all masters to make sure all replicas + * receive the same replica set UUID when a new cluster + * is deployed. + */ + box_sync_replication(TIMEOUT_INFINITY, true); + /* Use the first replica by URI as a bootstrap leader */ struct replica *master = replicaset_leader(); assert(master == NULL || master->applier != NULL); @@ -1727,9 +1760,117 @@ bootstrap(const struct tt_uuid *replicaset_uuid, bool *is_bootstrap_leader) bootstrap_master(replicaset_uuid); *is_bootstrap_leader = true; } - if (engine_begin_checkpoint() || - engine_commit_checkpoint(&replicaset.vclock)) - panic("failed to create a checkpoint"); +} + +/** + * Recover the instance from the local directory. + * Enter hot standby if the directory is locked. + */ +static void +local_recovery(const struct tt_uuid *instance_uuid, + const struct tt_uuid *replicaset_uuid, + const struct vclock *checkpoint_vclock) +{ + /* Check instance UUID. */ + assert(!tt_uuid_is_nil(&INSTANCE_UUID)); + if (!tt_uuid_is_nil(instance_uuid) && + !tt_uuid_is_equal(instance_uuid, &INSTANCE_UUID)) { + tnt_raise(ClientError, ER_INSTANCE_UUID_MISMATCH, + tt_uuid_str(instance_uuid), + tt_uuid_str(&INSTANCE_UUID)); + } + + struct wal_stream wal_stream; + wal_stream_create(&wal_stream, cfg_geti64("rows_per_wal")); + + struct recovery *recovery; + recovery = recovery_new(cfg_gets("wal_dir"), + cfg_geti("force_recovery"), + checkpoint_vclock); + auto guard = make_scoped_guard([=]{ recovery_delete(recovery); }); + + /* + * Initialize the replica set vclock from recovery. + * The local WAL may contain rows from remote masters, + * so we must reflect this in replicaset vclock to + * not attempt to apply these rows twice. + */ + recovery_end_vclock(recovery, &replicaset.vclock); + + if (wal_dir_lock >= 0) { + box_listen(); + box_sync_replication(replication_connect_timeout, false); + } + + /* + * recovery->vclock is needed by Vinyl to filter + * WAL rows that were dumped before restart. + * + * XXX: Passing an internal member of the recovery + * object to an engine is an ugly hack. Instead we + * should introduce space_vtab::apply_wal_row method + * and explicitly pass the statement LSN to it. + */ + engine_begin_initial_recovery_xc(&recovery->vclock); + + struct memtx_engine *memtx; + memtx = (struct memtx_engine *)engine_by_name("memtx"); + assert(memtx != NULL); + + struct recovery_journal journal; + recovery_journal_create(&journal, &recovery->vclock); + journal_set(&journal.base); + + /* + * We explicitly request memtx to recover its + * snapshot as a separate phase since it contains + * data for system spaces, and triggers on + * recovery of system spaces issue DDL events in + * other engines. + */ + memtx_engine_recover_snapshot_xc(memtx, checkpoint_vclock); + + engine_begin_final_recovery_xc(); + recover_remaining_wals(recovery, &wal_stream.base, NULL, true); + /* + * Leave hot standby mode, if any, only after + * acquiring the lock. + */ + if (wal_dir_lock < 0) { + title("hot_standby"); + say_info("Entering hot standby mode"); + recovery_follow_local(recovery, &wal_stream.base, "hot_standby", + cfg_getd("wal_dir_rescan_delay")); + while (true) { + if (path_lock(cfg_gets("wal_dir"), &wal_dir_lock)) + diag_raise(); + if (wal_dir_lock >= 0) + break; + fiber_sleep(0.1); + } + recovery_stop_local(recovery); + recover_remaining_wals(recovery, &wal_stream.base, NULL, true); + /* + * Advance replica set vclock to reflect records + * applied in hot standby mode. + */ + vclock_copy(&replicaset.vclock, &recovery->vclock); + box_listen(); + box_sync_replication(replication_connect_timeout, false); + } + recovery_finalize(recovery); + engine_end_recovery_xc(); + + /* Check replica set UUID. */ + if (!tt_uuid_is_nil(replicaset_uuid) && + !tt_uuid_is_equal(replicaset_uuid, &REPLICASET_UUID)) { + tnt_raise(ClientError, ER_REPLICASET_UUID_MISMATCH, + tt_uuid_str(replicaset_uuid), + tt_uuid_str(&REPLICASET_UUID)); + } + + /* Clear the pointer to journal before it goes out of scope */ + journal_set(NULL); } static void @@ -1826,132 +1967,13 @@ box_cfg_xc(void) } bool is_bootstrap_leader = false; if (last_checkpoint_lsn >= 0) { - /* Check instance UUID. */ - assert(!tt_uuid_is_nil(&INSTANCE_UUID)); - if (!tt_uuid_is_nil(&instance_uuid) && - !tt_uuid_is_equal(&instance_uuid, &INSTANCE_UUID)) { - tnt_raise(ClientError, ER_INSTANCE_UUID_MISMATCH, - tt_uuid_str(&instance_uuid), - tt_uuid_str(&INSTANCE_UUID)); - } - - struct wal_stream wal_stream; - wal_stream_create(&wal_stream, cfg_geti64("rows_per_wal")); - - struct recovery *recovery; - recovery = recovery_new(cfg_gets("wal_dir"), - cfg_geti("force_recovery"), - &last_checkpoint_vclock); - auto guard = make_scoped_guard([=]{ recovery_delete(recovery); }); - - /* - * Initialize the replica set vclock from recovery. - * The local WAL may contain rows from remote masters, - * so we must reflect this in replicaset vclock to - * not attempt to apply these rows twice. - */ - recovery_end_vclock(recovery, &replicaset.vclock); - - if (wal_dir_lock >= 0) { - box_listen(); - box_sync_replication(replication_connect_timeout, false); - } - - /* - * recovery->vclock is needed by Vinyl to filter - * WAL rows that were dumped before restart. - * - * XXX: Passing an internal member of the recovery - * object to an engine is an ugly hack. Instead we - * should introduce Engine::applyWALRow method and - * explicitly pass the statement LSN to it. - */ - engine_begin_initial_recovery_xc(&recovery->vclock); - - struct memtx_engine *memtx; - memtx = (struct memtx_engine *)engine_by_name("memtx"); - assert(memtx != NULL); - - struct recovery_journal journal; - recovery_journal_create(&journal, &recovery->vclock); - journal_set(&journal.base); - - /** - * We explicitly request memtx to recover its - * snapshot as a separate phase since it contains - * data for system spaces, and triggers on - * recovery of system spaces issue DDL events in - * other engines. - */ - memtx_engine_recover_snapshot_xc(memtx, - &last_checkpoint_vclock); - - engine_begin_final_recovery_xc(); - recover_remaining_wals(recovery, &wal_stream.base, NULL, true); - /* - * Leave hot standby mode, if any, only - * after acquiring the lock. - */ - if (wal_dir_lock < 0) { - title("hot_standby"); - say_info("Entering hot standby mode"); - recovery_follow_local(recovery, &wal_stream.base, - "hot_standby", - cfg_getd("wal_dir_rescan_delay")); - while (true) { - if (path_lock(cfg_gets("wal_dir"), - &wal_dir_lock)) - diag_raise(); - if (wal_dir_lock >= 0) - break; - fiber_sleep(0.1); - } - recovery_stop_local(recovery); - recover_remaining_wals(recovery, &wal_stream.base, - NULL, true); - /* - * Advance replica set vclock to reflect records - * applied in hot standby mode. - */ - vclock_copy(&replicaset.vclock, &recovery->vclock); - box_listen(); - box_sync_replication(replication_connect_timeout, false); - } - recovery_finalize(recovery); - engine_end_recovery_xc(); - - /* Check replica set UUID. */ - if (!tt_uuid_is_nil(&replicaset_uuid) && - !tt_uuid_is_equal(&replicaset_uuid, &REPLICASET_UUID)) { - tnt_raise(ClientError, ER_REPLICASET_UUID_MISMATCH, - tt_uuid_str(&replicaset_uuid), - tt_uuid_str(&REPLICASET_UUID)); - } - - /* Clear the pointer to journal before it goes out of scope */ - journal_set(NULL); + /* Recover the instance from the local directory */ + local_recovery(&instance_uuid, &replicaset_uuid, + &last_checkpoint_vclock); } else { - if (!tt_uuid_is_nil(&instance_uuid)) - INSTANCE_UUID = instance_uuid; - else - tt_uuid_create(&INSTANCE_UUID); - /* - * Begin listening on the socket to enable - * master-master replication leader election. - */ - box_listen(); - - /* - * Wait for the cluster to start up. - * - * Note, when bootstrapping a new instance, we have to - * connect to all masters to make sure all replicas - * receive the same replica set UUID when a new cluster - * is deployed. - */ - box_sync_replication(TIMEOUT_INFINITY, true); /* Bootstrap a new master */ - bootstrap(&replicaset_uuid, &is_bootstrap_leader); + bootstrap(&instance_uuid, &replicaset_uuid, + &is_bootstrap_leader); } fiber_gc(); -- 2.11.0
next prev parent reply other threads:[~2018-06-08 17:34 UTC|newest] Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-06-08 17:34 [PATCH v2 00/11] Replica rejoin Vladimir Davydov 2018-06-08 17:34 ` [PATCH v2 01/11] box: retrieve instance uuid before starting local recovery Vladimir Davydov 2018-06-08 17:51 ` Konstantin Osipov 2018-06-08 17:34 ` [PATCH v2 02/11] box: refactor hot standby recovery Vladimir Davydov 2018-06-08 17:34 ` [PATCH v2 03/11] box: retrieve end vclock before starting local recovery Vladimir Davydov 2018-06-14 12:58 ` Konstantin Osipov 2018-06-08 17:34 ` [PATCH v2 04/11] box: open the port " Vladimir Davydov 2018-06-13 20:43 ` Konstantin Osipov 2018-06-14 8:31 ` Vladimir Davydov 2018-06-14 12:59 ` Konstantin Osipov 2018-06-15 15:48 ` [PATCH 0/3] Speed up recovery in case rebootstrap is not needed Vladimir Davydov 2018-06-15 15:48 ` [PATCH 1/3] xlog: erase eof marker when reopening existing file for writing Vladimir Davydov 2018-06-27 17:09 ` Konstantin Osipov 2018-06-15 15:48 ` [PATCH 2/3] wal: rollback vclock on write failure Vladimir Davydov 2018-06-27 17:22 ` Konstantin Osipov 2018-06-15 15:48 ` [PATCH 3/3] wal: create empty xlog on shutdown Vladimir Davydov 2018-06-27 17:29 ` Konstantin Osipov 2018-06-08 17:34 ` [PATCH v2 05/11] box: connect to remote peers before starting local recovery Vladimir Davydov 2018-06-13 20:45 ` Konstantin Osipov 2018-06-14 8:34 ` Vladimir Davydov 2018-06-14 12:59 ` Konstantin Osipov 2018-06-08 17:34 ` Vladimir Davydov [this message] 2018-06-13 20:50 ` [PATCH v2 06/11] box: factor out local recovery function Konstantin Osipov 2018-06-08 17:34 ` [PATCH v2 07/11] applier: inquire oldest vclock on connect Vladimir Davydov 2018-06-13 20:51 ` Konstantin Osipov 2018-06-14 8:40 ` Vladimir Davydov 2018-06-08 17:34 ` [PATCH v2 08/11] replication: rebootstrap instance on startup if it fell behind Vladimir Davydov 2018-06-13 20:55 ` Konstantin Osipov 2018-06-14 8:58 ` Vladimir Davydov 2018-06-08 17:34 ` [PATCH v2 09/11] vinyl: simplify vylog recovery from backup Vladimir Davydov 2018-06-08 17:34 ` [PATCH v2 10/11] vinyl: pass flags to vy_recovery_new Vladimir Davydov 2018-06-13 20:56 ` Konstantin Osipov 2018-06-08 17:34 ` [PATCH v2 11/11] vinyl: implement rebootstrap support Vladimir Davydov 2018-06-10 12:02 ` Vladimir Davydov
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=75f728c07f3a850e22228880e97bf39d882b4bdf.1528478913.git.vdavydov.dev@gmail.com \ --to=vdavydov.dev@gmail.com \ --cc=kostja@tarantool.org \ --cc=tarantool-patches@freelists.org \ --subject='Re: [PATCH v2 06/11] box: factor out local recovery function' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox