From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH v2 11/11] vinyl: implement rebootstrap support Date: Fri, 8 Jun 2018 20:34:29 +0300 Message-Id: <34b1716f6b4960ed2483b4ebf64b693e3b3002a9.1528478913.git.vdavydov.dev@gmail.com> In-Reply-To: References: In-Reply-To: References: To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: If vy_log_bootstrap() finds a vylog file in the vinyl directory, it assumes it has to be rebootstrapped and calls vy_log_rebootstrap(). The latter scans the old vylog file to find the max vinyl object id, from which it will start numbering objects created during rebootstrap to avoid conflicts with old objects, then it writes VY_LOG_REBOOTSTRAP record to the old vylog to denote the beginning of a rebootstrap section. After that initial join proceeds as usual, writing information about new objects to the old vylog file after VY_LOG_REBOOTSTRAP marker. Upon successful rebootstrap completion, checkpoint, which is always called right after bootstrap, rotates the old vylog and marks all objects created before the VY_LOG_REBOOTSTRAP marker as dropped in the new vylog. The old objects will be purged by the garbage collector as usual. In case rebootstrap fails and checkpoint never happens, local recovery writes VY_LOG_ABORT_REBOOTSTRAP record to the vylog. This marker indicates that the rebootstrap attempt failed and all objects created during rebootstrap should be discarded. They will be purged by the garbage collector on checkpoint. Thus even if rebootstrap fails, it is possible to recover the database to the state that existed right before a failed rebootstrap attempt. TODO: write a test checking that garbage collection works as expected. Closes #461 --- src/box/vy_log.c | 133 +++++++++++++++++++++++++++++-- src/box/vy_log.h | 34 ++++++++ test/replication/replica_rejoin.result | 11 ++- test/replication/replica_rejoin.test.lua | 7 +- test/replication/suite.cfg | 1 - 5 files changed, 169 insertions(+), 17 deletions(-) diff --git a/src/box/vy_log.c b/src/box/vy_log.c index e44db5b8..e01a802a 100644 --- a/src/box/vy_log.c +++ b/src/box/vy_log.c @@ -122,6 +122,8 @@ static const char *vy_log_type_name[] = { [VY_LOG_MODIFY_LSM] = "modify_lsm", [VY_LOG_FORGET_LSM] = "forget_lsm", [VY_LOG_PREPARE_LSM] = "prepare_lsm", + [VY_LOG_REBOOTSTRAP] = "rebootstrap", + [VY_LOG_ABORT_REBOOTSTRAP] = "abort_rebootstrap", }; /** Metadata log object. */ @@ -835,17 +837,43 @@ vy_log_next_id(void) return vy_log.next_id++; } +/** + * If a vylog file already exists, we are doing a rebootstrap: + * - Load the vylog to find out the id to start indexing new + * objects with. + * - Mark the beginning of a new rebootstrap attempt by writing + * VY_LOG_REBOOTSTRAP record. + */ +static int +vy_log_rebootstrap(void) +{ + struct vy_recovery *recovery; + recovery = vy_recovery_new(vclock_sum(&vy_log.last_checkpoint), + VY_RECOVERY_ABORT_REBOOTSTRAP); + if (recovery == NULL) + return -1; + + vy_log.next_id = recovery->max_id + 1; + vy_recovery_delete(recovery); + + struct vy_log_record record; + vy_log_record_init(&record); + record.type = VY_LOG_REBOOTSTRAP; + vy_log_tx_begin(); + vy_log_write(&record); + if (vy_log_tx_commit() != 0) + return -1; + + return 0; +} + int vy_log_bootstrap(void) { - /* - * Scan the directory to make sure there is no - * vylog files left from previous setups. - */ if (xdir_scan(&vy_log.dir) < 0 && errno != ENOENT) return -1; - if (xdir_last_vclock(&vy_log.dir, NULL) >= 0) - panic("vinyl directory is not empty"); + if (xdir_last_vclock(&vy_log.dir, &vy_log.last_checkpoint) >= 0) + return vy_log_rebootstrap(); /* Add initial vclock to the xdir. */ struct vclock *vclock = malloc(sizeof(*vclock)); @@ -897,11 +925,29 @@ vy_log_begin_recovery(const struct vclock *vclock) return NULL; } + /* + * If we are recovering from a vylog that has an unfinished + * rebootstrap section, checkpoint (and hence rebootstrap) + * failed, and we need to mark rebootstrap as aborted. + */ struct vy_recovery *recovery; - recovery = vy_recovery_new(vclock_sum(&vy_log.last_checkpoint), 0); + recovery = vy_recovery_new(vclock_sum(&vy_log.last_checkpoint), + VY_RECOVERY_ABORT_REBOOTSTRAP); if (recovery == NULL) return NULL; + if (recovery->in_rebootstrap) { + struct vy_log_record record; + vy_log_record_init(&record); + record.type = VY_LOG_ABORT_REBOOTSTRAP; + vy_log_tx_begin(); + vy_log_write(&record); + if (vy_log_tx_commit() != 0) { + vy_recovery_delete(recovery); + return NULL; + } + } + vy_log.next_id = recovery->max_id + 1; vy_log.recovery = recovery; return recovery; @@ -1272,6 +1318,7 @@ vy_recovery_do_create_lsm(struct vy_recovery *recovery, int64_t id, * before the final version. */ rlist_add_tail_entry(&recovery->lsms, lsm, in_recovery); + lsm->in_rebootstrap = recovery->in_rebootstrap; if (recovery->max_id < id) recovery->max_id = id; return lsm; @@ -1852,6 +1899,42 @@ vy_recovery_delete_slice(struct vy_recovery *recovery, int64_t slice_id) } /** + * Mark all LSM trees created during rebootstrap as dropped so + * that they will be purged on the next garbage collection. + */ +static void +vy_recovery_do_abort_rebootstrap(struct vy_recovery *recovery) +{ + struct vy_lsm_recovery_info *lsm; + rlist_foreach_entry(lsm, &recovery->lsms, in_recovery) { + if (lsm->in_rebootstrap) { + lsm->in_rebootstrap = false; + lsm->create_lsn = -1; + lsm->modify_lsn = -1; + lsm->drop_lsn = 0; + } + } +} + +/** Handle a VY_LOG_REBOOTSTRAP log record. */ +static void +vy_recovery_rebootstrap(struct vy_recovery *recovery) +{ + if (recovery->in_rebootstrap) + vy_recovery_do_abort_rebootstrap(recovery); + recovery->in_rebootstrap = true; +} + +/** Handle VY_LOG_ABORT_REBOOTSTRAP record. */ +static void +vy_recovery_abort_rebootstrap(struct vy_recovery *recovery) +{ + if (recovery->in_rebootstrap) + vy_recovery_do_abort_rebootstrap(recovery); + recovery->in_rebootstrap = false; +} + +/** * Update a recovery context with a new log record. * Return 0 on success, -1 on failure. * @@ -1862,7 +1945,7 @@ static int vy_recovery_process_record(struct vy_recovery *recovery, const struct vy_log_record *record) { - int rc; + int rc = 0; switch (record->type) { case VY_LOG_PREPARE_LSM: rc = vy_recovery_prepare_lsm(recovery, record->lsm_id, @@ -1926,6 +2009,12 @@ vy_recovery_process_record(struct vy_recovery *recovery, /* Not used anymore, ignore. */ rc = 0; break; + case VY_LOG_REBOOTSTRAP: + vy_recovery_rebootstrap(recovery); + break; + case VY_LOG_ABORT_REBOOTSTRAP: + vy_recovery_abort_rebootstrap(recovery); + break; default: unreachable(); } @@ -1936,6 +2025,26 @@ vy_recovery_process_record(struct vy_recovery *recovery, } /** + * Commit the last rebootstrap attempt - drop all objects created + * before rebootstrap. + */ +static void +vy_recovery_commit_rebootstrap(struct vy_recovery *recovery) +{ + assert(recovery->in_rebootstrap); + struct vy_lsm_recovery_info *lsm; + rlist_foreach_entry(lsm, &recovery->lsms, in_recovery) { + if (!lsm->in_rebootstrap && lsm->drop_lsn < 0) { + /* + * The files will be removed when the current + * checkpoint is purged by garbage collector. + */ + lsm->drop_lsn = vy_log_signature(); + } + } +} + +/** * Fill index_id_hash with LSM trees recovered from vylog. */ static int @@ -2026,6 +2135,7 @@ vy_recovery_new_f(va_list ap) recovery->run_hash = NULL; recovery->slice_hash = NULL; recovery->max_id = -1; + recovery->in_rebootstrap = false; recovery->index_id_hash = mh_i64ptr_new(); recovery->lsm_hash = mh_i64ptr_new(); @@ -2079,6 +2189,13 @@ vy_recovery_new_f(va_list ap) xlog_cursor_close(&cursor, false); + if (recovery->in_rebootstrap) { + if ((flags & VY_RECOVERY_ABORT_REBOOTSTRAP) != 0) + vy_recovery_do_abort_rebootstrap(recovery); + else + vy_recovery_commit_rebootstrap(recovery); + } + if (vy_recovery_build_index_id_hash(recovery) != 0) goto fail_free; out: diff --git a/src/box/vy_log.h b/src/box/vy_log.h index cdac293e..c724d36a 100644 --- a/src/box/vy_log.h +++ b/src/box/vy_log.h @@ -195,6 +195,27 @@ enum vy_log_record_type { * a VY_LOG_CREATE_LSM record to commit it. */ VY_LOG_PREPARE_LSM = 15, + /** + * This record denotes the beginning of a rebootstrap section. + * A rebootstrap section ends either by another record of this + * type or by VY_LOG_ABORT_REBOOTSTRAP or at the end of the file. + * All objects created between a VY_LOG_REBOOTSTRAP record and + * VY_LOG_ABORT_REBOOTSTRAP or another VY_LOG_REBOOTSTRAP are + * considered to be garbage and marked as dropped on recovery. + * + * We write a record of this type if a vylog file already exists + * at bootstrap time, which means we are going to rebootstrap. + * If rebootstrap succeeds, we rotate the vylog on checkpoint and + * mark all objects written before the last VY_LOG_REBOOTSTRAP + * record as dropped in the rotated vylog. If rebootstrap fails, + * we write VY_LOG_ABORT_REBOOTSTRAP on recovery. + */ + VY_LOG_REBOOTSTRAP = 16, + /** + * This record is written on recovery if rebootstrap failed. + * See also VY_LOG_REBOOTSTRAP. + */ + VY_LOG_ABORT_REBOOTSTRAP = 17, vy_log_record_type_MAX }; @@ -273,6 +294,12 @@ struct vy_recovery { * or -1 in case no vinyl objects were recovered. */ int64_t max_id; + /** + * Set if we are currently processing a rebootstrap section, + * i.e. we encountered a VY_LOG_REBOOTSTRAP record and haven't + * seen matching VY_LOG_ABORT_REBOOTSTRAP. + */ + bool in_rebootstrap; }; /** LSM tree info stored in a recovery context. */ @@ -321,6 +348,8 @@ struct vy_lsm_recovery_info { * this one after successful ALTER. */ struct vy_lsm_recovery_info *prepared; + /** Set if this LSM tree was created during rebootstrap. */ + bool in_rebootstrap; }; /** Vinyl range info stored in a recovery context. */ @@ -528,6 +557,11 @@ enum vy_recovery_flag { * of the last checkpoint. */ VY_RECOVERY_LOAD_CHECKPOINT = 1 << 0, + /** + * Consider the last attempt to rebootstrap aborted even if + * there's no VY_LOG_ABORT_REBOOTSTRAP record. + */ + VY_RECOVERY_ABORT_REBOOTSTRAP = 1 << 1, }; /** diff --git a/test/replication/replica_rejoin.result b/test/replication/replica_rejoin.result index 2148625c..e8b76056 100644 --- a/test/replication/replica_rejoin.result +++ b/test/replication/replica_rejoin.result @@ -4,9 +4,12 @@ env = require('test_run') test_run = env.new() --- ... --- Cleanup the instance to remove vylog files left from previous --- tests, since vinyl doesn't support rebootstrap yet. -test_run:cmd('restart server default with cleanup=1') +engine = test_run:get_cfg('engine') +--- +... +test_run:cleanup_cluster() +--- +... -- -- gh-461: check that a replica refetches the last checkpoint -- in case it fell behind the master. @@ -14,7 +17,7 @@ test_run:cmd('restart server default with cleanup=1') box.schema.user.grant('guest', 'replication') --- ... -_ = box.schema.space.create('test') +_ = box.schema.space.create('test', {engine = engine}) --- ... _ = box.space.test:create_index('pk') diff --git a/test/replication/replica_rejoin.test.lua b/test/replication/replica_rejoin.test.lua index ea38bf81..b598c4fb 100644 --- a/test/replication/replica_rejoin.test.lua +++ b/test/replication/replica_rejoin.test.lua @@ -1,16 +1,15 @@ env = require('test_run') test_run = env.new() +engine = test_run:get_cfg('engine') --- Cleanup the instance to remove vylog files left from previous --- tests, since vinyl doesn't support rebootstrap yet. -test_run:cmd('restart server default with cleanup=1') +test_run:cleanup_cluster() -- -- gh-461: check that a replica refetches the last checkpoint -- in case it fell behind the master. -- box.schema.user.grant('guest', 'replication') -_ = box.schema.space.create('test') +_ = box.schema.space.create('test', {engine = engine}) _ = box.space.test:create_index('pk') _ = box.space.test:insert{1} _ = box.space.test:insert{2} diff --git a/test/replication/suite.cfg b/test/replication/suite.cfg index 2b609f16..95e94e5a 100644 --- a/test/replication/suite.cfg +++ b/test/replication/suite.cfg @@ -6,7 +6,6 @@ "wal_off.test.lua": {}, "hot_standby.test.lua": {}, "rebootstrap.test.lua": {}, - "replica_rejoin.test.lua": {}, "*": { "memtx": {"engine": "memtx"}, "vinyl": {"engine": "vinyl"} -- 2.11.0