From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH v2 3/6] recovery: make LSN gap check more thorough Date: Fri, 29 Jun 2018 19:48:30 +0300 Message-Id: In-Reply-To: References: In-Reply-To: References: To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: Currently, the lsn gap check is rather sloppy: when we open an xlog file for recovery, we check that its vclock equals the vclock of the last replayed row (see recover_remaining_wals), so if there were WAL write errors at the end of an xlog file, we will report a false-positive gap error (because wal doesn't rollback lsn counter). Let's use PrevVclock xlog meta key introduced earlier to improve the check. --- src/box/recovery.cc | 62 +++++++++++++++++++++++++------------ test/xlog/panic_on_lsn_gap.result | 34 ++++++++++++++++++++ test/xlog/panic_on_lsn_gap.test.lua | 13 ++++++++ 3 files changed, 90 insertions(+), 19 deletions(-) diff --git a/src/box/recovery.cc b/src/box/recovery.cc index 71f6bd8c..8ac89cc2 100644 --- a/src/box/recovery.cc +++ b/src/box/recovery.cc @@ -152,6 +152,48 @@ recovery_close_log(struct recovery *r) trigger_run_xc(&r->on_close_log, NULL); } +static void +recovery_open_log(struct recovery *r, const struct vclock *vclock) +{ + XlogGapError *e; + struct xlog_meta meta = r->cursor.meta; + enum xlog_cursor_state state = r->cursor.state; + + recovery_close_log(r); + + xdir_open_cursor_xc(&r->wal_dir, vclock_sum(vclock), &r->cursor); + + if (state == XLOG_CURSOR_UNINITIALIZED && + vclock_compare(vclock, &r->vclock) > 0) { + /* + * This is the first WAL we are about to scan + * and the best clock we could find is greater + * or is incomparable with the initial recovery + * position. + */ + goto gap_error; + } + + if (state != XLOG_CURSOR_UNINITIALIZED && + r->cursor.meta.has_prev_vclock && + vclock_compare(&r->cursor.meta.prev_vclock, &meta.vclock) != 0) { + /* + * WALs are missing between the last scanned WAL + * and the next one. + */ + goto gap_error; + } + return; + +gap_error: + e = tnt_error(XlogGapError, &r->vclock, vclock); + if (!r->wal_dir.force_recovery) + throw e; + /* Ignore missing WALs if force_recovery is set. */ + e->log(); + say_warn("ignoring a gap in LSN"); +} + void recovery_delete(struct recovery *r) { @@ -277,25 +319,7 @@ recover_remaining_wals(struct recovery *r, struct xstream *stream, continue; } - if (vclock_compare(clock, &r->vclock) > 0) { - /** - * The best clock we could find is - * greater or is incomparable with the - * current state of recovery. - */ - XlogGapError *e = - tnt_error(XlogGapError, &r->vclock, clock); - - if (!r->wal_dir.force_recovery) - throw e; - e->log(); - /* Ignore missing WALs */ - say_warn("ignoring a gap in LSN"); - } - - recovery_close_log(r); - - xdir_open_cursor_xc(&r->wal_dir, vclock_sum(clock), &r->cursor); + recovery_open_log(r, clock); say_info("recover from `%s'", r->cursor.name); diff --git a/test/xlog/panic_on_lsn_gap.result b/test/xlog/panic_on_lsn_gap.result index 313850a6..c93fcdd6 100644 --- a/test/xlog/panic_on_lsn_gap.result +++ b/test/xlog/panic_on_lsn_gap.result @@ -280,6 +280,40 @@ box.space._schema:select{'key'} --- - - ['key', 'test 4'] ... +-- +-- Check that if there's an LSN gap between two WALs +-- that appeared due to a disk error and no files is +-- actually missing, we won't panic on recovery. +-- +box.space._schema:replace{'key', 'test 4'} -- creates new WAL +--- +- ['key', 'test 4'] +... +box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true) +--- +- ok +... +box.space._schema:replace{'key', 'test 5'} -- fails, makes gap +--- +- error: Failed to write to disk +... +box.snapshot() -- fails, rotates WAL +--- +- error: Error injection 'xlog write injection' +... +box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false) +--- +- ok +... +box.space._schema:replace{'key', 'test 5'} -- creates new WAL +--- +- ['key', 'test 5'] +... +test_run:cmd("restart server panic") +box.space._schema:select{'key'} +--- +- - ['key', 'test 5'] +... test_run:cmd('switch default') --- - true diff --git a/test/xlog/panic_on_lsn_gap.test.lua b/test/xlog/panic_on_lsn_gap.test.lua index 248a3e63..b1ede320 100644 --- a/test/xlog/panic_on_lsn_gap.test.lua +++ b/test/xlog/panic_on_lsn_gap.test.lua @@ -108,6 +108,19 @@ require('fio').glob(name .. "/*.xlog") -- restart is ok test_run:cmd("restart server panic") box.space._schema:select{'key'} +-- +-- Check that if there's an LSN gap between two WALs +-- that appeared due to a disk error and no files is +-- actually missing, we won't panic on recovery. +-- +box.space._schema:replace{'key', 'test 4'} -- creates new WAL +box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true) +box.space._schema:replace{'key', 'test 5'} -- fails, makes gap +box.snapshot() -- fails, rotates WAL +box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false) +box.space._schema:replace{'key', 'test 5'} -- creates new WAL +test_run:cmd("restart server panic") +box.space._schema:select{'key'} test_run:cmd('switch default') test_run:cmd("stop server panic") test_run:cmd("cleanup server panic") -- 2.11.0