From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [PATCH v2 3/6] recovery: make LSN gap check more thorough
Date: Fri, 29 Jun 2018 19:48:30 +0300 [thread overview]
Message-ID: <cf339d9cc086534e5f1d2359876abc2e23da4cbf.1530287767.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1530287767.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1530287767.git.vdavydov.dev@gmail.com>
Currently, the lsn gap check is rather sloppy: when we open an xlog file
for recovery, we check that its vclock equals the vclock of the last
replayed row (see recover_remaining_wals), so if there were WAL write
errors at the end of an xlog file, we will report a false-positive gap
error (because wal doesn't rollback lsn counter). Let's use PrevVclock
xlog meta key introduced earlier to improve the check.
---
src/box/recovery.cc | 62 +++++++++++++++++++++++++------------
test/xlog/panic_on_lsn_gap.result | 34 ++++++++++++++++++++
test/xlog/panic_on_lsn_gap.test.lua | 13 ++++++++
3 files changed, 90 insertions(+), 19 deletions(-)
diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index 71f6bd8c..8ac89cc2 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -152,6 +152,48 @@ recovery_close_log(struct recovery *r)
trigger_run_xc(&r->on_close_log, NULL);
}
+static void
+recovery_open_log(struct recovery *r, const struct vclock *vclock)
+{
+ XlogGapError *e;
+ struct xlog_meta meta = r->cursor.meta;
+ enum xlog_cursor_state state = r->cursor.state;
+
+ recovery_close_log(r);
+
+ xdir_open_cursor_xc(&r->wal_dir, vclock_sum(vclock), &r->cursor);
+
+ if (state == XLOG_CURSOR_UNINITIALIZED &&
+ vclock_compare(vclock, &r->vclock) > 0) {
+ /*
+ * This is the first WAL we are about to scan
+ * and the best clock we could find is greater
+ * or is incomparable with the initial recovery
+ * position.
+ */
+ goto gap_error;
+ }
+
+ if (state != XLOG_CURSOR_UNINITIALIZED &&
+ r->cursor.meta.has_prev_vclock &&
+ vclock_compare(&r->cursor.meta.prev_vclock, &meta.vclock) != 0) {
+ /*
+ * WALs are missing between the last scanned WAL
+ * and the next one.
+ */
+ goto gap_error;
+ }
+ return;
+
+gap_error:
+ e = tnt_error(XlogGapError, &r->vclock, vclock);
+ if (!r->wal_dir.force_recovery)
+ throw e;
+ /* Ignore missing WALs if force_recovery is set. */
+ e->log();
+ say_warn("ignoring a gap in LSN");
+}
+
void
recovery_delete(struct recovery *r)
{
@@ -277,25 +319,7 @@ recover_remaining_wals(struct recovery *r, struct xstream *stream,
continue;
}
- if (vclock_compare(clock, &r->vclock) > 0) {
- /**
- * The best clock we could find is
- * greater or is incomparable with the
- * current state of recovery.
- */
- XlogGapError *e =
- tnt_error(XlogGapError, &r->vclock, clock);
-
- if (!r->wal_dir.force_recovery)
- throw e;
- e->log();
- /* Ignore missing WALs */
- say_warn("ignoring a gap in LSN");
- }
-
- recovery_close_log(r);
-
- xdir_open_cursor_xc(&r->wal_dir, vclock_sum(clock), &r->cursor);
+ recovery_open_log(r, clock);
say_info("recover from `%s'", r->cursor.name);
diff --git a/test/xlog/panic_on_lsn_gap.result b/test/xlog/panic_on_lsn_gap.result
index 313850a6..c93fcdd6 100644
--- a/test/xlog/panic_on_lsn_gap.result
+++ b/test/xlog/panic_on_lsn_gap.result
@@ -280,6 +280,40 @@ box.space._schema:select{'key'}
---
- - ['key', 'test 4']
...
+--
+-- Check that if there's an LSN gap between two WALs
+-- that appeared due to a disk error and no files is
+-- actually missing, we won't panic on recovery.
+--
+box.space._schema:replace{'key', 'test 4'} -- creates new WAL
+---
+- ['key', 'test 4']
+...
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
+---
+- ok
+...
+box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
+---
+- error: Failed to write to disk
+...
+box.snapshot() -- fails, rotates WAL
+---
+- error: Error injection 'xlog write injection'
+...
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
+---
+- ok
+...
+box.space._schema:replace{'key', 'test 5'} -- creates new WAL
+---
+- ['key', 'test 5']
+...
+test_run:cmd("restart server panic")
+box.space._schema:select{'key'}
+---
+- - ['key', 'test 5']
+...
test_run:cmd('switch default')
---
- true
diff --git a/test/xlog/panic_on_lsn_gap.test.lua b/test/xlog/panic_on_lsn_gap.test.lua
index 248a3e63..b1ede320 100644
--- a/test/xlog/panic_on_lsn_gap.test.lua
+++ b/test/xlog/panic_on_lsn_gap.test.lua
@@ -108,6 +108,19 @@ require('fio').glob(name .. "/*.xlog")
-- restart is ok
test_run:cmd("restart server panic")
box.space._schema:select{'key'}
+--
+-- Check that if there's an LSN gap between two WALs
+-- that appeared due to a disk error and no files is
+-- actually missing, we won't panic on recovery.
+--
+box.space._schema:replace{'key', 'test 4'} -- creates new WAL
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
+box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
+box.snapshot() -- fails, rotates WAL
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
+box.space._schema:replace{'key', 'test 5'} -- creates new WAL
+test_run:cmd("restart server panic")
+box.space._schema:select{'key'}
test_run:cmd('switch default')
test_run:cmd("stop server panic")
test_run:cmd("cleanup server panic")
--
2.11.0
next prev parent reply other threads:[~2018-06-29 16:48 UTC|newest]
Thread overview: 12+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-06-29 16:48 [PATCH v2 0/6] Create empty xlog on shutdown Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 1/6] xlog: store prev vclock in xlog header Vladimir Davydov
2018-07-05 6:49 ` Konstantin Osipov
2018-07-05 6:52 ` Konstantin Osipov
2018-07-05 8:23 ` Vladimir Davydov
2018-07-05 11:22 ` Konstantin Osipov
2018-07-10 16:28 ` [PATCH] xlog: get rid of xlog_meta::has_prev_vclock Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 2/6] xlog: differentiate between closed and never opened cursor Vladimir Davydov
2018-06-29 16:48 ` Vladimir Davydov [this message]
2018-06-29 16:48 ` [PATCH v2 4/6] recovery: promote recovery clock even if the WAL is empty Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 5/6] wal: create empty xlog on shutdown Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 6/6] error: move XlogGapError to box/error.h Vladimir Davydov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=cf339d9cc086534e5f1d2359876abc2e23da4cbf.1530287767.git.vdavydov.dev@gmail.com \
--to=vdavydov.dev@gmail.com \
--cc=kostja@tarantool.org \
--cc=tarantool-patches@freelists.org \
--subject='Re: [PATCH v2 3/6] recovery: make LSN gap check more thorough' \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox