Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [PATCH v2 3/6] recovery: make LSN gap check more thorough
Date: Fri, 29 Jun 2018 19:48:30 +0300	[thread overview]
Message-ID: <cf339d9cc086534e5f1d2359876abc2e23da4cbf.1530287767.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1530287767.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1530287767.git.vdavydov.dev@gmail.com>

Currently, the lsn gap check is rather sloppy: when we open an xlog file
for recovery, we check that its vclock equals the vclock of the last
replayed row (see recover_remaining_wals), so if there were WAL write
errors at the end of an xlog file, we will report a false-positive gap
error (because wal doesn't rollback lsn counter). Let's use PrevVclock
xlog meta key introduced earlier to improve the check.
---
 src/box/recovery.cc                 | 62 +++++++++++++++++++++++++------------
 test/xlog/panic_on_lsn_gap.result   | 34 ++++++++++++++++++++
 test/xlog/panic_on_lsn_gap.test.lua | 13 ++++++++
 3 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index 71f6bd8c..8ac89cc2 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -152,6 +152,48 @@ recovery_close_log(struct recovery *r)
 	trigger_run_xc(&r->on_close_log, NULL);
 }
 
+static void
+recovery_open_log(struct recovery *r, const struct vclock *vclock)
+{
+	XlogGapError *e;
+	struct xlog_meta meta = r->cursor.meta;
+	enum xlog_cursor_state state = r->cursor.state;
+
+	recovery_close_log(r);
+
+	xdir_open_cursor_xc(&r->wal_dir, vclock_sum(vclock), &r->cursor);
+
+	if (state == XLOG_CURSOR_UNINITIALIZED &&
+	    vclock_compare(vclock, &r->vclock) > 0) {
+		/*
+		 * This is the first WAL we are about to scan
+		 * and the best clock we could find is greater
+		 * or is incomparable with the initial recovery
+		 * position.
+		 */
+		goto gap_error;
+	}
+
+	if (state != XLOG_CURSOR_UNINITIALIZED &&
+	    r->cursor.meta.has_prev_vclock &&
+	    vclock_compare(&r->cursor.meta.prev_vclock, &meta.vclock) != 0) {
+		/*
+		 * WALs are missing between the last scanned WAL
+		 * and the next one.
+		 */
+		goto gap_error;
+	}
+	return;
+
+gap_error:
+	e = tnt_error(XlogGapError, &r->vclock, vclock);
+	if (!r->wal_dir.force_recovery)
+		throw e;
+	/* Ignore missing WALs if force_recovery is set. */
+	e->log();
+	say_warn("ignoring a gap in LSN");
+}
+
 void
 recovery_delete(struct recovery *r)
 {
@@ -277,25 +319,7 @@ recover_remaining_wals(struct recovery *r, struct xstream *stream,
 			continue;
 		}
 
-		if (vclock_compare(clock, &r->vclock) > 0) {
-			/**
-			 * The best clock we could find is
-			 * greater or is incomparable with the
-			 * current state of recovery.
-			 */
-			XlogGapError *e =
-				tnt_error(XlogGapError, &r->vclock, clock);
-
-			if (!r->wal_dir.force_recovery)
-				throw e;
-			e->log();
-			/* Ignore missing WALs */
-			say_warn("ignoring a gap in LSN");
-		}
-
-		recovery_close_log(r);
-
-		xdir_open_cursor_xc(&r->wal_dir, vclock_sum(clock), &r->cursor);
+		recovery_open_log(r, clock);
 
 		say_info("recover from `%s'", r->cursor.name);
 
diff --git a/test/xlog/panic_on_lsn_gap.result b/test/xlog/panic_on_lsn_gap.result
index 313850a6..c93fcdd6 100644
--- a/test/xlog/panic_on_lsn_gap.result
+++ b/test/xlog/panic_on_lsn_gap.result
@@ -280,6 +280,40 @@ box.space._schema:select{'key'}
 ---
 - - ['key', 'test 4']
 ...
+--
+-- Check that if there's an LSN gap between two WALs
+-- that appeared due to a disk error and no files is
+-- actually missing, we won't panic on recovery.
+--
+box.space._schema:replace{'key', 'test 4'} -- creates new WAL
+---
+- ['key', 'test 4']
+...
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
+---
+- ok
+...
+box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
+---
+- error: Failed to write to disk
+...
+box.snapshot() -- fails, rotates WAL
+---
+- error: Error injection 'xlog write injection'
+...
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
+---
+- ok
+...
+box.space._schema:replace{'key', 'test 5'} -- creates new WAL
+---
+- ['key', 'test 5']
+...
+test_run:cmd("restart server panic")
+box.space._schema:select{'key'}
+---
+- - ['key', 'test 5']
+...
 test_run:cmd('switch default')
 ---
 - true
diff --git a/test/xlog/panic_on_lsn_gap.test.lua b/test/xlog/panic_on_lsn_gap.test.lua
index 248a3e63..b1ede320 100644
--- a/test/xlog/panic_on_lsn_gap.test.lua
+++ b/test/xlog/panic_on_lsn_gap.test.lua
@@ -108,6 +108,19 @@ require('fio').glob(name .. "/*.xlog")
 -- restart is ok
 test_run:cmd("restart server panic")
 box.space._schema:select{'key'}
+--
+-- Check that if there's an LSN gap between two WALs
+-- that appeared due to a disk error and no files is
+-- actually missing, we won't panic on recovery.
+--
+box.space._schema:replace{'key', 'test 4'} -- creates new WAL
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
+box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
+box.snapshot() -- fails, rotates WAL
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
+box.space._schema:replace{'key', 'test 5'} -- creates new WAL
+test_run:cmd("restart server panic")
+box.space._schema:select{'key'}
 test_run:cmd('switch default')
 test_run:cmd("stop server panic")
 test_run:cmd("cleanup server panic")
-- 
2.11.0

  parent reply	other threads:[~2018-06-29 16:48 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-06-29 16:48 [PATCH v2 0/6] Create empty xlog on shutdown Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 1/6] xlog: store prev vclock in xlog header Vladimir Davydov
2018-07-05  6:49   ` Konstantin Osipov
2018-07-05  6:52   ` Konstantin Osipov
2018-07-05  8:23     ` Vladimir Davydov
2018-07-05 11:22       ` Konstantin Osipov
2018-07-10 16:28         ` [PATCH] xlog: get rid of xlog_meta::has_prev_vclock Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 2/6] xlog: differentiate between closed and never opened cursor Vladimir Davydov
2018-06-29 16:48 ` Vladimir Davydov [this message]
2018-06-29 16:48 ` [PATCH v2 4/6] recovery: promote recovery clock even if the WAL is empty Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 5/6] wal: create empty xlog on shutdown Vladimir Davydov
2018-06-29 16:48 ` [PATCH v2 6/6] error: move XlogGapError to box/error.h Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=cf339d9cc086534e5f1d2359876abc2e23da4cbf.1530287767.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH v2 3/6] recovery: make LSN gap check more thorough' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox