[PATCH v2 4/6] recovery: promote recovery clock even if the WAL is empty

Vladimir Davydov vdavydov.dev at gmail.com
Fri Jun 29 19:48:31 MSK 2018


Currently, if the last WAL in the directory happens to be corrupted or
empty so that we don't recover anything from it, recovery clock will be
that of the last record of the previous WAL. If the previous WAL happens
to have a gap at the end, the next WAL will be created between the last
WAL (empty one) and the next to last (with a gap at the end), breaking
the file order in the WAL directory. That said, we must promote recovery
clock even if we don't recover anything from a WAL.
---
 src/box/recovery.cc                 | 12 ++++++++++
 test/xlog/panic_on_lsn_gap.result   | 47 +++++++++++++++++++++++++++++++++++++
 test/xlog/panic_on_lsn_gap.test.lua | 10 ++++++++
 3 files changed, 69 insertions(+)

diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index 8ac89cc2..70eb7d74 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -183,6 +183,17 @@ recovery_open_log(struct recovery *r, const struct vclock *vclock)
 		 */
 		goto gap_error;
 	}
+out:
+	/*
+	 * We must promote recovery clock even if we don't recover
+	 * anything from the next WAL. Otherwise if the last WAL
+	 * in the directory is corrupted or empty and the previous
+	 * one has an LSN gap at the end (due to a write error),
+	 * we will create the next WAL between two existing ones,
+	 * thus breaking the file order.
+	 */
+	if (vclock_compare(&r->vclock, vclock) < 0)
+		vclock_copy(&r->vclock, vclock);
 	return;
 
 gap_error:
@@ -192,6 +203,7 @@ gap_error:
 	/* Ignore missing WALs if force_recovery is set. */
 	e->log();
 	say_warn("ignoring a gap in LSN");
+	goto out;
 }
 
 void
diff --git a/test/xlog/panic_on_lsn_gap.result b/test/xlog/panic_on_lsn_gap.result
index c93fcdd6..d5064ce6 100644
--- a/test/xlog/panic_on_lsn_gap.result
+++ b/test/xlog/panic_on_lsn_gap.result
@@ -309,11 +309,58 @@ box.space._schema:replace{'key', 'test 5'} -- creates new WAL
 ---
 - ['key', 'test 5']
 ...
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
+---
+- ok
+...
+box.space._schema:replace{'key', 'test 6'} -- fails, makes gap
+---
+- error: Failed to write to disk
+...
+box.snapshot() -- fails, rotates WAL
+---
+- error: Error injection 'xlog write injection'
+...
+box.space._schema:replace{'key', 'test 6'} -- fails, creates empty WAL
+---
+- error: Failed to write to disk
+...
+name = string.match(arg[0], "([^,]+)%.lua")
+---
+...
+require('fio').glob(name .. "/*.xlog")
+---
+- - panic/00000000000000000000.xlog
+  - panic/00000000000000000001.xlog
+  - panic/00000000000000000012.xlog
+  - panic/00000000000000000022.xlog
+  - panic/00000000000000000025.xlog
+  - panic/00000000000000000027.xlog
+  - panic/00000000000000000029.xlog
+...
 test_run:cmd("restart server panic")
 box.space._schema:select{'key'}
 ---
 - - ['key', 'test 5']
 ...
+-- Check that we don't create a WAL in the gap between the last two.
+box.space._schema:replace{'key', 'test 6'}
+---
+- ['key', 'test 6']
+...
+name = string.match(arg[0], "([^,]+)%.lua")
+---
+...
+require('fio').glob(name .. "/*.xlog")
+---
+- - panic/00000000000000000000.xlog
+  - panic/00000000000000000001.xlog
+  - panic/00000000000000000012.xlog
+  - panic/00000000000000000022.xlog
+  - panic/00000000000000000025.xlog
+  - panic/00000000000000000027.xlog
+  - panic/00000000000000000029.xlog
+...
 test_run:cmd('switch default')
 ---
 - true
diff --git a/test/xlog/panic_on_lsn_gap.test.lua b/test/xlog/panic_on_lsn_gap.test.lua
index b1ede320..d72552d0 100644
--- a/test/xlog/panic_on_lsn_gap.test.lua
+++ b/test/xlog/panic_on_lsn_gap.test.lua
@@ -119,8 +119,18 @@ box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
 box.snapshot() -- fails, rotates WAL
 box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
 box.space._schema:replace{'key', 'test 5'} -- creates new WAL
+box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
+box.space._schema:replace{'key', 'test 6'} -- fails, makes gap
+box.snapshot() -- fails, rotates WAL
+box.space._schema:replace{'key', 'test 6'} -- fails, creates empty WAL
+name = string.match(arg[0], "([^,]+)%.lua")
+require('fio').glob(name .. "/*.xlog")
 test_run:cmd("restart server panic")
 box.space._schema:select{'key'}
+-- Check that we don't create a WAL in the gap between the last two.
+box.space._schema:replace{'key', 'test 6'}
+name = string.match(arg[0], "([^,]+)%.lua")
+require('fio').glob(name .. "/*.xlog")
 test_run:cmd('switch default')
 test_run:cmd("stop server panic")
 test_run:cmd("cleanup server panic")
-- 
2.11.0




More information about the Tarantool-patches mailing list