[PATCH 3/3] wal: create empty xlog on shutdown

Vladimir Davydov vdavydov.dev at gmail.com
Fri Jun 15 18:48:22 MSK 2018


In order to determine whether we need to rebootstrap the instance on
startup, we need to know its vclock. To find it out, we scan the last
xlog file before proceeding to local recovery, but this means in case
rebootstrap is not required we scan the last xlog twice, which is
sub-optimal. To speed up this procedure, let's create a new empty xlog
before shutting down the server and reopen it after restart.
---
 src/box/recovery.cc                   | 23 ----------------
 src/box/wal.c                         | 50 +++++++++++++++++++++++++++++++++--
 test/replication/hot_standby.result   | 12 ++++-----
 test/replication/hot_standby.test.lua |  4 +--
 test/xlog-py/dup_key.result           | 20 ++++----------
 test/xlog-py/dup_key.test.py          | 29 +++++++-------------
 test/xlog/panic_on_lsn_gap.result     |  1 +
 test/xlog/panic_on_wal_error.result   | 23 +---------------
 test/xlog/panic_on_wal_error.test.lua |  9 +------
 9 files changed, 74 insertions(+), 97 deletions(-)

diff --git a/src/box/recovery.cc b/src/box/recovery.cc
index eb77476d..1f7a11e6 100644
--- a/src/box/recovery.cc
+++ b/src/box/recovery.cc
@@ -339,29 +339,6 @@ void
 recovery_finalize(struct recovery *r)
 {
 	recovery_close_log(r);
-
-	/*
-	 * Check if next xlog exists. If it's true this xlog is
-	 * corrupted and we should rename it (to avoid getting
-	 * problem on the next xlog write with the same name).
-	 * Possible reasons are:
-	 *  - last xlog has corrupted rows
-	 *  - last xlog has corrupted header
-	 *  - last xlog has zero size
-	 */
-	char *name = xdir_format_filename(&r->wal_dir,
-					  vclock_sum(&r->vclock),
-					  NONE);
-	if (access(name, F_OK) == 0) {
-		say_info("rename corrupted xlog %s", name);
-		char to[PATH_MAX];
-		snprintf(to, sizeof(to), "%s.corrupted", name);
-		if (rename(name, to) != 0) {
-			tnt_raise(SystemError,
-				  "%s: can't rename corrupted xlog",
-				  name);
-		}
-	}
 }
 
 
diff --git a/src/box/wal.c b/src/box/wal.c
index 1c6d2422..1456d3e7 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -310,6 +310,39 @@ wal_thread_start()
 	cpipe_set_max_input(&wal_thread.wal_pipe, IOV_MAX);
 }
 
+static int
+wal_init_f(struct cbus_call_msg *msg)
+{
+	(void)msg;
+
+	struct wal_writer *writer = &wal_writer_singleton;
+
+	/*
+	 * Check if the next WAL file already exists. If it does,
+	 * it must have been created on shutdown, try to reopen it.
+	 */
+	const char *path = xdir_format_filename(&writer->wal_dir,
+				vclock_sum(&writer->vclock), NONE);
+	if (access(path, F_OK) == 0) {
+		if (xlog_open(&writer->current_wal, path) == 0)
+			return 0;
+		/*
+		 * The WAL file seems to be corrupted. Rename it
+		 * so that we can proceed.
+		 */
+		say_info("rename corrupted %s", path);
+		char new_path[PATH_MAX];
+		snprintf(new_path, sizeof(new_path), "%s.corrupted", path);
+		if (rename(path, new_path) != 0) {
+			diag_set(SystemError,
+				 "%s: can't rename corrupted xlog", path);
+			diag_log();
+			return -1;
+		}
+	}
+	return 0;
+}
+
 /**
  * Initialize WAL writer.
  *
@@ -332,6 +365,11 @@ wal_init(enum wal_mode wal_mode, const char *wal_dirname,
 	if (xdir_scan(&writer->wal_dir))
 		return -1;
 
+	struct cbus_call_msg msg;
+	if (cbus_call(&wal_thread.wal_pipe, &wal_thread.tx_pipe, &msg,
+		      wal_init_f, NULL, TIMEOUT_INFINITY) != 0)
+		return -1;
+
 	journal_set(&writer->base);
 	return 0;
 }
@@ -382,8 +420,7 @@ wal_checkpoint_f(struct cmsg *data)
 
 		xlog_close(&writer->current_wal, false);
 		/*
-		 * Avoid creating an empty xlog if this is the
-		 * last snapshot before shutdown.
+		 * The next WAL will be created on first write.
 		 */
 	}
 	vclock_copy(msg->vclock, &writer->vclock);
@@ -712,6 +749,15 @@ wal_thread_f(va_list ap)
 	if (xlog_is_open(&writer->current_wal))
 		xlog_close(&writer->current_wal, false);
 
+	/*
+	 * Create a new empty WAL on shutdown so that we don't have
+	 * to rescan the last WAL to find the instance vclock.
+	 */
+	if (writer->wal_mode != WAL_NONE &&
+	    xdir_create_xlog(&writer->wal_dir, &writer->current_wal,
+			     &writer->vclock) == 0)
+		xlog_close(&writer->current_wal, false);
+
 	if (xlog_is_open(&vy_log_writer.xlog))
 		xlog_close(&vy_log_writer.xlog, false);
 
diff --git a/test/replication/hot_standby.result b/test/replication/hot_standby.result
index 66ede5b7..24be0a94 100644
--- a/test/replication/hot_standby.result
+++ b/test/replication/hot_standby.result
@@ -284,27 +284,27 @@ _select(11, 20)
   - [19, 'the tuple 19']
   - [20, 'the tuple 20']
 ...
-test_run:cmd("deploy server default")
+test_run:cmd("stop server hot_standby")
 ---
 - true
 ...
-test_run:cmd("start server default")
+test_run:cmd("cleanup server hot_standby")
 ---
 - true
 ...
-test_run:cmd("switch default")
+test_run:cmd("deploy server default")
 ---
 - true
 ...
-test_run:cmd("stop server hot_standby")
+test_run:cmd("start server default")
 ---
 - true
 ...
-test_run:cmd("stop server replica")
+test_run:cmd("switch default")
 ---
 - true
 ...
-test_run:cmd("cleanup server hot_standby")
+test_run:cmd("stop server replica")
 ---
 - true
 ...
diff --git a/test/replication/hot_standby.test.lua b/test/replication/hot_standby.test.lua
index 8a7c837e..adb3fb6f 100644
--- a/test/replication/hot_standby.test.lua
+++ b/test/replication/hot_standby.test.lua
@@ -109,10 +109,10 @@ test_run:cmd("switch replica")
 _wait_lsn(10)
 _select(11, 20)
 
+test_run:cmd("stop server hot_standby")
+test_run:cmd("cleanup server hot_standby")
 test_run:cmd("deploy server default")
 test_run:cmd("start server default")
 test_run:cmd("switch default")
-test_run:cmd("stop server hot_standby")
 test_run:cmd("stop server replica")
-test_run:cmd("cleanup server hot_standby")
 test_run:cmd("cleanup server replica")
diff --git a/test/xlog-py/dup_key.result b/test/xlog-py/dup_key.result
index 53ae7322..f387e8e8 100644
--- a/test/xlog-py/dup_key.result
+++ b/test/xlog-py/dup_key.result
@@ -4,6 +4,10 @@ space = box.schema.space.create('test')
 index = box.space.test:create_index('primary')
 ---
 ...
+box.snapshot()
+---
+- ok
+...
 box.space.test:insert{1, 'first tuple'}
 ---
 - [1, 'first tuple']
@@ -13,20 +17,6 @@ box.space.test:insert{2, 'second tuple'}
 - [2, 'second tuple']
 ...
 .xlog exists
-space = box.schema.space.create('test')
----
-...
-index = box.space.test:create_index('primary')
----
-...
-box.space.test:insert{1, 'first tuple'}
----
-- [1, 'first tuple']
-...
-box.space.test:delete{1}
----
-- [1, 'first tuple']
-...
 box.space.test:insert{1, 'third tuple'}
 ---
 - [1, 'third tuple']
@@ -35,7 +25,7 @@ box.space.test:insert{2, 'fourth tuple'}
 ---
 - [2, 'fourth tuple']
 ...
-.xlog exists
+.xlog does not exist
 check log line for 'Duplicate key'
 
 'Duplicate key' exists in server log
diff --git a/test/xlog-py/dup_key.test.py b/test/xlog-py/dup_key.test.py
index 058d9e3f..1c033da4 100644
--- a/test/xlog-py/dup_key.test.py
+++ b/test/xlog-py/dup_key.test.py
@@ -8,6 +8,11 @@ import yaml
 
 server.stop()
 server.deploy()
+
+server.admin("space = box.schema.space.create('test')")
+server.admin("index = box.space.test:create_index('primary')")
+server.admin("box.snapshot()")
+
 lsn = int(yaml.load(server.admin("box.info.lsn", silent=True))[0])
 filename = str(lsn).zfill(20) + ".xlog"
 vardir = os.path.join(server.vardir, server.name)
@@ -15,40 +20,26 @@ wal_old = os.path.join(vardir, "old_" + filename)
 wal = os.path.join(vardir, filename)
 
 # Create wal#1
-server.admin("space = box.schema.space.create('test')")
-server.admin("index = box.space.test:create_index('primary')")
 server.admin("box.space.test:insert{1, 'first tuple'}")
 server.admin("box.space.test:insert{2, 'second tuple'}")
 server.stop()
 
-# Save wal #1
+# Save wal#1
 if os.access(wal, os.F_OK):
     print ".xlog exists"
     os.rename(wal, wal_old)
 
-lsn += 4
-
-# Create another wal#1
-server.start()
-server.admin("space = box.schema.space.create('test')")
-server.admin("index = box.space.test:create_index('primary')")
-server.admin("box.space.test:insert{1, 'first tuple'}")
-server.admin("box.space.test:delete{1}")
-server.stop()
-
-# Create wal#2
+# Write wal#2
 server.start()
 server.admin("box.space.test:insert{1, 'third tuple'}")
 server.admin("box.space.test:insert{2, 'fourth tuple'}")
 server.stop()
 
-if os.access(wal, os.F_OK):
-    print ".xlog exists"
-    # Replace wal#1 with saved copy
-    os.unlink(wal)
+# Restore wal#1
+if not os.access(wal, os.F_OK):
+    print ".xlog does not exist"
     os.rename(wal_old, wal)
 
-
 server.start()
 line = 'Duplicate key'
 print "check log line for '%s'" % line
diff --git a/test/xlog/panic_on_lsn_gap.result b/test/xlog/panic_on_lsn_gap.result
index 731eec4e..d0978e40 100644
--- a/test/xlog/panic_on_lsn_gap.result
+++ b/test/xlog/panic_on_lsn_gap.result
@@ -188,6 +188,7 @@ require('fio').glob(name .. "/*.xlog")
 ---
 - - panic/00000000000000000000.xlog
   - panic/00000000000000000001.xlog
+  - panic/00000000000000000002.xlog
 ...
 -- now insert 10 rows - so that the next
 -- row will need to switch the WAL
diff --git a/test/xlog/panic_on_wal_error.result b/test/xlog/panic_on_wal_error.result
index 267b5340..345534ba 100644
--- a/test/xlog/panic_on_wal_error.result
+++ b/test/xlog/panic_on_wal_error.result
@@ -5,28 +5,7 @@ env = require('test_run')
 test_run = env.new()
 ---
 ...
-fio = require('fio')
----
-...
-glob = fio.pathjoin(box.cfg.wal_dir, '*.xlog')
----
-...
-for _, file in pairs(fio.glob(glob)) do fio.unlink(file) end
----
-...
-glob = fio.pathjoin(box.cfg.vinyl_dir, '*.vylog')
----
-...
-for _, file in pairs(fio.glob(glob)) do fio.unlink(file) end
----
-...
-glob = fio.pathjoin(box.cfg.memtx_dir, '*.snap')
----
-...
-for _, file in pairs(fio.glob(glob)) do fio.unlink(file) end
----
-...
-test_run:cmd("restart server default")
+test_run:cmd("restart server default with cleanup=True")
 box.schema.user.grant('guest', 'replication')
 ---
 ...
diff --git a/test/xlog/panic_on_wal_error.test.lua b/test/xlog/panic_on_wal_error.test.lua
index 4f598e33..29410cb2 100644
--- a/test/xlog/panic_on_wal_error.test.lua
+++ b/test/xlog/panic_on_wal_error.test.lua
@@ -2,14 +2,7 @@
 env = require('test_run')
 test_run = env.new()
 
-fio = require('fio')
-glob = fio.pathjoin(box.cfg.wal_dir, '*.xlog')
-for _, file in pairs(fio.glob(glob)) do fio.unlink(file) end
-glob = fio.pathjoin(box.cfg.vinyl_dir, '*.vylog')
-for _, file in pairs(fio.glob(glob)) do fio.unlink(file) end
-glob = fio.pathjoin(box.cfg.memtx_dir, '*.snap')
-for _, file in pairs(fio.glob(glob)) do fio.unlink(file) end
-test_run:cmd("restart server default")
+test_run:cmd("restart server default with cleanup=True")
 box.schema.user.grant('guest', 'replication')
 _ = box.schema.space.create('test')
 _ = box.space.test:create_index('pk')
-- 
2.11.0




More information about the Tarantool-patches mailing list