From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <tarantool-patches-bounce@freelists.org>
Received: from localhost (localhost [127.0.0.1])
	by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id D5466272A9
	for <tarantool-patches@freelists.org>; Thu,  7 Feb 2019 12:25:31 -0500 (EST)
Received: from turing.freelists.org ([127.0.0.1])
	by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024)
	with ESMTP id bZmht8lCDRQz for <tarantool-patches@freelists.org>;
	Thu,  7 Feb 2019 12:25:31 -0500 (EST)
Received: from smtp16.mail.ru (smtp16.mail.ru [94.100.176.153])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 6D67327278
	for <tarantool-patches@freelists.org>; Thu,  7 Feb 2019 12:25:31 -0500 (EST)
From: Georgy Kirichenko <georgy@tarantool.org>
Subject: [tarantool-patches] [PATCH v2 1/2] Do not promote wal vclock for failed writes
Date: Thu,  7 Feb 2019 20:27:30 +0300
Message-Id: <79a9eeb9870e15a826d7e64f186b8010ad836a64.1549556742.git.georgy@tarantool.org>
In-Reply-To: <cover.1549556742.git.georgy@tarantool.org>
References: <cover.1549556742.git.georgy@tarantool.org>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
Sender: tarantool-patches-bounce@freelists.org
Errors-to: tarantool-patches-bounce@freelists.org
Reply-To: tarantool-patches@freelists.org
List-help: <mailto:ecartis@freelists.org?Subject=help>
List-unsubscribe: <tarantool-patches-request@freelists.org?Subject=unsubscribe>
List-software: Ecartis version 1.0.0
List-Id: tarantool-patches <tarantool-patches.freelists.org>
List-subscribe: <tarantool-patches-request@freelists.org?Subject=subscribe>
List-owner: <mailto:>
List-post: <mailto:tarantool-patches@freelists.org>
List-archive: <http://www.freelists.org/archives/tarantool-patches>
To: tarantool-patches@freelists.org
Cc: Georgy Kirichenko <georgy@tarantool.org>

Wal used to promote vclock prior to write the row. This lead to a
situation when master's row would be skipped forever in case there is
an error trying to write it. However, some errors are transient, and we
might be able to successfully apply the same row later. So we do not
promote writer vclock in order to be able to restart replication from
failing point.

Obsoletes xlog/panic_on_lsn_gap.test.

Needed for #2283
---
 src/box/wal.c                       |  24 +-
 test/box/errinj.result              |  23 ++
 test/box/errinj.test.lua            |  20 ++
 test/xlog/errinj.result             |   1 -
 test/xlog/panic_on_lsn_gap.result   | 377 ----------------------------
 test/xlog/panic_on_lsn_gap.test.lua | 136 ----------
 6 files changed, 61 insertions(+), 520 deletions(-)
 delete mode 100644 test/xlog/panic_on_lsn_gap.result
 delete mode 100644 test/xlog/panic_on_lsn_gap.test.lua

diff --git a/src/box/wal.c b/src/box/wal.c
index 3b50d3629..966f3bfb9 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -901,16 +901,16 @@ wal_writer_begin_rollback(struct wal_writer *writer)
 }
 
 static void
-wal_assign_lsn(struct wal_writer *writer, struct xrow_header **row,
+wal_assign_lsn(struct vclock *vclock, struct xrow_header **row,
 	       struct xrow_header **end)
 {
 	/** Assign LSN to all local rows. */
 	for ( ; row < end; row++) {
 		if ((*row)->replica_id == 0) {
-			(*row)->lsn = vclock_inc(&writer->vclock, instance_id);
+			(*row)->lsn = vclock_inc(vclock, instance_id);
 			(*row)->replica_id = instance_id;
 		} else {
-			vclock_follow_xrow(&writer->vclock, *row);
+			vclock_follow_xrow(vclock, *row);
 		}
 	}
 }
@@ -922,6 +922,16 @@ wal_write_to_disk(struct cmsg *msg)
 	struct wal_msg *wal_msg = (struct wal_msg *) msg;
 	struct error *error;
 
+	/*
+	 * In order to not to promote writer's vclock in case of an error we
+	 * create a copy to assign lsn's before rows were actually written.
+	 * After successful xlog flash we update writer vclock to the
+	 * last written vclock value.
+	 */
+	struct vclock vclock;
+	vclock_create(&vclock);
+	vclock_copy(&vclock, &writer->vclock);
+
 	struct errinj *inj = errinj(ERRINJ_WAL_DELAY, ERRINJ_BOOL);
 	while (inj != NULL && inj->bparam)
 		usleep(10);
@@ -974,14 +984,15 @@ wal_write_to_disk(struct cmsg *msg)
 	struct journal_entry *entry;
 	struct stailq_entry *last_committed = NULL;
 	stailq_foreach_entry(entry, &wal_msg->commit, fifo) {
-		wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
-		entry->res = vclock_sum(&writer->vclock);
+		wal_assign_lsn(&vclock, entry->rows, entry->rows + entry->n_rows);
+		entry->res = vclock_sum(&vclock);
 		rc = xlog_write_entry(l, entry);
 		if (rc < 0)
 			goto done;
 		if (rc > 0) {
 			writer->checkpoint_wal_size += rc;
 			last_committed = &entry->fifo;
+			vclock_copy(&writer->vclock, &vclock);
 		}
 		/* rc == 0: the write is buffered in xlog_tx */
 	}
@@ -991,6 +1002,7 @@ wal_write_to_disk(struct cmsg *msg)
 
 	writer->checkpoint_wal_size += rc;
 	last_committed = stailq_last(&wal_msg->commit);
+	vclock_copy(&writer->vclock, &vclock);
 
 	/*
 	 * Notify TX if the checkpoint threshold has been exceeded.
@@ -1185,7 +1197,7 @@ wal_write_in_wal_mode_none(struct journal *journal,
 			   struct journal_entry *entry)
 {
 	struct wal_writer *writer = (struct wal_writer *) journal;
-	wal_assign_lsn(writer, entry->rows, entry->rows + entry->n_rows);
+	wal_assign_lsn(&writer->vclock, entry->rows, entry->rows + entry->n_rows);
 	int64_t old_lsn = vclock_get(&replicaset.vclock, instance_id);
 	int64_t new_lsn = vclock_get(&writer->vclock, instance_id);
 	if (new_lsn > old_lsn) {
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 12303670e..1d9a16d8d 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -141,6 +141,15 @@ errinj.set("ERRINJ_TESTING", false)
 ---
 - ok
 ...
+env = require('test_run')
+---
+...
+test_run = env.new()
+---
+...
+lsn1 = box.info.vclock[box.info.id]
+---
+...
 -- Check how well we handle a failed log write
 errinj.set("ERRINJ_WAL_IO", true)
 ---
@@ -161,6 +170,11 @@ space:insert{1}
 ---
 - [1]
 ...
+-- Check vclock was promoted only one time
+box.info.vclock[box.info.id] == lsn1 + 1
+---
+- true
+...
 errinj.set("ERRINJ_WAL_IO", true)
 ---
 - ok
@@ -180,6 +194,15 @@ errinj.set("ERRINJ_WAL_IO", false)
 ---
 - ok
 ...
+space:update(1, {{'=', 2, 2}})
+---
+- [1, 2]
+...
+-- Check vclock was promoted only two times
+box.info.vclock[box.info.id] == lsn1 + 2
+---
+- true
+...
 space:truncate()
 ---
 ...
diff --git a/test/box/errinj.test.lua b/test/box/errinj.test.lua
index 6f491d623..39d73b57e 100644
--- a/test/box/errinj.test.lua
+++ b/test/box/errinj.test.lua
@@ -18,11 +18,31 @@ space:insert{1}
 space:get{1}
 errinj.set("ERRINJ_WAL_IO", false)
 space:insert{1}
+-- Check vclock was promoted only one time
 errinj.set("ERRINJ_WAL_IO", true)
 space:update(1, {{'=', 2, 2}})
 space:get{1}
 space:get{2}
 errinj.set("ERRINJ_WAL_IO", false)
+space:update(1, {{'=', 2, 2}})
+-- Check vclock was promoted only two times
+space:truncate()
+
+lsn1 = box.info.vclock[box.info.id]
+-- Check how well we handle a failed log write
+errinj.set("ERRINJ_WAL_WRITE_PARTIAL", 0)
+space:insert{1}
+errinj.set("ERRINJ_WAL_WRITE_PARTIAL", -1)
+space:insert{1}
+-- Check vclock was promoted only one time
+box.info.vclock[box.info.id] == lsn1 + 1
+errinj.set("ERRINJ_WAL_WRITE_PARTIAL", 0)
+space:update(1, {{'=', 2, 2}})
+space:get{1}
+errinj.set("ERRINJ_WAL_WRITE_PARTIAL", -1)
+space:update(1, {{'=', 2, 2}})
+-- Check vclock was promoted only two times
+box.info.vclock[box.info.id] == lsn1 + 2
 space:truncate()
 
 -- Check a failed log rotation
diff --git a/test/xlog/errinj.result b/test/xlog/errinj.result
index 390404b47..7f15bef35 100644
--- a/test/xlog/errinj.result
+++ b/test/xlog/errinj.result
@@ -43,7 +43,6 @@ require('fio').glob(name .. "/*.xlog")
 ---
 - - xlog/00000000000000000000.xlog
   - xlog/00000000000000000001.xlog
-  - xlog/00000000000000000002.xlog
 ...
 test_run:cmd('restart server default with cleanup=1')
 -- gh-881 iproto request with wal IO error
diff --git a/test/xlog/panic_on_lsn_gap.result b/test/xlog/panic_on_lsn_gap.result
deleted file mode 100644
index 4dd1291f8..000000000
--- a/test/xlog/panic_on_lsn_gap.result
+++ /dev/null
@@ -1,377 +0,0 @@
---
--- we actually need to know what xlogs the server creates,
--- so start from a clean state
---
---
--- Check how the server is able to find the next
--- xlog if there are failed writes (lsn gaps).
---
-env = require('test_run')
----
-...
-test_run = env.new()
----
-...
-test_run:cmd("create server panic with script='xlog/panic.lua'")
----
-- true
-...
-test_run:cmd("start server panic")
----
-- true
-...
-test_run:cmd("switch panic")
----
-- true
-...
-box.info.vclock
----
-- {}
-...
-s = box.space._schema
----
-...
--- we need to have at least one record in the
--- xlog otherwise the server believes that there
--- is an lsn gap during recovery.
---
-s:replace{"key", 'test 1'}
----
-- ['key', 'test 1']
-...
-box.info.vclock
----
-- {1: 1}
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
----
-- ok
-...
-t = {}
----
-...
---
--- Try to insert rows, so that it's time to
--- switch WALs. No switch will happen though,
--- since no writes were made.
---
-test_run:cmd("setopt delimiter ';'")
----
-- true
-...
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
----
-...
-test_run:cmd("setopt delimiter ''");
----
-- true
-...
-t
----
-- - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-...
---
--- Before restart: our LSN is 1, because
--- LSN is promoted in tx only on successful
--- WAL write.
---
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-box.info.vclock
----
-- {1: 1}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-...
-test_run:cmd("restart server panic")
---
--- After restart: our LSN is the LSN of the
--- last empty WAL created on shutdown, i.e. 11.
---
-box.info.vclock
----
-- {1: 11}
-...
-box.space._schema:select{'key'}
----
-- - ['key', 'test 1']
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
----
-- ok
-...
-t = {}
----
-...
-s = box.space._schema
----
-...
---
--- now do the same
---
-test_run:cmd("setopt delimiter ';'")
----
-- true
-...
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
----
-...
-test_run:cmd("setopt delimiter ''");
----
-- true
-...
-t
----
-- - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-  - Failed to write to disk
-...
-box.info.vclock
----
-- {1: 11}
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
----
-- ok
-...
---
--- Write a good row after a series of failed
--- rows. There is a gap in LSN, correct,
--- but it's *inside* a single WAL, so doesn't
--- affect WAL search in recover_remaining_wals()
---
-s:replace{'key', 'test 2'}
----
-- ['key', 'test 2']
-...
---
--- notice that vclock before and after
--- server stop is the same -- because it's
--- recorded in the last row
---
-box.info.vclock
----
-- {1: 22}
-...
-test_run:cmd("restart server panic")
-box.info.vclock
----
-- {1: 22}
-...
-box.space._schema:select{'key'}
----
-- - ['key', 'test 2']
-...
--- list all the logs
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-...
--- now insert 10 rows - so that the next
--- row will need to switch the WAL
-test_run:cmd("setopt delimiter ';'")
----
-- true
-...
-for i=1,box.cfg.rows_per_wal do
-    box.space._schema:replace{"key", 'test 3'}
-end;
----
-...
-test_run:cmd("setopt delimiter ''");
----
-- true
-...
--- the next insert should switch xlog, but aha - it fails
--- a new xlog file is created but has 0 rows
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
----
-- ok
-...
-box.space._schema:replace{"key", 'test 3'}
----
-- error: Failed to write to disk
-...
-box.info.vclock
----
-- {1: 32}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-...
--- and the next one (just to be sure
-box.space._schema:replace{"key", 'test 3'}
----
-- error: Failed to write to disk
-...
-box.info.vclock
----
-- {1: 32}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-...
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
----
-- ok
-...
--- then a success
-box.space._schema:replace{"key", 'test 4'}
----
-- ['key', 'test 4']
-...
-box.info.vclock
----
-- {1: 35}
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-...
--- restart is ok
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
----
-- - ['key', 'test 4']
-...
---
--- Check that if there's an LSN gap between two WALs
--- that appeared due to a disk error and no files is
--- actually missing, we won't panic on recovery.
---
-box.space._schema:replace{'key', 'test 4'} -- creates new WAL
----
-- ['key', 'test 4']
-...
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
----
-- ok
-...
-box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
----
-- error: Failed to write to disk
-...
-box.snapshot() -- fails, rotates WAL
----
-- error: Error injection 'xlog write injection'
-...
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
----
-- ok
-...
-box.space._schema:replace{'key', 'test 5'} -- creates new WAL
----
-- ['key', 'test 5']
-...
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
----
-- ok
-...
-box.space._schema:replace{'key', 'test 6'} -- fails, makes gap
----
-- error: Failed to write to disk
-...
-box.snapshot() -- fails, rotates WAL
----
-- error: Error injection 'xlog write injection'
-...
-box.space._schema:replace{'key', 'test 6'} -- fails, creates empty WAL
----
-- error: Failed to write to disk
-...
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-  - panic/00000000000000000035.xlog
-  - panic/00000000000000000037.xlog
-  - panic/00000000000000000039.xlog
-...
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
----
-- - ['key', 'test 5']
-...
--- Check that we don't create a WAL in the gap between the last two.
-box.space._schema:replace{'key', 'test 6'}
----
-- ['key', 'test 6']
-...
-name = string.match(arg[0], "([^,]+)%.lua")
----
-...
-require('fio').glob(name .. "/*.xlog")
----
-- - panic/00000000000000000000.xlog
-  - panic/00000000000000000011.xlog
-  - panic/00000000000000000022.xlog
-  - panic/00000000000000000032.xlog
-  - panic/00000000000000000035.xlog
-  - panic/00000000000000000037.xlog
-  - panic/00000000000000000039.xlog
-  - panic/00000000000000000040.xlog
-...
-test_run:cmd('switch default')
----
-- true
-...
-test_run:cmd("stop server panic")
----
-- true
-...
-test_run:cmd("cleanup server panic")
----
-- true
-...
diff --git a/test/xlog/panic_on_lsn_gap.test.lua b/test/xlog/panic_on_lsn_gap.test.lua
deleted file mode 100644
index 6221261a7..000000000
--- a/test/xlog/panic_on_lsn_gap.test.lua
+++ /dev/null
@@ -1,136 +0,0 @@
---
--- we actually need to know what xlogs the server creates,
--- so start from a clean state
---
---
--- Check how the server is able to find the next
--- xlog if there are failed writes (lsn gaps).
---
-env = require('test_run')
-test_run = env.new()
-test_run:cmd("create server panic with script='xlog/panic.lua'")
-test_run:cmd("start server panic")
-test_run:cmd("switch panic")
-box.info.vclock
-s = box.space._schema
--- we need to have at least one record in the
--- xlog otherwise the server believes that there
--- is an lsn gap during recovery.
---
-s:replace{"key", 'test 1'}
-box.info.vclock
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
-t = {}
---
--- Try to insert rows, so that it's time to
--- switch WALs. No switch will happen though,
--- since no writes were made.
---
-test_run:cmd("setopt delimiter ';'")
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
-test_run:cmd("setopt delimiter ''");
-t
---
--- Before restart: our LSN is 1, because
--- LSN is promoted in tx only on successful
--- WAL write.
---
-name = string.match(arg[0], "([^,]+)%.lua")
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
-test_run:cmd("restart server panic")
---
--- After restart: our LSN is the LSN of the
--- last empty WAL created on shutdown, i.e. 11.
---
-box.info.vclock
-box.space._schema:select{'key'}
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
-t = {}
-s = box.space._schema
---
--- now do the same
---
-test_run:cmd("setopt delimiter ';'")
-for i=1,box.cfg.rows_per_wal do
-    status, msg = pcall(s.replace, s, {"key"})
-    table.insert(t, msg)
-end;
-test_run:cmd("setopt delimiter ''");
-t
-box.info.vclock
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
---
--- Write a good row after a series of failed
--- rows. There is a gap in LSN, correct,
--- but it's *inside* a single WAL, so doesn't
--- affect WAL search in recover_remaining_wals()
---
-s:replace{'key', 'test 2'}
---
--- notice that vclock before and after
--- server stop is the same -- because it's
--- recorded in the last row
---
-box.info.vclock
-test_run:cmd("restart server panic")
-box.info.vclock
-box.space._schema:select{'key'}
--- list all the logs
-name = string.match(arg[0], "([^,]+)%.lua")
-require('fio').glob(name .. "/*.xlog")
--- now insert 10 rows - so that the next
--- row will need to switch the WAL
-test_run:cmd("setopt delimiter ';'")
-for i=1,box.cfg.rows_per_wal do
-    box.space._schema:replace{"key", 'test 3'}
-end;
-test_run:cmd("setopt delimiter ''");
--- the next insert should switch xlog, but aha - it fails
--- a new xlog file is created but has 0 rows
-require('fio').glob(name .. "/*.xlog")
-box.error.injection.set("ERRINJ_WAL_WRITE", true)
-box.space._schema:replace{"key", 'test 3'}
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
--- and the next one (just to be sure
-box.space._schema:replace{"key", 'test 3'}
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
-box.error.injection.set("ERRINJ_WAL_WRITE", false)
--- then a success
-box.space._schema:replace{"key", 'test 4'}
-box.info.vclock
-require('fio').glob(name .. "/*.xlog")
--- restart is ok
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
---
--- Check that if there's an LSN gap between two WALs
--- that appeared due to a disk error and no files is
--- actually missing, we won't panic on recovery.
---
-box.space._schema:replace{'key', 'test 4'} -- creates new WAL
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
-box.space._schema:replace{'key', 'test 5'} -- fails, makes gap
-box.snapshot() -- fails, rotates WAL
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", false)
-box.space._schema:replace{'key', 'test 5'} -- creates new WAL
-box.error.injection.set("ERRINJ_WAL_WRITE_DISK", true)
-box.space._schema:replace{'key', 'test 6'} -- fails, makes gap
-box.snapshot() -- fails, rotates WAL
-box.space._schema:replace{'key', 'test 6'} -- fails, creates empty WAL
-name = string.match(arg[0], "([^,]+)%.lua")
-require('fio').glob(name .. "/*.xlog")
-test_run:cmd("restart server panic")
-box.space._schema:select{'key'}
--- Check that we don't create a WAL in the gap between the last two.
-box.space._schema:replace{'key', 'test 6'}
-name = string.match(arg[0], "([^,]+)%.lua")
-require('fio').glob(name .. "/*.xlog")
-test_run:cmd('switch default')
-test_run:cmd("stop server panic")
-test_run:cmd("cleanup server panic")
-- 
2.20.1