From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <vdavydov.dev@gmail.com>
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Subject: [PATCH 3/6] box: do not rotate WAL when replica subscribes
Date: Sun, 25 Nov 2018 16:48:10 +0300
Message-Id: <51805fb0af1b8192c8226348bb46683b14ead224.1543152574.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1543152574.git.vdavydov.dev@gmail.com>
References: <cover.1543152574.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1543152574.git.vdavydov.dev@gmail.com>
References: <cover.1543152574.git.vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
List-ID: <tarantool-patches.dev.tarantool.org>

Because this is pointless and confusing. This "feature" was silently
introduced by commit f2bccc18485d ("Use WAL vclock instead of TX vclock
in most places"). Let's revert this change. This will allow us to
clearly separate WAL checkpointing from WAL flushing, which will in turn
facilitate implementation of the checkpoint-on-WAL-threshold feature.

There are two problems here, however. First, not rotating the log breaks
expectations of replication/gc test: an xlog file doesn't get deleted in
time as a consequence. This happens, because we don't delete xlogs
relayed to a replica after join stage is complete - we only do it during
subscribe stage - and if we don't rotate WAL on subscribe the garbage
collector won't be invoked. This is actually a bug - we should advance
the WAL consumer associated with a replica once join stage is complete.
This patch fixes it, but it unveils another problem - this time in the
WAL garbage collection procedure.

Turns out, when passed a vclock, the WAL garbage collection procedure
removes all WAL files that were created before the vclock. Apparently,
this isn't quite correct - if a consumer is in the middle of a WAL file,
we must not delete the WAL file, but we do. This works as long as
consumers never track vlcocks inside WAL files - currently they are
advanced only when a WAL file is closed and naturally they are advanced
to the beginning of the next WAL file. However, if we want to advance
the consumer associated with a replica when join stage ends (this is
what the previous paragraph is about), it might occur that we will
advance it to the middle of a WAL file. If that happens the WAL garbage
collector might remove a file which is actually in use by a replica.
Fix this as well.
---
 src/box/box.cc                                     | 19 ++++++++++---------
 src/box/wal.c                                      | 21 ++++++++++++++++++++-
 src/box/wal.h                                      |  4 +++-
 test/replication/gc.result                         |  8 +-------
 test/replication/gc.test.lua                       |  6 +-----
 test/replication/gc_no_space.result                |  4 ++--
 test/replication/gc_no_space.test.lua              |  4 ++--
 test/replication/show_error_on_disconnect.result   | 12 +++++++-----
 test/replication/show_error_on_disconnect.test.lua | 10 ++++++++--
 9 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 21b84991..8a1a2668 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1511,21 +1511,22 @@ box_process_join(struct ev_io *io, struct xrow_header *header)
 	 */
 	box_on_join(&instance_uuid);
 
-	replica = replica_by_uuid(&instance_uuid);
-	assert(replica != NULL);
+	/* Remember master's vclock after the last request */
+	struct vclock stop_vclock;
+	wal_checkpoint(&stop_vclock, false);
 
-	/* Register the replica as a WAL consumer. */
+	/*
+	 * Register the replica as a WAL consumer so that
+	 * it can resume SUBSCRIBE where FINAL JOIN ends.
+	 */
+	replica = replica_by_uuid(&instance_uuid);
 	if (replica->gc != NULL)
 		gc_consumer_unregister(replica->gc);
-	replica->gc = gc_consumer_register(&start_vclock, "replica %s",
+	replica->gc = gc_consumer_register(&stop_vclock, "replica %s",
 					   tt_uuid_str(&instance_uuid));
 	if (replica->gc == NULL)
 		diag_raise();
 
-	/* Remember master's vclock after the last request */
-	struct vclock stop_vclock;
-	wal_checkpoint(&stop_vclock, false);
-
 	/* Send end of initial stage data marker */
 	xrow_encode_vclock_xc(&row, &stop_vclock);
 	row.sync = header->sync;
@@ -1608,7 +1609,7 @@ box_process_subscribe(struct ev_io *io, struct xrow_header *header)
 	 */
 	struct xrow_header row;
 	struct vclock current_vclock;
-	wal_checkpoint(&current_vclock, true);
+	wal_checkpoint(&current_vclock, false);
 	xrow_encode_vclock_xc(&row, &current_vclock);
 	/*
 	 * Identify the message with the replica id of this
diff --git a/src/box/wal.c b/src/box/wal.c
index cbf2c121..11aae5fc 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -553,8 +553,27 @@ wal_collect_garbage_f(struct cbus_call_msg *data)
 {
 	struct wal_writer *writer = &wal_writer_singleton;
 	struct wal_gc_msg *msg = (struct wal_gc_msg *)data;
+	const struct vclock *vclock = msg->wal_vclock;
+
+	if (!xlog_is_open(&writer->current_wal) &&
+	    vclock_sum(vclock) >= vclock_sum(&writer->vclock)) {
+		/*
+		 * The last available WAL file has been sealed and
+		 * all registered consumers have done reading it.
+		 * We can delete it now.
+		 */
+	} else {
+		/*
+		 * Find the most recent WAL file that contains rows
+		 * required by registered consumers and delete all
+		 * older WAL files.
+		 */
+		vclock = vclockset_psearch(&writer->wal_dir.index, vclock);
+	}
+	if (vclock != NULL)
+		xdir_collect_garbage(&writer->wal_dir, vclock_sum(vclock), 0);
+
 	vclock_copy(&writer->checkpoint_vclock, msg->checkpoint_vclock);
-	xdir_collect_garbage(&writer->wal_dir, vclock_sum(msg->wal_vclock), 0);
 	return 0;
 }
 
diff --git a/src/box/wal.h b/src/box/wal.h
index 808af76e..e4094b1e 100644
--- a/src/box/wal.h
+++ b/src/box/wal.h
@@ -175,7 +175,9 @@ int
 wal_checkpoint(struct vclock *vclock, bool rotate);
 
 /**
- * Remove all WAL files whose signature is less than @wal_vclock.
+ * Remove WAL files that are not needed by consumers reading
+ * rows at @wal_vclock or newer.
+ *
  * Update the oldest checkpoint signature with @checkpoint_vclock.
  * WAL thread will delete WAL files that are not needed to
  * recover from the oldest checkpoint if it runs out of disk
diff --git a/test/replication/gc.result b/test/replication/gc.result
index f6266004..72fdcf77 100644
--- a/test/replication/gc.result
+++ b/test/replication/gc.result
@@ -308,13 +308,7 @@ box.snapshot()
 ---
 - true
 ...
-xlog_count = #fio.glob('./master/*.xlog')
----
-...
--- the replica may have managed to download all data
--- from xlog #1 before it was stopped, in which case
--- it's OK to collect xlog #1
-xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
+#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
 ---
 - true
 ...
diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua
index 30108249..2707d5e2 100644
--- a/test/replication/gc.test.lua
+++ b/test/replication/gc.test.lua
@@ -140,11 +140,7 @@ box.snapshot()
 _ = s:auto_increment{}
 box.snapshot()
 #box.info.gc().checkpoints == 1 or box.info.gc()
-xlog_count = #fio.glob('./master/*.xlog')
--- the replica may have managed to download all data
--- from xlog #1 before it was stopped, in which case
--- it's OK to collect xlog #1
-xlog_count == 3 or xlog_count == 2 or fio.listdir('./master')
+#fio.glob('./master/*.xlog') == 2 or fio.listdir('./master')
 
 -- The xlog should only be deleted after the replica
 -- is unregistered.
diff --git a/test/replication/gc_no_space.result b/test/replication/gc_no_space.result
index 8e663cdf..ceea8ab3 100644
--- a/test/replication/gc_no_space.result
+++ b/test/replication/gc_no_space.result
@@ -152,7 +152,7 @@ s:auto_increment{}
 ---
 - [5]
 ...
-check_wal_count(7)
+check_wal_count(5)
 ---
 - true
 ...
@@ -168,7 +168,7 @@ check_snap_count(2)
 -- Inject a ENOSPC error and check that the WAL thread deletes
 -- old WAL files to prevent the user from seeing the error.
 --
-errinj.set('ERRINJ_WAL_FALLOCATE', 3)
+errinj.set('ERRINJ_WAL_FALLOCATE', 2)
 ---
 - ok
 ...
diff --git a/test/replication/gc_no_space.test.lua b/test/replication/gc_no_space.test.lua
index 4bab2b0e..be2e3229 100644
--- a/test/replication/gc_no_space.test.lua
+++ b/test/replication/gc_no_space.test.lua
@@ -68,7 +68,7 @@ s:auto_increment{}
 box.snapshot()
 s:auto_increment{}
 
-check_wal_count(7)
+check_wal_count(5)
 check_snap_count(2)
 #box.info.gc().consumers -- 3
 
@@ -76,7 +76,7 @@ check_snap_count(2)
 -- Inject a ENOSPC error and check that the WAL thread deletes
 -- old WAL files to prevent the user from seeing the error.
 --
-errinj.set('ERRINJ_WAL_FALLOCATE', 3)
+errinj.set('ERRINJ_WAL_FALLOCATE', 2)
 s:auto_increment{} -- success
 errinj.info()['ERRINJ_WAL_FALLOCATE'].state -- 0
 
diff --git a/test/replication/show_error_on_disconnect.result b/test/replication/show_error_on_disconnect.result
index c5a91c00..af082203 100644
--- a/test/replication/show_error_on_disconnect.result
+++ b/test/replication/show_error_on_disconnect.result
@@ -46,17 +46,18 @@ box.snapshot()
 ---
 - ok
 ...
-test_run:cmd("switch default")
+-- Manually remove all xlogs on master_quorum2 to break replication to master_quorum1.
+fio = require('fio')
 ---
-- true
 ...
-fio = require('fio')
+for _, path in ipairs(fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog'))) do fio.unlink(path) end
 ---
 ...
-fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+box.space.test:insert{3}
 ---
-- true
+- [3]
 ...
+-- Check error reporting.
 test_run:cmd("switch master_quorum1")
 ---
 - true
@@ -90,6 +91,7 @@ box.space.test:select()
 ---
 - - [1]
   - [2]
+  - [3]
 ...
 other_id = box.info.id % 2 + 1
 ---
diff --git a/test/replication/show_error_on_disconnect.test.lua b/test/replication/show_error_on_disconnect.test.lua
index 64a75025..40e9dbc5 100644
--- a/test/replication/show_error_on_disconnect.test.lua
+++ b/test/replication/show_error_on_disconnect.test.lua
@@ -5,6 +5,7 @@
 --
 test_run = require('test_run').new()
 SERVERS = {'master_quorum1', 'master_quorum2'}
+
 -- Deploy a cluster.
 test_run:create_cluster(SERVERS)
 test_run:wait_fullmesh(SERVERS)
@@ -16,9 +17,14 @@ box.space.test:insert{1}
 box.snapshot()
 box.space.test:insert{2}
 box.snapshot()
-test_run:cmd("switch default")
+
+-- Manually remove all xlogs on master_quorum2 to break replication to master_quorum1.
 fio = require('fio')
-fio.unlink(fio.pathjoin(fio.abspath("."), string.format('master_quorum2/%020d.xlog', 5)))
+for _, path in ipairs(fio.glob(fio.pathjoin(box.cfg.wal_dir, '*.xlog'))) do fio.unlink(path) end
+
+box.space.test:insert{3}
+
+-- Check error reporting.
 test_run:cmd("switch master_quorum1")
 box.cfg{replication = repl}
 require('fiber').sleep(0.1)
-- 
2.11.0