[PATCH 6/6] wal: remove files needed for recovery from backup checkpoints on ENOSPC

Sun Nov 25 16:48:13 MSK 2018

Tarantool always keeps box.cfg.checkpoint_count latest checkpoints. It
also never deletes WAL files needed for recovery from any of them for
the sake of redundancy, even if it gets ENOSPC while trying to write to
WAL. This patch changes that behavior: now the WAL thread is allowed to
delete backup WAL files in case of emergency ENOSPC - after all it's
better than stopping operation.

Closes #3822
---
 src/box/box.cc                        | 17 +++++++------
 src/box/gc.c                          | 17 ++++++++-----
 src/box/gc.h                          | 14 ----------
 src/box/wal.c                         | 48 +++++++++++++++++++++++------------
 src/box/wal.h                         | 22 ++++++++--------
 test/replication/gc_no_space.result   | 47 +++++++++++++++++++++++++++++-----
 test/replication/gc_no_space.test.lua | 23 ++++++++++++-----
 7 files changed, 121 insertions(+), 67 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 5ea2f014..72788f82 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2104,6 +2104,8 @@ box_cfg_xc(void)
 		/* Bootstrap a new master */
 		bootstrap(&instance_uuid, &replicaset_uuid,
 			  &is_bootstrap_leader);
+		checkpoint = gc_last_checkpoint();
+		assert(checkpoint != NULL);
 	}
 	fiber_gc();
 
@@ -2117,16 +2119,13 @@ box_cfg_xc(void)
 		}
 	}
 
-	struct gc_checkpoint *first_checkpoint = gc_first_checkpoint();
-	assert(first_checkpoint != NULL);
-
 	/* Start WAL writer */
 	int64_t wal_max_rows = box_check_wal_max_rows(cfg_geti64("rows_per_wal"));
 	int64_t wal_max_size = box_check_wal_max_size(cfg_geti64("wal_max_size"));
 	enum wal_mode wal_mode = box_check_wal_mode(cfg_gets("wal_mode"));
 	if (wal_init(wal_mode, cfg_gets("wal_dir"), wal_max_rows,
 		     wal_max_size, &INSTANCE_UUID, &replicaset.vclock,
-		     &first_checkpoint->vclock) != 0) {
+		     &checkpoint->vclock) != 0) {
 		diag_raise();
 	}
 	gc_set_wal_watcher();
@@ -2190,15 +2189,17 @@ box_checkpoint()
 		goto end;
 
 	struct vclock vclock;
-	if ((rc = wal_checkpoint(&vclock)))
+	if ((rc = wal_begin_checkpoint(&vclock)))
+		goto end;
+
+	if ((rc = engine_commit_checkpoint(&vclock)))
 		goto end;
 
-	rc = engine_commit_checkpoint(&vclock);
+	wal_commit_checkpoint(&vclock);
+	gc_add_checkpoint(&vclock);
 end:
 	if (rc)
 		engine_abort_checkpoint();
-	else
-		gc_add_checkpoint(&vclock);
 
 	latch_unlock(&schema_lock);
 	box_checkpoint_is_in_progress = false;
diff --git a/src/box/gc.c b/src/box/gc.c
index 55c36d15..05773b91 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -218,13 +218,8 @@ gc_run(void)
 	int rc = 0;
 	if (run_engine_gc)
 		rc = engine_collect_garbage(&checkpoint->vclock);
-	/*
-	 * Run wal_collect_garbage() even if we don't need to
-	 * delete any WAL files, because we have to apprise
-	 * the WAL thread of the oldest checkpoint signature.
-	 */
-	if (rc == 0)
-		wal_collect_garbage(vclock, &checkpoint->vclock);
+	if (run_wal_gc && rc == 0)
+		wal_collect_garbage(vclock);
 	latch_unlock(&gc.latch);
 }
 
@@ -236,6 +231,14 @@ gc_process_wal_event(struct wal_watcher_msg *msg)
 {
 	assert((msg->events & WAL_EVENT_GC) != 0);
 
+	/*
+	 * In case of emergency ENOSPC, the WAL thread may delete
+	 * WAL files needed to restore from backup checkpoints,
+	 * which would be kept by the garbage collector otherwise.
+	 * Bring the garbage collector vclock up to date.
+	 */
+	vclock_copy(&gc.vclock, &msg->gc_vclock);
+
 	struct gc_consumer *consumer = gc_tree_first(&gc.consumers);
 	while (consumer != NULL &&
 	       vclock_sum(&consumer->vclock) < vclock_sum(&msg->gc_vclock)) {
diff --git a/src/box/gc.h b/src/box/gc.h
index e1241baa..eab19ba3 100644
--- a/src/box/gc.h
+++ b/src/box/gc.h
@@ -156,20 +156,6 @@ extern struct gc_state gc;
 	rlist_foreach_entry(ref, &(checkpoint)->refs, in_refs)
 
 /**
- * Return the first (oldest) checkpoint known to the garbage
- * collector. If there's no checkpoint, return NULL.
- */
-static inline struct gc_checkpoint *
-gc_first_checkpoint(void)
-{
-	if (rlist_empty(&gc.checkpoints))
-		return NULL;
-
-	return rlist_first_entry(&gc.checkpoints, struct gc_checkpoint,
-				 in_checkpoints);
-}
-
-/**
  * Return the last (newest) checkpoint known to the garbage
  * collector. If there's no checkpoint, return NULL.
  */
diff --git a/src/box/wal.c b/src/box/wal.c
index 3e6c1e7f..f0b19b7c 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -120,7 +120,7 @@ struct wal_writer
 	 */
 	struct vclock vclock;
 	/**
-	 * VClock of the oldest checkpoint available on the instance.
+	 * VClock of the most recent successfully created checkpoint.
 	 * The WAL writer must not delete WAL files that are needed to
 	 * recover from it even if it is running out of disk space.
 	 */
@@ -419,14 +419,14 @@ wal_open(struct wal_writer *writer)
 int
 wal_init(enum wal_mode wal_mode, const char *wal_dirname, int64_t wal_max_rows,
 	 int64_t wal_max_size, const struct tt_uuid *instance_uuid,
-	 const struct vclock *vclock, const struct vclock *first_checkpoint_vclock)
+	 const struct vclock *vclock, const struct vclock *checkpoint_vclock)
 {
 	assert(wal_max_rows > 1);
 
 	struct wal_writer *writer = &wal_writer_singleton;
 	wal_writer_create(writer, wal_mode, wal_dirname, wal_max_rows,
 			  wal_max_size, instance_uuid, vclock,
-			  first_checkpoint_vclock);
+			  checkpoint_vclock);
 
 	/*
 	 * Scan the WAL directory to build an index of all
@@ -473,7 +473,7 @@ struct wal_checkpoint_msg {
 };
 
 static int
-wal_checkpoint_f(struct cbus_call_msg *data)
+wal_begin_checkpoint_f(struct cbus_call_msg *data)
 {
 	struct wal_checkpoint_msg *msg = (struct wal_checkpoint_msg *)data;
 	struct wal_writer *writer = &wal_writer_singleton;
@@ -481,7 +481,7 @@ wal_checkpoint_f(struct cbus_call_msg *data)
 		/*
 		 * We're rolling back a failed write and so
 		 * can't make a checkpoint - see the comment
-		 * in wal_checkpoint() for the explanation.
+		 * in wal_begin_checkpoint() for the explanation.
 		 */
 		diag_set(ClientError, ER_CHECKPOINT_ROLLBACK);
 		return -1;
@@ -503,7 +503,7 @@ wal_checkpoint_f(struct cbus_call_msg *data)
 }
 
 int
-wal_checkpoint(struct vclock *vclock)
+wal_begin_checkpoint(struct vclock *vclock)
 {
 	struct wal_writer *writer = &wal_writer_singleton;
 	if (writer->wal_mode == WAL_NONE) {
@@ -524,7 +524,8 @@ wal_checkpoint(struct vclock *vclock)
 	struct wal_checkpoint_msg msg;
 	bool cancellable = fiber_set_cancellable(false);
 	int rc = cbus_call(&wal_thread.wal_pipe, &wal_thread.tx_prio_pipe,
-			   &msg.base, wal_checkpoint_f, NULL, TIMEOUT_INFINITY);
+			   &msg.base, wal_begin_checkpoint_f, NULL,
+			   TIMEOUT_INFINITY);
 	fiber_set_cancellable(cancellable);
 	if (rc != 0)
 		return -1;
@@ -532,19 +533,37 @@ wal_checkpoint(struct vclock *vclock)
 	return 0;
 }
 
+static int
+wal_commit_checkpoint_f(struct cbus_call_msg *data)
+{
+	struct wal_checkpoint_msg *msg = (struct wal_checkpoint_msg *)data;
+	struct wal_writer *writer = &wal_writer_singleton;
+	vclock_copy(&writer->checkpoint_vclock, &msg->vclock);
+	return 0;
+}
+
+void
+wal_commit_checkpoint(const struct vclock *vclock)
+{
+	struct wal_checkpoint_msg msg;
+	vclock_copy(&msg.vclock, vclock);
+	bool cancellable = fiber_set_cancellable(false);
+	cbus_call(&wal_thread.wal_pipe, &wal_thread.tx_prio_pipe,
+		  &msg.base, wal_commit_checkpoint_f, NULL, TIMEOUT_INFINITY);
+	fiber_set_cancellable(cancellable);
+}
+
 struct wal_gc_msg
 {
 	struct cbus_call_msg base;
-	const struct vclock *wal_vclock;
-	const struct vclock *checkpoint_vclock;
+	const struct vclock *vclock;
 };
 
 static int
 wal_collect_garbage_f(struct cbus_call_msg *data)
 {
 	struct wal_writer *writer = &wal_writer_singleton;
-	struct wal_gc_msg *msg = (struct wal_gc_msg *)data;
-	const struct vclock *vclock = msg->wal_vclock;
+	const struct vclock *vclock = ((struct wal_gc_msg *)data)->vclock;
 
 	if (!xlog_is_open(&writer->current_wal) &&
 	    vclock_sum(vclock) >= vclock_sum(&writer->vclock)) {
@@ -564,20 +583,17 @@ wal_collect_garbage_f(struct cbus_call_msg *data)
 	if (vclock != NULL)
 		xdir_collect_garbage(&writer->wal_dir, vclock_sum(vclock), 0);
 
-	vclock_copy(&writer->checkpoint_vclock, msg->checkpoint_vclock);
 	return 0;
 }
 
 void
-wal_collect_garbage(const struct vclock *wal_vclock,
-		    const struct vclock *checkpoint_vclock)
+wal_collect_garbage(const struct vclock *vclock)
 {
 	struct wal_writer *writer = &wal_writer_singleton;
 	if (writer->wal_mode == WAL_NONE)
 		return;
 	struct wal_gc_msg msg;
-	msg.wal_vclock = wal_vclock;
-	msg.checkpoint_vclock = checkpoint_vclock;
+	msg.vclock = vclock;
 	bool cancellable = fiber_set_cancellable(false);
 	cbus_call(&wal_thread.wal_pipe, &wal_thread.tx_prio_pipe, &msg.base,
 		  wal_collect_garbage_f, NULL, TIMEOUT_INFINITY);
diff --git a/src/box/wal.h b/src/box/wal.h
index 7ca27f1a..5f3a66ce 100644
--- a/src/box/wal.h
+++ b/src/box/wal.h
@@ -58,7 +58,7 @@ wal_thread_start();
 int
 wal_init(enum wal_mode wal_mode, const char *wal_dirname, int64_t wal_max_rows,
 	 int64_t wal_max_size, const struct tt_uuid *instance_uuid,
-	 const struct vclock *vclock, const struct vclock *first_checkpoint_vclock);
+	 const struct vclock *vclock, const struct vclock *checkpoint_vclock);
 
 void
 wal_thread_stop();
@@ -179,20 +179,22 @@ wal_flush(void);
  * is supposed to be used to identify the new checkpoint.
  */
 int
-wal_checkpoint(struct vclock *vclock);
+wal_begin_checkpoint(struct vclock *vclock);
+
+/**
+ * This function is called upon successful checkpoint creation.
+ * It updates the WAL thread's version of the last checkpoint
+ * vclock.
+ */
+void
+wal_commit_checkpoint(const struct vclock *vclock);
 
 /**
  * Remove WAL files that are not needed by consumers reading
- * rows at @wal_vclock or newer.
- *
- * Update the oldest checkpoint signature with @checkpoint_vclock.
- * WAL thread will delete WAL files that are not needed to
- * recover from the oldest checkpoint if it runs out of disk
- * space.
+ * rows at @vclock or newer.
  */
 void
-wal_collect_garbage(const struct vclock *wal_vclock,
-		    const struct vclock *checkpoint_vclock);
+wal_collect_garbage(const struct vclock *vclock);
 
 void
 wal_init_vy_log();
diff --git a/test/replication/gc_no_space.result b/test/replication/gc_no_space.result
index ceea8ab3..5c64bea4 100644
--- a/test/replication/gc_no_space.result
+++ b/test/replication/gc_no_space.result
@@ -160,10 +160,21 @@ check_snap_count(2)
 ---
 - true
 ...
-#box.info.gc().consumers -- 3
+gc = box.info.gc()
+---
+...
+#gc.consumers -- 3
 ---
 - 3
 ...
+#gc.checkpoints -- 2
+---
+- 2
+...
+gc.signature == gc.consumers[1].signature
+---
+- true
+...
 --
 -- Inject a ENOSPC error and check that the WAL thread deletes
 -- old WAL files to prevent the user from seeing the error.
@@ -188,15 +199,28 @@ check_snap_count(2)
 ---
 - true
 ...
-#box.info.gc().consumers -- 1
+gc = box.info.gc()
+---
+...
+#gc.consumers -- 1
 ---
 - 1
 ...
+#gc.checkpoints -- 2
+---
+- 2
+...
+gc.signature == gc.consumers[1].signature
+---
+- true
+...
 --
 -- Check that the WAL thread never deletes WAL files that are
--- needed for recovery from a checkpoint.
+-- needed for recovery from the last checkpoint, but may delete
+-- older WAL files that would be kept otherwise for recovery
+-- from backup checkpoints.
 --
-errinj.set('ERRINJ_WAL_FALLOCATE', 2)
+errinj.set('ERRINJ_WAL_FALLOCATE', 3)
 ---
 - ok
 ...
@@ -208,7 +232,7 @@ errinj.info()['ERRINJ_WAL_FALLOCATE'].state -- 0
 ---
 - 0
 ...
-check_wal_count(2)
+check_wal_count(1)
 ---
 - true
 ...
@@ -216,10 +240,21 @@ check_snap_count(2)
 ---
 - true
 ...
-#box.info.gc().consumers -- 0
+gc = box.info.gc()
+---
+...
+#gc.consumers -- 0
 ---
 - 0
 ...
+#gc.checkpoints -- 2
+---
+- 2
+...
+gc.signature == gc.checkpoints[2].signature
+---
+- true
+...
 s:drop()
 ---
 ...
diff --git a/test/replication/gc_no_space.test.lua b/test/replication/gc_no_space.test.lua
index be2e3229..7f5ab803 100644
--- a/test/replication/gc_no_space.test.lua
+++ b/test/replication/gc_no_space.test.lua
@@ -70,7 +70,10 @@ s:auto_increment{}
 
 check_wal_count(5)
 check_snap_count(2)
-#box.info.gc().consumers -- 3
+gc = box.info.gc()
+#gc.consumers -- 3
+#gc.checkpoints -- 2
+gc.signature == gc.consumers[1].signature
 
 --
 -- Inject a ENOSPC error and check that the WAL thread deletes
@@ -82,19 +85,27 @@ errinj.info()['ERRINJ_WAL_FALLOCATE'].state -- 0
 
 check_wal_count(3)
 check_snap_count(2)
-#box.info.gc().consumers -- 1
+gc = box.info.gc()
+#gc.consumers -- 1
+#gc.checkpoints -- 2
+gc.signature == gc.consumers[1].signature
 
 --
 -- Check that the WAL thread never deletes WAL files that are
--- needed for recovery from a checkpoint.
+-- needed for recovery from the last checkpoint, but may delete
+-- older WAL files that would be kept otherwise for recovery
+-- from backup checkpoints.
 --
-errinj.set('ERRINJ_WAL_FALLOCATE', 2)
+errinj.set('ERRINJ_WAL_FALLOCATE', 3)
 s:auto_increment{} -- failure
 errinj.info()['ERRINJ_WAL_FALLOCATE'].state -- 0
 
-check_wal_count(2)
+check_wal_count(1)
 check_snap_count(2)
-#box.info.gc().consumers -- 0
+gc = box.info.gc()
+#gc.consumers -- 0
+#gc.checkpoints -- 2
+gc.signature == gc.checkpoints[2].signature
 
 s:drop()
 box.schema.user.revoke('guest', 'replication')
-- 
2.11.0