Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [PATCH 6/6] wal: remove files needed for recovery from backup checkpoints on ENOSPC
Date: Sun, 25 Nov 2018 16:48:13 +0300	[thread overview]
Message-ID: <42ab54945eb5d9154668ad80e640c4e52c01835f.1543152574.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1543152574.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1543152574.git.vdavydov.dev@gmail.com>

Tarantool always keeps box.cfg.checkpoint_count latest checkpoints. It
also never deletes WAL files needed for recovery from any of them for
the sake of redundancy, even if it gets ENOSPC while trying to write to
WAL. This patch changes that behavior: now the WAL thread is allowed to
delete backup WAL files in case of emergency ENOSPC - after all it's
better than stopping operation.

Closes #3822
---
 src/box/box.cc                        | 17 +++++++------
 src/box/gc.c                          | 17 ++++++++-----
 src/box/gc.h                          | 14 ----------
 src/box/wal.c                         | 48 +++++++++++++++++++++++------------
 src/box/wal.h                         | 22 ++++++++--------
 test/replication/gc_no_space.result   | 47 +++++++++++++++++++++++++++++-----
 test/replication/gc_no_space.test.lua | 23 ++++++++++++-----
 7 files changed, 121 insertions(+), 67 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index 5ea2f014..72788f82 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -2104,6 +2104,8 @@ box_cfg_xc(void)
 		/* Bootstrap a new master */
 		bootstrap(&instance_uuid, &replicaset_uuid,
 			  &is_bootstrap_leader);
+		checkpoint = gc_last_checkpoint();
+		assert(checkpoint != NULL);
 	}
 	fiber_gc();
 
@@ -2117,16 +2119,13 @@ box_cfg_xc(void)
 		}
 	}
 
-	struct gc_checkpoint *first_checkpoint = gc_first_checkpoint();
-	assert(first_checkpoint != NULL);
-
 	/* Start WAL writer */
 	int64_t wal_max_rows = box_check_wal_max_rows(cfg_geti64("rows_per_wal"));
 	int64_t wal_max_size = box_check_wal_max_size(cfg_geti64("wal_max_size"));
 	enum wal_mode wal_mode = box_check_wal_mode(cfg_gets("wal_mode"));
 	if (wal_init(wal_mode, cfg_gets("wal_dir"), wal_max_rows,
 		     wal_max_size, &INSTANCE_UUID, &replicaset.vclock,
-		     &first_checkpoint->vclock) != 0) {
+		     &checkpoint->vclock) != 0) {
 		diag_raise();
 	}
 	gc_set_wal_watcher();
@@ -2190,15 +2189,17 @@ box_checkpoint()
 		goto end;
 
 	struct vclock vclock;
-	if ((rc = wal_checkpoint(&vclock)))
+	if ((rc = wal_begin_checkpoint(&vclock)))
+		goto end;
+
+	if ((rc = engine_commit_checkpoint(&vclock)))
 		goto end;
 
-	rc = engine_commit_checkpoint(&vclock);
+	wal_commit_checkpoint(&vclock);
+	gc_add_checkpoint(&vclock);
 end:
 	if (rc)
 		engine_abort_checkpoint();
-	else
-		gc_add_checkpoint(&vclock);
 
 	latch_unlock(&schema_lock);
 	box_checkpoint_is_in_progress = false;
diff --git a/src/box/gc.c b/src/box/gc.c
index 55c36d15..05773b91 100644
--- a/src/box/gc.c
+++ b/src/box/gc.c
@@ -218,13 +218,8 @@ gc_run(void)
 	int rc = 0;
 	if (run_engine_gc)
 		rc = engine_collect_garbage(&checkpoint->vclock);
-	/*
-	 * Run wal_collect_garbage() even if we don't need to
-	 * delete any WAL files, because we have to apprise
-	 * the WAL thread of the oldest checkpoint signature.
-	 */
-	if (rc == 0)
-		wal_collect_garbage(vclock, &checkpoint->vclock);
+	if (run_wal_gc && rc == 0)
+		wal_collect_garbage(vclock);
 	latch_unlock(&gc.latch);
 }
 
@@ -236,6 +231,14 @@ gc_process_wal_event(struct wal_watcher_msg *msg)
 {
 	assert((msg->events & WAL_EVENT_GC) != 0);
 
+	/*
+	 * In case of emergency ENOSPC, the WAL thread may delete
+	 * WAL files needed to restore from backup checkpoints,
+	 * which would be kept by the garbage collector otherwise.
+	 * Bring the garbage collector vclock up to date.
+	 */
+	vclock_copy(&gc.vclock, &msg->gc_vclock);
+
 	struct gc_consumer *consumer = gc_tree_first(&gc.consumers);
 	while (consumer != NULL &&
 	       vclock_sum(&consumer->vclock) < vclock_sum(&msg->gc_vclock)) {
diff --git a/src/box/gc.h b/src/box/gc.h
index e1241baa..eab19ba3 100644
--- a/src/box/gc.h
+++ b/src/box/gc.h
@@ -156,20 +156,6 @@ extern struct gc_state gc;
 	rlist_foreach_entry(ref, &(checkpoint)->refs, in_refs)
 
 /**
- * Return the first (oldest) checkpoint known to the garbage
- * collector. If there's no checkpoint, return NULL.
- */
-static inline struct gc_checkpoint *
-gc_first_checkpoint(void)
-{
-	if (rlist_empty(&gc.checkpoints))
-		return NULL;
-
-	return rlist_first_entry(&gc.checkpoints, struct gc_checkpoint,
-				 in_checkpoints);
-}
-
-/**
  * Return the last (newest) checkpoint known to the garbage
  * collector. If there's no checkpoint, return NULL.
  */
diff --git a/src/box/wal.c b/src/box/wal.c
index 3e6c1e7f..f0b19b7c 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -120,7 +120,7 @@ struct wal_writer
 	 */
 	struct vclock vclock;
 	/**
-	 * VClock of the oldest checkpoint available on the instance.
+	 * VClock of the most recent successfully created checkpoint.
 	 * The WAL writer must not delete WAL files that are needed to
 	 * recover from it even if it is running out of disk space.
 	 */
@@ -419,14 +419,14 @@ wal_open(struct wal_writer *writer)
 int
 wal_init(enum wal_mode wal_mode, const char *wal_dirname, int64_t wal_max_rows,
 	 int64_t wal_max_size, const struct tt_uuid *instance_uuid,
-	 const struct vclock *vclock, const struct vclock *first_checkpoint_vclock)
+	 const struct vclock *vclock, const struct vclock *checkpoint_vclock)
 {
 	assert(wal_max_rows > 1);
 
 	struct wal_writer *writer = &wal_writer_singleton;
 	wal_writer_create(writer, wal_mode, wal_dirname, wal_max_rows,
 			  wal_max_size, instance_uuid, vclock,
-			  first_checkpoint_vclock);
+			  checkpoint_vclock);
 
 	/*
 	 * Scan the WAL directory to build an index of all
@@ -473,7 +473,7 @@ struct wal_checkpoint_msg {
 };
 
 static int
-wal_checkpoint_f(struct cbus_call_msg *data)
+wal_begin_checkpoint_f(struct cbus_call_msg *data)
 {
 	struct wal_checkpoint_msg *msg = (struct wal_checkpoint_msg *)data;
 	struct wal_writer *writer = &wal_writer_singleton;
@@ -481,7 +481,7 @@ wal_checkpoint_f(struct cbus_call_msg *data)
 		/*
 		 * We're rolling back a failed write and so
 		 * can't make a checkpoint - see the comment
-		 * in wal_checkpoint() for the explanation.
+		 * in wal_begin_checkpoint() for the explanation.
 		 */
 		diag_set(ClientError, ER_CHECKPOINT_ROLLBACK);
 		return -1;
@@ -503,7 +503,7 @@ wal_checkpoint_f(struct cbus_call_msg *data)
 }
 
 int
-wal_checkpoint(struct vclock *vclock)
+wal_begin_checkpoint(struct vclock *vclock)
 {
 	struct wal_writer *writer = &wal_writer_singleton;
 	if (writer->wal_mode == WAL_NONE) {
@@ -524,7 +524,8 @@ wal_checkpoint(struct vclock *vclock)
 	struct wal_checkpoint_msg msg;
 	bool cancellable = fiber_set_cancellable(false);
 	int rc = cbus_call(&wal_thread.wal_pipe, &wal_thread.tx_prio_pipe,
-			   &msg.base, wal_checkpoint_f, NULL, TIMEOUT_INFINITY);
+			   &msg.base, wal_begin_checkpoint_f, NULL,
+			   TIMEOUT_INFINITY);
 	fiber_set_cancellable(cancellable);
 	if (rc != 0)
 		return -1;
@@ -532,19 +533,37 @@ wal_checkpoint(struct vclock *vclock)
 	return 0;
 }
 
+static int
+wal_commit_checkpoint_f(struct cbus_call_msg *data)
+{
+	struct wal_checkpoint_msg *msg = (struct wal_checkpoint_msg *)data;
+	struct wal_writer *writer = &wal_writer_singleton;
+	vclock_copy(&writer->checkpoint_vclock, &msg->vclock);
+	return 0;
+}
+
+void
+wal_commit_checkpoint(const struct vclock *vclock)
+{
+	struct wal_checkpoint_msg msg;
+	vclock_copy(&msg.vclock, vclock);
+	bool cancellable = fiber_set_cancellable(false);
+	cbus_call(&wal_thread.wal_pipe, &wal_thread.tx_prio_pipe,
+		  &msg.base, wal_commit_checkpoint_f, NULL, TIMEOUT_INFINITY);
+	fiber_set_cancellable(cancellable);
+}
+
 struct wal_gc_msg
 {
 	struct cbus_call_msg base;
-	const struct vclock *wal_vclock;
-	const struct vclock *checkpoint_vclock;
+	const struct vclock *vclock;
 };
 
 static int
 wal_collect_garbage_f(struct cbus_call_msg *data)
 {
 	struct wal_writer *writer = &wal_writer_singleton;
-	struct wal_gc_msg *msg = (struct wal_gc_msg *)data;
-	const struct vclock *vclock = msg->wal_vclock;
+	const struct vclock *vclock = ((struct wal_gc_msg *)data)->vclock;
 
 	if (!xlog_is_open(&writer->current_wal) &&
 	    vclock_sum(vclock) >= vclock_sum(&writer->vclock)) {
@@ -564,20 +583,17 @@ wal_collect_garbage_f(struct cbus_call_msg *data)
 	if (vclock != NULL)
 		xdir_collect_garbage(&writer->wal_dir, vclock_sum(vclock), 0);
 
-	vclock_copy(&writer->checkpoint_vclock, msg->checkpoint_vclock);
 	return 0;
 }
 
 void
-wal_collect_garbage(const struct vclock *wal_vclock,
-		    const struct vclock *checkpoint_vclock)
+wal_collect_garbage(const struct vclock *vclock)
 {
 	struct wal_writer *writer = &wal_writer_singleton;
 	if (writer->wal_mode == WAL_NONE)
 		return;
 	struct wal_gc_msg msg;
-	msg.wal_vclock = wal_vclock;
-	msg.checkpoint_vclock = checkpoint_vclock;
+	msg.vclock = vclock;
 	bool cancellable = fiber_set_cancellable(false);
 	cbus_call(&wal_thread.wal_pipe, &wal_thread.tx_prio_pipe, &msg.base,
 		  wal_collect_garbage_f, NULL, TIMEOUT_INFINITY);
diff --git a/src/box/wal.h b/src/box/wal.h
index 7ca27f1a..5f3a66ce 100644
--- a/src/box/wal.h
+++ b/src/box/wal.h
@@ -58,7 +58,7 @@ wal_thread_start();
 int
 wal_init(enum wal_mode wal_mode, const char *wal_dirname, int64_t wal_max_rows,
 	 int64_t wal_max_size, const struct tt_uuid *instance_uuid,
-	 const struct vclock *vclock, const struct vclock *first_checkpoint_vclock);
+	 const struct vclock *vclock, const struct vclock *checkpoint_vclock);
 
 void
 wal_thread_stop();
@@ -179,20 +179,22 @@ wal_flush(void);
  * is supposed to be used to identify the new checkpoint.
  */
 int
-wal_checkpoint(struct vclock *vclock);
+wal_begin_checkpoint(struct vclock *vclock);
+
+/**
+ * This function is called upon successful checkpoint creation.
+ * It updates the WAL thread's version of the last checkpoint
+ * vclock.
+ */
+void
+wal_commit_checkpoint(const struct vclock *vclock);
 
 /**
  * Remove WAL files that are not needed by consumers reading
- * rows at @wal_vclock or newer.
- *
- * Update the oldest checkpoint signature with @checkpoint_vclock.
- * WAL thread will delete WAL files that are not needed to
- * recover from the oldest checkpoint if it runs out of disk
- * space.
+ * rows at @vclock or newer.
  */
 void
-wal_collect_garbage(const struct vclock *wal_vclock,
-		    const struct vclock *checkpoint_vclock);
+wal_collect_garbage(const struct vclock *vclock);
 
 void
 wal_init_vy_log();
diff --git a/test/replication/gc_no_space.result b/test/replication/gc_no_space.result
index ceea8ab3..5c64bea4 100644
--- a/test/replication/gc_no_space.result
+++ b/test/replication/gc_no_space.result
@@ -160,10 +160,21 @@ check_snap_count(2)
 ---
 - true
 ...
-#box.info.gc().consumers -- 3
+gc = box.info.gc()
+---
+...
+#gc.consumers -- 3
 ---
 - 3
 ...
+#gc.checkpoints -- 2
+---
+- 2
+...
+gc.signature == gc.consumers[1].signature
+---
+- true
+...
 --
 -- Inject a ENOSPC error and check that the WAL thread deletes
 -- old WAL files to prevent the user from seeing the error.
@@ -188,15 +199,28 @@ check_snap_count(2)
 ---
 - true
 ...
-#box.info.gc().consumers -- 1
+gc = box.info.gc()
+---
+...
+#gc.consumers -- 1
 ---
 - 1
 ...
+#gc.checkpoints -- 2
+---
+- 2
+...
+gc.signature == gc.consumers[1].signature
+---
+- true
+...
 --
 -- Check that the WAL thread never deletes WAL files that are
--- needed for recovery from a checkpoint.
+-- needed for recovery from the last checkpoint, but may delete
+-- older WAL files that would be kept otherwise for recovery
+-- from backup checkpoints.
 --
-errinj.set('ERRINJ_WAL_FALLOCATE', 2)
+errinj.set('ERRINJ_WAL_FALLOCATE', 3)
 ---
 - ok
 ...
@@ -208,7 +232,7 @@ errinj.info()['ERRINJ_WAL_FALLOCATE'].state -- 0
 ---
 - 0
 ...
-check_wal_count(2)
+check_wal_count(1)
 ---
 - true
 ...
@@ -216,10 +240,21 @@ check_snap_count(2)
 ---
 - true
 ...
-#box.info.gc().consumers -- 0
+gc = box.info.gc()
+---
+...
+#gc.consumers -- 0
 ---
 - 0
 ...
+#gc.checkpoints -- 2
+---
+- 2
+...
+gc.signature == gc.checkpoints[2].signature
+---
+- true
+...
 s:drop()
 ---
 ...
diff --git a/test/replication/gc_no_space.test.lua b/test/replication/gc_no_space.test.lua
index be2e3229..7f5ab803 100644
--- a/test/replication/gc_no_space.test.lua
+++ b/test/replication/gc_no_space.test.lua
@@ -70,7 +70,10 @@ s:auto_increment{}
 
 check_wal_count(5)
 check_snap_count(2)
-#box.info.gc().consumers -- 3
+gc = box.info.gc()
+#gc.consumers -- 3
+#gc.checkpoints -- 2
+gc.signature == gc.consumers[1].signature
 
 --
 -- Inject a ENOSPC error and check that the WAL thread deletes
@@ -82,19 +85,27 @@ errinj.info()['ERRINJ_WAL_FALLOCATE'].state -- 0
 
 check_wal_count(3)
 check_snap_count(2)
-#box.info.gc().consumers -- 1
+gc = box.info.gc()
+#gc.consumers -- 1
+#gc.checkpoints -- 2
+gc.signature == gc.consumers[1].signature
 
 --
 -- Check that the WAL thread never deletes WAL files that are
--- needed for recovery from a checkpoint.
+-- needed for recovery from the last checkpoint, but may delete
+-- older WAL files that would be kept otherwise for recovery
+-- from backup checkpoints.
 --
-errinj.set('ERRINJ_WAL_FALLOCATE', 2)
+errinj.set('ERRINJ_WAL_FALLOCATE', 3)
 s:auto_increment{} -- failure
 errinj.info()['ERRINJ_WAL_FALLOCATE'].state -- 0
 
-check_wal_count(2)
+check_wal_count(1)
 check_snap_count(2)
-#box.info.gc().consumers -- 0
+gc = box.info.gc()
+#gc.consumers -- 0
+#gc.checkpoints -- 2
+gc.signature == gc.checkpoints[2].signature
 
 s:drop()
 box.schema.user.revoke('guest', 'replication')
-- 
2.11.0

  parent reply	other threads:[~2018-11-25 13:48 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-25 13:48 [PATCH 0/6] WAL garbage collection and checkpointing fixes Vladimir Davydov
2018-11-25 13:48 ` [PATCH 1/6] vclock: allow to use const vclock as search key Vladimir Davydov
2018-11-26 17:38   ` Konstantin Osipov
2018-11-27  9:56     ` Vladimir Davydov
2018-11-25 13:48 ` [PATCH 2/6] engine: pass vclock instead of lsn to collect_garbage callback Vladimir Davydov
2018-11-26 17:41   ` Konstantin Osipov
2018-11-27  9:56     ` Vladimir Davydov
2018-11-25 13:48 ` [PATCH 3/6] box: do not rotate WAL when replica subscribes Vladimir Davydov
2018-11-26 17:50   ` Konstantin Osipov
2018-11-27  9:57     ` Vladimir Davydov
2018-11-25 13:48 ` [PATCH 4/6] box: use replicaset.vclock in replica join/subscribe Vladimir Davydov
2018-11-26 17:54   ` Konstantin Osipov
2018-11-27  9:57     ` Vladimir Davydov
2018-11-25 13:48 ` [PATCH 5/6] wal: separate checkpoint and flush paths Vladimir Davydov
2018-11-26 17:58   ` Konstantin Osipov
2018-11-26 20:19     ` Vladimir Davydov
2018-11-28 16:46       ` Konstantin Osipov
2018-11-25 13:48 ` Vladimir Davydov [this message]
2018-11-28 16:14   ` [PATCH 6/6] wal: remove files needed for recovery from backup checkpoints on ENOSPC Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=42ab54945eb5d9154668ad80e640c4e52c01835f.1543152574.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH 6/6] wal: remove files needed for recovery from backup checkpoints on ENOSPC' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox