[PATCH] vinyl: delete runs compacted during join immediately

Vladimir Davydov vdavydov.dev at gmail.com
Fri Feb 16 20:44:28 MSK 2018


We keep run files corresponding to (at least) the last snapshot, because
we need them for backups and replication. Deletion of compacted run
files is postponed until the next snapshot. As a consequence, we don't
delete run files created on a replica during the join stage. However, in
contrast to run files created during normal operation, these are pure
garbage and should be deleted right away. Not deleting them can result
in depletion of disk space, because vinyl has quite high write
amplification by design.

We can't write a functional test for this, because there's no way to
guarantee that compaction started during join will finish before join
completion - if it doesn't, compacted runs won't be removed, because
they will be assigned to the snapshot created by join.

Closes #3162
---
 src/box/vinyl.c                                  | 17 ++------------
 src/box/vy_run.c                                 | 19 ++++++++++++++++
 src/box/vy_run.h                                 |  9 ++++++++
 src/box/vy_scheduler.c                           | 17 ++++++++++++++
 test/vinyl/{join_quota.lua => replica_quota.lua} |  0
 test/vinyl/replica_quota.result                  | 28 ++++++++++++++++++++++--
 test/vinyl/replica_quota.test.lua                | 16 ++++++++++++--
 7 files changed, 87 insertions(+), 19 deletions(-)
 rename test/vinyl/{join_quota.lua => replica_quota.lua} (100%)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 03ad2bf4..e4ef6173 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -54,7 +54,6 @@
 #include <small/region.h>
 #include <small/mempool.h>
 
-#include "coio_file.h"
 #include "coio_task.h"
 #include "cbus.h"
 #include "histogram.h"
@@ -3334,20 +3333,8 @@ vy_gc_cb(const struct vy_log_record *record, void *cb_arg)
 				(long long)record->run_id); goto out;});
 
 	/* Try to delete files. */
-	bool forget = true;
-	char path[PATH_MAX];
-	for (int type = 0; type < vy_file_MAX; type++) {
-		vy_run_snprint_path(path, sizeof(path), arg->env->path,
-				    arg->space_id, arg->index_id,
-				    record->run_id, type);
-		say_info("removing %s", path);
-		if (coio_unlink(path) < 0 && errno != ENOENT) {
-			say_syserror("error while removing %s", path);
-			forget = false;
-		}
-	}
-
-	if (!forget)
+	if (vy_run_remove_files(arg->env->path, arg->space_id,
+				arg->index_id, record->run_id) != 0)
 		goto out;
 
 	/* Forget the run on success. */
diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index ccbb0ca1..4935b4ed 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -37,6 +37,7 @@
 #include "fio.h"
 #include "cbus.h"
 #include "memory.h"
+#include "coio_file.h"
 
 #include "replication.h"
 #include "tuple_hash.h" /* for bloom filter */
@@ -2602,6 +2603,24 @@ close_err:
 	return -1;
 }
 
+int
+vy_run_remove_files(const char *dir, uint32_t space_id,
+		    uint32_t iid, int64_t run_id)
+{
+	int ret = 0;
+	char path[PATH_MAX];
+	for (int type = 0; type < vy_file_MAX; type++) {
+		vy_run_snprint_path(path, sizeof(path), dir,
+				    space_id, iid, run_id, type);
+		say_info("removing %s", path);
+		if (coio_unlink(path) < 0 && errno != ENOENT) {
+			say_syserror("error while removing %s", path);
+			ret = -1;
+		}
+	}
+	return ret;
+}
+
 /**
  * Read a page with stream->page_no from the run and save it in stream->page.
  * Support function of slice stream.
diff --git a/src/box/vy_run.h b/src/box/vy_run.h
index cf0569c2..1dde0776 100644
--- a/src/box/vy_run.h
+++ b/src/box/vy_run.h
@@ -403,6 +403,15 @@ vy_run_snprint_path(char *buf, int size, const char *dir,
 	return total;
 }
 
+/**
+ * Remove all files (data, index) corresponding to a run
+ * with the given id. Return 0 on success, -1 if unlink()
+ * failed.
+ */
+int
+vy_run_remove_files(const char *dir, uint32_t space_id,
+		    uint32_t iid, int64_t run_id);
+
 int
 vy_run_write(struct vy_run *run, const char *dirpath,
 	     uint32_t space_id, uint32_t iid,
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 35fc24f4..df8eb87f 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -1071,6 +1071,23 @@ vy_task_compact_complete(struct vy_scheduler *scheduler, struct vy_task *task)
 		return -1;
 	}
 
+	if (gc_lsn < 0) {
+		/*
+		 * If there is no last snapshot, i.e. we are in
+		 * the middle of join, we can delete compacted
+		 * run files right away.
+		 */
+		vy_log_tx_begin();
+		rlist_foreach_entry(run, &unused_runs, in_unused) {
+			if (vy_run_remove_files(index->env->path,
+						index->space_id, index->id,
+						run->id) == 0) {
+				vy_log_forget_run(run->id);
+			}
+		}
+		vy_log_tx_try_commit();
+	}
+
 	/*
 	 * Account the new run if it is not empty,
 	 * otherwise discard it.
diff --git a/test/vinyl/join_quota.lua b/test/vinyl/replica_quota.lua
similarity index 100%
rename from test/vinyl/join_quota.lua
rename to test/vinyl/replica_quota.lua
diff --git a/test/vinyl/replica_quota.result b/test/vinyl/replica_quota.result
index b85c7398..460cc1e6 100644
--- a/test/vinyl/replica_quota.result
+++ b/test/vinyl/replica_quota.result
@@ -10,7 +10,7 @@ box.schema.user.grant('guest', 'replication')
 s = box.schema.space.create('test', { engine = 'vinyl' })
 ---
 ...
-_ = s:create_index('pk')
+_ = s:create_index('pk', {run_count_per_level = 1})
 ---
 ...
 -- Send > 2 MB to replica.
@@ -36,7 +36,7 @@ for i = 1001,2000 do s:insert{i, pad} end
 -- a dump to complete and hence would result in bootstrap failure
 -- were the timeout not ignored.
 --
-_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/join_quota.lua'")
+_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_quota.lua'")
 ---
 ...
 _ = test_run:cmd("start server replica")
@@ -58,6 +58,30 @@ _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
 ---
 ...
+-- During join we remove compacted run files immediately (gh-3162).
+-- Check that we don't delete files that are still in use.
+_ = test_run:cmd("stop server replica")
+---
+...
+_ = test_run:cmd("cleanup server replica")
+---
+...
+box.snapshot()
+---
+- ok
+...
+for i = 3001,4000 do s:insert{i, pad} end
+---
+...
+_ = test_run:cmd("start server replica") -- join
+---
+...
+_ = test_run:cmd("stop server replica")
+---
+...
+_ = test_run:cmd("start server replica") -- recovery
+---
+...
 _ = test_run:cmd("stop server replica")
 ---
 ...
diff --git a/test/vinyl/replica_quota.test.lua b/test/vinyl/replica_quota.test.lua
index ab89c1bc..eade6f2f 100644
--- a/test/vinyl/replica_quota.test.lua
+++ b/test/vinyl/replica_quota.test.lua
@@ -4,7 +4,7 @@ box.schema.user.grant('guest', 'read,write,execute', 'universe')
 box.schema.user.grant('guest', 'replication')
 
 s = box.schema.space.create('test', { engine = 'vinyl' })
-_ = s:create_index('pk')
+_ = s:create_index('pk', {run_count_per_level = 1})
 
 -- Send > 2 MB to replica.
 pad = string.rep('x', 1100)
@@ -21,7 +21,7 @@ for i = 1001,2000 do s:insert{i, pad} end
 -- a dump to complete and hence would result in bootstrap failure
 -- were the timeout not ignored.
 --
-_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/join_quota.lua'")
+_ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_quota.lua'")
 _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
 
@@ -31,6 +31,18 @@ for i = 2001,3000 do s:insert{i, pad} end
 _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
 
+-- During join we remove compacted run files immediately (gh-3162).
+-- Check that we don't delete files that are still in use.
+_ = test_run:cmd("stop server replica")
+_ = test_run:cmd("cleanup server replica")
+
+box.snapshot()
+for i = 3001,4000 do s:insert{i, pad} end
+
+_ = test_run:cmd("start server replica") -- join
+_ = test_run:cmd("stop server replica")
+_ = test_run:cmd("start server replica") -- recovery
+
 _ = test_run:cmd("stop server replica")
 _ = test_run:cmd("cleanup server replica")
 
-- 
2.11.0




More information about the Tarantool-patches mailing list