[PATCH] vinyl: fix compaction vs checkpoint race resulting in invalid gc

Vladimir Davydov vdavydov.dev at gmail.com
Thu May 31 15:51:15 MSK 2018


The callback invoked upon compaction completion uses checkpoint_last()
to determine whether compacted runs may be deleted: if the max LSN
stored in a compacted run (run->dump_lsn) is greater than the LSN of the
last checkpoint (gc_lsn) then the run doesn't belong to the last
checkpoint and hence is safe to delete, see commit 35db70fac55c ("vinyl:
remove runs not referenced by any checkpoint immediately").

The problem is checkpoint_last() isn't synced with vylog rotation - it
returns the signature of the last successfully created memtx snapshot
and is updated in memtx_engine_commit_checkpoint() after vylog is
rotated. If a compaction task completes after vylog is rotated but
before snap file is renamed, it will assume that compacted runs do not
belong to the last checkpoint, although they do (as they have been
appended to the rotated vylog), and delete them.

To eliminate this race, let's use vylog signature instead of snap
signature in vy_task_compact_complete().

Closes #3437
---
This patch is for 1.9

https://github.com/tarantool/tarantool/issues/3437
https://github.com/tarantool/tarantool/commits/gh-3437-vy-fix-invalid-gc

 src/box/memtx_engine.c     |  8 ++++++
 src/box/vy_log.c           |  6 ++++
 src/box/vy_log.h           |  6 ++++
 src/box/vy_scheduler.c     |  2 +-
 src/errinj.h               |  1 +
 test/box/errinj.result     |  2 ++
 test/vinyl/errinj.result   | 69 ++++++++++++++++++++++++++++++++++++++++++++++
 test/vinyl/errinj.test.lua | 29 +++++++++++++++++++
 8 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
index cf88ca26..9a9aff51 100644
--- a/src/box/memtx_engine.c
+++ b/src/box/memtx_engine.c
@@ -724,6 +724,14 @@ memtx_engine_commit_checkpoint(struct engine *engine, struct vclock *vclock)
 		snprintf(to, sizeof(to), "%s",
 			 xdir_format_filename(dir, lsn, NONE));
 		char *from = xdir_format_filename(dir, lsn, INPROGRESS);
+#ifndef NDEBUG
+		struct errinj *delay = errinj(ERRINJ_SNAP_COMMIT_DELAY,
+					       ERRINJ_BOOL);
+		if (delay != NULL && delay->bparam) {
+			while (delay->bparam)
+				fiber_sleep(0.001);
+		}
+#endif
 		int rc = coio_rename(from, to);
 		if (rc != 0)
 			panic("can't rename .snap.inprogress");
diff --git a/src/box/vy_log.c b/src/box/vy_log.c
index c31a588e..4301e609 100644
--- a/src/box/vy_log.c
+++ b/src/box/vy_log.c
@@ -1154,6 +1154,12 @@ vy_log_collect_garbage(int64_t signature)
 	xdir_collect_garbage(&vy_log.dir, signature, true);
 }
 
+int64_t
+vy_log_signature(void)
+{
+	return vclock_sum(&vy_log.last_checkpoint);
+}
+
 const char *
 vy_log_backup_path(struct vclock *vclock)
 {
diff --git a/src/box/vy_log.h b/src/box/vy_log.h
index f17b122a..5ed99488 100644
--- a/src/box/vy_log.h
+++ b/src/box/vy_log.h
@@ -253,6 +253,12 @@ void
 vy_log_collect_garbage(int64_t signature);
 
 /**
+ * Return the signature of the newest vylog to the time.
+ */
+int64_t
+vy_log_signature(void);
+
+/**
  * Return the path to the log file that needs to be backed up
  * in order to recover to checkpoint @vclock.
  */
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 4c9103cf..3dd81629 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -1123,7 +1123,7 @@ vy_task_compact_complete(struct vy_scheduler *scheduler, struct vy_task *task)
 		if (slice == last_slice)
 			break;
 	}
-	int64_t gc_lsn = checkpoint_last(NULL);
+	int64_t gc_lsn = vy_log_signature();
 	rlist_foreach_entry(run, &unused_runs, in_unused)
 		vy_log_drop_run(run->id, gc_lsn);
 	if (new_slice != NULL) {
diff --git a/src/errinj.h b/src/errinj.h
index ed69b6cb..331529cd 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -109,6 +109,7 @@ struct errinj {
 	_(ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \
 	_(ERRINJ_IPROTO_TX_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_LOG_ROTATE, ERRINJ_BOOL, {.bparam = false}) \
+	_(ERRINJ_SNAP_COMMIT_DELAY, ERRINJ_BOOL, {.bparam = 0}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 9e3a0bfa..df044369 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -54,6 +54,8 @@ errinj.info()
     state: false
   ERRINJ_VY_RUN_WRITE:
     state: false
+  ERRINJ_SNAP_COMMIT_DELAY:
+    state: false
   ERRINJ_RELAY_FINAL_SLEEP:
     state: false
   ERRINJ_VY_RUN_DISCARD:
diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result
index 91351086..094d4fa9 100644
--- a/test/vinyl/errinj.result
+++ b/test/vinyl/errinj.result
@@ -4,6 +4,9 @@
 test_run = require('test_run').new()
 ---
 ...
+fio = require('fio')
+---
+...
 fiber = require('fiber')
 ---
 ...
@@ -1320,3 +1323,69 @@ ret
 s:drop()
 ---
 ...
+--
+-- gh-3437: if compaction races with checkpointing, it may remove
+-- files needed for backup.
+--
+s = box.schema.space.create('test', {engine = 'vinyl'})
+---
+...
+_ = s:create_index('pk', {run_count_per_level = 1})
+---
+...
+-- Create a run file.
+_ = s:replace{1}
+---
+...
+box.snapshot()
+---
+- ok
+...
+-- Create another run file. This will trigger compaction
+-- as run_count_per_level is set to 1. Due to the error
+-- injection compaction will finish before snapshot.
+_ = s:replace{2}
+---
+...
+errinj.set('ERRINJ_SNAP_COMMIT_DELAY', true)
+---
+- ok
+...
+c = fiber.channel(1)
+---
+...
+_ = fiber.create(function() box.snapshot() c:put(true) end)
+---
+...
+while s.index.pk:info().disk.compact.count == 0 do fiber.sleep(0.001) end
+---
+...
+errinj.set('ERRINJ_SNAP_COMMIT_DELAY', false)
+---
+- ok
+...
+c:get()
+---
+- true
+...
+-- Check that all files corresponding to the last checkpoint
+-- are present.
+files = box.backup.start()
+---
+...
+missing = {}
+---
+...
+for _, f in pairs(files) do if not fio.path.exists(f) then table.insert(missing, f) end end
+---
+...
+missing
+---
+- []
+...
+box.backup.stop()
+---
+...
+s:drop()
+---
+...
diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua
index 9724a69b..46ffdcac 100644
--- a/test/vinyl/errinj.test.lua
+++ b/test/vinyl/errinj.test.lua
@@ -2,6 +2,7 @@
 -- gh-1681: vinyl: crash in vy_rollback on ER_WAL_WRITE
 --
 test_run = require('test_run').new()
+fio = require('fio')
 fiber = require('fiber')
 errinj = box.error.injection
 errinj.set("ERRINJ_VY_SCHED_TIMEOUT", 0.040)
@@ -513,3 +514,31 @@ errinj.set("ERRINJ_VY_DELAY_PK_LOOKUP", false)
 while ret == nil do fiber.sleep(0.01) end
 ret
 s:drop()
+
+--
+-- gh-3437: if compaction races with checkpointing, it may remove
+-- files needed for backup.
+--
+s = box.schema.space.create('test', {engine = 'vinyl'})
+_ = s:create_index('pk', {run_count_per_level = 1})
+-- Create a run file.
+_ = s:replace{1}
+box.snapshot()
+-- Create another run file. This will trigger compaction
+-- as run_count_per_level is set to 1. Due to the error
+-- injection compaction will finish before snapshot.
+_ = s:replace{2}
+errinj.set('ERRINJ_SNAP_COMMIT_DELAY', true)
+c = fiber.channel(1)
+_ = fiber.create(function() box.snapshot() c:put(true) end)
+while s.index.pk:info().disk.compact.count == 0 do fiber.sleep(0.001) end
+errinj.set('ERRINJ_SNAP_COMMIT_DELAY', false)
+c:get()
+-- Check that all files corresponding to the last checkpoint
+-- are present.
+files = box.backup.start()
+missing = {}
+for _, f in pairs(files) do if not fio.path.exists(f) then table.insert(missing, f) end end
+missing
+box.backup.stop()
+s:drop()
-- 
2.11.0




More information about the Tarantool-patches mailing list