From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH] vinyl: fix compaction vs checkpoint race resulting in invalid gc Date: Thu, 31 May 2018 15:51:15 +0300 Message-Id: <09ab67b920b5207ac3b47d797412da94b1fe8336.1527769684.git.vdavydov.dev@gmail.com> To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: The callback invoked upon compaction completion uses checkpoint_last() to determine whether compacted runs may be deleted: if the max LSN stored in a compacted run (run->dump_lsn) is greater than the LSN of the last checkpoint (gc_lsn) then the run doesn't belong to the last checkpoint and hence is safe to delete, see commit 35db70fac55c ("vinyl: remove runs not referenced by any checkpoint immediately"). The problem is checkpoint_last() isn't synced with vylog rotation - it returns the signature of the last successfully created memtx snapshot and is updated in memtx_engine_commit_checkpoint() after vylog is rotated. If a compaction task completes after vylog is rotated but before snap file is renamed, it will assume that compacted runs do not belong to the last checkpoint, although they do (as they have been appended to the rotated vylog), and delete them. To eliminate this race, let's use vylog signature instead of snap signature in vy_task_compact_complete(). Closes #3437 --- This patch is for 1.9 https://github.com/tarantool/tarantool/issues/3437 https://github.com/tarantool/tarantool/commits/gh-3437-vy-fix-invalid-gc src/box/memtx_engine.c | 8 ++++++ src/box/vy_log.c | 6 ++++ src/box/vy_log.h | 6 ++++ src/box/vy_scheduler.c | 2 +- src/errinj.h | 1 + test/box/errinj.result | 2 ++ test/vinyl/errinj.result | 69 ++++++++++++++++++++++++++++++++++++++++++++++ test/vinyl/errinj.test.lua | 29 +++++++++++++++++++ 8 files changed, 122 insertions(+), 1 deletion(-) diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c index cf88ca26..9a9aff51 100644 --- a/src/box/memtx_engine.c +++ b/src/box/memtx_engine.c @@ -724,6 +724,14 @@ memtx_engine_commit_checkpoint(struct engine *engine, struct vclock *vclock) snprintf(to, sizeof(to), "%s", xdir_format_filename(dir, lsn, NONE)); char *from = xdir_format_filename(dir, lsn, INPROGRESS); +#ifndef NDEBUG + struct errinj *delay = errinj(ERRINJ_SNAP_COMMIT_DELAY, + ERRINJ_BOOL); + if (delay != NULL && delay->bparam) { + while (delay->bparam) + fiber_sleep(0.001); + } +#endif int rc = coio_rename(from, to); if (rc != 0) panic("can't rename .snap.inprogress"); diff --git a/src/box/vy_log.c b/src/box/vy_log.c index c31a588e..4301e609 100644 --- a/src/box/vy_log.c +++ b/src/box/vy_log.c @@ -1154,6 +1154,12 @@ vy_log_collect_garbage(int64_t signature) xdir_collect_garbage(&vy_log.dir, signature, true); } +int64_t +vy_log_signature(void) +{ + return vclock_sum(&vy_log.last_checkpoint); +} + const char * vy_log_backup_path(struct vclock *vclock) { diff --git a/src/box/vy_log.h b/src/box/vy_log.h index f17b122a..5ed99488 100644 --- a/src/box/vy_log.h +++ b/src/box/vy_log.h @@ -253,6 +253,12 @@ void vy_log_collect_garbage(int64_t signature); /** + * Return the signature of the newest vylog to the time. + */ +int64_t +vy_log_signature(void); + +/** * Return the path to the log file that needs to be backed up * in order to recover to checkpoint @vclock. */ diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c index 4c9103cf..3dd81629 100644 --- a/src/box/vy_scheduler.c +++ b/src/box/vy_scheduler.c @@ -1123,7 +1123,7 @@ vy_task_compact_complete(struct vy_scheduler *scheduler, struct vy_task *task) if (slice == last_slice) break; } - int64_t gc_lsn = checkpoint_last(NULL); + int64_t gc_lsn = vy_log_signature(); rlist_foreach_entry(run, &unused_runs, in_unused) vy_log_drop_run(run->id, gc_lsn); if (new_slice != NULL) { diff --git a/src/errinj.h b/src/errinj.h index ed69b6cb..331529cd 100644 --- a/src/errinj.h +++ b/src/errinj.h @@ -109,6 +109,7 @@ struct errinj { _(ERRINJ_VY_RUN_WRITE_STMT_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \ _(ERRINJ_IPROTO_TX_DELAY, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_LOG_ROTATE, ERRINJ_BOOL, {.bparam = false}) \ + _(ERRINJ_SNAP_COMMIT_DELAY, ERRINJ_BOOL, {.bparam = 0}) \ ENUM0(errinj_id, ERRINJ_LIST); extern struct errinj errinjs[]; diff --git a/test/box/errinj.result b/test/box/errinj.result index 9e3a0bfa..df044369 100644 --- a/test/box/errinj.result +++ b/test/box/errinj.result @@ -54,6 +54,8 @@ errinj.info() state: false ERRINJ_VY_RUN_WRITE: state: false + ERRINJ_SNAP_COMMIT_DELAY: + state: false ERRINJ_RELAY_FINAL_SLEEP: state: false ERRINJ_VY_RUN_DISCARD: diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result index 91351086..094d4fa9 100644 --- a/test/vinyl/errinj.result +++ b/test/vinyl/errinj.result @@ -4,6 +4,9 @@ test_run = require('test_run').new() --- ... +fio = require('fio') +--- +... fiber = require('fiber') --- ... @@ -1320,3 +1323,69 @@ ret s:drop() --- ... +-- +-- gh-3437: if compaction races with checkpointing, it may remove +-- files needed for backup. +-- +s = box.schema.space.create('test', {engine = 'vinyl'}) +--- +... +_ = s:create_index('pk', {run_count_per_level = 1}) +--- +... +-- Create a run file. +_ = s:replace{1} +--- +... +box.snapshot() +--- +- ok +... +-- Create another run file. This will trigger compaction +-- as run_count_per_level is set to 1. Due to the error +-- injection compaction will finish before snapshot. +_ = s:replace{2} +--- +... +errinj.set('ERRINJ_SNAP_COMMIT_DELAY', true) +--- +- ok +... +c = fiber.channel(1) +--- +... +_ = fiber.create(function() box.snapshot() c:put(true) end) +--- +... +while s.index.pk:info().disk.compact.count == 0 do fiber.sleep(0.001) end +--- +... +errinj.set('ERRINJ_SNAP_COMMIT_DELAY', false) +--- +- ok +... +c:get() +--- +- true +... +-- Check that all files corresponding to the last checkpoint +-- are present. +files = box.backup.start() +--- +... +missing = {} +--- +... +for _, f in pairs(files) do if not fio.path.exists(f) then table.insert(missing, f) end end +--- +... +missing +--- +- [] +... +box.backup.stop() +--- +... +s:drop() +--- +... diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua index 9724a69b..46ffdcac 100644 --- a/test/vinyl/errinj.test.lua +++ b/test/vinyl/errinj.test.lua @@ -2,6 +2,7 @@ -- gh-1681: vinyl: crash in vy_rollback on ER_WAL_WRITE -- test_run = require('test_run').new() +fio = require('fio') fiber = require('fiber') errinj = box.error.injection errinj.set("ERRINJ_VY_SCHED_TIMEOUT", 0.040) @@ -513,3 +514,31 @@ errinj.set("ERRINJ_VY_DELAY_PK_LOOKUP", false) while ret == nil do fiber.sleep(0.01) end ret s:drop() + +-- +-- gh-3437: if compaction races with checkpointing, it may remove +-- files needed for backup. +-- +s = box.schema.space.create('test', {engine = 'vinyl'}) +_ = s:create_index('pk', {run_count_per_level = 1}) +-- Create a run file. +_ = s:replace{1} +box.snapshot() +-- Create another run file. This will trigger compaction +-- as run_count_per_level is set to 1. Due to the error +-- injection compaction will finish before snapshot. +_ = s:replace{2} +errinj.set('ERRINJ_SNAP_COMMIT_DELAY', true) +c = fiber.channel(1) +_ = fiber.create(function() box.snapshot() c:put(true) end) +while s.index.pk:info().disk.compact.count == 0 do fiber.sleep(0.001) end +errinj.set('ERRINJ_SNAP_COMMIT_DELAY', false) +c:get() +-- Check that all files corresponding to the last checkpoint +-- are present. +files = box.backup.start() +missing = {} +for _, f in pairs(files) do if not fio.path.exists(f) then table.insert(missing, f) end end +missing +box.backup.stop() +s:drop() -- 2.11.0