From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Sun, 10 Jun 2018 15:02:48 +0300 From: Vladimir Davydov Subject: Re: [PATCH v2 11/11] vinyl: implement rebootstrap support Message-ID: <20180610120248.76s5hmzht57euupk@esperanza> References: <34b1716f6b4960ed2483b4ebf64b693e3b3002a9.1528478913.git.vdavydov.dev@gmail.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <34b1716f6b4960ed2483b4ebf64b693e3b3002a9.1528478913.git.vdavydov.dev@gmail.com> To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: On Fri, Jun 08, 2018 at 08:34:29PM +0300, Vladimir Davydov wrote: > If vy_log_bootstrap() finds a vylog file in the vinyl directory, it > assumes it has to be rebootstrapped and calls vy_log_rebootstrap(). > The latter scans the old vylog file to find the max vinyl object id, > from which it will start numbering objects created during rebootstrap to > avoid conflicts with old objects, then it writes VY_LOG_REBOOTSTRAP > record to the old vylog to denote the beginning of a rebootstrap > section. After that initial join proceeds as usual, writing information > about new objects to the old vylog file after VY_LOG_REBOOTSTRAP marker. > Upon successful rebootstrap completion, checkpoint, which is always > called right after bootstrap, rotates the old vylog and marks all > objects created before the VY_LOG_REBOOTSTRAP marker as dropped in the > new vylog. The old objects will be purged by the garbage collector as > usual. > > In case rebootstrap fails and checkpoint never happens, local recovery > writes VY_LOG_ABORT_REBOOTSTRAP record to the vylog. This marker > indicates that the rebootstrap attempt failed and all objects created > during rebootstrap should be discarded. They will be purged by the > garbage collector on checkpoint. Thus even if rebootstrap fails, it is > possible to recover the database to the state that existed right before > a failed rebootstrap attempt. > > TODO: write a test checking that garbage collection works as expected. Here goes the test. Note, it needs the following pull requests merged into test-run (updated on the branch): https://github.com/tarantool/test-run/pull/93 --- >From 5c2127a1b600559e5e51f19a8b6bea1a75c5aad0 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sat, 9 Jun 2018 20:31:18 +0300 Subject: [PATCH] test: check that gc works as expected after rebootstrap Follow-up #461 diff --git a/src/box/relay.cc b/src/box/relay.cc index a25cc540..985a3e5a 100644 --- a/src/box/relay.cc +++ b/src/box/relay.cc @@ -287,6 +287,9 @@ relay_final_join(struct replica *replica, int fd, uint64_t sync, if (rc != 0) diag_raise(); + ERROR_INJECT(ERRINJ_RELAY_FINAL_JOIN, + tnt_raise(ClientError, ER_INJECTION, "relay final join")); + ERROR_INJECT(ERRINJ_RELAY_FINAL_SLEEP, { while (vclock_compare(stop_vclock, &replicaset.vclock) == 0) fiber_sleep(0.001); diff --git a/src/errinj.h b/src/errinj.h index ab578274..e3adb7d7 100644 --- a/src/errinj.h +++ b/src/errinj.h @@ -97,6 +97,7 @@ struct errinj { _(ERRINJ_RELAY_TIMEOUT, ERRINJ_DOUBLE, {.dparam = 0}) \ _(ERRINJ_RELAY_REPORT_INTERVAL, ERRINJ_DOUBLE, {.dparam = 0}) \ _(ERRINJ_RELAY_FINAL_SLEEP, ERRINJ_BOOL, {.bparam = false}) \ + _(ERRINJ_RELAY_FINAL_JOIN, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_PORT_DUMP, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_XLOG_GARBAGE, ERRINJ_BOOL, {.bparam = false}) \ _(ERRINJ_XLOG_META, ERRINJ_BOOL, {.bparam = false}) \ diff --git a/test/box/errinj.result b/test/box/errinj.result index e25a4594..aad07c4c 100644 --- a/test/box/errinj.result +++ b/test/box/errinj.result @@ -58,6 +58,8 @@ errinj.info() state: false ERRINJ_VY_LOG_FLUSH_DELAY: state: false + ERRINJ_RELAY_FINAL_JOIN: + state: false ERRINJ_SNAP_COMMIT_DELAY: state: false ERRINJ_RELAY_FINAL_SLEEP: diff --git a/test/vinyl/replica_rejoin.lua b/test/vinyl/replica_rejoin.lua new file mode 100644 index 00000000..7cb7e09a --- /dev/null +++ b/test/vinyl/replica_rejoin.lua @@ -0,0 +1,13 @@ +#!/usr/bin/env tarantool + +local replication = os.getenv("MASTER") +if arg[1] == 'disable_replication' then + replication = nil +end + +box.cfg({ + replication = replication, + vinyl_memory = 1024 * 1024, +}) + +require('console').listen(os.getenv('ADMIN')) diff --git a/test/vinyl/replica_rejoin.result b/test/vinyl/replica_rejoin.result new file mode 100644 index 00000000..9116dfbb --- /dev/null +++ b/test/vinyl/replica_rejoin.result @@ -0,0 +1,257 @@ +env = require('test_run') +--- +... +test_run = env.new() +--- +... +-- +-- gh-461: check that garbage collection works as expected +-- after rebootstrap. +-- +box.schema.user.grant('guest', 'replication') +--- +... +_ = box.schema.space.create('test', { id = 9000, engine = 'vinyl' }) +--- +... +_ = box.space.test:create_index('pk') +--- +... +pad = string.rep('x', 15 * 1024) +--- +... +for i = 1, 100 do box.space.test:replace{i, pad} end +--- +... +box.snapshot() +--- +- ok +... +-- Join a replica. Check its files. +test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_rejoin.lua'") +--- +- true +... +test_run:cmd("start server replica") +--- +- true +... +test_run:cmd("switch replica") +--- +- true +... +fio = require('fio') +--- +... +fio.chdir(box.cfg.vinyl_dir) +--- +- true +... +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +--- +- - 9000/0/00000000000000000002.index + - 9000/0/00000000000000000002.run + - 9000/0/00000000000000000004.index + - 9000/0/00000000000000000004.run +... +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("stop server replica") +--- +- true +... +-- Invoke garbage collector on the master. +test_run:cmd("restart server default") +checkpoint_count = box.cfg.checkpoint_count +--- +... +box.cfg{checkpoint_count = 1} +--- +... +box.space.test:delete(1) +--- +... +box.snapshot() +--- +- ok +... +box.cfg{checkpoint_count = checkpoint_count} +--- +... +-- Rebootstrap the replica. Check that old files are removed +-- by garbage collector. +test_run:cmd("start server replica") +--- +- true +... +test_run:cmd("switch replica") +--- +- true +... +box.cfg{checkpoint_count = 1} +--- +... +box.snapshot() +--- +- ok +... +fio = require('fio') +--- +... +fio.chdir(box.cfg.vinyl_dir) +--- +- true +... +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +--- +- - 9000/0/00000000000000000008.index + - 9000/0/00000000000000000008.run + - 9000/0/00000000000000000010.index + - 9000/0/00000000000000000010.run +... +box.space.test:count() -- 99 +--- +- 99 +... +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("stop server replica") +--- +- true +... +-- Invoke garbage collector on the master. +test_run:cmd("restart server default") +checkpoint_count = box.cfg.checkpoint_count +--- +... +box.cfg{checkpoint_count = 1} +--- +... +box.space.test:delete(2) +--- +... +box.snapshot() +--- +- ok +... +box.cfg{checkpoint_count = checkpoint_count} +--- +... +-- Make the master fail join after sending data. Check that +-- files written during failed rebootstrap attempt are removed +-- by garbage collector. +box.error.injection.set('ERRINJ_RELAY_FINAL_JOIN', true) +--- +- ok +... +test_run:cmd("start server replica") -- fail +--- +- Can't start server 'replica' +... +test_run:cmd("start server replica") -- fail again +--- +- Can't start server 'replica' +... +test_run:cmd("start server replica with args='disable_replication'") +--- +- true +... +test_run:cmd("switch replica") +--- +- true +... +box.cfg{checkpoint_count = 1} +--- +... +box.snapshot() +--- +- ok +... +fio = require('fio') +--- +... +fio.chdir(box.cfg.vinyl_dir) +--- +- true +... +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +--- +- - 9000/0/00000000000000000008.index + - 9000/0/00000000000000000008.run + - 9000/0/00000000000000000010.index + - 9000/0/00000000000000000010.run +... +box.space.test:count() -- 99 +--- +- 99 +... +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("stop server replica") +--- +- true +... +box.error.injection.set('ERRINJ_RELAY_FINAL_JOIN', false) +--- +- ok +... +-- Rebootstrap after several failed attempts and make sure +-- old files are removed. +test_run:cmd("start server replica") +--- +- true +... +test_run:cmd("switch replica") +--- +- true +... +box.cfg{checkpoint_count = 1} +--- +... +box.snapshot() +--- +- ok +... +fio = require('fio') +--- +... +fio.chdir(box.cfg.vinyl_dir) +--- +- true +... +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +--- +- - 9000/0/00000000000000000022.index + - 9000/0/00000000000000000022.run + - 9000/0/00000000000000000024.index + - 9000/0/00000000000000000024.run +... +box.space.test:count() -- 98 +--- +- 98 +... +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("stop server replica") +--- +- true +... +-- Cleanup. +test_run:cmd("cleanup server replica") +--- +- true +... +box.space.test:drop() +--- +... +box.schema.user.revoke('guest', 'replication') +--- +... diff --git a/test/vinyl/replica_rejoin.test.lua b/test/vinyl/replica_rejoin.test.lua new file mode 100644 index 00000000..61e8199d --- /dev/null +++ b/test/vinyl/replica_rejoin.test.lua @@ -0,0 +1,88 @@ +env = require('test_run') +test_run = env.new() + +-- +-- gh-461: check that garbage collection works as expected +-- after rebootstrap. +-- +box.schema.user.grant('guest', 'replication') +_ = box.schema.space.create('test', { id = 9000, engine = 'vinyl' }) +_ = box.space.test:create_index('pk') +pad = string.rep('x', 15 * 1024) +for i = 1, 100 do box.space.test:replace{i, pad} end +box.snapshot() + +-- Join a replica. Check its files. +test_run:cmd("create server replica with rpl_master=default, script='vinyl/replica_rejoin.lua'") +test_run:cmd("start server replica") +test_run:cmd("switch replica") +fio = require('fio') +fio.chdir(box.cfg.vinyl_dir) +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +test_run:cmd("switch default") +test_run:cmd("stop server replica") + +-- Invoke garbage collector on the master. +test_run:cmd("restart server default") +checkpoint_count = box.cfg.checkpoint_count +box.cfg{checkpoint_count = 1} +box.space.test:delete(1) +box.snapshot() +box.cfg{checkpoint_count = checkpoint_count} + +-- Rebootstrap the replica. Check that old files are removed +-- by garbage collector. +test_run:cmd("start server replica") +test_run:cmd("switch replica") +box.cfg{checkpoint_count = 1} +box.snapshot() +fio = require('fio') +fio.chdir(box.cfg.vinyl_dir) +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +box.space.test:count() -- 99 +test_run:cmd("switch default") +test_run:cmd("stop server replica") + +-- Invoke garbage collector on the master. +test_run:cmd("restart server default") +checkpoint_count = box.cfg.checkpoint_count +box.cfg{checkpoint_count = 1} +box.space.test:delete(2) +box.snapshot() +box.cfg{checkpoint_count = checkpoint_count} + +-- Make the master fail join after sending data. Check that +-- files written during failed rebootstrap attempt are removed +-- by garbage collector. +box.error.injection.set('ERRINJ_RELAY_FINAL_JOIN', true) +test_run:cmd("start server replica") -- fail +test_run:cmd("start server replica") -- fail again +test_run:cmd("start server replica with args='disable_replication'") +test_run:cmd("switch replica") +box.cfg{checkpoint_count = 1} +box.snapshot() +fio = require('fio') +fio.chdir(box.cfg.vinyl_dir) +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +box.space.test:count() -- 99 +test_run:cmd("switch default") +test_run:cmd("stop server replica") +box.error.injection.set('ERRINJ_RELAY_FINAL_JOIN', false) + +-- Rebootstrap after several failed attempts and make sure +-- old files are removed. +test_run:cmd("start server replica") +test_run:cmd("switch replica") +box.cfg{checkpoint_count = 1} +box.snapshot() +fio = require('fio') +fio.chdir(box.cfg.vinyl_dir) +fio.glob(fio.pathjoin(box.space.test.id, 0, '*')) +box.space.test:count() -- 98 +test_run:cmd("switch default") +test_run:cmd("stop server replica") + +-- Cleanup. +test_run:cmd("cleanup server replica") +box.space.test:drop() +box.schema.user.revoke('guest', 'replication') diff --git a/test/vinyl/suite.ini b/test/vinyl/suite.ini index ca964289..b9dae380 100644 --- a/test/vinyl/suite.ini +++ b/test/vinyl/suite.ini @@ -2,7 +2,7 @@ core = tarantool description = vinyl integration tests script = vinyl.lua -release_disabled = errinj.test.lua errinj_gc.test.lua errinj_vylog.test.lua partial_dump.test.lua quota_timeout.test.lua recovery_quota.test.lua +release_disabled = errinj.test.lua errinj_gc.test.lua errinj_vylog.test.lua partial_dump.test.lua quota_timeout.test.lua recovery_quota.test.lua replica_rejoin.test.lua config = suite.cfg lua_libs = suite.lua stress.lua large.lua txn_proxy.lua ../box/lua/utils.lua use_unix_sockets = True