From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Vladimir Davydov Subject: [PATCH 2/2] replication: fix garbage collection logic Date: Tue, 9 Apr 2019 20:07:11 +0300 Message-Id: <44bdf3f4affee3c95cf9b23add24c30dece44151.1554829366.git.vdavydov.dev@gmail.com> In-Reply-To: <24d1ce5202869981a2182ff1c7947d7d4914bb5b.1554829366.git.vdavydov.dev@gmail.com> References: <24d1ce5202869981a2182ff1c7947d7d4914bb5b.1554829366.git.vdavydov.dev@gmail.com> In-Reply-To: <24d1ce5202869981a2182ff1c7947d7d4914bb5b.1554829366.git.vdavydov.dev@gmail.com> References: <24d1ce5202869981a2182ff1c7947d7d4914bb5b.1554829366.git.vdavydov.dev@gmail.com> To: kostja@tarantool.org Cc: tarantool-patches@freelists.org List-ID: Currently, the garbage collector works with vclock signatures and doesn't take into account vclock components. This works as long as the caller (i.e. relay) makes sure that it doesn't advance a consumer associated with a replica unless its acknowledged vclock is greater than or equal to the vclock of a WAL file fed to it. The bug is that it does not - it only compares vclock signatures. As a result, if a replica has some local changes or changes pulled from other members of the cluster, which render its signature greater, the master may remove files that are still needed by the replica, permanently breaking replication and requiring rebootstrap. I guess the proper fix would be teaching the garbage collector operate on vclock components rather than signatures, but it's rather difficult to implement. This patch is a quick fix, which simply replaces vclock signature comparison in relay with vclock_compare. Closes #4106 --- https://github.com/tarantool/tarantool/issues/4106 https://github.com/tarantool/tarantool/commits/dv/gh-4106-replica-gc-fix src/box/relay.cc | 12 ++++- test/replication/gc.result | 124 +++++++++++++++++++++++++++++++++++++++++-- test/replication/gc.test.lua | 59 +++++++++++++++++++- 3 files changed, 189 insertions(+), 6 deletions(-) diff --git a/src/box/relay.cc b/src/box/relay.cc index 29570323..e3d7f0ed 100644 --- a/src/box/relay.cc +++ b/src/box/relay.cc @@ -422,7 +422,17 @@ relay_schedule_pending_gc(struct relay *relay, const struct vclock *vclock) { struct relay_gc_msg *curr, *next, *gc_msg = NULL; stailq_foreach_entry_safe(curr, next, &relay->pending_gc, in_pending) { - if (vclock_sum(&curr->vclock) > vclock_sum(vclock)) + /* + * We may delete a WAL file only if its vclock is + * less than or equal to the vclock acknowledged by + * the replica. Even if the replica's signature is + * is greater, but the vclocks are incomparable, we + * must not delete the WAL, because there may still + * be rows not applied by the replica in it while + * the greater signatures is due to changes pulled + * from other members of the cluster. + */ + if (vclock_compare(&curr->vclock, vclock) > 0) break; stailq_shift(&relay->pending_gc); free(gc_msg); diff --git a/test/replication/gc.result b/test/replication/gc.result index 5b7057ad..65785f47 100644 --- a/test/replication/gc.result +++ b/test/replication/gc.result @@ -404,10 +404,6 @@ test_run:cmd("cleanup server replica") --- - true ... -test_run:cmd("delete server replica") ---- -- true -... _ = s:auto_increment{} --- ... @@ -450,7 +446,127 @@ wait_xlog(0, 10) or fio.listdir('./master') box.cfg{replication = {}} --- ... +s:truncate() +--- +... +-- +-- Check that a master doesn't remove WAL files needed by a replica +-- in case the replica has local changes due to which its vclock has +-- greater signature than the master's (gh-4106). +-- +test_run:cmd("start server replica") +--- +- true +... +-- Temporarily disable replication. +test_run:cmd("switch replica") +--- +- true +... +replication = box.cfg.replication +--- +... +box.cfg{replication = {}} +--- +... +-- Generate some WALs on the master. +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("setopt delimiter ';'") +--- +- true +... +for i = 1, 4 do + for j = 1, 100 do + s:replace{1, i, j} + end + box.snapshot() +end; +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +-- Stall WAL writes on the replica and reestablish replication, +-- then bump local LSN and break replication on WAL error so that +-- the master sends all rows and receives ack with the replica's +-- vclock, but not all rows are actually applied on the replica. +-- The master must not delete unconfirmed WALs even though the +-- replica's vclock has a greater signature (checked later on). +test_run:cmd("switch replica") +--- +- true +... +box.error.injection.set('ERRINJ_WAL_DELAY', true) +--- +- ok +... +box.cfg{replication_connect_quorum = 0} -- disable sync +--- +... +box.cfg{replication = replication} +--- +... +fiber = require('fiber') +--- +... +test_run:cmd("setopt delimiter ';'"); +--- +- true +... +_ = fiber.create(function() + box.begin() + for i = 1, 1000 do + box.space.test:replace{1, i} + end + box.commit() + box.error.injection.set('ERRINJ_WAL_WRITE_DISK', true) +end); +--- +... +test_run:cmd("setopt delimiter ''"); +--- +- true +... +box.error.injection.set('ERRINJ_WAL_DELAY', false) +--- +- ok +... +-- Restart the replica and wait for it to sync. +test_run:cmd("switch default") +--- +- true +... +test_run:cmd("stop server replica") +--- +- true +... +test_run:cmd("start server replica") +--- +- true +... +test_run:wait_lsn('replica', 'default') +--- +... -- Cleanup. +test_run:cmd("stop server replica") +--- +- true +... +test_run:cmd("cleanup server replica") +--- +- true +... +test_run:cmd("delete server replica") +--- +- true +... +test_run:cleanup_cluster() +--- +... s:drop() --- ... diff --git a/test/replication/gc.test.lua b/test/replication/gc.test.lua index a92a6ed7..890fe29a 100644 --- a/test/replication/gc.test.lua +++ b/test/replication/gc.test.lua @@ -189,7 +189,6 @@ box.cfg{replication = replica_port} -- Stop the replica and write a few WALs. test_run:cmd("stop server replica") test_run:cmd("cleanup server replica") -test_run:cmd("delete server replica") _ = s:auto_increment{} box.snapshot() _ = s:auto_increment{} @@ -207,7 +206,65 @@ wait_xlog(0, 10) or fio.listdir('./master') -- Restore the config. box.cfg{replication = {}} +s:truncate() + +-- +-- Check that a master doesn't remove WAL files needed by a replica +-- in case the replica has local changes due to which its vclock has +-- greater signature than the master's (gh-4106). +-- +test_run:cmd("start server replica") + +-- Temporarily disable replication. +test_run:cmd("switch replica") +replication = box.cfg.replication +box.cfg{replication = {}} + +-- Generate some WALs on the master. +test_run:cmd("switch default") +test_run:cmd("setopt delimiter ';'") +for i = 1, 4 do + for j = 1, 100 do + s:replace{1, i, j} + end + box.snapshot() +end; +test_run:cmd("setopt delimiter ''"); + +-- Stall WAL writes on the replica and reestablish replication, +-- then bump local LSN and break replication on WAL error so that +-- the master sends all rows and receives ack with the replica's +-- vclock, but not all rows are actually applied on the replica. +-- The master must not delete unconfirmed WALs even though the +-- replica's vclock has a greater signature (checked later on). +test_run:cmd("switch replica") +box.error.injection.set('ERRINJ_WAL_DELAY', true) +box.cfg{replication_connect_quorum = 0} -- disable sync +box.cfg{replication = replication} +fiber = require('fiber') +test_run:cmd("setopt delimiter ';'"); +_ = fiber.create(function() + box.begin() + for i = 1, 1000 do + box.space.test:replace{1, i} + end + box.commit() + box.error.injection.set('ERRINJ_WAL_WRITE_DISK', true) +end); +test_run:cmd("setopt delimiter ''"); +box.error.injection.set('ERRINJ_WAL_DELAY', false) + +-- Restart the replica and wait for it to sync. +test_run:cmd("switch default") +test_run:cmd("stop server replica") +test_run:cmd("start server replica") +test_run:wait_lsn('replica', 'default') + -- Cleanup. +test_run:cmd("stop server replica") +test_run:cmd("cleanup server replica") +test_run:cmd("delete server replica") +test_run:cleanup_cluster() s:drop() box.error.injection.set("ERRINJ_RELAY_REPORT_INTERVAL", 0) box.schema.user.revoke('guest', 'replication') -- 2.11.0