[Tarantool-patches] [PATCH 3/3] raft: don't drop GC when restart relay recovery

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Sat Oct 17 20:17:57 MSK 2020


When a node becomes a leader, it restarts relay recovery cursors
to re-send all the data since the last acked row.

But during recovery restart the relay lost the trigger, which used
to update GC state in TX thread.

The patch preserves the trigger.

Follow up for #5433
---
 src/box/relay.cc                              |  4 +-
 .../gh-5433-election-restart-recovery.result  | 55 +++++++++++++++++++
 ...gh-5433-election-restart-recovery.test.lua | 28 ++++++++++
 3 files changed, 86 insertions(+), 1 deletion(-)

diff --git a/src/box/relay.cc b/src/box/relay.cc
index 285f96108..b68b45e00 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -882,8 +882,10 @@ relay_restart_recovery(struct relay *relay)
 	struct vclock restart_vclock;
 	vclock_copy(&restart_vclock, &relay->recv_vclock);
 	vclock_reset(&restart_vclock, 0, vclock_get(&relay->r->vclock, 0));
+	struct recovery *r = recovery_new(wal_dir(), false, &restart_vclock);
+	rlist_swap(&relay->r->on_close_log, &r->on_close_log);
 	recovery_delete(relay->r);
-	relay->r = recovery_new(wal_dir(), false, &restart_vclock);
+	relay->r = r;
 	recover_remaining_wals(relay->r, &relay->stream, NULL, true);
 }
 
diff --git a/test/replication/gh-5433-election-restart-recovery.result b/test/replication/gh-5433-election-restart-recovery.result
index a8d7893bd..f8f32416e 100644
--- a/test/replication/gh-5433-election-restart-recovery.result
+++ b/test/replication/gh-5433-election-restart-recovery.result
@@ -100,6 +100,61 @@ s:drop()
  | ---
  | ...
 
+-- Ensure the restarted recovery correctly propagates GC state. For that create
+-- some noise xlog files, snapshots, and check if the relay reports to GC that
+-- it does not use them anymore after scanning.
+fiber = require('fiber')
+ | ---
+ | ...
+s = box.schema.create_space('test')
+ | ---
+ | ...
+_ = s:create_index('pk')
+ | ---
+ | ...
+s:replace{1}
+ | ---
+ | - [1]
+ | ...
+box.snapshot()
+ | ---
+ | - ok
+ | ...
+s:replace{2}
+ | ---
+ | - [2]
+ | ...
+box.snapshot()
+ | ---
+ | - ok
+ | ...
+test_run:wait_lsn('replica', 'default')
+ | ---
+ | ...
+lsn = test_run:get_lsn('replica', box.info.id)
+ | ---
+ | ...
+-- Eventually GC should get the last relayed LSN as it is reported on each
+-- relayed xlog file.
+test_run:wait_cond(function()                                                   \
+    local consumers = box.info.gc().consumers                                   \
+    assert(#consumers == 1)                                                     \
+    local vclock = consumers[1].vclock                                          \
+    if vclock[box.info.id] >= lsn then                                          \
+        return true                                                             \
+    end                                                                         \
+    s:replace{3}                                                                \
+    box.snapshot()                                                              \
+    test_run:wait_lsn('replica', 'default')                                     \
+    return false                                                                \
+end)
+ | ---
+ | - true
+ | ...
+s:drop()
+ | ---
+ | ...
+
 test_run:cmd('stop server replica')
  | ---
  | - true
diff --git a/test/replication/gh-5433-election-restart-recovery.test.lua b/test/replication/gh-5433-election-restart-recovery.test.lua
index 0339a5504..4aff000bf 100644
--- a/test/replication/gh-5433-election-restart-recovery.test.lua
+++ b/test/replication/gh-5433-election-restart-recovery.test.lua
@@ -50,6 +50,34 @@ test_run:switch('default')
 assert(not test_run:grep_log('default', 'XlogGapError', 1000))
 s:drop()
 
+-- Ensure the restarted recovery correctly propagates GC state. For that create
+-- some noise xlog files, snapshots, and check if the relay reports to GC that
+-- it does not use them anymore after scanning.
+fiber = require('fiber')
+s = box.schema.create_space('test')
+_ = s:create_index('pk')
+s:replace{1}
+box.snapshot()
+s:replace{2}
+box.snapshot()
+test_run:wait_lsn('replica', 'default')
+lsn = test_run:get_lsn('replica', box.info.id)
+-- Eventually GC should get the last relayed LSN as it is reported on each
+-- relayed xlog file.
+test_run:wait_cond(function()                                                   \
+    local consumers = box.info.gc().consumers                                   \
+    assert(#consumers == 1)                                                     \
+    local vclock = consumers[1].vclock                                          \
+    if vclock[box.info.id] >= lsn then                                          \
+        return true                                                             \
+    end                                                                         \
+    s:replace{3}                                                                \
+    box.snapshot()                                                              \
+    test_run:wait_lsn('replica', 'default')                                     \
+    return false                                                                \
+end)
+s:drop()
+
 test_run:cmd('stop server replica')
 test_run:cmd('delete server replica')
 box.cfg{                                                                        \
-- 
2.21.1 (Apple Git-122.3)



More information about the Tarantool-patches mailing list