[Tarantool-patches] [PATCH v9 2/2] relay: provide information about downstream lag

Cyrill Gorcunov gorcunov at gmail.com
Mon Jun 21 19:17:20 MSK 2021


On Sun, Jun 20, 2021 at 04:37:21PM +0200, Vladislav Shpilevoy wrote:
> Hi! Thanks for the patch!
> 
> The test fails when I run it multiple times:
> 
> [014] Test failed! Result content mismatch:
> [014] --- replication/gh-5447-downstream-lag.result	Sun Jun 20 16:10:26 2021
> [014] +++ var/rejects/replication/gh-5447-downstream-lag.reject	Sun Jun 20 16:33:01 2021
> [014] @@ -37,7 +37,7 @@
> [014]  -- Upon replica startup there is no ACKs to process.
> [014]  assert(box.info.replication[replica_id].downstream.lag == 0)
> [014]   | ---
> [014] - | - true
> [014] + | - error: assertion failed!
> 
> See 4 comments below.

Vlad, here is an update, I force pushed it into the same branch.
I'll fix the error injection nit. Could you please retry the
test to run simultaneously (I did it locally with 200 tests
but it didn't trigger anything). I rebased the series on top
of master.
---
>From da969da89beab720c91c7e895613ab9cf6ab2ea7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov at gmail.com>
Date: Mon, 21 Jun 2021 14:30:52 +0300
Subject: [PATCH] Update

Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
 src/box/relay.cc                              | 19 +------------------
 .../replication/gh-5447-downstream-lag.result | 10 +++-------
 .../gh-5447-downstream-lag.test.lua           |  3 +--
 3 files changed, 5 insertions(+), 27 deletions(-)

diff --git a/src/box/relay.cc b/src/box/relay.cc
index 14c9b0f03..115037fc3 100644
--- a/src/box/relay.cc
+++ b/src/box/relay.cc
@@ -160,11 +160,6 @@ struct relay {
 	struct stailq pending_gc;
 	/** Time when last row was sent to peer. */
 	double last_row_time;
-	/**
-	 * Last timestamp observed from remote node to
-	 * compute @a txn_lag value.
-	 */
-	double txn_acked_tm;
 	/**
 	 * A time difference between the moment when we
 	 * wrote a transaction to the local WAL and when
@@ -310,15 +305,6 @@ relay_start(struct relay *relay, int fd, uint64_t sync,
 	relay->state = RELAY_FOLLOW;
 	relay->row_count = 0;
 	relay->last_row_time = ev_monotonic_now(loop());
-	/*
-	 * We assume that previously written rows in WAL
-	 * are older than current node real time which allows
-	 * to simplify @a tx.txn_lag calculation. In worst
-	 * scenario when runtime has been adjusted backwards
-	 * between restart we simply get some big value in
-	 * @a tx.txn_lag until next transaction get replicated.
-	 */
-	relay->txn_acked_tm = ev_now(loop());
 }
 
 void
@@ -375,7 +361,6 @@ relay_stop(struct relay *relay)
 	 * If relay is stopped then lag statistics should
 	 * be updated on next new ACK packets obtained.
 	 */
-	relay->txn_acked_tm = 0;
 	relay->txn_lag = 0;
 	relay->tx.txn_lag = 0;
 }
@@ -682,10 +667,8 @@ relay_reader_f(va_list ap)
 			 * can compute time spent regardless of the clock
 			 * value on remote replica.
 			 */
-			if (relay->txn_acked_tm < xrow.tm) {
-				relay->txn_acked_tm = xrow.tm;
+			if (xrow.tm != 0)
 				relay->txn_lag = ev_now(loop()) - xrow.tm;
-			}
 			fiber_cond_signal(&relay->reader_cond);
 		}
 	} catch (Exception *e) {
diff --git a/test/replication/gh-5447-downstream-lag.result b/test/replication/gh-5447-downstream-lag.result
index 2cc020451..0d5de2564 100644
--- a/test/replication/gh-5447-downstream-lag.result
+++ b/test/replication/gh-5447-downstream-lag.result
@@ -70,17 +70,10 @@ test_run:switch('default')
  | ---
  | - true
  | ...
-lsn = box.info.lsn
- | ---
- | ...
 box.space.test:insert({1})
  | ---
  | - [1]
  | ...
-test_run:wait_cond(function() return box.info.lsn > lsn end)
- | ---
- | - true
- | ...
 -- The record is written on the master node.
 test_run:switch('replica')
  | ---
@@ -111,6 +104,9 @@ test_run:switch('default')
  | ---
  | - true
  | ...
+box.space.test:drop()
+ | ---
+ | ...
 box.schema.user.revoke('guest', 'replication')
  | ---
  | ...
diff --git a/test/replication/gh-5447-downstream-lag.test.lua b/test/replication/gh-5447-downstream-lag.test.lua
index 3096e2ac3..dd1d2e2c9 100644
--- a/test/replication/gh-5447-downstream-lag.test.lua
+++ b/test/replication/gh-5447-downstream-lag.test.lua
@@ -35,9 +35,7 @@ box.error.injection.set("ERRINJ_WAL_DELAY", true)
 --
 -- Insert a record and wakeup replica's WAL to process data.
 test_run:switch('default')
-lsn = box.info.lsn
 box.space.test:insert({1})
-test_run:wait_cond(function() return box.info.lsn > lsn end)
 -- The record is written on the master node.
 test_run:switch('replica')
 box.error.injection.set("ERRINJ_WAL_DELAY", false)
@@ -51,6 +49,7 @@ assert(box.info.replication[replica_id].downstream.lag > 0)
 --
 -- Cleanup everything.
 test_run:switch('default')
+box.space.test:drop()
 box.schema.user.revoke('guest', 'replication')
 test_run:cmd('stop server replica')
 test_run:cmd('cleanup server replica')
-- 
2.31.1



More information about the Tarantool-patches mailing list