[PATCH] vinyl: ignore quota timeout on replication

Vladimir Davydov vdavydov.dev at gmail.com
Mon Jan 29 16:20:25 MSK 2018


If vinyl fails to do memory dumps in time on a replica (e.g. it ran
out of disk space), replication will stop forever with an error, and
the admin will have to call box.cfg() to restart replication. Since
replication is asynchronous anyway, we shouldn't stop it on vinyl
timeout - it isn't critical as the replica will recover as soon as
the admin fixes the problem (e.g. frees up some disk space). Let's
ignore vinyl timeout altogether for applier fibers (currently, we
ignore it only on join) - the admin can monitor how badly a replica
lags behind the master via box.info.replication lag/idle.

Closes #3087
---
Branch: gh-3087-vy-ignore-quota-timeout-on-replication

 src/box/applier.cc                |  1 +
 src/box/vinyl.c                   | 10 +++++-----
 src/fiber.c                       |  1 +
 src/fiber.h                       | 29 +++++++++++++++++++++++++++++
 test/vinyl/replica_quota.result   | 13 +++++++++++++
 test/vinyl/replica_quota.test.lua |  7 +++++++
 6 files changed, 56 insertions(+), 5 deletions(-)

diff --git a/src/box/applier.cc b/src/box/applier.cc
index f0073bad..6f05d223 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -576,6 +576,7 @@ applier_start(struct applier *applier)
 	uri_format(name + pos, sizeof(name) - pos, &applier->uri, false);
 
 	struct fiber *f = fiber_new_xc(name, applier_f);
+	fiber_set_type(f, FIBER_TYPE_APPLIER);
 	/**
 	 * So that we can safely grab the status of the
 	 * fiber any time we want.
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 59bd6e7d..fac8089a 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2324,12 +2324,12 @@ vinyl_engine_prepare(struct engine *engine, struct txn *txn)
 		return -1;
 
 	/*
-	 * A replica receives a lot of data during initial join.
-	 * If the network connection is fast enough, it might fail
-	 * to keep up with dumps. To avoid replication failure due
-	 * to this, we ignore the quota timeout during bootstrap.
+	 * Do not abort join/subscribe on quota timeout - replication
+	 * is asynchronous anyway and there's box.info.replication
+	 * available for the admin to track the lag so let the applier
+	 * wait as long as necessary for memory dump to complete.
 	 */
-	double timeout = (env->status == VINYL_ONLINE ?
+	double timeout = (fiber_type(fiber()) != FIBER_TYPE_APPLIER ?
 			  env->timeout : TIMEOUT_INFINITY);
 	/*
 	 * Reserve quota needed by the transaction before allocating
diff --git a/src/fiber.c b/src/fiber.c
index 94157392..c1701079 100644
--- a/src/fiber.c
+++ b/src/fiber.c
@@ -808,6 +808,7 @@ fiber_new_ex(const char *name, const struct fiber_attr *fiber_attr,
 		cord->max_fid = 101;
 	fiber->fid = cord->max_fid;
 	fiber_set_name(fiber, name);
+	fiber_set_type(fiber, FIBER_TYPE_UNKNOWN);
 	register_fid(fiber);
 
 	return fiber;
diff --git a/src/fiber.h b/src/fiber.h
index 94b3f445..a2a407c2 100644
--- a/src/fiber.h
+++ b/src/fiber.h
@@ -108,6 +108,14 @@ enum fiber_key {
 	FIBER_KEY_MAX = 5
 };
 
+enum fiber_type {
+	/** Unknown fiber (default). */
+	FIBER_TYPE_UNKNOWN = 0,
+	/** Applier fiber. */
+	FIBER_TYPE_APPLIER = 1,
+	FIBER_TYPE_MAX
+};
+
 /** \cond public */
 
 /**
@@ -397,6 +405,9 @@ struct fiber {
 	void *fls[FIBER_KEY_MAX];
 	/** Exception which caused this fiber's death. */
 	struct diag diag;
+	/** Fiber type. */
+	enum fiber_type type;
+	/** Fiber name. */
 	char name[FIBER_NAME_MAX];
 };
 
@@ -550,6 +561,24 @@ fiber_name(struct fiber *f)
 	return f->name;
 }
 
+/**
+ * Set fiber type.
+ * @param fiber Fiber to set type for.
+ * @param type New type of @a fiber.
+ */
+static inline void
+fiber_set_type(struct fiber *fiber, enum fiber_type type)
+{
+	assert(type < FIBER_TYPE_MAX);
+	fiber->type = type;
+}
+
+static inline enum fiber_type
+fiber_type(struct fiber *fiber)
+{
+	return fiber->type;
+}
+
 bool
 fiber_checkstack();
 
diff --git a/test/vinyl/replica_quota.result b/test/vinyl/replica_quota.result
index 485efde7..b85c7398 100644
--- a/test/vinyl/replica_quota.result
+++ b/test/vinyl/replica_quota.result
@@ -45,6 +45,19 @@ _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
 ---
 ...
+-- Check vinyl_timeout is ignored on 'subscribe' (gh-3087).
+_ = test_run:cmd("stop server replica")
+---
+...
+for i = 2001,3000 do s:insert{i, pad} end
+---
+...
+_ = test_run:cmd("start server replica")
+---
+...
+_ = test_run:wait_lsn('replica', 'default')
+---
+...
 _ = test_run:cmd("stop server replica")
 ---
 ...
diff --git a/test/vinyl/replica_quota.test.lua b/test/vinyl/replica_quota.test.lua
index bc6cfb0d..ab89c1bc 100644
--- a/test/vinyl/replica_quota.test.lua
+++ b/test/vinyl/replica_quota.test.lua
@@ -24,6 +24,13 @@ for i = 1001,2000 do s:insert{i, pad} end
 _ = test_run:cmd("create server replica with rpl_master=default, script='vinyl/join_quota.lua'")
 _ = test_run:cmd("start server replica")
 _ = test_run:wait_lsn('replica', 'default')
+
+-- Check vinyl_timeout is ignored on 'subscribe' (gh-3087).
+_ = test_run:cmd("stop server replica")
+for i = 2001,3000 do s:insert{i, pad} end
+_ = test_run:cmd("start server replica")
+_ = test_run:wait_lsn('replica', 'default')
+
 _ = test_run:cmd("stop server replica")
 _ = test_run:cmd("cleanup server replica")
 
-- 
2.11.0




More information about the Tarantool-patches mailing list