[PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete

Vladimir Davydov vdavydov.dev at gmail.com
Tue Sep 4 20:23:50 MSK 2018


We must not schedule any background jobs during local recovery, because
they may disrupt yet to be recovered data stored on disk. Since we start
the scheduler fiber as soon as the engine is initialized, we have to
pull some tricks to make sure it doesn't schedule any tasks: the
scheduler fiber function yields immediately upon startup; we assume
that it won't be woken up until local recovery is complete, because we
don't set the memory limit until then.

This looks rather flimsy, because the logic is spread among several
seemingly unrelated functions: the scheduler fiber (vy_scheduler_f),
the quota watermark callback (vy_env_quota_exceeded_cb), and the engine
recovery callback (vinyl_engine_begin_initial_recovery), where we leave
the memory limit unset until recovery is complete. The latter isn't even
mentioned in comments, which makes the code difficult to follow. Think
how everything would fall apart should we try to wake up the scheduler
fiber somewhere else for some reason.

This patch attempts to make the code more straightforward by postponing
startup of the scheduler fiber until recovery completion. It also moves
the comment explaining why we can't schedule tasks during local recovery
from vy_env_quota_exceeded_cb to vinyl_engine_begin_initial_recovery,
because this is where we actually omit the scheduler fiber startup.

Note, since now the scheduler fiber goes straight to business once
started, we can't start worker threads in the fiber function as we used
to, because then workers threads would be running even if vinyl was
unused. So we move this code to vy_worker_pool_get, which is called when
a worker is actually needed to run a task.
---
 src/box/vinyl.c        | 39 ++++++++++++++++++++-------------------
 src/box/vy_scheduler.c | 27 ++++++++++++---------------
 src/box/vy_scheduler.h |  6 ++++++
 3 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 416c9824..1cf9ad16 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2420,21 +2420,6 @@ vy_env_quota_exceeded_cb(struct vy_quota *quota)
 {
 	struct vy_env *env = container_of(quota, struct vy_env, quota);
 
-	/*
-	 * The scheduler must be disabled during local recovery so as
-	 * not to distort data stored on disk. Not that we really need
-	 * it anyway, because the memory footprint is limited by the
-	 * memory limit from the previous run.
-	 *
-	 * On the contrary, remote recovery does require the scheduler
-	 * to be up and running, because the amount of data received
-	 * when bootstrapping from a remote master is only limited by
-	 * its disk size, which can exceed the size of available
-	 * memory by orders of magnitude.
-	 */
-	assert(env->status != VINYL_INITIAL_RECOVERY_LOCAL &&
-	       env->status != VINYL_FINAL_RECOVERY_LOCAL);
-
 	if (lsregion_used(&env->mem_env.allocator) == 0) {
 		/*
 		 * The memory limit has been exceeded, but there's
@@ -2710,13 +2695,15 @@ vy_set_deferred_delete_trigger(void)
 static int
 vinyl_engine_bootstrap(struct engine *engine)
 {
+	vy_set_deferred_delete_trigger();
+
 	struct vy_env *e = vy_env(engine);
 	assert(e->status == VINYL_OFFLINE);
 	if (vy_log_bootstrap() != 0)
 		return -1;
+	vy_scheduler_start(&e->scheduler);
 	vy_quota_set_limit(&e->quota, e->memory);
 	e->status = VINYL_ONLINE;
-	vy_set_deferred_delete_trigger();
 	return 0;
 }
 
@@ -2724,6 +2711,8 @@ static int
 vinyl_engine_begin_initial_recovery(struct engine *engine,
 				    const struct vclock *recovery_vclock)
 {
+	vy_set_deferred_delete_trigger();
+
 	struct vy_env *e = vy_env(engine);
 	assert(e->status == VINYL_OFFLINE);
 	if (recovery_vclock != NULL) {
@@ -2732,14 +2721,25 @@ vinyl_engine_begin_initial_recovery(struct engine *engine,
 		e->recovery = vy_log_begin_recovery(recovery_vclock);
 		if (e->recovery == NULL)
 			return -1;
+		/*
+		 * We can't schedule any background tasks until
+		 * local recovery is complete, because they would
+		 * disrupt yet to be recovered data stored on disk.
+		 * So we don't start the scheduler fiber or enable
+		 * memory limit until then.
+		 *
+		 * This is OK, because during recovery an instance
+		 * can't consume more memory than it used before
+		 * restart and hence memory dumps are not necessary.
+		 */
 		e->status = VINYL_INITIAL_RECOVERY_LOCAL;
 	} else {
 		if (vy_log_bootstrap() != 0)
 			return -1;
+		vy_scheduler_start(&e->scheduler);
 		vy_quota_set_limit(&e->quota, e->memory);
 		e->status = VINYL_INITIAL_RECOVERY_REMOTE;
 	}
-	vy_set_deferred_delete_trigger();
 	return 0;
 }
 
@@ -2780,11 +2780,10 @@ vinyl_engine_end_recovery(struct engine *engine)
 		vy_recovery_delete(e->recovery);
 		e->recovery = NULL;
 		e->recovery_vclock = NULL;
-		e->status = VINYL_ONLINE;
+		vy_scheduler_start(&e->scheduler);
 		vy_quota_set_limit(&e->quota, e->memory);
 		break;
 	case VINYL_FINAL_RECOVERY_REMOTE:
-		e->status = VINYL_ONLINE;
 		break;
 	default:
 		unreachable();
@@ -2796,6 +2795,8 @@ vinyl_engine_end_recovery(struct engine *engine)
 	 */
 	if (e->lsm_env.lsm_count > 0)
 		vy_run_env_enable_coio(&e->run_env, e->read_threads);
+
+	e->status = VINYL_ONLINE;
 	return 0;
 }
 
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 4f57c0a0..5daaf4f1 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -399,6 +399,14 @@ vy_worker_pool_destroy(struct vy_worker_pool *pool)
 static struct vy_worker *
 vy_worker_pool_get(struct vy_worker_pool *pool)
 {
+	/*
+	 * Start worker threads only when a task is scheduled
+	 * so that they are not dangling around if vinyl is
+	 * not used.
+	 */
+	if (pool->workers == NULL)
+		vy_worker_pool_start(pool);
+
 	struct vy_worker *worker = NULL;
 	if (!stailq_empty(&pool->idle_workers)) {
 		worker = stailq_shift_entry(&pool->idle_workers,
@@ -465,7 +473,11 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 
 	diag_create(&scheduler->diag);
 	fiber_cond_create(&scheduler->dump_cond);
+}
 
+void
+vy_scheduler_start(struct vy_scheduler *scheduler)
+{
 	fiber_start(scheduler->scheduler_fiber, scheduler);
 }
 
@@ -1947,21 +1959,6 @@ vy_scheduler_f(va_list va)
 {
 	struct vy_scheduler *scheduler = va_arg(va, struct vy_scheduler *);
 
-	/*
-	 * Yield immediately, until the quota watermark is reached
-	 * for the first time or a checkpoint is made.
-	 * Then start the worker threads: we know they will be
-	 * needed. If quota watermark is never reached, workers
-	 * are not started and the scheduler is idle until
-	 * shutdown or checkpoint.
-	 */
-	fiber_cond_wait(&scheduler->scheduler_cond);
-	if (scheduler->scheduler_fiber == NULL)
-		return 0; /* destroyed */
-
-	vy_worker_pool_start(&scheduler->dump_pool);
-	vy_worker_pool_start(&scheduler->compact_pool);
-
 	while (scheduler->scheduler_fiber != NULL) {
 		struct stailq processed_tasks;
 		struct vy_task *task, *next;
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index 841d5a53..96ce721b 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -172,6 +172,12 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 		    struct vy_run_env *run_env, struct rlist *read_views);
 
 /**
+ * Start a scheduler fiber.
+ */
+void
+vy_scheduler_start(struct vy_scheduler *scheduler);
+
+/**
  * Destroy a scheduler instance.
  */
 void
-- 
2.11.0




More information about the Tarantool-patches mailing list