Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks
Date: Tue,  4 Sep 2018 20:23:48 +0300	[thread overview]
Message-ID: <e3b17a179bc25a4298801ac8edd32af0f76496bb.1536080993.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1536080993.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1536080993.git.vdavydov.dev@gmail.com>

Using the same thread pool for both dump and compaction tasks makes
estimation of dump bandwidth unstable. For instance, if we have four
worker threads, then the observed dump bandwidth may vary from X if
there's high compaction demand and all worker threads tend to be busy
with compaction tasks to 4 * X if there's no compaction demand. As a
result, we can overestimate the dump bandwidth and trigger dump when
it's too late, which will result in hitting the limit before dump is
complete and hence stalling write transactions, which is unacceptable.

To avoid that, let's separate thread pools used for dump and compaction
tasks. Since LSM tree based design typically implies high levels of
write amplification, let's allocate 1/4th of all threads for dump tasks
and use the rest exclusively for compaction.
---
 src/box/vy_scheduler.c | 74 ++++++++++++++++++++++++++++++--------------------
 src/box/vy_scheduler.h | 11 ++++----
 2 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 5d6960cd..dbc3ca69 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -88,6 +88,8 @@ struct vy_worker {
 	struct cpipe worker_pipe;
 	/** Pipe from the worker thread to tx. */
 	struct cpipe tx_pipe;
+	/** Pool this worker was allocated from. */
+	struct vy_worker_pool *pool;
 	/**
 	 * Task that is currently being executed by the worker
 	 * or NULL if the worker is idle.
@@ -337,8 +339,6 @@ static void
 vy_worker_pool_start(struct vy_worker_pool *pool)
 {
 	assert(pool->workers == NULL);
-	/* One thread is reserved for dumps, see vy_schedule(). */
-	assert(pool->size >= 2);
 
 	pool->idle_worker_count = pool->size;
 	pool->workers = calloc(pool->size, sizeof(*pool->workers));
@@ -347,10 +347,12 @@ vy_worker_pool_start(struct vy_worker_pool *pool)
 
 	for (int i = 0; i < pool->size; i++) {
 		char name[FIBER_NAME_MAX];
-		snprintf(name, sizeof(name), "vinyl.writer.%d", i);
+		snprintf(name, sizeof(name), "vinyl.%s.%d", pool->name, i);
 		struct vy_worker *worker = &pool->workers[i];
 		if (cord_costart(&worker->cord, name, vy_worker_f, worker) != 0)
 			panic("failed to start vinyl worker thread");
+
+		worker->pool = pool;
 		cpipe_create(&worker->worker_pipe, name);
 		stailq_add_tail_entry(&pool->idle_workers, worker, in_idle);
 
@@ -377,8 +379,9 @@ vy_worker_pool_stop(struct vy_worker_pool *pool)
 }
 
 static void
-vy_worker_pool_create(struct vy_worker_pool *pool, int size)
+vy_worker_pool_create(struct vy_worker_pool *pool, const char *name, int size)
 {
+	pool->name = name;
 	pool->size = size;
 	pool->workers = NULL;
 	stailq_create(&pool->idle_workers);
@@ -404,16 +407,19 @@ vy_worker_pool_get(struct vy_worker_pool *pool)
 		pool->idle_worker_count--;
 		worker = stailq_shift_entry(&pool->idle_workers,
 					    struct vy_worker, in_idle);
+		assert(worker->pool == pool);
 	}
 	return worker;
 }
 
 /**
- * Put a worker back to a pool once it's done its job.
+ * Put a worker back to the pool it was allocated from once
+ * it's done its job.
  */
 static void
-vy_worker_pool_put(struct vy_worker_pool *pool, struct vy_worker *worker)
+vy_worker_pool_put(struct vy_worker *worker)
 {
+	struct vy_worker_pool *pool = worker->pool;
 	assert(pool->idle_worker_count < pool->size);
 	pool->idle_worker_count++;
 	stailq_add_entry(&pool->idle_workers, worker, in_idle);
@@ -436,7 +442,28 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 		panic("failed to allocate vinyl scheduler fiber");
 
 	fiber_cond_create(&scheduler->scheduler_cond);
-	vy_worker_pool_create(&scheduler->worker_pool, write_threads);
+
+	/*
+	 * Dump tasks must be scheduled as soon as possible,
+	 * otherwise we may run out of memory quota and have
+	 * to stall transactions. To avoid unpredictably long
+	 * stalls caused by ongoing compaction tasks blocking
+	 * scheduling of dump tasks, we use separate thread
+	 * pools for dump and compaction tasks.
+	 *
+	 * Since a design based on LSM trees typically implies
+	 * high write amplification, we allocate only 1/4th of
+	 * all available threads to dump tasks while the rest
+	 * is used exclusively for compaction.
+	 */
+	assert(write_threads > 1);
+	int dump_threads = MAX(1, write_threads / 4);
+	int compact_threads = write_threads - dump_threads;
+	vy_worker_pool_create(&scheduler->dump_pool,
+			      "dump", dump_threads);
+	vy_worker_pool_create(&scheduler->compact_pool,
+			      "compact", compact_threads);
+
 	stailq_create(&scheduler->processed_tasks);
 
 	vy_dump_heap_create(&scheduler->dump_heap);
@@ -457,7 +484,8 @@ vy_scheduler_destroy(struct vy_scheduler *scheduler)
 	fiber_cond_signal(&scheduler->dump_cond);
 	fiber_cond_signal(&scheduler->scheduler_cond);
 
-	vy_worker_pool_destroy(&scheduler->worker_pool);
+	vy_worker_pool_destroy(&scheduler->dump_pool);
+	vy_worker_pool_destroy(&scheduler->compact_pool);
 	diag_destroy(&scheduler->diag);
 	fiber_cond_destroy(&scheduler->dump_cond);
 	fiber_cond_destroy(&scheduler->scheduler_cond);
@@ -1791,12 +1819,12 @@ retry:
 		 * round. Try to create a task for it.
 		 */
 		if (worker == NULL) {
-			worker = vy_worker_pool_get(&scheduler->worker_pool);
+			worker = vy_worker_pool_get(&scheduler->dump_pool);
 			if (worker == NULL)
 				return 0; /* all workers are busy */
 		}
 		if (vy_task_dump_new(scheduler, worker, lsm, ptask) != 0) {
-			vy_worker_pool_put(&scheduler->worker_pool, worker);
+			vy_worker_pool_put(worker);
 			return -1;
 		}
 		if (*ptask != NULL)
@@ -1816,7 +1844,7 @@ retry:
 	assert(scheduler->dump_task_count > 0);
 no_task:
 	if (worker != NULL)
-		vy_worker_pool_put(&scheduler->worker_pool, worker);
+		vy_worker_pool_put(worker);
 	return 0;
 }
 
@@ -1846,12 +1874,12 @@ retry:
 	if (vy_lsm_compact_priority(lsm) <= 1)
 		goto no_task; /* nothing to do */
 	if (worker == NULL) {
-		worker = vy_worker_pool_get(&scheduler->worker_pool);
+		worker = vy_worker_pool_get(&scheduler->compact_pool);
 		if (worker == NULL)
 			return 0; /* all workers are busy */
 	}
 	if (vy_task_compact_new(scheduler, worker, lsm, ptask) != 0) {
-		vy_worker_pool_put(&scheduler->worker_pool, worker);
+		vy_worker_pool_put(worker);
 		return -1;
 	}
 	if (*ptask == NULL)
@@ -1859,7 +1887,7 @@ retry:
 	return 0; /* new task */
 no_task:
 	if (worker != NULL)
-		vy_worker_pool_put(&scheduler->worker_pool, worker);
+		vy_worker_pool_put(worker);
 	return 0;
 }
 
@@ -1873,18 +1901,6 @@ vy_schedule(struct vy_scheduler *scheduler, struct vy_task **ptask)
 	if (*ptask != NULL)
 		return 0;
 
-	if (scheduler->worker_pool.idle_worker_count <= 1) {
-		/*
-		 * If all worker threads are busy doing compaction
-		 * when we run out of quota, ongoing transactions will
-		 * hang until one of the threads has finished, which
-		 * may take quite a while. To avoid unpredictably long
-		 * stalls, always keep one worker thread reserved for
-		 * dumps.
-		 */
-		return 0;
-	}
-
 	if (vy_scheduler_peek_compact(scheduler, ptask) != 0)
 		goto fail;
 	if (*ptask != NULL)
@@ -1949,7 +1965,8 @@ vy_scheduler_f(va_list va)
 	if (scheduler->scheduler_fiber == NULL)
 		return 0; /* destroyed */
 
-	vy_worker_pool_start(&scheduler->worker_pool);
+	vy_worker_pool_start(&scheduler->dump_pool);
+	vy_worker_pool_start(&scheduler->compact_pool);
 
 	while (scheduler->scheduler_fiber != NULL) {
 		struct stailq processed_tasks;
@@ -1967,8 +1984,7 @@ vy_scheduler_f(va_list va)
 				tasks_failed++;
 			else
 				tasks_done++;
-			vy_worker_pool_put(&scheduler->worker_pool,
-					   task->worker);
+			vy_worker_pool_put(task->worker);
 			vy_task_delete(task);
 		}
 		/*
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index 606f3b31..7014f953 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -57,6 +57,8 @@ typedef void
 				int64_t dump_generation, double dump_duration);
 
 struct vy_worker_pool {
+	/** Name of the pool. Used for naming threads. */
+	const char *name;
 	/** Number of worker threads in the pool. */
 	int size;
 	/** Array of all worker threads in the pool. */
@@ -72,11 +74,10 @@ struct vy_scheduler {
 	struct fiber *scheduler_fiber;
 	/** Used to wake up the scheduler fiber from TX. */
 	struct fiber_cond scheduler_cond;
-	/**
-	 * Pool of threads used for performing dump/compaction
-	 * tasks in the background.
-	 */
-	struct vy_worker_pool worker_pool;
+	/** Pool of threads for performing background dumps. */
+	struct vy_worker_pool dump_pool;
+	/** Pool of threads for performing background compactions. */
+	struct vy_worker_pool compact_pool;
 	/** Queue of processed tasks, linked by vy_task::in_processed. */
 	struct stailq processed_tasks;
 	/**
-- 
2.11.0

  parent reply	other threads:[~2018-09-04 17:23 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
2018-09-02 20:18 ` [PATCH 1/7] vinyl: fix accounting of secondary index cache statements Vladimir Davydov
2018-09-02 22:26   ` [tarantool-patches] " Konstantin Osipov
2018-09-02 20:18 ` [PATCH 2/7] vinyl: add global memory stats Vladimir Davydov
2018-09-02 22:27   ` [tarantool-patches] " Konstantin Osipov
2018-09-02 22:27   ` Konstantin Osipov
2018-09-03  8:10     ` Vladimir Davydov
2018-09-02 20:18 ` [PATCH 3/7] vinyl: add global disk stats Vladimir Davydov
2018-09-02 22:30   ` [tarantool-patches] " Konstantin Osipov
2018-09-02 20:18 ` [PATCH 4/7] vinyl: fix force compaction logic Vladimir Davydov
2018-09-02 20:18 ` [PATCH 5/7] vinyl: update compact priority usual way on range split/coalesce Vladimir Davydov
2018-09-02 20:18 ` [PATCH 6/7] vinyl: keep track of compaction queue length and debt Vladimir Davydov
2018-09-02 20:19 ` [PATCH 7/7] vinyl: keep track of disk idle time Vladimir Davydov
2018-09-04 11:54   ` Vladimir Davydov
2018-09-04 17:23     ` Vladimir Davydov
2018-09-04 17:23       ` [PATCH 1/8] vinyl: add helper to check whether dump is in progress Vladimir Davydov
2018-09-06  7:33         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 2/8] vinyl: don't use mempool for allocating background tasks Vladimir Davydov
2018-09-06  7:33         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 3/8] vinyl: factor out worker pool from scheduler struct Vladimir Davydov
2018-09-06  7:34         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 4/8] vinyl: move worker allocation closer to task creation Vladimir Davydov
2018-09-06  7:35         ` Konstantin Osipov
2018-09-04 17:23       ` Vladimir Davydov [this message]
2018-09-06  7:37         ` [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks Konstantin Osipov
2018-09-06  9:48           ` Vladimir Davydov
2018-09-06 10:32             ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 6/8] vinyl: zap vy_worker_pool::idle_worker_count Vladimir Davydov
2018-09-06  7:38         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete Vladimir Davydov
2018-09-06  7:39         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 8/8] vinyl: keep track of thread pool idle ratio Vladimir Davydov
2018-09-06  7:49         ` Konstantin Osipov
2018-09-06  8:18           ` Vladimir Davydov
2018-09-06 10:26             ` Konstantin Osipov
2018-09-06 10:52               ` Vladimir Davydov
2018-09-06 10:57                 ` Konstantin Osipov
2018-09-06 11:59                   ` Vladimir Davydov
2018-09-09 11:41 ` [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e3b17a179bc25a4298801ac8edd32af0f76496bb.1536080993.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox