[PATCH 0/7] vinyl: improve stats for throttling

Tarantool development patches archive
 help / color / mirror / Atom feed

* [PATCH 0/7] vinyl: improve stats for throttling
@ 2018-09-02 20:18 Vladimir Davydov
  2018-09-02 20:18 ` [PATCH 1/7] vinyl: fix accounting of secondary index cache statements Vladimir Davydov
                   ` (7 more replies)
  0 siblings, 8 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:18 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

To implement transaction throttling, we need (at least) two additional
metrics:

 - Compaction debt, i.e. how much data have to be compacted to restore
   the target LSM tree shape. This is needed to make decisions about
   decreasing transaction rate.

 - Disk idle time, i.e. how much time worker threads spend doing
   nothing. This is needed to make decisions about loosening throttling.

This patch set adds those metrics. Along the way it also fixes a bug
in cache accounting (patch 1) and adds some useful global statistics
(patches 2 and 3).

https://github.com/tarantool/tarantool/issues/1862
https://github.com/tarantool/tarantool/commits/dv/vy-stats-for-throttling

Vladimir Davydov (7):
  vinyl: fix accounting of secondary index cache statements
  vinyl: add global memory stats
  vinyl: add global disk stats
  vinyl: fix force compaction logic
  vinyl: update compact priority usual way on range split/coalesce
  vinyl: keep track of compaction queue length and debt
  vinyl: keep track of disk idle time

 src/box/vinyl.c                 |  89 ++++++++++---------
 src/box/vy_cache.c              |  13 ++-
 src/box/vy_cache.h              |   5 +-
 src/box/vy_lsm.c                |  85 +++++++++++++-----
 src/box/vy_lsm.h                |  42 ++++++++-
 src/box/vy_range.c              |  42 ++++-----
 src/box/vy_range.h              |  17 ++--
 src/box/vy_scheduler.c          |  61 +++++++++++--
 src/box/vy_scheduler.h          |  12 +++
 src/box/vy_stat.h               |  39 +++++---
 src/box/vy_tx.c                 |  19 ++++
 src/box/vy_tx.h                 |   4 +
 src/errinj.h                    |   1 +
 test/box/errinj.result          |   4 +-
 test/unit/vy_iterators_helper.c |   2 +-
 test/unit/vy_point_lookup.c     |   2 +-
 test/vinyl/cache.result         |  55 +++++++++++-
 test/vinyl/cache.test.lua       |  26 +++++-
 test/vinyl/errinj.result        | 191 ++++++++++++++++++++++++++++++++++++++++
 test/vinyl/errinj.test.lua      |  56 ++++++++++++
 test/vinyl/info.result          | 159 ++++++++++++++++++++++++++++-----
 test/vinyl/info.test.lua        |  32 ++++++-
 22 files changed, 809 insertions(+), 147 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 1/7] vinyl: fix accounting of secondary index cache statements
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
@ 2018-09-02 20:18 ` Vladimir Davydov
  2018-09-02 22:26   ` [tarantool-patches] " Konstantin Osipov
  2018-09-02 20:18 ` [PATCH 2/7] vinyl: add global memory stats Vladimir Davydov
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:18 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

Since commit 0c5e6cc8c80b ("vinyl: store full tuples in secondary index
cache"), we store primary index tuples in secondary index cache, but we
still account them as separate tuples. Fix that.

Follow-up #3478
Closes #3655
---
 src/box/vy_cache.c              | 13 ++++++++++--
 src/box/vy_cache.h              |  5 ++++-
 src/box/vy_lsm.c                |  2 +-
 test/unit/vy_iterators_helper.c |  2 +-
 test/unit/vy_point_lookup.c     |  2 +-
 test/vinyl/cache.result         | 47 +++++++++++++++++++++++++++++++++++++++++
 test/vinyl/cache.test.lua       | 18 ++++++++++++++++
 7 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/src/box/vy_cache.c b/src/box/vy_cache.c
index 4f6d0c7f..ead56e95 100644
--- a/src/box/vy_cache.c
+++ b/src/box/vy_cache.c
@@ -71,7 +71,15 @@ vy_cache_env_destroy(struct vy_cache_env *e)
 static inline size_t
 vy_cache_entry_size(const struct vy_cache_entry *entry)
 {
-	return sizeof(*entry) + tuple_size(entry->stmt);
+	size_t size = sizeof(*entry);
+	/*
+	 * Tuples are shared between primary and secondary index
+	 * cache so to avoid double accounting, we account only
+	 * primary index tuples.
+	 */
+	if (entry->cache->is_primary)
+		size += tuple_size(entry->stmt);
+	return size;
 }
 
 static struct vy_cache_entry *
@@ -128,10 +136,11 @@ vy_cache_tree_page_free(void *ctx, void *p)
 
 void
 vy_cache_create(struct vy_cache *cache, struct vy_cache_env *env,
-		struct key_def *cmp_def)
+		struct key_def *cmp_def, bool is_primary)
 {
 	cache->env = env;
 	cache->cmp_def = cmp_def;
+	cache->is_primary = is_primary;
 	cache->version = 1;
 	vy_cache_tree_create(&cache->cache_tree, cmp_def,
 			     vy_cache_tree_page_alloc,
diff --git a/src/box/vy_cache.h b/src/box/vy_cache.h
index d2545152..7cb93155 100644
--- a/src/box/vy_cache.h
+++ b/src/box/vy_cache.h
@@ -160,6 +160,8 @@ struct vy_cache {
 	 * key parts
 	 */
 	struct key_def *cmp_def;
+	/** Set if this cache is for a primary index. */
+	bool is_primary;
 	/* Tree of cache entries */
 	struct vy_cache_tree cache_tree;
 	/* The vesrion of state of cache_tree. Increments on every change */
@@ -174,10 +176,11 @@ struct vy_cache {
  * Allocate and initialize tuple cache.
  * @param env - pointer to common cache environment.
  * @param cmp_def - key definition for tuple comparison.
+ * @param is_primary - set if cache is for primary index
  */
 void
 vy_cache_create(struct vy_cache *cache, struct vy_cache_env *env,
-		struct key_def *cmp_def);
+		struct key_def *cmp_def, bool is_primary);
 
 /**
  * Destroy and deallocate tuple cache.
diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index cb3c436f..1994525e 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -188,7 +188,7 @@ vy_lsm_new(struct vy_lsm_env *lsm_env, struct vy_cache_env *cache_env,
 	lsm->refs = 1;
 	lsm->dump_lsn = -1;
 	lsm->commit_lsn = -1;
-	vy_cache_create(&lsm->cache, cache_env, cmp_def);
+	vy_cache_create(&lsm->cache, cache_env, cmp_def, index_def->iid == 0);
 	rlist_create(&lsm->sealed);
 	vy_range_tree_new(lsm->tree);
 	vy_range_heap_create(&lsm->range_heap);
diff --git a/test/unit/vy_iterators_helper.c b/test/unit/vy_iterators_helper.c
index 89603376..40d8d6a1 100644
--- a/test/unit/vy_iterators_helper.c
+++ b/test/unit/vy_iterators_helper.c
@@ -232,7 +232,7 @@ create_test_cache(uint32_t *fields, uint32_t *types,
 {
 	*def = box_key_def_new(fields, types, key_cnt);
 	assert(*def != NULL);
-	vy_cache_create(cache, &cache_env, *def);
+	vy_cache_create(cache, &cache_env, *def, true);
 	*format = tuple_format_new(&vy_tuple_format_vtab, def, 1, 0, NULL, 0,
 				   NULL);
 	tuple_format_ref(*format);
diff --git a/test/unit/vy_point_lookup.c b/test/unit/vy_point_lookup.c
index eee25274..86877d7d 100644
--- a/test/unit/vy_point_lookup.c
+++ b/test/unit/vy_point_lookup.c
@@ -82,7 +82,7 @@ test_basic()
 	struct key_def *key_def = box_key_def_new(fields, types, 1);
 	isnt(key_def, NULL, "key_def is not NULL");
 
-	vy_cache_create(&cache, &cache_env, key_def);
+	vy_cache_create(&cache, &cache_env, key_def, true);
 	struct tuple_format *format = tuple_format_new(&vy_tuple_format_vtab,
 						       &key_def, 1, 0, NULL, 0,
 						       NULL);
diff --git a/test/vinyl/cache.result b/test/vinyl/cache.result
index 2043eda9..d75de263 100644
--- a/test/vinyl/cache.result
+++ b/test/vinyl/cache.result
@@ -1077,3 +1077,50 @@ s:drop()
 box.cfg{vinyl_cache = vinyl_cache}
 ---
 ...
+--
+-- gh-3655: statements are shared between primary and secondary
+-- index cache.
+--
+vinyl_cache = box.cfg.vinyl_cache
+---
+...
+box.cfg{vinyl_cache = 1024 * 1024}
+---
+...
+s = box.schema.space.create('test', {engine = 'vinyl'})
+---
+...
+_ = s:create_index('pk')
+---
+...
+_ = s:create_index('i1', {unique = false, parts = {2, 'string'}})
+---
+...
+_ = s:create_index('i2', {unique = false, parts = {3, 'string'}})
+---
+...
+for i = 1, 100 do pad = string.rep(i % 10, 1000) s:replace{i, pad, pad} end
+---
+...
+s.index.pk:count()
+---
+- 100
+...
+s.index.i1:count()
+---
+- 100
+...
+s.index.i2:count()
+---
+- 100
+...
+box.stat.vinyl().cache.used -- should be about 200 KB
+---
+- 216800
+...
+s:drop()
+---
+...
+box.cfg{vinyl_cache = vinyl_cache}
+---
+...
diff --git a/test/vinyl/cache.test.lua b/test/vinyl/cache.test.lua
index 6f414d5d..6d82249a 100644
--- a/test/vinyl/cache.test.lua
+++ b/test/vinyl/cache.test.lua
@@ -380,3 +380,21 @@ st2.put.rows - st1.put.rows
 box.stat.vinyl().cache.used
 s:drop()
 box.cfg{vinyl_cache = vinyl_cache}
+
+--
+-- gh-3655: statements are shared between primary and secondary
+-- index cache.
+--
+vinyl_cache = box.cfg.vinyl_cache
+box.cfg{vinyl_cache = 1024 * 1024}
+s = box.schema.space.create('test', {engine = 'vinyl'})
+_ = s:create_index('pk')
+_ = s:create_index('i1', {unique = false, parts = {2, 'string'}})
+_ = s:create_index('i2', {unique = false, parts = {3, 'string'}})
+for i = 1, 100 do pad = string.rep(i % 10, 1000) s:replace{i, pad, pad} end
+s.index.pk:count()
+s.index.i1:count()
+s.index.i2:count()
+box.stat.vinyl().cache.used -- should be about 200 KB
+s:drop()
+box.cfg{vinyl_cache = vinyl_cache}
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 2/7] vinyl: add global memory stats
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
  2018-09-02 20:18 ` [PATCH 1/7] vinyl: fix accounting of secondary index cache statements Vladimir Davydov
@ 2018-09-02 20:18 ` Vladimir Davydov
  2018-09-02 22:27   ` [tarantool-patches] " Konstantin Osipov
  2018-09-02 22:27   ` Konstantin Osipov
  2018-09-02 20:18 ` [PATCH 3/7] vinyl: add global disk stats Vladimir Davydov
                   ` (5 subsequent siblings)
  7 siblings, 2 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:18 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

box.info.memory() gives you some insight on what memory is used for,
but it's very coarse. For vinyl we need finer grained global memory
statistics.

This patch adds such: they are reported under box.stat.vinyl().memory
and consist of the following entries:

 - level0: sum size of level-0 of all LSM trees.
 - tx: size of memory used by tx write and read sets.
 - tuple_cache: size of memory occupied by tuple cache.
 - page_index: size of memory used for storing page indexes.
 - bloom_filter: size of memory used for storing bloom filters.

It also removes box.stat.vinyl().cache, as the size of cache is now
reported under memory.tuple_cache.
---
 src/box/vinyl.c           | 42 ++++++++---------------
 src/box/vy_tx.c           | 19 +++++++++++
 src/box/vy_tx.h           |  4 +++
 test/vinyl/cache.result   | 10 +++---
 test/vinyl/cache.test.lua | 10 +++---
 test/vinyl/info.result    | 85 ++++++++++++++++++++++++++++++++++++-----------
 test/vinyl/info.test.lua  | 19 +++++++++--
 7 files changed, 129 insertions(+), 60 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 0b33b6f7..88cdbce9 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -263,23 +263,6 @@ vy_info_append_quota(struct vy_env *env, struct info_handler *h)
 }
 
 static void
-vy_info_append_cache(struct vy_env *env, struct info_handler *h)
-{
-	struct vy_cache_env *c = &env->cache_env;
-
-	info_table_begin(h, "cache");
-
-	info_append_int(h, "used", c->mem_used);
-	info_append_int(h, "limit", c->mem_quota);
-
-	struct mempool_stats mstats;
-	mempool_stats(&c->cache_entry_mempool, &mstats);
-	info_append_int(h, "tuples", mstats.objcount);
-
-	info_table_end(h);
-}
-
-static void
 vy_info_append_tx(struct vy_env *env, struct info_handler *h)
 {
 	struct tx_manager *xm = env->xm;
@@ -303,6 +286,18 @@ vy_info_append_tx(struct vy_env *env, struct info_handler *h)
 	info_table_end(h);
 }
 
+static void
+vy_info_append_memory(struct vy_env *env, struct info_handler *h)
+{
+	info_table_begin(h, "memory");
+	info_append_int(h, "tx", tx_manager_mem_used(env->xm));
+	info_append_int(h, "level0", lsregion_used(&env->mem_env.allocator));
+	info_append_int(h, "tuple_cache", env->cache_env.mem_used);
+	info_append_int(h, "page_index", env->lsm_env.page_index_size);
+	info_append_int(h, "bloom_filter", env->lsm_env.bloom_size);
+	info_table_end(h);
+}
+
 void
 vinyl_engine_stat(struct vinyl_engine *vinyl, struct info_handler *h)
 {
@@ -310,8 +305,8 @@ vinyl_engine_stat(struct vinyl_engine *vinyl, struct info_handler *h)
 
 	info_begin(h);
 	vy_info_append_quota(env, h);
-	vy_info_append_cache(env, h);
 	vy_info_append_tx(env, h);
+	vy_info_append_memory(env, h);
 	info_end(h);
 }
 
@@ -466,7 +461,6 @@ static void
 vinyl_engine_memory_stat(struct engine *engine, struct engine_memory_stat *stat)
 {
 	struct vy_env *env = vy_env(engine);
-	struct mempool_stats mstats;
 
 	stat->data += lsregion_used(&env->mem_env.allocator) -
 				env->mem_env.tree_extent_size;
@@ -474,15 +468,7 @@ vinyl_engine_memory_stat(struct engine *engine, struct engine_memory_stat *stat)
 	stat->index += env->lsm_env.bloom_size;
 	stat->index += env->lsm_env.page_index_size;
 	stat->cache += env->cache_env.mem_used;
-	stat->tx += env->xm->write_set_size + env->xm->read_set_size;
-	mempool_stats(&env->xm->tx_mempool, &mstats);
-	stat->tx += mstats.totals.used;
-	mempool_stats(&env->xm->txv_mempool, &mstats);
-	stat->tx += mstats.totals.used;
-	mempool_stats(&env->xm->read_interval_mempool, &mstats);
-	stat->tx += mstats.totals.used;
-	mempool_stats(&env->xm->read_view_mempool, &mstats);
-	stat->tx += mstats.totals.used;
+	stat->tx += tx_manager_mem_used(env->xm);
 }
 
 static void
diff --git a/src/box/vy_tx.c b/src/box/vy_tx.c
index 1e8775a0..957c87f0 100644
--- a/src/box/vy_tx.c
+++ b/src/box/vy_tx.c
@@ -137,6 +137,25 @@ tx_manager_delete(struct tx_manager *xm)
 	free(xm);
 }
 
+size_t
+tx_manager_mem_used(struct tx_manager *xm)
+{
+	struct mempool_stats mstats;
+	size_t ret = 0;
+
+	ret += xm->write_set_size + xm->read_set_size;
+	mempool_stats(&xm->tx_mempool, &mstats);
+	ret += mstats.totals.used;
+	mempool_stats(&xm->txv_mempool, &mstats);
+	ret += mstats.totals.used;
+	mempool_stats(&xm->read_interval_mempool, &mstats);
+	ret += mstats.totals.used;
+	mempool_stats(&xm->read_view_mempool, &mstats);
+	ret += mstats.totals.used;
+
+	return ret;
+}
+
 /** Create or reuse an instance of a read view. */
 static struct vy_read_view *
 tx_manager_read_view(struct tx_manager *xm)
diff --git a/src/box/vy_tx.h b/src/box/vy_tx.h
index 1d515c72..b201abd7 100644
--- a/src/box/vy_tx.h
+++ b/src/box/vy_tx.h
@@ -268,6 +268,10 @@ tx_manager_new(void);
 void
 tx_manager_delete(struct tx_manager *xm);
 
+/** Return total amount of memory used by active transactions. */
+size_t
+tx_manager_mem_used(struct tx_manager *xm);
+
 /**
  * Abort all rw transactions that affect the given LSM tree
  * and haven't reached WAL yet.
diff --git a/test/vinyl/cache.result b/test/vinyl/cache.result
index d75de263..85741604 100644
--- a/test/vinyl/cache.result
+++ b/test/vinyl/cache.result
@@ -1031,21 +1031,21 @@ for i = 1, 100 do s:replace{i, string.rep('x', 1000)} end
 for i = 1, 100 do s:get{i} end
 ---
 ...
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 ---
 - 107700
 ...
 box.cfg{vinyl_cache = 50 * 1000}
 ---
 ...
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 ---
 - 49542
 ...
 box.cfg{vinyl_cache = 0}
 ---
 ...
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 ---
 - 0
 ...
@@ -1067,7 +1067,7 @@ st2.put.rows - st1.put.rows
 ---
 - 0
 ...
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 ---
 - 0
 ...
@@ -1114,7 +1114,7 @@ s.index.i2:count()
 ---
 - 100
 ...
-box.stat.vinyl().cache.used -- should be about 200 KB
+box.stat.vinyl().memory.tuple_cache -- should be about 200 KB
 ---
 - 216800
 ...
diff --git a/test/vinyl/cache.test.lua b/test/vinyl/cache.test.lua
index 6d82249a..2fedf34b 100644
--- a/test/vinyl/cache.test.lua
+++ b/test/vinyl/cache.test.lua
@@ -366,18 +366,18 @@ s = box.schema.space.create('test', {engine = 'vinyl'})
 _ = s:create_index('pk')
 for i = 1, 100 do s:replace{i, string.rep('x', 1000)} end
 for i = 1, 100 do s:get{i} end
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 box.cfg{vinyl_cache = 50 * 1000}
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 box.cfg{vinyl_cache = 0}
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 -- Make sure cache is not populated if box.cfg.vinyl_cache is set to 0
 st1 = s.index.pk:stat().cache
 #s:select()
 for i = 1, 100 do s:get{i} end
 st2 = s.index.pk:stat().cache
 st2.put.rows - st1.put.rows
-box.stat.vinyl().cache.used
+box.stat.vinyl().memory.tuple_cache
 s:drop()
 box.cfg{vinyl_cache = vinyl_cache}
 
@@ -395,6 +395,6 @@ for i = 1, 100 do pad = string.rep(i % 10, 1000) s:replace{i, pad, pad} end
 s.index.pk:count()
 s.index.i1:count()
 s.index.i2:count()
-box.stat.vinyl().cache.used -- should be about 200 KB
+box.stat.vinyl().memory.tuple_cache -- should be about 200 KB
 s:drop()
 box.cfg{vinyl_cache = vinyl_cache}
diff --git a/test/vinyl/info.result b/test/vinyl/info.result
index 95e8cc60..a53ee3ae 100644
--- a/test/vinyl/info.result
+++ b/test/vinyl/info.result
@@ -210,10 +210,15 @@ istat()
 ...
 gstat()
 ---
-- cache:
-    limit: 15360
-    tuples: 0
+- quota:
+    limit: 134217728
     used: 0
+  memory:
+    tuple_cache: 0
+    tx: 0
+    level0: 0
+    page_index: 0
+    bloom_filter: 0
   tx:
     conflict: 0
     commit: 0
@@ -222,9 +227,6 @@ gstat()
     transactions: 0
     gap_locks: 0
     read_views: 0
-  quota:
-    limit: 134217728
-    used: 0
 ...
 --
 -- Index statistics.
@@ -665,16 +667,16 @@ box.rollback()
 --
 -- Global statistics.
 --
--- use quota
+-- use memory
 st = gstat()
 ---
 ...
 put(1)
 ---
 ...
-stat_diff(gstat(), st, 'quota')
+stat_diff(gstat(), st, 'memory.level0')
 ---
-- used: 1061
+- 1061
 ...
 -- use cache
 st = gstat()
@@ -683,10 +685,9 @@ st = gstat()
 _ = s:get(1)
 ---
 ...
-stat_diff(gstat(), st, 'cache')
+stat_diff(gstat(), st, 'memory.tuple_cache')
 ---
-- used: 1101
-  tuples: 1
+- 1101
 ...
 s:delete(1)
 ---
@@ -1007,10 +1008,15 @@ istat()
 ...
 gstat()
 ---
-- cache:
-    limit: 15360
-    tuples: 13
-    used: 14313
+- quota:
+    limit: 134217728
+    used: 262583
+  memory:
+    tuple_cache: 14313
+    tx: 0
+    level0: 262583
+    page_index: 1050
+    bloom_filter: 140
   tx:
     conflict: 0
     commit: 0
@@ -1019,9 +1025,6 @@ gstat()
     transactions: 0
     gap_locks: 0
     read_views: 0
-  quota:
-    limit: 134217728
-    used: 262583
 ...
 s:drop()
 ---
@@ -1059,6 +1062,9 @@ i1:bsize(), i2:bsize()
 for i = 1, 100, 2 do s:replace{i, i, pad()} end
 ---
 ...
+gst = gstat()
+---
+...
 st1 = i1:stat()
 ---
 ...
@@ -1099,10 +1105,21 @@ i2:bsize() == st2.memory.index_size
 ---
 - true
 ...
+gst.memory.page_index == 0
+---
+- true
+...
+gst.memory.bloom_filter == 0
+---
+- true
+...
 box.snapshot()
 ---
 - ok
 ...
+gst = gstat()
+---
+...
 st1 = i1:stat()
 ---
 ...
@@ -1143,6 +1160,14 @@ i2:bsize() == st2.disk.index_size + st2.disk.bloom_size + st2.disk.bytes
 ---
 - true
 ...
+gst.memory.page_index == st1.disk.index_size + st2.disk.index_size
+---
+- true
+...
+gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
+---
+- true
+...
 for i = 1, 100, 2 do s:delete(i) end
 ---
 ...
@@ -1211,6 +1236,9 @@ i2:compact()
 wait(function() return i2:stat() end, st2, 'disk.compact.count', 1)
 ---
 ...
+gst = gstat()
+---
+...
 st1 = i1:stat()
 ---
 ...
@@ -1251,9 +1279,28 @@ i2:bsize() == st2.disk.index_size + st2.disk.bloom_size + st2.disk.bytes
 ---
 - true
 ...
+gst.memory.page_index == st1.disk.index_size + st2.disk.index_size
+---
+- true
+...
+gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
+---
+- true
+...
 s:drop()
 ---
 ...
+gst = gstat()
+---
+...
+gst.memory.page_index == 0
+---
+- true
+...
+gst.memory.bloom_filter == 0
+---
+- true
+...
 test_run:cmd('switch default')
 ---
 - true
diff --git a/test/vinyl/info.test.lua b/test/vinyl/info.test.lua
index 5aebd0a8..230bac1e 100644
--- a/test/vinyl/info.test.lua
+++ b/test/vinyl/info.test.lua
@@ -206,15 +206,15 @@ box.rollback()
 -- Global statistics.
 --
 
--- use quota
+-- use memory
 st = gstat()
 put(1)
-stat_diff(gstat(), st, 'quota')
+stat_diff(gstat(), st, 'memory.level0')
 
 -- use cache
 st = gstat()
 _ = s:get(1)
-stat_diff(gstat(), st, 'cache')
+stat_diff(gstat(), st, 'memory.tuple_cache')
 
 s:delete(1)
 
@@ -329,6 +329,7 @@ i1:len(), i2:len()
 i1:bsize(), i2:bsize()
 
 for i = 1, 100, 2 do s:replace{i, i, pad()} end
+gst = gstat()
 st1 = i1:stat()
 st2 = i2:stat()
 s:bsize()
@@ -339,8 +340,11 @@ i1:len() == st1.memory.rows
 i2:len() == st2.memory.rows
 i1:bsize() == st1.memory.index_size
 i2:bsize() == st2.memory.index_size
+gst.memory.page_index == 0
+gst.memory.bloom_filter == 0
 
 box.snapshot()
+gst = gstat()
 st1 = i1:stat()
 st2 = i2:stat()
 s:bsize()
@@ -351,6 +355,8 @@ i1:len() == st1.disk.rows
 i2:len() == st2.disk.rows
 i1:bsize() == st1.disk.index_size + st1.disk.bloom_size
 i2:bsize() == st2.disk.index_size + st2.disk.bloom_size + st2.disk.bytes
+gst.memory.page_index == st1.disk.index_size + st2.disk.index_size
+gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
 
 for i = 1, 100, 2 do s:delete(i) end
 for i = 2, 100, 2 do s:replace{i, i, pad()} end
@@ -373,6 +379,7 @@ wait(function() return i1:stat() end, st1, 'disk.compact.count', 1)
 box.snapshot()
 i2:compact()
 wait(function() return i2:stat() end, st2, 'disk.compact.count', 1)
+gst = gstat()
 st1 = i1:stat()
 st2 = i2:stat()
 s:bsize()
@@ -383,9 +390,15 @@ i1:len() == st1.disk.rows
 i2:len() == st2.disk.rows
 i1:bsize() == st1.disk.index_size + st1.disk.bloom_size
 i2:bsize() == st2.disk.index_size + st2.disk.bloom_size + st2.disk.bytes
+gst.memory.page_index == st1.disk.index_size + st2.disk.index_size
+gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
 
 s:drop()
 
+gst = gstat()
+gst.memory.page_index == 0
+gst.memory.bloom_filter == 0
+
 test_run:cmd('switch default')
 test_run:cmd('stop server test')
 test_run:cmd('cleanup server test')
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 3/7] vinyl: add global disk stats
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
  2018-09-02 20:18 ` [PATCH 1/7] vinyl: fix accounting of secondary index cache statements Vladimir Davydov
  2018-09-02 20:18 ` [PATCH 2/7] vinyl: add global memory stats Vladimir Davydov
@ 2018-09-02 20:18 ` Vladimir Davydov
  2018-09-02 22:30   ` [tarantool-patches] " Konstantin Osipov
  2018-09-02 20:18 ` [PATCH 4/7] vinyl: fix force compaction logic Vladimir Davydov
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:18 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

This patch adds some essential disk statistics to box.stat.vinyl().
The new statistics are reported under the 'disk' section and currently
include the following fields:

 - data_files: number of open data files (*.run).
 - data_size: size of data stored on disk.
 - index_size: size of index stored on disk.
 - dump_total: number of bytes written by dump tasks.
 - compact_total: number of bytes writted by compaction tasks.

All sizes are in bytes, before compression. Dump and compaction counters
are reset by box.stat.reset().
---
 src/box/vinyl.c          | 19 ++++++++++++++++
 src/box/vy_lsm.c         | 44 ++++++++++++++++++++++++++++---------
 src/box/vy_lsm.h         | 16 ++++++++++++++
 src/box/vy_scheduler.c   |  2 ++
 test/vinyl/info.result   | 57 ++++++++++++++++++++++++++++++++++++++++++++++--
 test/vinyl/info.test.lua | 12 ++++++++++
 6 files changed, 138 insertions(+), 12 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 88cdbce9..edfaa824 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -298,6 +298,20 @@ vy_info_append_memory(struct vy_env *env, struct info_handler *h)
 	info_table_end(h);
 }
 
+static void
+vy_info_append_disk(struct vy_env *env, struct info_handler *h)
+{
+	struct vy_lsm_env *lsm_env = &env->lsm_env;
+
+	info_table_begin(h, "disk");
+	info_append_int(h, "data_files", lsm_env->data_file_count);
+	info_append_int(h, "data_size", lsm_env->disk_data_size);
+	info_append_int(h, "index_size", lsm_env->disk_index_size);
+	info_append_int(h, "dump_total", lsm_env->dump_total);
+	info_append_int(h, "compact_total", lsm_env->compact_total);
+	info_table_end(h);
+}
+
 void
 vinyl_engine_stat(struct vinyl_engine *vinyl, struct info_handler *h)
 {
@@ -307,6 +321,7 @@ vinyl_engine_stat(struct vinyl_engine *vinyl, struct info_handler *h)
 	vy_info_append_quota(env, h);
 	vy_info_append_tx(env, h);
 	vy_info_append_memory(env, h);
+	vy_info_append_disk(env, h);
 	info_end(h);
 }
 
@@ -476,8 +491,12 @@ vinyl_engine_reset_stat(struct engine *engine)
 {
 	struct vy_env *env = vy_env(engine);
 	struct tx_manager *xm = env->xm;
+	struct vy_lsm_env *lsm_env = &env->lsm_env;
 
 	memset(&xm->stat, 0, sizeof(xm->stat));
+
+	lsm_env->dump_total = 0;
+	lsm_env->compact_total = 0;
 }
 
 /** }}} Introspection */
diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index 1994525e..15592fbf 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -685,32 +685,56 @@ vy_lsm_compact_priority(struct vy_lsm *lsm)
 void
 vy_lsm_add_run(struct vy_lsm *lsm, struct vy_run *run)
 {
+	struct vy_lsm_env *env = lsm->env;
+	size_t bloom_size = vy_run_bloom_size(run);
+	size_t page_index_size = run->page_index_size;
+
 	assert(rlist_empty(&run->in_lsm));
 	rlist_add_entry(&lsm->runs, run, in_lsm);
 	lsm->run_count++;
 	vy_disk_stmt_counter_add(&lsm->stat.disk.count, &run->count);
 
-	lsm->bloom_size += vy_run_bloom_size(run);
-	lsm->page_index_size += run->page_index_size;
-
-	lsm->env->bloom_size += vy_run_bloom_size(run);
-	lsm->env->page_index_size += run->page_index_size;
+	lsm->bloom_size += bloom_size;
+	lsm->page_index_size += page_index_size;
+
+	env->bloom_size += bloom_size;
+	env->page_index_size += page_index_size;
+	env->data_file_count++;
+	/* Data size is consistent with space.bsize. */
+	if (lsm->index_id == 0)
+		env->disk_data_size += run->count.bytes;
+	/* Index size is consistent with index.bsize. */
+	env->disk_index_size += bloom_size + page_index_size;
+	if (lsm->index_id > 0)
+		env->disk_index_size += run->count.bytes;
 }
 
 void
 vy_lsm_remove_run(struct vy_lsm *lsm, struct vy_run *run)
 {
+	struct vy_lsm_env *env = lsm->env;
+	size_t bloom_size = vy_run_bloom_size(run);
+	size_t page_index_size = run->page_index_size;
+
 	assert(lsm->run_count > 0);
 	assert(!rlist_empty(&run->in_lsm));
 	rlist_del_entry(run, in_lsm);
 	lsm->run_count--;
 	vy_disk_stmt_counter_sub(&lsm->stat.disk.count, &run->count);
 
-	lsm->bloom_size -= vy_run_bloom_size(run);
-	lsm->page_index_size -= run->page_index_size;
-
-	lsm->env->bloom_size -= vy_run_bloom_size(run);
-	lsm->env->page_index_size -= run->page_index_size;
+	lsm->bloom_size -= bloom_size;
+	lsm->page_index_size -= page_index_size;
+
+	env->bloom_size -= bloom_size;
+	env->page_index_size -= page_index_size;
+	env->data_file_count--;
+	/* Data size is consistent with space.bsize. */
+	if (lsm->index_id == 0)
+		env->disk_data_size -= run->count.bytes;
+	/* Index size is consistent with index.bsize. */
+	env->disk_index_size -= bloom_size + page_index_size;
+	if (lsm->index_id > 0)
+		env->disk_index_size -= run->count.bytes;
 }
 
 void
diff --git a/src/box/vy_lsm.h b/src/box/vy_lsm.h
index 6917d475..67a5c5d1 100644
--- a/src/box/vy_lsm.h
+++ b/src/box/vy_lsm.h
@@ -91,6 +91,22 @@ struct vy_lsm_env {
 	size_t bloom_size;
 	/** Size of memory used for page index. */
 	size_t page_index_size;
+	/** Total number of data files (*.run). */
+	size_t data_file_count;
+	/** Size of data stored on disk (uncompressed). */
+	size_t disk_data_size;
+	/** Size of index stored on disk (uncompressed). */
+	size_t disk_index_size;
+	/**
+	 * Total number of bytes written to disk by dump tasks
+	 * (uncompressed).
+	 */
+	int64_t dump_total;
+	/**
+	 * Total number of bytes written to disk by compaction
+	 * tasks (uncompressed).
+	 */
+	int64_t compact_total;
 	/** Memory pool for vy_history_node allocations. */
 	struct mempool history_node_pool;
 };
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index e41f1ffd..4959300e 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -1117,6 +1117,7 @@ vy_task_dump_complete(struct vy_task *task)
 	 */
 	vy_lsm_add_run(lsm, new_run);
 	vy_stmt_counter_add_disk(&lsm->stat.disk.dump.out, &new_run->count);
+	lsm->env->dump_total += new_run->count.bytes;
 
 	/* Drop the reference held by the task. */
 	vy_run_unref(new_run);
@@ -1457,6 +1458,7 @@ vy_task_compact_complete(struct vy_task *task)
 		vy_lsm_add_run(lsm, new_run);
 		vy_stmt_counter_add_disk(&lsm->stat.disk.compact.out,
 					 &new_run->count);
+		lsm->env->compact_total += new_run->count.bytes;
 		/* Drop the reference held by the task. */
 		vy_run_unref(new_run);
 	} else
diff --git a/test/vinyl/info.result b/test/vinyl/info.result
index a53ee3ae..36fa732c 100644
--- a/test/vinyl/info.result
+++ b/test/vinyl/info.result
@@ -210,7 +210,13 @@ istat()
 ...
 gstat()
 ---
-- quota:
+- disk:
+    dump_total: 0
+    data_size: 0
+    data_files: 0
+    compact_total: 0
+    index_size: 0
+  quota:
     limit: 134217728
     used: 0
   memory:
@@ -667,6 +673,15 @@ box.rollback()
 --
 -- Global statistics.
 --
+-- dump and compaction totals
+gstat().disk.dump_total == istat().disk.dump.out.bytes
+---
+- true
+...
+gstat().disk.compact_total == istat().disk.compact.out.bytes
+---
+- true
+...
 -- use memory
 st = gstat()
 ---
@@ -1008,7 +1023,13 @@ istat()
 ...
 gstat()
 ---
-- quota:
+- disk:
+    dump_total: 0
+    data_size: 104300
+    data_files: 2
+    compact_total: 0
+    index_size: 1190
+  quota:
     limit: 134217728
     used: 262583
   memory:
@@ -1113,6 +1134,14 @@ gst.memory.bloom_filter == 0
 ---
 - true
 ...
+gst.disk.data_size == 0
+---
+- true
+...
+gst.disk.index_size == 0
+---
+- true
+...
 box.snapshot()
 ---
 - ok
@@ -1168,6 +1197,14 @@ gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
 ---
 - true
 ...
+gst.disk.data_size == s:bsize()
+---
+- true
+...
+gst.disk.index_size == i1:bsize() + i2:bsize()
+---
+- true
+...
 for i = 1, 100, 2 do s:delete(i) end
 ---
 ...
@@ -1287,6 +1324,14 @@ gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
 ---
 - true
 ...
+gst.disk.data_size == s:bsize()
+---
+- true
+...
+gst.disk.index_size == i1:bsize() + i2:bsize()
+---
+- true
+...
 s:drop()
 ---
 ...
@@ -1301,6 +1346,14 @@ gst.memory.bloom_filter == 0
 ---
 - true
 ...
+gst.disk.data_size == 0
+---
+- true
+...
+gst.disk.index_size == 0
+---
+- true
+...
 test_run:cmd('switch default')
 ---
 - true
diff --git a/test/vinyl/info.test.lua b/test/vinyl/info.test.lua
index 230bac1e..919dde63 100644
--- a/test/vinyl/info.test.lua
+++ b/test/vinyl/info.test.lua
@@ -206,6 +206,10 @@ box.rollback()
 -- Global statistics.
 --
 
+-- dump and compaction totals
+gstat().disk.dump_total == istat().disk.dump.out.bytes
+gstat().disk.compact_total == istat().disk.compact.out.bytes
+
 -- use memory
 st = gstat()
 put(1)
@@ -342,6 +346,8 @@ i1:bsize() == st1.memory.index_size
 i2:bsize() == st2.memory.index_size
 gst.memory.page_index == 0
 gst.memory.bloom_filter == 0
+gst.disk.data_size == 0
+gst.disk.index_size == 0
 
 box.snapshot()
 gst = gstat()
@@ -357,6 +363,8 @@ i1:bsize() == st1.disk.index_size + st1.disk.bloom_size
 i2:bsize() == st2.disk.index_size + st2.disk.bloom_size + st2.disk.bytes
 gst.memory.page_index == st1.disk.index_size + st2.disk.index_size
 gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
+gst.disk.data_size == s:bsize()
+gst.disk.index_size == i1:bsize() + i2:bsize()
 
 for i = 1, 100, 2 do s:delete(i) end
 for i = 2, 100, 2 do s:replace{i, i, pad()} end
@@ -392,12 +400,16 @@ i1:bsize() == st1.disk.index_size + st1.disk.bloom_size
 i2:bsize() == st2.disk.index_size + st2.disk.bloom_size + st2.disk.bytes
 gst.memory.page_index == st1.disk.index_size + st2.disk.index_size
 gst.memory.bloom_filter == st1.disk.bloom_size + st2.disk.bloom_size
+gst.disk.data_size == s:bsize()
+gst.disk.index_size == i1:bsize() + i2:bsize()
 
 s:drop()
 
 gst = gstat()
 gst.memory.page_index == 0
 gst.memory.bloom_filter == 0
+gst.disk.data_size == 0
+gst.disk.index_size == 0
 
 test_run:cmd('switch default')
 test_run:cmd('stop server test')
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 4/7] vinyl: fix force compaction logic
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
                   ` (2 preceding siblings ...)
  2018-09-02 20:18 ` [PATCH 3/7] vinyl: add global disk stats Vladimir Davydov
@ 2018-09-02 20:18 ` Vladimir Davydov
  2018-09-02 20:18 ` [PATCH 5/7] vinyl: update compact priority usual way on range split/coalesce Vladimir Davydov
                   ` (3 subsequent siblings)
  7 siblings, 0 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:18 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

This patch address a few problems index.compact() is suffering from,
namely:

 - When a range is split or coalesced, it should inherit the value of
   needs_compaction flag from the source ranges. Currently, the flag is
   cleared so that the resulting range may be not compacted.

 - If a range has no slices, we shouldn't set needs_compaction flag for
   it, because obviously it can't be compacted, but we do.

 - The needs_compaction flag should be cleared as soon as we schedule a
   range for compaction, not when all slices have been compacted into
   one, as we presently expect, because the latter may never happen
   under a write-intensive load.
---
 src/box/vy_lsm.c       |  9 +++++++--
 src/box/vy_range.c     | 16 ++--------------
 src/box/vy_range.h     |  8 ++------
 src/box/vy_scheduler.c |  2 ++
 4 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index 15592fbf..a0d211f8 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -1040,6 +1040,7 @@ vy_lsm_split_range(struct vy_lsm *lsm, struct vy_range *range)
 			if (new_slice != NULL)
 				vy_range_add_slice(part, new_slice);
 		}
+		part->needs_compaction = range->needs_compaction;
 		part->compact_priority = range->compact_priority;
 	}
 
@@ -1147,6 +1148,8 @@ vy_lsm_coalesce_range(struct vy_lsm *lsm, struct vy_range *range)
 		rlist_splice(&result->slices, &it->slices);
 		result->slice_count += it->slice_count;
 		vy_disk_stmt_counter_add(&result->count, &it->count);
+		if (it->needs_compaction)
+			result->needs_compaction = true;
 		vy_range_delete(it);
 		it = next;
 	}
@@ -1181,8 +1184,10 @@ vy_lsm_force_compaction(struct vy_lsm *lsm)
 	struct vy_range_tree_iterator it;
 
 	vy_range_tree_ifirst(lsm->tree, &it);
-	while ((range = vy_range_tree_inext(&it)) != NULL)
-		vy_range_force_compaction(range);
+	while ((range = vy_range_tree_inext(&it)) != NULL) {
+		range->needs_compaction = true;
+		vy_range_update_compact_priority(range, &lsm->opts);
+	}
 
 	vy_range_heap_update_all(&lsm->range_heap);
 }
diff --git a/src/box/vy_range.c b/src/box/vy_range.c
index 6a55a018..ddcd2ed3 100644
--- a/src/box/vy_range.c
+++ b/src/box/vy_range.c
@@ -262,18 +262,6 @@ vy_range_remove_slice(struct vy_range *range, struct vy_slice *slice)
 	vy_disk_stmt_counter_sub(&range->count, &slice->count);
 }
 
-void
-vy_range_force_compaction(struct vy_range *range)
-{
-	if (range->slice_count == 1) {
-		/* Already compacted. */
-		assert(!range->needs_compaction);
-		return;
-	}
-	range->needs_compaction = true;
-	range->compact_priority = range->slice_count;
-}
-
 /**
  * To reduce write amplification caused by compaction, we follow
  * the LSM tree design. Runs in each range are divided into groups
@@ -304,9 +292,9 @@ vy_range_update_compact_priority(struct vy_range *range,
 	assert(opts->run_count_per_level > 0);
 	assert(opts->run_size_ratio > 1);
 
-	if (range->slice_count == 1) {
+	if (range->slice_count <= 1) {
 		/* Nothing to compact. */
-		range->compact_priority = 1;
+		range->compact_priority = 0;
 		range->needs_compaction = false;
 		return;
 	}
diff --git a/src/box/vy_range.h b/src/box/vy_range.h
index d7031e70..2ca19a1c 100644
--- a/src/box/vy_range.h
+++ b/src/box/vy_range.h
@@ -110,8 +110,8 @@ struct vy_range {
 	 * If this flag is set, the range must be scheduled for
 	 * major compaction, i.e. its compact_priority must be
 	 * raised to max (slice_count). The flag is set by
-	 * vy_range_force_compaction() and cleared automatically
-	 * when all slices of the range have been compacted.
+	 * vy_lsm_force_compaction() and cleared when the range
+	 * is scheduled for compaction.
 	 */
 	bool needs_compaction;
 	/** Number of times the range was compacted. */
@@ -229,10 +229,6 @@ vy_range_add_slice_before(struct vy_range *range, struct vy_slice *slice,
 void
 vy_range_remove_slice(struct vy_range *range, struct vy_slice *slice);
 
-/** Mark a range for major compaction. */
-void
-vy_range_force_compaction(struct vy_range *range);
-
 /**
  * Update compaction priority of a range.
  *
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 4959300e..a1ae3f54 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -1604,6 +1604,8 @@ vy_task_compact_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 	assert(n == 0);
 	assert(new_run->dump_lsn >= 0);
 
+	range->needs_compaction = false;
+
 	task->range = range;
 	task->new_run = new_run;
 	task->wi = wi;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 5/7] vinyl: update compact priority usual way on range split/coalesce
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
                   ` (3 preceding siblings ...)
  2018-09-02 20:18 ` [PATCH 4/7] vinyl: fix force compaction logic Vladimir Davydov
@ 2018-09-02 20:18 ` Vladimir Davydov
  2018-09-02 20:18 ` [PATCH 6/7] vinyl: keep track of compaction queue length and debt Vladimir Davydov
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:18 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

When a few ranges are coalesced, we "force" compaction of the resulting
range by raising its compaction priority to max (slice count). There's
actually no point in that, because as long as the shape of the resulting
LSM tree is OK, we don't need to do extra compaction work. Moreover, it
actually doesn't work if a new slice is added to the resulting range by
dump before it gets compacted, which is fairly likely, because then its
compaction priority will be recalculated as usual. So let's simply call
vy_range_update_compact_priority() for the resulting range.

When a range is split, the produced ranges will inherit its compaction
priority. This is actually incorrect, because range split may change the
shape of the tree so let's recalculate priority for each part the usual
way, i.e. by calling vy_range_update_compact_priority().

After this patch, there's this only place where we can update compaction
priority of a range - it's vy_range_update_compact_priority().
---
 src/box/vy_lsm.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index a0d211f8..bf359973 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -1041,7 +1041,7 @@ vy_lsm_split_range(struct vy_lsm *lsm, struct vy_range *range)
 				vy_range_add_slice(part, new_slice);
 		}
 		part->needs_compaction = range->needs_compaction;
-		part->compact_priority = range->compact_priority;
+		vy_range_update_compact_priority(part, &lsm->opts);
 	}

 	/*
@@ -1153,13 +1153,7 @@ vy_lsm_coalesce_range(struct vy_lsm *lsm, struct vy_range *range)
 		vy_range_delete(it);
 		it = next;
 	}
-	/*
-	 * Coalescing increases read amplification and breaks the log
-	 * structured layout of the run list, so, although we could
-	 * leave the resulting range as it is, we'd better compact it
-	 * as soon as we can.
-	 */
-	result->compact_priority = result->slice_count;
+	vy_range_update_compact_priority(result, &lsm->opts);
 	vy_lsm_acct_range(lsm, result);
 	vy_lsm_add_range(lsm, result);
 	lsm->range_tree_version++;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 6/7] vinyl: keep track of compaction queue length and debt
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
                   ` (4 preceding siblings ...)
  2018-09-02 20:18 ` [PATCH 5/7] vinyl: update compact priority usual way on range split/coalesce Vladimir Davydov
@ 2018-09-02 20:18 ` Vladimir Davydov
  2018-09-02 20:19 ` [PATCH 7/7] vinyl: keep track of disk idle time Vladimir Davydov
  2018-09-09 11:41 ` [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
  7 siblings, 0 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:18 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

Currently, there's no way to figure out whether compaction keeps up
with dumps or not while this is essential for implementing transaction
throttling. This patch adds metrics that are supposed to help answer
this question. Those are size of compaction queue and compaction debt.

Size of compaction queue is the total size of runs awaiting compaction.
Compaction debt is how many runs must be compacted to restore the target
LSM tree shape. It can't be greater than the size of compaction queue.
Ideally, it should be zero.

We update both metrics along with compaction priority of a range.
Calculation of compaction queue size is trivial - we simply sum
statement counters of slices to be compacted. Compaction debt is a bit
more sophisticated. When walking down an LSM tree, we check whether the
number of runs at any level is greater than run_count_level * 2. If it
is, then we would reschedule compaction had the previous compaction task
completed by now, so we assume that the range is "in debt". If a range
is in debt, its compaction queue length contributes to compaction debt
metric.

Both metrics are accumulated and reported per index and globally.
Global metrics are reported under box.info.vinyl().disk.compact_queue
and disk.compact_debt, in bytes only. Index statistics are reported
under index.stat().disk.compact.queue and disk.compact.debt, both in
bytes and in rows. The metrics don't take into account disk compression.
---
 src/box/vinyl.c            |  27 +++++-----
 src/box/vy_lsm.c           |  20 +++++++
 src/box/vy_lsm.h           |  26 ++++++++-
 src/box/vy_range.c         |  28 +++++++---
 src/box/vy_range.h         |   9 ++++
 src/box/vy_scheduler.c     |   9 +++-
 src/box/vy_stat.h          |  39 ++++++++++----
 src/errinj.h               |   1 +
 test/box/errinj.result     |   4 +-
 test/vinyl/errinj.result   | 129 +++++++++++++++++++++++++++++++++++++++++++++
 test/vinyl/errinj.test.lua |  34 ++++++++++++
 test/vinyl/info.result     |  20 ++++++-
 12 files changed, 307 insertions(+), 39 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index edfaa824..416c9824 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -309,6 +309,8 @@ vy_info_append_disk(struct vy_env *env, struct info_handler *h)
 	info_append_int(h, "index_size", lsm_env->disk_index_size);
 	info_append_int(h, "dump_total", lsm_env->dump_total);
 	info_append_int(h, "compact_total", lsm_env->compact_total);
+	info_append_int(h, "compact_queue", lsm_env->compact_queue);
+	info_append_int(h, "compact_debt", lsm_env->compact_debt);
 	info_table_end(h);
 }
 
@@ -352,17 +354,6 @@ vy_info_append_disk_stmt_counter(struct info_handler *h, const char *name,
 }
 
 static void
-vy_info_append_compact_stat(struct info_handler *h, const char *name,
-			    const struct vy_compact_stat *stat)
-{
-	info_table_begin(h, name);
-	info_append_int(h, "count", stat->count);
-	vy_info_append_stmt_counter(h, "in", &stat->in);
-	vy_info_append_stmt_counter(h, "out", &stat->out);
-	info_table_end(h);
-}
-
-static void
 vinyl_index_stat(struct index *index, struct info_handler *h)
 {
 	char buf[1024];
@@ -413,8 +404,18 @@ vinyl_index_stat(struct index *index, struct info_handler *h)
 	info_append_int(h, "miss", stat->disk.iterator.bloom_miss);
 	info_table_end(h);
 	info_table_end(h);
-	vy_info_append_compact_stat(h, "dump", &stat->disk.dump);
-	vy_info_append_compact_stat(h, "compact", &stat->disk.compact);
+	info_table_begin(h, "dump");
+	info_append_int(h, "count", stat->disk.dump.count);
+	vy_info_append_stmt_counter(h, "in", &stat->disk.dump.in);
+	vy_info_append_stmt_counter(h, "out", &stat->disk.dump.out);
+	info_table_end(h);
+	info_table_begin(h, "compact");
+	info_append_int(h, "count", stat->disk.compact.count);
+	vy_info_append_stmt_counter(h, "in", &stat->disk.compact.in);
+	vy_info_append_stmt_counter(h, "out", &stat->disk.compact.out);
+	vy_info_append_stmt_counter(h, "queue", &stat->disk.compact.queue);
+	vy_info_append_stmt_counter(h, "debt", &stat->disk.compact.debt);
+	info_table_end(h);
 	info_append_int(h, "index_size", lsm->page_index_size);
 	info_append_int(h, "bloom_size", lsm->bloom_size);
 	info_table_end(h);
diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index bf359973..f27634b9 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -252,6 +252,8 @@ vy_lsm_delete(struct vy_lsm *lsm)
 	assert(lsm->env->lsm_count > 0);
 
 	lsm->env->lsm_count--;
+	lsm->env->compact_queue -= lsm->stat.disk.compact.queue.bytes;
+	lsm->env->compact_debt -= lsm->stat.disk.compact.debt.bytes;
 
 	if (lsm->pk != NULL)
 		vy_lsm_unref(lsm->pk);
@@ -759,12 +761,28 @@ void
 vy_lsm_acct_range(struct vy_lsm *lsm, struct vy_range *range)
 {
 	histogram_collect(lsm->run_hist, range->slice_count);
+	vy_stmt_counter_add(&lsm->stat.disk.compact.queue,
+			    &range->compact_count);
+	lsm->env->compact_queue += range->compact_count.bytes;
+	if (range->in_compaction_debt) {
+		vy_stmt_counter_add(&lsm->stat.disk.compact.debt,
+				    &range->compact_count);
+		lsm->env->compact_debt += range->compact_count.bytes;
+	}
 }
 
 void
 vy_lsm_unacct_range(struct vy_lsm *lsm, struct vy_range *range)
 {
 	histogram_discard(lsm->run_hist, range->slice_count);
+	vy_stmt_counter_sub(&lsm->stat.disk.compact.queue,
+			    &range->compact_count);
+	lsm->env->compact_queue -= range->compact_count.bytes;
+	if (range->in_compaction_debt) {
+		vy_stmt_counter_sub(&lsm->stat.disk.compact.debt,
+				    &range->compact_count);
+		lsm->env->compact_debt -= range->compact_count.bytes;
+	}
 }
 
 int
@@ -1179,8 +1197,10 @@ vy_lsm_force_compaction(struct vy_lsm *lsm)
 
 	vy_range_tree_ifirst(lsm->tree, &it);
 	while ((range = vy_range_tree_inext(&it)) != NULL) {
+		vy_lsm_unacct_range(lsm, range);
 		range->needs_compaction = true;
 		vy_range_update_compact_priority(range, &lsm->opts);
+		vy_lsm_acct_range(lsm, range);
 	}
 
 	vy_range_heap_update_all(&lsm->range_heap);
diff --git a/src/box/vy_lsm.h b/src/box/vy_lsm.h
index 67a5c5d1..7e3e8430 100644
--- a/src/box/vy_lsm.h
+++ b/src/box/vy_lsm.h
@@ -107,6 +107,16 @@ struct vy_lsm_env {
 	 * tasks (uncompressed).
 	 */
 	int64_t compact_total;
+	/**
+	 * Sum size of runs awaiting compaction, in bytes
+	 * (uncompressed).
+	 */
+	int64_t compact_queue;
+	/**
+	 * Sum size of runs that must be compacted to restore
+	 * the target LSM tree shape, in bytes (uncompressed).
+	 */
+	int64_t compact_debt;
 	/** Memory pool for vy_history_node allocations. */
 	struct mempool history_node_pool;
 };
@@ -452,11 +462,23 @@ vy_lsm_add_range(struct vy_lsm *lsm, struct vy_range *range);
 void
 vy_lsm_remove_range(struct vy_lsm *lsm, struct vy_range *range);
 
-/** Account a range to the run histogram of an LSM tree. */
+/**
+ * Account a range in an LSM tree.
+ *
+ * This function updates the following LSM tree statistics:
+ *  - vy_lsm::run_hist after a slice is added to or removed from
+ *    a range of the LSM tree.
+ *  - vy_lsm::stat::disk::compact::queue and debt after compaction
+ *    priority of a range is updated.
+ */
 void
 vy_lsm_acct_range(struct vy_lsm *lsm, struct vy_range *range);
 
-/** Unaccount a range from the run histogram of an LSM tree. */
+/**
+ * Unaccount a range in an LSM tree.
+ *
+ * This function undoes the effect of vy_lsm_acct_range().
+ */
 void
 vy_lsm_unacct_range(struct vy_lsm *lsm, struct vy_range *range);
 
diff --git a/src/box/vy_range.c b/src/box/vy_range.c
index ddcd2ed3..1816e546 100644
--- a/src/box/vy_range.c
+++ b/src/box/vy_range.c
@@ -292,19 +292,19 @@ vy_range_update_compact_priority(struct vy_range *range,
 	assert(opts->run_count_per_level > 0);
 	assert(opts->run_size_ratio > 1);
 
+	range->compact_priority = 0;
+	range->in_compaction_debt = false;
+	vy_stmt_counter_reset(&range->compact_count);
+
 	if (range->slice_count <= 1) {
 		/* Nothing to compact. */
-		range->compact_priority = 0;
 		range->needs_compaction = false;
 		return;
 	}
-	if (range->needs_compaction) {
-		range->compact_priority = range->slice_count;
-		return;
-	}
-
-	range->compact_priority = 0;
 
+	/* Total number of statements in checked runs. */
+	struct vy_stmt_counter total_stmt_count;
+	vy_stmt_counter_reset(&total_stmt_count);
 	/* Total number of checked runs. */
 	uint32_t total_run_count = 0;
 	/* The total size of runs checked so far. */
@@ -333,6 +333,7 @@ vy_range_update_compact_priority(struct vy_range *range,
 		total_size += size;
 		level_run_count++;
 		total_run_count++;
+		vy_stmt_counter_add_disk(&total_stmt_count, &slice->count);
 		while (size > target_run_size) {
 			/*
 			 * The run size exceeds the threshold
@@ -362,7 +363,8 @@ vy_range_update_compact_priority(struct vy_range *range,
 			 * we find an appropriate level for it.
 			 */
 		}
-		if (level_run_count > opts->run_count_per_level) {
+		if (range->needs_compaction ||
+		    level_run_count > opts->run_count_per_level) {
 			/*
 			 * The number of runs at the current level
 			 * exceeds the configured maximum. Arrange
@@ -370,8 +372,18 @@ vy_range_update_compact_priority(struct vy_range *range,
 			 * this level and upper levels.
 			 */
 			range->compact_priority = total_run_count;
+			range->compact_count = total_stmt_count;
 			est_new_run_size = total_size;
 		}
+		/*
+		 * If the number of runs on any level is twice as
+		 * many as run_count_per_level, we would schedule
+		 * compaction for another time had the previously
+		 * scheduled compaction task completed. This means
+		 * that compaction doesn't keep up with dumps.
+		 */
+		if (level_run_count > opts->run_count_per_level * 2)
+			range->in_compaction_debt = true;
 	}
 }
 
diff --git a/src/box/vy_range.h b/src/box/vy_range.h
index 2ca19a1c..60ce8d96 100644
--- a/src/box/vy_range.h
+++ b/src/box/vy_range.h
@@ -106,6 +106,15 @@ struct vy_range {
 	 * how we  decide how many runs to compact next time.
 	 */
 	int compact_priority;
+	/** Number of statements that needs to be compacted. */
+	struct vy_stmt_counter compact_count;
+	/**
+	 * Set if compaction doesn't manage to keep this LSM tree
+	 * in a good shape, because there are too many dumps.
+	 * Updated by vy_range_update_compact_priority() along
+	 * with compaction priority.
+	 */
+	bool in_compaction_debt;
 	/**
 	 * If this flag is set, the range must be scheduled for
 	 * major compaction, i.e. its compact_priority must be
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index a1ae3f54..580c3129 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -1136,8 +1136,8 @@ vy_task_dump_complete(struct vy_task *task)
 		slice = new_slices[i];
 		vy_lsm_unacct_range(lsm, range);
 		vy_range_add_slice(range, slice);
-		vy_lsm_acct_range(lsm, range);
 		vy_range_update_compact_priority(range, &lsm->opts);
+		vy_lsm_acct_range(lsm, range);
 		if (!vy_range_is_scheduled(range))
 			vy_range_heap_update(&lsm->range_heap,
 					     &range->heap_node);
@@ -1363,6 +1363,11 @@ err:
 static int
 vy_task_compact_execute(struct vy_task *task)
 {
+	struct errinj *errinj = errinj(ERRINJ_VY_COMPACTION_DELAY, ERRINJ_BOOL);
+	if (errinj != NULL && errinj->bparam) {
+		while (errinj->bparam)
+			fiber_sleep(0.01);
+	}
 	return vy_task_write_run(task);
 }
 
@@ -1487,8 +1492,8 @@ vy_task_compact_complete(struct vy_task *task)
 	}
 	range->n_compactions++;
 	range->version++;
-	vy_lsm_acct_range(lsm, range);
 	vy_range_update_compact_priority(range, &lsm->opts);
+	vy_lsm_acct_range(lsm, range);
 	lsm->stat.disk.compact.count++;
 
 	/*
diff --git a/src/box/vy_stat.h b/src/box/vy_stat.h
index ca52c4d3..99159022 100644
--- a/src/box/vy_stat.h
+++ b/src/box/vy_stat.h
@@ -32,6 +32,7 @@
  */
 
 #include <stdint.h>
+#include <string.h>
 
 #include "latency.h"
 #include "tuple.h"
@@ -101,15 +102,6 @@ struct vy_txw_iterator_stat {
 	struct vy_stmt_counter get;
 };
 
-/** Dump/compaction statistics. */
-struct vy_compact_stat {
-	int32_t count;
-	/** Number of input statements. */
-	struct vy_stmt_counter in;
-	/** Number of output statements. */
-	struct vy_stmt_counter out;
-};
-
 /** LSM tree statistics. */
 struct vy_lsm_stat {
 	/** Number of lookups in the LSM tree. */
@@ -141,9 +133,28 @@ struct vy_lsm_stat {
 		/** Run iterator statistics. */
 		struct vy_run_iterator_stat iterator;
 		/** Dump statistics. */
-		struct vy_compact_stat dump;
+		struct {
+			int32_t count;
+			/** Number of input statements. */
+			struct vy_stmt_counter in;
+			/** Number of output statements. */
+			struct vy_stmt_counter out;
+		} dump;
 		/** Compaction statistics. */
-		struct vy_compact_stat compact;
+		struct {
+			int32_t count;
+			/** Number of input statements. */
+			struct vy_stmt_counter in;
+			/** Number of output statements. */
+			struct vy_stmt_counter out;
+			/** Number of statements awaiting compaction. */
+			struct vy_stmt_counter queue;
+			/**
+			 * Number of statements that must be compacted
+			 * to restore the LSM tree shape.
+			 */
+			struct vy_stmt_counter debt;
+		} compact;
 	} disk;
 	/** TX write set statistics. */
 	struct {
@@ -199,6 +210,12 @@ vy_lsm_stat_destroy(struct vy_lsm_stat *stat)
 }
 
 static inline void
+vy_stmt_counter_reset(struct vy_stmt_counter *c)
+{
+	memset(c, 0, sizeof(*c));
+}
+
+static inline void
 vy_stmt_counter_acct_tuple(struct vy_stmt_counter *c,
 			   const struct tuple *tuple)
 {
diff --git a/src/errinj.h b/src/errinj.h
index b6d4a4c9..84a1fbb5 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -120,6 +120,7 @@ struct errinj {
 	_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
 	_(ERRINJ_WAL_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
+	_(ERRINJ_VY_COMPACTION_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 8dae7614..81087900 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -58,6 +58,8 @@ errinj.info()
     state: 0
   ERRINJ_XLOG_META:
     state: false
+  ERRINJ_SNAP_COMMIT_DELAY:
+    state: false
   ERRINJ_WAL_BREAK_LSN:
     state: -1
   ERRINJ_WAL_WRITE_DISK:
@@ -74,7 +76,7 @@ errinj.info()
     state: false
   ERRINJ_RELAY_FINAL_JOIN:
     state: false
-  ERRINJ_SNAP_COMMIT_DELAY:
+  ERRINJ_VY_COMPACTION_DELAY:
     state: false
   ERRINJ_RELAY_FINAL_SLEEP:
     state: false
diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result
index 17e4dc8c..7b880030 100644
--- a/test/vinyl/errinj.result
+++ b/test/vinyl/errinj.result
@@ -2118,3 +2118,132 @@ s:select()
 s:drop()
 ---
 ...
+-- Check compact.queue and compact.debt statistics.
+s = box.schema.space.create('test', {engine = 'vinyl'})
+---
+...
+i = s:create_index('pk', {run_count_per_level = 2})
+---
+...
+function dump() for i = 1, 10 do s:replace{i} end box.snapshot() end
+---
+...
+dump()
+---
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+---
+- ok
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- 30 statements
+---
+- rows: 30
+  bytes: 471
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- 40 statements
+---
+- rows: 40
+  bytes: 628
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- 50 statements
+---
+- rows: 50
+  bytes: 785
+...
+i:stat().disk.compact.debt -- 50 statements
+---
+- rows: 50
+  bytes: 785
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+---
+- ok
+...
+while i:stat().disk.compact.count < 2 do fiber.sleep(0.01) end
+---
+...
+i:stat().disk.compact.queue -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+s:drop()
+---
+...
diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua
index 1b02c01c..9037bfad 100644
--- a/test/vinyl/errinj.test.lua
+++ b/test/vinyl/errinj.test.lua
@@ -850,3 +850,37 @@ fiber.sleep(0)
 s:create_index('sk', {parts = {2, 'unsigned'}})
 s:select()
 s:drop()
+
+-- Check compact.queue and compact.debt statistics.
+s = box.schema.space.create('test', {engine = 'vinyl'})
+i = s:create_index('pk', {run_count_per_level = 2})
+function dump() for i = 1, 10 do s:replace{i} end box.snapshot() end
+dump()
+dump()
+i:stat().disk.compact.queue -- none
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+dump()
+i:stat().disk.compact.queue -- 30 statements
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+dump()
+i:stat().disk.compact.queue -- 40 statements
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+dump()
+i:stat().disk.compact.queue -- 50 statements
+i:stat().disk.compact.debt -- 50 statements
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+while i:stat().disk.compact.count < 2 do fiber.sleep(0.01) end
+i:stat().disk.compact.queue -- none
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+s:drop()
diff --git a/test/vinyl/info.result b/test/vinyl/info.result
index 36fa732c..556f5eca 100644
--- a/test/vinyl/info.result
+++ b/test/vinyl/info.result
@@ -163,11 +163,17 @@ istat()
         rows: 0
         bytes: 0
     compact:
+      out:
+        rows: 0
+        bytes: 0
       in:
         rows: 0
         bytes: 0
       count: 0
-      out:
+      debt:
+        rows: 0
+        bytes: 0
+      queue:
         rows: 0
         bytes: 0
     iterator:
@@ -213,6 +219,8 @@ gstat()
 - disk:
     dump_total: 0
     data_size: 0
+    compact_debt: 0
+    compact_queue: 0
     data_files: 0
     compact_total: 0
     index_size: 0
@@ -976,11 +984,17 @@ istat()
         rows: 0
         bytes: 0
     compact:
+      out:
+        rows: 0
+        bytes: 0
       in:
         rows: 0
         bytes: 0
       count: 0
-      out:
+      debt:
+        rows: 0
+        bytes: 0
+      queue:
         rows: 0
         bytes: 0
     iterator:
@@ -1026,6 +1040,8 @@ gstat()
 - disk:
     dump_total: 0
     data_size: 104300
+    compact_debt: 0
+    compact_queue: 0
     data_files: 2
     compact_total: 0
     index_size: 1190
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 7/7] vinyl: keep track of disk idle time
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
                   ` (5 preceding siblings ...)
  2018-09-02 20:18 ` [PATCH 6/7] vinyl: keep track of compaction queue length and debt Vladimir Davydov
@ 2018-09-02 20:19 ` Vladimir Davydov
  2018-09-04 11:54   ` Vladimir Davydov
  2018-09-09 11:41 ` [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-02 20:19 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

To understand whether the disk is fully utilized or can still handle
more compaction load and make right decisions regarding transaction
throttling, we need a metric that would report how much time worker
threads spent being idle. So this patch adds a new metric to global
statistics, box.stat.vinyl().disk.idle_ratio. The metric is updated
on each dump using the following formula:

                       idle_time
  idle_ratio = --------------------------
               dump_period * worker_count

where idle_time is the total amount of time workers were idle between
the last two dumps, dump_period is the time that passed between the last
two dumps, worker_count is the number of workers.

The value of the new metric always lays between 0 inclusive and 1
exclusive. The closer it is to 1 the more busy the disk is.
---
 src/box/vinyl.c            |  1 +
 src/box/vy_scheduler.c     | 48 +++++++++++++++++++++++++++++++----
 src/box/vy_scheduler.h     | 12 +++++++++
 test/vinyl/errinj.result   | 62 ++++++++++++++++++++++++++++++++++++++++++++++
 test/vinyl/errinj.test.lua | 22 ++++++++++++++++
 test/vinyl/info.result     |  1 +
 test/vinyl/info.test.lua   |  1 +
 7 files changed, 142 insertions(+), 5 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 416c9824..e140f03c 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -311,6 +311,7 @@ vy_info_append_disk(struct vy_env *env, struct info_handler *h)
 	info_append_int(h, "compact_total", lsm_env->compact_total);
 	info_append_int(h, "compact_queue", lsm_env->compact_queue);
 	info_append_int(h, "compact_debt", lsm_env->compact_debt);
+	info_append_double(h, "idle_ratio", env->scheduler.idle_ratio);
 	info_table_end(h);
 }
 
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 580c3129..702f426c 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -96,6 +96,10 @@ struct vy_worker {
 	struct vy_task *task;
 	/** Link in vy_scheduler::idle_workers. */
 	struct stailq_entry in_idle;
+	/** Time when this worker became idle. */
+	double idle_start;
+	/** How much time this worker have been idle. */
+	double idle_time;
 	/** Route for sending deferred DELETEs back to tx. */
 	struct cmsg_hop deferred_delete_route[2];
 };
@@ -346,6 +350,7 @@ vy_scheduler_start_workers(struct vy_scheduler *scheduler)
 	if (scheduler->worker_pool == NULL)
 		panic("failed to allocate vinyl worker pool");
 
+	double now = ev_monotonic_now(loop());
 	for (int i = 0; i < scheduler->worker_pool_size; i++) {
 		char name[FIBER_NAME_MAX];
 		snprintf(name, sizeof(name), "vinyl.writer.%d", i);
@@ -355,6 +360,7 @@ vy_scheduler_start_workers(struct vy_scheduler *scheduler)
 		cpipe_create(&worker->worker_pipe, name);
 		stailq_add_tail_entry(&scheduler->idle_workers,
 				      worker, in_idle);
+		worker->idle_start = now;
 
 		struct cmsg_hop *route = worker->deferred_delete_route;
 		route[0].f = vy_deferred_delete_batch_process_f;
@@ -407,6 +413,7 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 
 	diag_create(&scheduler->diag);
 	fiber_cond_create(&scheduler->dump_cond);
+	scheduler->dump_end = ev_monotonic_now(loop());
 
 	fiber_start(scheduler->scheduler_fiber, scheduler);
 }
@@ -548,6 +555,27 @@ vy_scheduler_force_compaction(struct vy_scheduler *scheduler,
 }
 
 /**
+ * Return total time workers have spent idle.
+ */
+static double
+vy_scheduler_get_idle_time(struct vy_scheduler *scheduler)
+{
+	double idle_time = 0;
+	double now = ev_monotonic_now(loop());
+
+	struct vy_worker *worker;
+	for (int i = 0; i < scheduler->worker_pool_size; i++) {
+		worker = &scheduler->worker_pool[i];
+		idle_time += worker->idle_time;
+	}
+
+	stailq_foreach_entry(worker, &scheduler->idle_workers, in_idle)
+		idle_time += now - worker->idle_start;
+
+	return idle_time;
+}
+
+/**
  * Check whether the current dump round is complete.
  * If it is, free memory and proceed to the next dump round.
  */
@@ -585,7 +613,11 @@ vy_scheduler_complete_dump(struct vy_scheduler *scheduler)
 	 */
 	double now = ev_monotonic_now(loop());
 	double dump_duration = now - scheduler->dump_start;
-	scheduler->dump_start = now;
+	double idle_time = vy_scheduler_get_idle_time(scheduler);
+	scheduler->idle_ratio = (idle_time - scheduler->idle_time_at_dump) /
+		(now - scheduler->dump_end) / scheduler->worker_pool_size;
+	scheduler->idle_time_at_dump = idle_time;
+	scheduler->dump_start = scheduler->dump_end = now;
 	scheduler->dump_generation = min_generation;
 	scheduler->dump_complete_cb(scheduler,
 			min_generation - 1, dump_duration);
@@ -1900,7 +1932,9 @@ vy_scheduler_f(va_list va)
 	while (scheduler->scheduler_fiber != NULL) {
 		struct stailq processed_tasks;
 		struct vy_task *task, *next;
+		struct vy_worker *worker;
 		int tasks_failed = 0, tasks_done = 0;
+		double now = ev_monotonic_now(loop());
 
 		/* Get the list of processed tasks. */
 		stailq_create(&processed_tasks);
@@ -1913,8 +1947,10 @@ vy_scheduler_f(va_list va)
 				tasks_failed++;
 			else
 				tasks_done++;
+			worker = task->worker;
 			stailq_add_entry(&scheduler->idle_workers,
-					 task->worker, in_idle);
+					 worker, in_idle);
+			worker->idle_start = now;
 			vy_task_delete(task);
 			scheduler->idle_worker_count++;
 			assert(scheduler->idle_worker_count <=
@@ -1951,11 +1987,13 @@ vy_scheduler_f(va_list va)
 
 		/* Queue the task and notify workers if necessary. */
 		assert(!stailq_empty(&scheduler->idle_workers));
-		task->worker = stailq_shift_entry(&scheduler->idle_workers,
-						  struct vy_worker, in_idle);
+		worker = stailq_shift_entry(&scheduler->idle_workers,
+					    struct vy_worker, in_idle);
+		worker->idle_time += now - worker->idle_start;
 		scheduler->idle_worker_count--;
+		task->worker = worker;
 		cmsg_init(&task->cmsg, vy_task_execute_route);
-		cpipe_push(&task->worker->worker_pipe, &task->cmsg);
+		cpipe_push(&worker->worker_pipe, &task->cmsg);
 
 		fiber_reschedule();
 		continue;
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index deefacd7..5524ecce 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -136,6 +136,18 @@ struct vy_scheduler {
 	int dump_task_count;
 	/** Time when the current dump round started. */
 	double dump_start;
+	/** Time when the last dump round ended. */
+	double dump_end;
+	/**
+	 * Total amount of time worker threads have been idle,
+	 * taken at the time when the last dump round completed.
+	 */
+	double idle_time_at_dump;
+	/**
+	 * How much time worker threads were idle between the last
+	 * two dump, relative to the dump period.
+	 */
+	double idle_ratio;
 	/** Signaled on dump round completion. */
 	struct fiber_cond dump_cond;
 	/**
diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result
index 7b880030..bb5377b4 100644
--- a/test/vinyl/errinj.result
+++ b/test/vinyl/errinj.result
@@ -2244,6 +2244,68 @@ i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
 ---
 - true
 ...
+s:truncate()
+---
+...
+box.stat.reset()
+---
+...
+-- Check disk.idle_ratio statistic.
+errinj.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0.01)
+---
+- ok
+...
+start = fiber.time()
+---
+...
+dump()
+---
+...
+fiber.sleep(fiber.time() - start)
+---
+...
+dump()
+---
+...
+-- one worker is busy half of the time
+expected = 1 - 1 / (2 * box.cfg.vinyl_write_threads)
+---
+...
+math.abs(box.stat.vinyl().disk.idle_ratio - expected) < 0.1
+---
+- true
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+---
+- ok
+...
+start = fiber.time()
+---
+...
+dump()
+---
+...
+fiber.sleep(fiber.time() - start)
+---
+...
+dump()
+---
+...
+-- one worker is busy all the time, plus one half of the time
+expected = 1 - 3 / (2 * box.cfg.vinyl_write_threads)
+---
+...
+math.abs(box.stat.vinyl().disk.idle_ratio - expected) < 0.1
+---
+- true
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+---
+- ok
+...
+while i:stat().disk.compact.count < 1 do fiber.sleep(0.01) end
+---
+...
 s:drop()
 ---
 ...
diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua
index 9037bfad..835f4540 100644
--- a/test/vinyl/errinj.test.lua
+++ b/test/vinyl/errinj.test.lua
@@ -883,4 +883,26 @@ i:stat().disk.compact.queue -- none
 i:stat().disk.compact.debt -- none
 i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
 i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+s:truncate()
+box.stat.reset()
+
+-- Check disk.idle_ratio statistic.
+errinj.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0.01)
+start = fiber.time()
+dump()
+fiber.sleep(fiber.time() - start)
+dump()
+-- one worker is busy half of the time
+expected = 1 - 1 / (2 * box.cfg.vinyl_write_threads)
+math.abs(box.stat.vinyl().disk.idle_ratio - expected) < 0.1
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+start = fiber.time()
+dump()
+fiber.sleep(fiber.time() - start)
+dump()
+-- one worker is busy all the time, plus one half of the time
+expected = 1 - 3 / (2 * box.cfg.vinyl_write_threads)
+math.abs(box.stat.vinyl().disk.idle_ratio - expected) < 0.1
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+while i:stat().disk.compact.count < 1 do fiber.sleep(0.01) end
 s:drop()
diff --git a/test/vinyl/info.result b/test/vinyl/info.result
index 556f5eca..4340be91 100644
--- a/test/vinyl/info.result
+++ b/test/vinyl/info.result
@@ -102,6 +102,7 @@ function gstat()
     st.quota.use_rate = nil
     st.quota.dump_bandwidth = nil
     st.quota.watermark = nil
+    st.disk.idle_ratio = nil
     return st
 end;
 ---
diff --git a/test/vinyl/info.test.lua b/test/vinyl/info.test.lua
index 919dde63..fe070416 100644
--- a/test/vinyl/info.test.lua
+++ b/test/vinyl/info.test.lua
@@ -84,6 +84,7 @@ function gstat()
     st.quota.use_rate = nil
     st.quota.dump_bandwidth = nil
     st.quota.watermark = nil
+    st.disk.idle_ratio = nil
     return st
 end;
 
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [tarantool-patches] Re: [PATCH 1/7] vinyl: fix accounting of secondary index cache statements
  2018-09-02 20:18 ` [PATCH 1/7] vinyl: fix accounting of secondary index cache statements Vladimir Davydov
@ 2018-09-02 22:26   ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-02 22:26 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/02 23:23]:
> Since commit 0c5e6cc8c80b ("vinyl: store full tuples in secondary index
> cache"), we store primary index tuples in secondary index cache, but we
> still account them as separate tuples. Fix that.

OK to push.

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [tarantool-patches] Re: [PATCH 2/7] vinyl: add global memory stats
  2018-09-02 20:18 ` [PATCH 2/7] vinyl: add global memory stats Vladimir Davydov
@ 2018-09-02 22:27   ` Konstantin Osipov
  2018-09-02 22:27   ` Konstantin Osipov
  1 sibling, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-02 22:27 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/02 23:23]:
> box.info.memory() gives you some insight on what memory is used for,
> but it's very coarse. For vinyl we need finer grained global memory
> statistics.

OK to push.

 

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [tarantool-patches] Re: [PATCH 2/7] vinyl: add global memory stats
  2018-09-02 20:18 ` [PATCH 2/7] vinyl: add global memory stats Vladimir Davydov
  2018-09-02 22:27   ` [tarantool-patches] " Konstantin Osipov
@ 2018-09-02 22:27   ` Konstantin Osipov
  2018-09-03  8:10     ` Vladimir Davydov
  1 sibling, 1 reply; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-02 22:27 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/02 23:23]:
> box.info.memory() gives you some insight on what memory is used for,
> but it's very coarse. For vinyl we need finer grained global memory
> statistics.
> 
> This patch adds such: they are reported under box.stat.vinyl().memory
> and consist of the following entries:
> 
>  - level0: sum size of level-0 of all LSM trees.
>  - tx: size of memory used by tx write and read sets.
>  - tuple_cache: size of memory occupied by tuple cache.
>  - page_index: size of memory used for storing page indexes.
>  - bloom_filter: size of memory used for storing bloom filters.
> 
> It also removes box.stat.vinyl().cache, as the size of cache is now
> reported under memory.tuple_cache.

Please add a plea to docbot.

 

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [tarantool-patches] Re: [PATCH 3/7] vinyl: add global disk stats
  2018-09-02 20:18 ` [PATCH 3/7] vinyl: add global disk stats Vladimir Davydov
@ 2018-09-02 22:30   ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-02 22:30 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/02 23:23]:
> This patch adds some essential disk statistics to box.stat.vinyl().
> The new statistics are reported under the 'disk' section and currently
> include the following fields:
> 
>  - data_files: number of open data files (*.run).
>  - data_size: size of data stored on disk.
>  - index_size: size of index stored on disk.
>  - dump_total: number of bytes written by dump tasks.
>  - compact_total: number of bytes writted by compaction tasks.

How about also maintaining statistics about the amount of
garbage, i.e. runs removed from an LSM but maintained 'cause of
checkpoint or backup?

These stats would also be useful on per index basis.


The patch itself is OK to push.

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [tarantool-patches] Re: [PATCH 2/7] vinyl: add global memory stats
  2018-09-02 22:27   ` Konstantin Osipov
@ 2018-09-03  8:10     ` Vladimir Davydov
  0 siblings, 0 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-03  8:10 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Mon, Sep 03, 2018 at 01:27:17AM +0300, Konstantin Osipov wrote:
> * Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/02 23:23]:
> > box.info.memory() gives you some insight on what memory is used for,
> > but it's very coarse. For vinyl we need finer grained global memory
> > statistics.
> > 
> > This patch adds such: they are reported under box.stat.vinyl().memory
> > and consist of the following entries:
> > 
> >  - level0: sum size of level-0 of all LSM trees.
> >  - tx: size of memory used by tx write and read sets.
> >  - tuple_cache: size of memory occupied by tuple cache.
> >  - page_index: size of memory used for storing page indexes.
> >  - bloom_filter: size of memory used for storing bloom filters.
> > 
> > It also removes box.stat.vinyl().cache, as the size of cache is now
> > reported under memory.tuple_cache.
> 
> Please add a plea to docbot.

IMO it's pointless to write a docbot request in each patch, because
other vinyl statistics (per index and global) aren't documented, either.
I'll open a separate ticket manually once we are done with throttling.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 7/7] vinyl: keep track of disk idle time
  2018-09-02 20:19 ` [PATCH 7/7] vinyl: keep track of disk idle time Vladimir Davydov
@ 2018-09-04 11:54   ` Vladimir Davydov
  2018-09-04 17:23     ` Vladimir Davydov
  0 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 11:54 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

Please ignore this patch, because it's incorrect: since we always
keep one worker reserved for dumps it will should that the disk is
1/box.cfg.vinyl_writers idle even if compaction never stops, which
is misleading. I'll rework and resend it separately.

On Sun, Sep 02, 2018 at 11:19:00PM +0300, Vladimir Davydov wrote:
> To understand whether the disk is fully utilized or can still handle
> more compaction load and make right decisions regarding transaction
> throttling, we need a metric that would report how much time worker
> threads spent being idle. So this patch adds a new metric to global
> statistics, box.stat.vinyl().disk.idle_ratio. The metric is updated
> on each dump using the following formula:
> 
>                        idle_time
>   idle_ratio = --------------------------
>                dump_period * worker_count
> 
> where idle_time is the total amount of time workers were idle between
> the last two dumps, dump_period is the time that passed between the last
> two dumps, worker_count is the number of workers.
> 
> The value of the new metric always lays between 0 inclusive and 1
> exclusive. The closer it is to 1 the more busy the disk is.
> ---
>  src/box/vinyl.c            |  1 +
>  src/box/vy_scheduler.c     | 48 +++++++++++++++++++++++++++++++----
>  src/box/vy_scheduler.h     | 12 +++++++++
>  test/vinyl/errinj.result   | 62 ++++++++++++++++++++++++++++++++++++++++++++++
>  test/vinyl/errinj.test.lua | 22 ++++++++++++++++
>  test/vinyl/info.result     |  1 +
>  test/vinyl/info.test.lua   |  1 +
>  7 files changed, 142 insertions(+), 5 deletions(-)

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 7/7] vinyl: keep track of disk idle time
  2018-09-04 11:54   ` Vladimir Davydov
@ 2018-09-04 17:23     ` Vladimir Davydov
  2018-09-04 17:23       ` [PATCH 1/8] vinyl: add helper to check whether dump is in progress Vladimir Davydov
                         ` (7 more replies)
  0 siblings, 8 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

On Tue, Sep 04, 2018 at 02:54:04PM +0300, Vladimir Davydov wrote:
> Please ignore this patch, because it's incorrect: since we always
> keep one worker reserved for dumps it will should that the disk is
> 1/box.cfg.vinyl_writers idle even if compaction never stops, which
> is misleading. I'll rework and resend it separately.

Here goes the new version. The primary difference is that it reports
idle time separately for dump and compaction threads. To achieve that,
I had to introduce separate thread pools for dump and compaction tasks.
There's still one knob though, box.cfg.vinyl_write_threads I assign 25%
of threads (min 1) to dump tasks while the rest goes to compaction tasks
(this is similar to what RocksDB does).

Patches 1 and 2 do some trivial cleanup.

Patches 3 to 7 introduce separate thread pools for different kinds of
background tasks.

Patch 8 adds idle ratio to global statistics.

The branch is the same:

https://github.com/tarantool/tarantool/commits/dv/vy-stats-for-throttling

Vladimir Davydov (8):
  vinyl: add helper to check whether dump is in progress
  vinyl: don't use mempool for allocating background tasks
  vinyl: factor out worker pool from scheduler struct
  vinyl: move worker allocation closer to task creation
  vinyl: use separate thread pools for dump and compaction tasks
  vinyl: zap vy_worker_pool::idle_worker_count
  vinyl: don't start scheduler fiber until local recovery is complete
  vinyl: keep track of thread pool idle ratio

 src/box/vinyl.c            |  44 +++---
 src/box/vy_scheduler.c     | 334 ++++++++++++++++++++++++++++++---------------
 src/box/vy_scheduler.h     |  60 ++++++--
 test/vinyl/errinj.result   |  82 +++++++++++
 test/vinyl/errinj.test.lua |  37 +++++
 test/vinyl/info.result     |  14 +-
 test/vinyl/info.test.lua   |   2 +
 7 files changed, 423 insertions(+), 150 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 1/8] vinyl: add helper to check whether dump is in progress
  2018-09-04 17:23     ` Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:33         ` Konstantin Osipov
  2018-09-04 17:23       ` [PATCH 2/8] vinyl: don't use mempool for allocating background tasks Vladimir Davydov
                         ` (6 subsequent siblings)
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

Needed solely to improve code readability. No functional changes.
---
 src/box/vy_scheduler.c | 17 +++++++----------
 src/box/vy_scheduler.h | 13 +++++++++++++
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 580c3129..367c8a20 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -488,8 +488,7 @@ vy_scheduler_unpin_lsm(struct vy_scheduler *scheduler, struct vy_lsm *lsm)
 void
 vy_scheduler_trigger_dump(struct vy_scheduler *scheduler)
 {
-	assert(scheduler->dump_generation <= scheduler->generation);
-	if (scheduler->dump_generation < scheduler->generation) {
+	if (vy_scheduler_dump_in_progress(scheduler)) {
 		/* Dump is already in progress, nothing to do. */
 		return;
 	}
@@ -520,13 +519,13 @@ vy_scheduler_dump(struct vy_scheduler *scheduler)
 		fiber_cond_wait(&scheduler->dump_cond);
 
 	/* Trigger dump. */
-	if (scheduler->generation == scheduler->dump_generation)
+	if (!vy_scheduler_dump_in_progress(scheduler))
 		scheduler->dump_start = ev_monotonic_now(loop());
-	int64_t generation = ++scheduler->generation;
+	scheduler->generation++;
 	fiber_cond_signal(&scheduler->scheduler_cond);
 
 	/* Wait for dump to complete. */
-	while (scheduler->dump_generation < generation) {
+	while (vy_scheduler_dump_in_progress(scheduler)) {
 		if (scheduler->is_throttled) {
 			/* Dump error occurred. */
 			struct error *e = diag_last_error(&scheduler->diag);
@@ -611,8 +610,7 @@ vy_scheduler_begin_checkpoint(struct vy_scheduler *scheduler)
 		return -1;
 	}
 
-	assert(scheduler->dump_generation <= scheduler->generation);
-	if (scheduler->generation == scheduler->dump_generation) {
+	if (!vy_scheduler_dump_in_progress(scheduler)) {
 		/*
 		 * We are about to start a new dump round.
 		 * Remember the current time so that we can update
@@ -638,7 +636,7 @@ vy_scheduler_wait_checkpoint(struct vy_scheduler *scheduler)
 	 * Wait until all in-memory trees created before
 	 * checkpoint started have been dumped.
 	 */
-	while (scheduler->dump_generation < scheduler->generation) {
+	while (vy_scheduler_dump_in_progress(scheduler)) {
 		if (scheduler->is_throttled) {
 			/* A dump error occurred, abort checkpoint. */
 			struct error *e = diag_last_error(&scheduler->diag);
@@ -1730,8 +1728,7 @@ vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask)
 {
 retry:
 	*ptask = NULL;
-	assert(scheduler->dump_generation <= scheduler->generation);
-	if (scheduler->dump_generation == scheduler->generation) {
+	if (!vy_scheduler_dump_in_progress(scheduler)) {
 		/*
 		 * All memory trees of past generations have
 		 * been dumped, nothing to do.
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index deefacd7..89f8f170 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -31,6 +31,7 @@
  * SUCH DAMAGE.
  */
 
+#include <assert.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <small/mempool.h>
@@ -151,6 +152,18 @@ struct vy_scheduler {
 };
 
 /**
+ * Return true if memory dump is in progress, i.e. there are
+ * in-memory trees that are being dumped right now or should
+ * be scheduled for dump as soon as possible.
+ */
+static inline bool
+vy_scheduler_dump_in_progress(struct vy_scheduler *scheduler)
+{
+	assert(scheduler->dump_generation <= scheduler->generation);
+	return scheduler->dump_generation < scheduler->generation;
+}
+
+/**
  * Create a scheduler instance.
  */
 void
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 2/8] vinyl: don't use mempool for allocating background tasks
  2018-09-04 17:23     ` Vladimir Davydov
  2018-09-04 17:23       ` [PATCH 1/8] vinyl: add helper to check whether dump is in progress Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:33         ` Konstantin Osipov
  2018-09-04 17:23       ` [PATCH 3/8] vinyl: factor out worker pool from scheduler struct Vladimir Davydov
                         ` (5 subsequent siblings)
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

Background tasks are allocated infrequently, not more often than once
per several seconds, so using mempool for them is unnecessary and only
clutters vy_scheduler struct. Let's allocate them with malloc().
---
 src/box/vy_scheduler.c | 14 +++++---------
 src/box/vy_scheduler.h |  4 ----
 2 files changed, 5 insertions(+), 13 deletions(-)

diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 367c8a20..fc4fdf8c 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -36,7 +36,6 @@
 #include <stddef.h>
 #include <stdint.h>
 #include <stdlib.h>
-#include <small/mempool.h>
 #include <small/rlist.h>
 #include <tarantool_ev.h>
 
@@ -233,10 +232,10 @@ static struct vy_task *
 vy_task_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 	    const struct vy_task_ops *ops)
 {
-	struct vy_task *task = mempool_alloc(&scheduler->task_pool);
+	struct vy_task *task = calloc(1, sizeof(*task));
 	if (task == NULL) {
 		diag_set(OutOfMemory, sizeof(*task),
-			 "mempool", "struct vy_task");
+			 "malloc", "struct vy_task");
 		return NULL;
 	}
 	memset(task, 0, sizeof(*task));
@@ -245,13 +244,13 @@ vy_task_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 	task->lsm = lsm;
 	task->cmp_def = key_def_dup(lsm->cmp_def);
 	if (task->cmp_def == NULL) {
-		mempool_free(&scheduler->task_pool, task);
+		free(task);
 		return NULL;
 	}
 	task->key_def = key_def_dup(lsm->key_def);
 	if (task->key_def == NULL) {
 		key_def_delete(task->cmp_def);
-		mempool_free(&scheduler->task_pool, task);
+		free(task);
 		return NULL;
 	}
 	vy_lsm_ref(lsm);
@@ -270,7 +269,7 @@ vy_task_delete(struct vy_task *task)
 	key_def_delete(task->key_def);
 	vy_lsm_unref(task->lsm);
 	diag_destroy(&task->diag);
-	mempool_free(&task->scheduler->task_pool, task);
+	free(task);
 }
 
 static bool
@@ -397,8 +396,6 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 	fiber_cond_create(&scheduler->scheduler_cond);
 
 	scheduler->worker_pool_size = write_threads;
-	mempool_create(&scheduler->task_pool, cord_slab_cache(),
-		       sizeof(struct vy_task));
 	stailq_create(&scheduler->idle_workers);
 	stailq_create(&scheduler->processed_tasks);
 
@@ -424,7 +421,6 @@ vy_scheduler_destroy(struct vy_scheduler *scheduler)
 		vy_scheduler_stop_workers(scheduler);
 
 	diag_destroy(&scheduler->diag);
-	mempool_destroy(&scheduler->task_pool);
 	fiber_cond_destroy(&scheduler->dump_cond);
 	fiber_cond_destroy(&scheduler->scheduler_cond);
 	vy_dump_heap_destroy(&scheduler->dump_heap);
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index 89f8f170..db85711a 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -34,9 +34,7 @@
 #include <assert.h>
 #include <stdbool.h>
 #include <stdint.h>
-#include <small/mempool.h>
 #include <small/rlist.h>
-#include <tarantool_ev.h>
 
 #include "diag.h"
 #include "fiber_cond.h"
@@ -74,8 +72,6 @@ struct vy_scheduler {
 	int idle_worker_count;
 	/** List of idle workers, linked by vy_worker::in_idle. */
 	struct stailq idle_workers;
-	/** Memory pool used for allocating vy_task objects. */
-	struct mempool task_pool;
 	/** Queue of processed tasks, linked by vy_task::in_processed. */
 	struct stailq processed_tasks;
 	/**
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 3/8] vinyl: factor out worker pool from scheduler struct
  2018-09-04 17:23     ` Vladimir Davydov
  2018-09-04 17:23       ` [PATCH 1/8] vinyl: add helper to check whether dump is in progress Vladimir Davydov
  2018-09-04 17:23       ` [PATCH 2/8] vinyl: don't use mempool for allocating background tasks Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:34         ` Konstantin Osipov
  2018-09-04 17:23       ` [PATCH 4/8] vinyl: move worker allocation closer to task creation Vladimir Davydov
                         ` (4 subsequent siblings)
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

A worker pool is an independent entity that provides the scheduler with
worker threads on demand. Let's factor it out so that we can introduce
separate pools for dump and compaction tasks.
---
 src/box/vy_scheduler.c | 104 ++++++++++++++++++++++++++++++++-----------------
 src/box/vy_scheduler.h |  23 ++++++-----
 2 files changed, 82 insertions(+), 45 deletions(-)

diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index fc4fdf8c..03126e5e 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -93,7 +93,7 @@ struct vy_worker {
 	 * or NULL if the worker is idle.
 	 */
 	struct vy_task *task;
-	/** Link in vy_scheduler::idle_workers. */
+	/** Link in vy_worker_pool::idle_workers. */
 	struct stailq_entry in_idle;
 	/** Route for sending deferred DELETEs back to tx. */
 	struct cmsg_hop deferred_delete_route[2];
@@ -333,27 +333,25 @@ vy_compact_heap_less(struct heap_node *a, struct heap_node *b)
 #undef HEAP_NAME
 
 static void
-vy_scheduler_start_workers(struct vy_scheduler *scheduler)
+vy_worker_pool_start(struct vy_worker_pool *pool)
 {
-	assert(scheduler->worker_pool == NULL);
+	assert(pool->workers == NULL);
 	/* One thread is reserved for dumps, see vy_schedule(). */
-	assert(scheduler->worker_pool_size >= 2);
+	assert(pool->size >= 2);
 
-	scheduler->idle_worker_count = scheduler->worker_pool_size;
-	scheduler->worker_pool = calloc(scheduler->worker_pool_size,
-					sizeof(*scheduler->worker_pool));
-	if (scheduler->worker_pool == NULL)
+	pool->idle_worker_count = pool->size;
+	pool->workers = calloc(pool->size, sizeof(*pool->workers));
+	if (pool->workers == NULL)
 		panic("failed to allocate vinyl worker pool");
 
-	for (int i = 0; i < scheduler->worker_pool_size; i++) {
+	for (int i = 0; i < pool->size; i++) {
 		char name[FIBER_NAME_MAX];
 		snprintf(name, sizeof(name), "vinyl.writer.%d", i);
-		struct vy_worker *worker = &scheduler->worker_pool[i];
+		struct vy_worker *worker = &pool->workers[i];
 		if (cord_costart(&worker->cord, name, vy_worker_f, worker) != 0)
 			panic("failed to start vinyl worker thread");
 		cpipe_create(&worker->worker_pipe, name);
-		stailq_add_tail_entry(&scheduler->idle_workers,
-				      worker, in_idle);
+		stailq_add_tail_entry(&pool->idle_workers, worker, in_idle);
 
 		struct cmsg_hop *route = worker->deferred_delete_route;
 		route[0].f = vy_deferred_delete_batch_process_f;
@@ -364,17 +362,60 @@ vy_scheduler_start_workers(struct vy_scheduler *scheduler)
 }
 
 static void
-vy_scheduler_stop_workers(struct vy_scheduler *scheduler)
+vy_worker_pool_stop(struct vy_worker_pool *pool)
 {
-	assert(scheduler->worker_pool != NULL);
-	for (int i = 0; i < scheduler->worker_pool_size; i++) {
-		struct vy_worker *worker = &scheduler->worker_pool[i];
+	assert(pool->workers != NULL);
+	for (int i = 0; i < pool->size; i++) {
+		struct vy_worker *worker = &pool->workers[i];
 		cbus_stop_loop(&worker->worker_pipe);
 		cpipe_destroy(&worker->worker_pipe);
 		cord_join(&worker->cord);
 	}
-	free(scheduler->worker_pool);
-	scheduler->worker_pool = NULL;
+	free(pool->workers);
+	pool->workers = NULL;
+}
+
+static void
+vy_worker_pool_create(struct vy_worker_pool *pool, int size)
+{
+	pool->size = size;
+	pool->workers = NULL;
+	stailq_create(&pool->idle_workers);
+	pool->idle_worker_count = 0;
+}
+
+static void
+vy_worker_pool_destroy(struct vy_worker_pool *pool)
+{
+	if (pool->workers != NULL)
+		vy_worker_pool_stop(pool);
+}
+
+/**
+ * Get an idle worker from a pool.
+ */
+static struct vy_worker *
+vy_worker_pool_get(struct vy_worker_pool *pool)
+{
+	struct vy_worker *worker = NULL;
+	if (!stailq_empty(&pool->idle_workers)) {
+		assert(pool->idle_worker_count > 0);
+		pool->idle_worker_count--;
+		worker = stailq_shift_entry(&pool->idle_workers,
+					    struct vy_worker, in_idle);
+	}
+	return worker;
+}
+
+/**
+ * Put a worker back to a pool once it's done its job.
+ */
+static void
+vy_worker_pool_put(struct vy_worker_pool *pool, struct vy_worker *worker)
+{
+	assert(pool->idle_worker_count < pool->size);
+	pool->idle_worker_count++;
+	stailq_add_entry(&pool->idle_workers, worker, in_idle);
 }
 
 void
@@ -394,9 +435,7 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 		panic("failed to allocate vinyl scheduler fiber");
 
 	fiber_cond_create(&scheduler->scheduler_cond);
-
-	scheduler->worker_pool_size = write_threads;
-	stailq_create(&scheduler->idle_workers);
+	vy_worker_pool_create(&scheduler->worker_pool, write_threads);
 	stailq_create(&scheduler->processed_tasks);
 
 	vy_dump_heap_create(&scheduler->dump_heap);
@@ -417,9 +456,7 @@ vy_scheduler_destroy(struct vy_scheduler *scheduler)
 	fiber_cond_signal(&scheduler->dump_cond);
 	fiber_cond_signal(&scheduler->scheduler_cond);
 
-	if (scheduler->worker_pool != NULL)
-		vy_scheduler_stop_workers(scheduler);
-
+	vy_worker_pool_destroy(&scheduler->worker_pool);
 	diag_destroy(&scheduler->diag);
 	fiber_cond_destroy(&scheduler->dump_cond);
 	fiber_cond_destroy(&scheduler->scheduler_cond);
@@ -1812,7 +1849,7 @@ vy_schedule(struct vy_scheduler *scheduler, struct vy_task **ptask)
 	if (*ptask != NULL)
 		return 0;
 
-	if (scheduler->idle_worker_count <= 1) {
+	if (scheduler->worker_pool.idle_worker_count <= 1) {
 		/*
 		 * If all worker threads are busy doing compaction
 		 * when we run out of quota, ongoing transactions will
@@ -1888,7 +1925,7 @@ vy_scheduler_f(va_list va)
 	if (scheduler->scheduler_fiber == NULL)
 		return 0; /* destroyed */
 
-	vy_scheduler_start_workers(scheduler);
+	vy_worker_pool_start(&scheduler->worker_pool);
 
 	while (scheduler->scheduler_fiber != NULL) {
 		struct stailq processed_tasks;
@@ -1906,12 +1943,9 @@ vy_scheduler_f(va_list va)
 				tasks_failed++;
 			else
 				tasks_done++;
-			stailq_add_entry(&scheduler->idle_workers,
-					 task->worker, in_idle);
+			vy_worker_pool_put(&scheduler->worker_pool,
+					   task->worker);
 			vy_task_delete(task);
-			scheduler->idle_worker_count++;
-			assert(scheduler->idle_worker_count <=
-			       scheduler->worker_pool_size);
 		}
 		/*
 		 * Reset the timeout if we managed to successfully
@@ -1933,7 +1967,7 @@ vy_scheduler_f(va_list va)
 		if (tasks_failed > 0)
 			goto error;
 		/* All worker threads are busy. */
-		if (scheduler->idle_worker_count == 0)
+		if (scheduler->worker_pool.idle_worker_count == 0)
 			goto wait;
 		/* Get a task to schedule. */
 		if (vy_schedule(scheduler, &task) != 0)
@@ -1943,10 +1977,8 @@ vy_scheduler_f(va_list va)
 			goto wait;
 
 		/* Queue the task and notify workers if necessary. */
-		assert(!stailq_empty(&scheduler->idle_workers));
-		task->worker = stailq_shift_entry(&scheduler->idle_workers,
-						  struct vy_worker, in_idle);
-		scheduler->idle_worker_count--;
+		task->worker = vy_worker_pool_get(&scheduler->worker_pool);
+		assert(task->worker != NULL);
 		cmsg_init(&task->cmsg, vy_task_execute_route);
 		cpipe_push(&task->worker->worker_pipe, &task->cmsg);
 
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index db85711a..606f3b31 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -56,22 +56,27 @@ typedef void
 (*vy_scheduler_dump_complete_f)(struct vy_scheduler *scheduler,
 				int64_t dump_generation, double dump_duration);
 
+struct vy_worker_pool {
+	/** Number of worker threads in the pool. */
+	int size;
+	/** Array of all worker threads in the pool. */
+	struct vy_worker *workers;
+	/** List of workers that are currently idle. */
+	struct stailq idle_workers;
+	/** Length of @idle_workers list. */
+	int idle_worker_count;
+};
+
 struct vy_scheduler {
 	/** Scheduler fiber. */
 	struct fiber *scheduler_fiber;
 	/** Used to wake up the scheduler fiber from TX. */
 	struct fiber_cond scheduler_cond;
 	/**
-	 * Array of worker threads used for performing
-	 * dump/compaction tasks.
+	 * Pool of threads used for performing dump/compaction
+	 * tasks in the background.
 	 */
-	struct vy_worker *worker_pool;
-	/** Total number of worker threads. */
-	int worker_pool_size;
-	/** Number worker threads that are currently idle. */
-	int idle_worker_count;
-	/** List of idle workers, linked by vy_worker::in_idle. */
-	struct stailq idle_workers;
+	struct vy_worker_pool worker_pool;
 	/** Queue of processed tasks, linked by vy_task::in_processed. */
 	struct stailq processed_tasks;
 	/**
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 4/8] vinyl: move worker allocation closer to task creation
  2018-09-04 17:23     ` Vladimir Davydov
                         ` (2 preceding siblings ...)
  2018-09-04 17:23       ` [PATCH 3/8] vinyl: factor out worker pool from scheduler struct Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:35         ` Konstantin Osipov
  2018-09-04 17:23       ` [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks Vladimir Davydov
                         ` (3 subsequent siblings)
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

Call vy_worker_pool_get() from vy_scheduler_peek_{dump,compaction} so
that we can use different worker pools for dump and compaction tasks.
---
 src/box/vy_scheduler.c | 80 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 31 deletions(-)

diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 03126e5e..5d6960cd 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -229,8 +229,8 @@ vy_task_deferred_delete_iface;
  * does not free it from under us.
  */
 static struct vy_task *
-vy_task_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
-	    const struct vy_task_ops *ops)
+vy_task_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
+	    struct vy_lsm *lsm, const struct vy_task_ops *ops)
 {
 	struct vy_task *task = calloc(1, sizeof(*task));
 	if (task == NULL) {
@@ -241,6 +241,7 @@ vy_task_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 	memset(task, 0, sizeof(*task));
 	task->ops = ops;
 	task->scheduler = scheduler;
+	task->worker = worker;
 	task->lsm = lsm;
 	task->cmp_def = key_def_dup(lsm->cmp_def);
 	if (task->cmp_def == NULL) {
@@ -1269,8 +1270,8 @@ vy_task_dump_abort(struct vy_task *task)
  * trees created at @scheduler->dump_generation.
  */
 static int
-vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
-		 struct vy_task **p_task)
+vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
+		 struct vy_lsm *lsm, struct vy_task **p_task)
 {
 	static struct vy_task_ops dump_ops = {
 		.execute = vy_task_dump_execute,
@@ -1323,7 +1324,7 @@ vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 		return 0;
 	}
 
-	struct vy_task *task = vy_task_new(scheduler, lsm, &dump_ops);
+	struct vy_task *task = vy_task_new(scheduler, worker, lsm, &dump_ops);
 	if (task == NULL)
 		goto err;
 
@@ -1579,8 +1580,8 @@ vy_task_compact_abort(struct vy_task *task)
 }
 
 static int
-vy_task_compact_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
-		    struct vy_task **p_task)
+vy_task_compact_new(struct vy_scheduler *scheduler, struct vy_worker *worker,
+		    struct vy_lsm *lsm, struct vy_task **p_task)
 {
 	static struct vy_task_ops compact_ops = {
 		.execute = vy_task_compact_execute,
@@ -1604,7 +1605,7 @@ vy_task_compact_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 		return 0;
 	}
 
-	struct vy_task *task = vy_task_new(scheduler, lsm, &compact_ops);
+	struct vy_task *task = vy_task_new(scheduler, worker, lsm, &compact_ops);
 	if (task == NULL)
 		goto err_task;
 
@@ -1746,8 +1747,8 @@ vy_task_complete_f(struct cmsg *cmsg)
 
 /**
  * Create a task for dumping an LSM tree. The new task is returned
- * in @ptask. If there's no LSM tree that needs to be dumped @ptask
- * is set to NULL.
+ * in @ptask. If there's no LSM tree that needs to be dumped or all
+ * workers are currently busy, @ptask is set to NULL.
  *
  * We only dump an LSM tree if it needs to be snapshotted or the quota
  * on memory usage is exceeded. In either case, the oldest LSM tree
@@ -1759,6 +1760,7 @@ vy_task_complete_f(struct cmsg *cmsg)
 static int
 vy_scheduler_peek_dump(struct vy_scheduler *scheduler, struct vy_task **ptask)
 {
+	struct vy_worker *worker = NULL;
 retry:
 	*ptask = NULL;
 	if (!vy_scheduler_dump_in_progress(scheduler)) {
@@ -1766,7 +1768,7 @@ retry:
 		 * All memory trees of past generations have
 		 * been dumped, nothing to do.
 		 */
-		return 0;
+		goto no_task;
 	}
 	/*
 	 * Look up the oldest LSM tree eligible for dump.
@@ -1778,7 +1780,7 @@ retry:
 		 * Complete the current dump round.
 		 */
 		vy_scheduler_complete_dump(scheduler);
-		return 0;
+		goto no_task;
 	}
 	struct vy_lsm *lsm = container_of(pn, struct vy_lsm, in_dump);
 	if (!lsm->is_dumping && lsm->pin_count == 0 &&
@@ -1788,8 +1790,15 @@ retry:
 		 * contains data that must be dumped at the current
 		 * round. Try to create a task for it.
 		 */
-		if (vy_task_dump_new(scheduler, lsm, ptask) != 0)
+		if (worker == NULL) {
+			worker = vy_worker_pool_get(&scheduler->worker_pool);
+			if (worker == NULL)
+				return 0; /* all workers are busy */
+		}
+		if (vy_task_dump_new(scheduler, worker, lsm, ptask) != 0) {
+			vy_worker_pool_put(&scheduler->worker_pool, worker);
 			return -1;
+		}
 		if (*ptask != NULL)
 			return 0; /* new task */
 		/*
@@ -1805,13 +1814,16 @@ retry:
 	 * is complete.
 	 */
 	assert(scheduler->dump_task_count > 0);
+no_task:
+	if (worker != NULL)
+		vy_worker_pool_put(&scheduler->worker_pool, worker);
 	return 0;
 }
 
 /**
  * Create a task for compacting a range. The new task is returned
- * in @ptask. If there's no range that needs to be compacted @ptask
- * is set to NULL.
+ * in @ptask. If there's no range that needs to be compacted or all
+ * workers are currently busy, @ptask is set to NULL.
  *
  * We compact ranges that have more runs in a level than specified
  * by run_count_per_level configuration option. Among those runs we
@@ -1824,19 +1836,31 @@ static int
 vy_scheduler_peek_compact(struct vy_scheduler *scheduler,
 			  struct vy_task **ptask)
 {
+	struct vy_worker *worker = NULL;
 retry:
 	*ptask = NULL;
 	struct heap_node *pn = vy_compact_heap_top(&scheduler->compact_heap);
 	if (pn == NULL)
-		return 0; /* nothing to do */
+		goto no_task; /* nothing to do */
 	struct vy_lsm *lsm = container_of(pn, struct vy_lsm, in_compact);
 	if (vy_lsm_compact_priority(lsm) <= 1)
-		return 0; /* nothing to do */
-	if (vy_task_compact_new(scheduler, lsm, ptask) != 0)
+		goto no_task; /* nothing to do */
+	if (worker == NULL) {
+		worker = vy_worker_pool_get(&scheduler->worker_pool);
+		if (worker == NULL)
+			return 0; /* all workers are busy */
+	}
+	if (vy_task_compact_new(scheduler, worker, lsm, ptask) != 0) {
+		vy_worker_pool_put(&scheduler->worker_pool, worker);
 		return -1;
+	}
 	if (*ptask == NULL)
 		goto retry; /* LSM tree dropped or range split/coalesced */
 	return 0; /* new task */
+no_task:
+	if (worker != NULL)
+		vy_worker_pool_put(&scheduler->worker_pool, worker);
+	return 0;
 }
 
 static int
@@ -1966,19 +1990,17 @@ vy_scheduler_f(va_list va)
 		/* Throttle for a while if a task failed. */
 		if (tasks_failed > 0)
 			goto error;
-		/* All worker threads are busy. */
-		if (scheduler->worker_pool.idle_worker_count == 0)
-			goto wait;
 		/* Get a task to schedule. */
 		if (vy_schedule(scheduler, &task) != 0)
 			goto error;
-		/* Nothing to do. */
-		if (task == NULL)
-			goto wait;
+		/* Nothing to do or all workers are busy. */
+		if (task == NULL) {
+			/* Wait for changes. */
+			fiber_cond_wait(&scheduler->scheduler_cond);
+			continue;
+		}
 
-		/* Queue the task and notify workers if necessary. */
-		task->worker = vy_worker_pool_get(&scheduler->worker_pool);
-		assert(task->worker != NULL);
+		/* Queue the task for execution. */
 		cmsg_init(&task->cmsg, vy_task_execute_route);
 		cpipe_push(&task->worker->worker_pipe, &task->cmsg);
 
@@ -2008,10 +2030,6 @@ error:
 		scheduler->is_throttled = true;
 		fiber_sleep(scheduler->timeout);
 		scheduler->is_throttled = false;
-		continue;
-wait:
-		/* Wait for changes */
-		fiber_cond_wait(&scheduler->scheduler_cond);
 	}
 
 	return 0;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks
  2018-09-04 17:23     ` Vladimir Davydov
                         ` (3 preceding siblings ...)
  2018-09-04 17:23       ` [PATCH 4/8] vinyl: move worker allocation closer to task creation Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:37         ` Konstantin Osipov
  2018-09-04 17:23       ` [PATCH 6/8] vinyl: zap vy_worker_pool::idle_worker_count Vladimir Davydov
                         ` (2 subsequent siblings)
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

Using the same thread pool for both dump and compaction tasks makes
estimation of dump bandwidth unstable. For instance, if we have four
worker threads, then the observed dump bandwidth may vary from X if
there's high compaction demand and all worker threads tend to be busy
with compaction tasks to 4 * X if there's no compaction demand. As a
result, we can overestimate the dump bandwidth and trigger dump when
it's too late, which will result in hitting the limit before dump is
complete and hence stalling write transactions, which is unacceptable.

To avoid that, let's separate thread pools used for dump and compaction
tasks. Since LSM tree based design typically implies high levels of
write amplification, let's allocate 1/4th of all threads for dump tasks
and use the rest exclusively for compaction.
---
 src/box/vy_scheduler.c | 74 ++++++++++++++++++++++++++++++--------------------
 src/box/vy_scheduler.h | 11 ++++----
 2 files changed, 51 insertions(+), 34 deletions(-)

diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 5d6960cd..dbc3ca69 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -88,6 +88,8 @@ struct vy_worker {
 	struct cpipe worker_pipe;
 	/** Pipe from the worker thread to tx. */
 	struct cpipe tx_pipe;
+	/** Pool this worker was allocated from. */
+	struct vy_worker_pool *pool;
 	/**
 	 * Task that is currently being executed by the worker
 	 * or NULL if the worker is idle.
@@ -337,8 +339,6 @@ static void
 vy_worker_pool_start(struct vy_worker_pool *pool)
 {
 	assert(pool->workers == NULL);
-	/* One thread is reserved for dumps, see vy_schedule(). */
-	assert(pool->size >= 2);
 
 	pool->idle_worker_count = pool->size;
 	pool->workers = calloc(pool->size, sizeof(*pool->workers));
@@ -347,10 +347,12 @@ vy_worker_pool_start(struct vy_worker_pool *pool)
 
 	for (int i = 0; i < pool->size; i++) {
 		char name[FIBER_NAME_MAX];
-		snprintf(name, sizeof(name), "vinyl.writer.%d", i);
+		snprintf(name, sizeof(name), "vinyl.%s.%d", pool->name, i);
 		struct vy_worker *worker = &pool->workers[i];
 		if (cord_costart(&worker->cord, name, vy_worker_f, worker) != 0)
 			panic("failed to start vinyl worker thread");
+
+		worker->pool = pool;
 		cpipe_create(&worker->worker_pipe, name);
 		stailq_add_tail_entry(&pool->idle_workers, worker, in_idle);
 
@@ -377,8 +379,9 @@ vy_worker_pool_stop(struct vy_worker_pool *pool)
 }
 
 static void
-vy_worker_pool_create(struct vy_worker_pool *pool, int size)
+vy_worker_pool_create(struct vy_worker_pool *pool, const char *name, int size)
 {
+	pool->name = name;
 	pool->size = size;
 	pool->workers = NULL;
 	stailq_create(&pool->idle_workers);
@@ -404,16 +407,19 @@ vy_worker_pool_get(struct vy_worker_pool *pool)
 		pool->idle_worker_count--;
 		worker = stailq_shift_entry(&pool->idle_workers,
 					    struct vy_worker, in_idle);
+		assert(worker->pool == pool);
 	}
 	return worker;
 }
 
 /**
- * Put a worker back to a pool once it's done its job.
+ * Put a worker back to the pool it was allocated from once
+ * it's done its job.
  */
 static void
-vy_worker_pool_put(struct vy_worker_pool *pool, struct vy_worker *worker)
+vy_worker_pool_put(struct vy_worker *worker)
 {
+	struct vy_worker_pool *pool = worker->pool;
 	assert(pool->idle_worker_count < pool->size);
 	pool->idle_worker_count++;
 	stailq_add_entry(&pool->idle_workers, worker, in_idle);
@@ -436,7 +442,28 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 		panic("failed to allocate vinyl scheduler fiber");
 
 	fiber_cond_create(&scheduler->scheduler_cond);
-	vy_worker_pool_create(&scheduler->worker_pool, write_threads);
+
+	/*
+	 * Dump tasks must be scheduled as soon as possible,
+	 * otherwise we may run out of memory quota and have
+	 * to stall transactions. To avoid unpredictably long
+	 * stalls caused by ongoing compaction tasks blocking
+	 * scheduling of dump tasks, we use separate thread
+	 * pools for dump and compaction tasks.
+	 *
+	 * Since a design based on LSM trees typically implies
+	 * high write amplification, we allocate only 1/4th of
+	 * all available threads to dump tasks while the rest
+	 * is used exclusively for compaction.
+	 */
+	assert(write_threads > 1);
+	int dump_threads = MAX(1, write_threads / 4);
+	int compact_threads = write_threads - dump_threads;
+	vy_worker_pool_create(&scheduler->dump_pool,
+			      "dump", dump_threads);
+	vy_worker_pool_create(&scheduler->compact_pool,
+			      "compact", compact_threads);
+
 	stailq_create(&scheduler->processed_tasks);
 
 	vy_dump_heap_create(&scheduler->dump_heap);
@@ -457,7 +484,8 @@ vy_scheduler_destroy(struct vy_scheduler *scheduler)
 	fiber_cond_signal(&scheduler->dump_cond);
 	fiber_cond_signal(&scheduler->scheduler_cond);
 
-	vy_worker_pool_destroy(&scheduler->worker_pool);
+	vy_worker_pool_destroy(&scheduler->dump_pool);
+	vy_worker_pool_destroy(&scheduler->compact_pool);
 	diag_destroy(&scheduler->diag);
 	fiber_cond_destroy(&scheduler->dump_cond);
 	fiber_cond_destroy(&scheduler->scheduler_cond);
@@ -1791,12 +1819,12 @@ retry:
 		 * round. Try to create a task for it.
 		 */
 		if (worker == NULL) {
-			worker = vy_worker_pool_get(&scheduler->worker_pool);
+			worker = vy_worker_pool_get(&scheduler->dump_pool);
 			if (worker == NULL)
 				return 0; /* all workers are busy */
 		}
 		if (vy_task_dump_new(scheduler, worker, lsm, ptask) != 0) {
-			vy_worker_pool_put(&scheduler->worker_pool, worker);
+			vy_worker_pool_put(worker);
 			return -1;
 		}
 		if (*ptask != NULL)
@@ -1816,7 +1844,7 @@ retry:
 	assert(scheduler->dump_task_count > 0);
 no_task:
 	if (worker != NULL)
-		vy_worker_pool_put(&scheduler->worker_pool, worker);
+		vy_worker_pool_put(worker);
 	return 0;
 }
 
@@ -1846,12 +1874,12 @@ retry:
 	if (vy_lsm_compact_priority(lsm) <= 1)
 		goto no_task; /* nothing to do */
 	if (worker == NULL) {
-		worker = vy_worker_pool_get(&scheduler->worker_pool);
+		worker = vy_worker_pool_get(&scheduler->compact_pool);
 		if (worker == NULL)
 			return 0; /* all workers are busy */
 	}
 	if (vy_task_compact_new(scheduler, worker, lsm, ptask) != 0) {
-		vy_worker_pool_put(&scheduler->worker_pool, worker);
+		vy_worker_pool_put(worker);
 		return -1;
 	}
 	if (*ptask == NULL)
@@ -1859,7 +1887,7 @@ retry:
 	return 0; /* new task */
 no_task:
 	if (worker != NULL)
-		vy_worker_pool_put(&scheduler->worker_pool, worker);
+		vy_worker_pool_put(worker);
 	return 0;
 }
 
@@ -1873,18 +1901,6 @@ vy_schedule(struct vy_scheduler *scheduler, struct vy_task **ptask)
 	if (*ptask != NULL)
 		return 0;
 
-	if (scheduler->worker_pool.idle_worker_count <= 1) {
-		/*
-		 * If all worker threads are busy doing compaction
-		 * when we run out of quota, ongoing transactions will
-		 * hang until one of the threads has finished, which
-		 * may take quite a while. To avoid unpredictably long
-		 * stalls, always keep one worker thread reserved for
-		 * dumps.
-		 */
-		return 0;
-	}
-
 	if (vy_scheduler_peek_compact(scheduler, ptask) != 0)
 		goto fail;
 	if (*ptask != NULL)
@@ -1949,7 +1965,8 @@ vy_scheduler_f(va_list va)
 	if (scheduler->scheduler_fiber == NULL)
 		return 0; /* destroyed */
 
-	vy_worker_pool_start(&scheduler->worker_pool);
+	vy_worker_pool_start(&scheduler->dump_pool);
+	vy_worker_pool_start(&scheduler->compact_pool);
 
 	while (scheduler->scheduler_fiber != NULL) {
 		struct stailq processed_tasks;
@@ -1967,8 +1984,7 @@ vy_scheduler_f(va_list va)
 				tasks_failed++;
 			else
 				tasks_done++;
-			vy_worker_pool_put(&scheduler->worker_pool,
-					   task->worker);
+			vy_worker_pool_put(task->worker);
 			vy_task_delete(task);
 		}
 		/*
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index 606f3b31..7014f953 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -57,6 +57,8 @@ typedef void
 				int64_t dump_generation, double dump_duration);
 
 struct vy_worker_pool {
+	/** Name of the pool. Used for naming threads. */
+	const char *name;
 	/** Number of worker threads in the pool. */
 	int size;
 	/** Array of all worker threads in the pool. */
@@ -72,11 +74,10 @@ struct vy_scheduler {
 	struct fiber *scheduler_fiber;
 	/** Used to wake up the scheduler fiber from TX. */
 	struct fiber_cond scheduler_cond;
-	/**
-	 * Pool of threads used for performing dump/compaction
-	 * tasks in the background.
-	 */
-	struct vy_worker_pool worker_pool;
+	/** Pool of threads for performing background dumps. */
+	struct vy_worker_pool dump_pool;
+	/** Pool of threads for performing background compactions. */
+	struct vy_worker_pool compact_pool;
 	/** Queue of processed tasks, linked by vy_task::in_processed. */
 	struct stailq processed_tasks;
 	/**
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 6/8] vinyl: zap vy_worker_pool::idle_worker_count
  2018-09-04 17:23     ` Vladimir Davydov
                         ` (4 preceding siblings ...)
  2018-09-04 17:23       ` [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:38         ` Konstantin Osipov
  2018-09-04 17:23       ` [PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete Vladimir Davydov
  2018-09-04 17:23       ` [PATCH 8/8] vinyl: keep track of thread pool idle ratio Vladimir Davydov
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

It is not used anywhere anymore.
---
 src/box/vy_scheduler.c | 6 ------
 src/box/vy_scheduler.h | 2 --
 2 files changed, 8 deletions(-)

diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index dbc3ca69..4f57c0a0 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -340,7 +340,6 @@ vy_worker_pool_start(struct vy_worker_pool *pool)
 {
 	assert(pool->workers == NULL);
 
-	pool->idle_worker_count = pool->size;
 	pool->workers = calloc(pool->size, sizeof(*pool->workers));
 	if (pool->workers == NULL)
 		panic("failed to allocate vinyl worker pool");
@@ -385,7 +384,6 @@ vy_worker_pool_create(struct vy_worker_pool *pool, const char *name, int size)
 	pool->size = size;
 	pool->workers = NULL;
 	stailq_create(&pool->idle_workers);
-	pool->idle_worker_count = 0;
 }
 
 static void
@@ -403,8 +401,6 @@ vy_worker_pool_get(struct vy_worker_pool *pool)
 {
 	struct vy_worker *worker = NULL;
 	if (!stailq_empty(&pool->idle_workers)) {
-		assert(pool->idle_worker_count > 0);
-		pool->idle_worker_count--;
 		worker = stailq_shift_entry(&pool->idle_workers,
 					    struct vy_worker, in_idle);
 		assert(worker->pool == pool);
@@ -420,8 +416,6 @@ static void
 vy_worker_pool_put(struct vy_worker *worker)
 {
 	struct vy_worker_pool *pool = worker->pool;
-	assert(pool->idle_worker_count < pool->size);
-	pool->idle_worker_count++;
 	stailq_add_entry(&pool->idle_workers, worker, in_idle);
 }
 
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index 7014f953..841d5a53 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -65,8 +65,6 @@ struct vy_worker_pool {
 	struct vy_worker *workers;
 	/** List of workers that are currently idle. */
 	struct stailq idle_workers;
-	/** Length of @idle_workers list. */
-	int idle_worker_count;
 };
 
 struct vy_scheduler {
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete
  2018-09-04 17:23     ` Vladimir Davydov
                         ` (5 preceding siblings ...)
  2018-09-04 17:23       ` [PATCH 6/8] vinyl: zap vy_worker_pool::idle_worker_count Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:39         ` Konstantin Osipov
  2018-09-04 17:23       ` [PATCH 8/8] vinyl: keep track of thread pool idle ratio Vladimir Davydov
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

We must not schedule any background jobs during local recovery, because
they may disrupt yet to be recovered data stored on disk. Since we start
the scheduler fiber as soon as the engine is initialized, we have to
pull some tricks to make sure it doesn't schedule any tasks: the
scheduler fiber function yields immediately upon startup; we assume
that it won't be woken up until local recovery is complete, because we
don't set the memory limit until then.

This looks rather flimsy, because the logic is spread among several
seemingly unrelated functions: the scheduler fiber (vy_scheduler_f),
the quota watermark callback (vy_env_quota_exceeded_cb), and the engine
recovery callback (vinyl_engine_begin_initial_recovery), where we leave
the memory limit unset until recovery is complete. The latter isn't even
mentioned in comments, which makes the code difficult to follow. Think
how everything would fall apart should we try to wake up the scheduler
fiber somewhere else for some reason.

This patch attempts to make the code more straightforward by postponing
startup of the scheduler fiber until recovery completion. It also moves
the comment explaining why we can't schedule tasks during local recovery
from vy_env_quota_exceeded_cb to vinyl_engine_begin_initial_recovery,
because this is where we actually omit the scheduler fiber startup.

Note, since now the scheduler fiber goes straight to business once
started, we can't start worker threads in the fiber function as we used
to, because then workers threads would be running even if vinyl was
unused. So we move this code to vy_worker_pool_get, which is called when
a worker is actually needed to run a task.
---
 src/box/vinyl.c        | 39 ++++++++++++++++++++-------------------
 src/box/vy_scheduler.c | 27 ++++++++++++---------------
 src/box/vy_scheduler.h |  6 ++++++
 3 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 416c9824..1cf9ad16 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2420,21 +2420,6 @@ vy_env_quota_exceeded_cb(struct vy_quota *quota)
 {
 	struct vy_env *env = container_of(quota, struct vy_env, quota);
 
-	/*
-	 * The scheduler must be disabled during local recovery so as
-	 * not to distort data stored on disk. Not that we really need
-	 * it anyway, because the memory footprint is limited by the
-	 * memory limit from the previous run.
-	 *
-	 * On the contrary, remote recovery does require the scheduler
-	 * to be up and running, because the amount of data received
-	 * when bootstrapping from a remote master is only limited by
-	 * its disk size, which can exceed the size of available
-	 * memory by orders of magnitude.
-	 */
-	assert(env->status != VINYL_INITIAL_RECOVERY_LOCAL &&
-	       env->status != VINYL_FINAL_RECOVERY_LOCAL);
-
 	if (lsregion_used(&env->mem_env.allocator) == 0) {
 		/*
 		 * The memory limit has been exceeded, but there's
@@ -2710,13 +2695,15 @@ vy_set_deferred_delete_trigger(void)
 static int
 vinyl_engine_bootstrap(struct engine *engine)
 {
+	vy_set_deferred_delete_trigger();
+
 	struct vy_env *e = vy_env(engine);
 	assert(e->status == VINYL_OFFLINE);
 	if (vy_log_bootstrap() != 0)
 		return -1;
+	vy_scheduler_start(&e->scheduler);
 	vy_quota_set_limit(&e->quota, e->memory);
 	e->status = VINYL_ONLINE;
-	vy_set_deferred_delete_trigger();
 	return 0;
 }
 
@@ -2724,6 +2711,8 @@ static int
 vinyl_engine_begin_initial_recovery(struct engine *engine,
 				    const struct vclock *recovery_vclock)
 {
+	vy_set_deferred_delete_trigger();
+
 	struct vy_env *e = vy_env(engine);
 	assert(e->status == VINYL_OFFLINE);
 	if (recovery_vclock != NULL) {
@@ -2732,14 +2721,25 @@ vinyl_engine_begin_initial_recovery(struct engine *engine,
 		e->recovery = vy_log_begin_recovery(recovery_vclock);
 		if (e->recovery == NULL)
 			return -1;
+		/*
+		 * We can't schedule any background tasks until
+		 * local recovery is complete, because they would
+		 * disrupt yet to be recovered data stored on disk.
+		 * So we don't start the scheduler fiber or enable
+		 * memory limit until then.
+		 *
+		 * This is OK, because during recovery an instance
+		 * can't consume more memory than it used before
+		 * restart and hence memory dumps are not necessary.
+		 */
 		e->status = VINYL_INITIAL_RECOVERY_LOCAL;
 	} else {
 		if (vy_log_bootstrap() != 0)
 			return -1;
+		vy_scheduler_start(&e->scheduler);
 		vy_quota_set_limit(&e->quota, e->memory);
 		e->status = VINYL_INITIAL_RECOVERY_REMOTE;
 	}
-	vy_set_deferred_delete_trigger();
 	return 0;
 }
 
@@ -2780,11 +2780,10 @@ vinyl_engine_end_recovery(struct engine *engine)
 		vy_recovery_delete(e->recovery);
 		e->recovery = NULL;
 		e->recovery_vclock = NULL;
-		e->status = VINYL_ONLINE;
+		vy_scheduler_start(&e->scheduler);
 		vy_quota_set_limit(&e->quota, e->memory);
 		break;
 	case VINYL_FINAL_RECOVERY_REMOTE:
-		e->status = VINYL_ONLINE;
 		break;
 	default:
 		unreachable();
@@ -2796,6 +2795,8 @@ vinyl_engine_end_recovery(struct engine *engine)
 	 */
 	if (e->lsm_env.lsm_count > 0)
 		vy_run_env_enable_coio(&e->run_env, e->read_threads);
+
+	e->status = VINYL_ONLINE;
 	return 0;
 }
 
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 4f57c0a0..5daaf4f1 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -399,6 +399,14 @@ vy_worker_pool_destroy(struct vy_worker_pool *pool)
 static struct vy_worker *
 vy_worker_pool_get(struct vy_worker_pool *pool)
 {
+	/*
+	 * Start worker threads only when a task is scheduled
+	 * so that they are not dangling around if vinyl is
+	 * not used.
+	 */
+	if (pool->workers == NULL)
+		vy_worker_pool_start(pool);
+
 	struct vy_worker *worker = NULL;
 	if (!stailq_empty(&pool->idle_workers)) {
 		worker = stailq_shift_entry(&pool->idle_workers,
@@ -465,7 +473,11 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 
 	diag_create(&scheduler->diag);
 	fiber_cond_create(&scheduler->dump_cond);
+}
 
+void
+vy_scheduler_start(struct vy_scheduler *scheduler)
+{
 	fiber_start(scheduler->scheduler_fiber, scheduler);
 }
 
@@ -1947,21 +1959,6 @@ vy_scheduler_f(va_list va)
 {
 	struct vy_scheduler *scheduler = va_arg(va, struct vy_scheduler *);
 
-	/*
-	 * Yield immediately, until the quota watermark is reached
-	 * for the first time or a checkpoint is made.
-	 * Then start the worker threads: we know they will be
-	 * needed. If quota watermark is never reached, workers
-	 * are not started and the scheduler is idle until
-	 * shutdown or checkpoint.
-	 */
-	fiber_cond_wait(&scheduler->scheduler_cond);
-	if (scheduler->scheduler_fiber == NULL)
-		return 0; /* destroyed */
-
-	vy_worker_pool_start(&scheduler->dump_pool);
-	vy_worker_pool_start(&scheduler->compact_pool);
-
 	while (scheduler->scheduler_fiber != NULL) {
 		struct stailq processed_tasks;
 		struct vy_task *task, *next;
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index 841d5a53..96ce721b 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -172,6 +172,12 @@ vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 		    struct vy_run_env *run_env, struct rlist *read_views);
 
 /**
+ * Start a scheduler fiber.
+ */
+void
+vy_scheduler_start(struct vy_scheduler *scheduler);
+
+/**
  * Destroy a scheduler instance.
  */
 void
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH 8/8] vinyl: keep track of thread pool idle ratio
  2018-09-04 17:23     ` Vladimir Davydov
                         ` (6 preceding siblings ...)
  2018-09-04 17:23       ` [PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete Vladimir Davydov
@ 2018-09-04 17:23       ` Vladimir Davydov
  2018-09-06  7:49         ` Konstantin Osipov
  7 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-04 17:23 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

To understand whether the disk is fully utilized or can still handle
more compaction load and make right decisions regarding transaction
throttling, we need a metric that would report how much time worker
threads spent being idle. So this patch adds two new metrics to global
statistics, disk.dump_idle_ratio and compact_idle_ratio, which show how
much time dump threads and compaction threads were idle, respectively.
The metrics are updated using the following formula:

                       idle_time
  idle_ratio = --------------------------
               dump_period * worker_count

where idle_time is the total amount of time workers were idle between
the last two dumps, dump_period is the time that passed between the last
two dumps, worker_count is the number of workers in the pool.
---
 src/box/vinyl.c            |  5 +++
 src/box/vy_scheduler.c     | 64 ++++++++++++++++++++++++++++++++++++
 src/box/vy_scheduler.h     | 11 +++++++
 test/vinyl/errinj.result   | 82 ++++++++++++++++++++++++++++++++++++++++++++++
 test/vinyl/errinj.test.lua | 37 +++++++++++++++++++++
 test/vinyl/info.result     | 14 ++++----
 test/vinyl/info.test.lua   |  2 ++
 7 files changed, 209 insertions(+), 6 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 1cf9ad16..f6b23cd6 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -302,15 +302,20 @@ static void
 vy_info_append_disk(struct vy_env *env, struct info_handler *h)
 {
 	struct vy_lsm_env *lsm_env = &env->lsm_env;
+	struct vy_scheduler *scheduler = &env->scheduler;
 
 	info_table_begin(h, "disk");
 	info_append_int(h, "data_files", lsm_env->data_file_count);
 	info_append_int(h, "data_size", lsm_env->disk_data_size);
 	info_append_int(h, "index_size", lsm_env->disk_index_size);
 	info_append_int(h, "dump_total", lsm_env->dump_total);
+	info_append_double(h, "dump_idle_ratio",
+			   scheduler->dump_pool.idle_ratio);
 	info_append_int(h, "compact_total", lsm_env->compact_total);
 	info_append_int(h, "compact_queue", lsm_env->compact_queue);
 	info_append_int(h, "compact_debt", lsm_env->compact_debt);
+	info_append_double(h, "compact_idle_ratio",
+			   scheduler->compact_pool.idle_ratio);
 	info_table_end(h);
 }
 
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 5daaf4f1..178db2cc 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -97,6 +97,10 @@ struct vy_worker {
 	struct vy_task *task;
 	/** Link in vy_worker_pool::idle_workers. */
 	struct stailq_entry in_idle;
+	/** Time when this worker became idle. */
+	double idle_start;
+	/** How much time this worker have been idle. */
+	double idle_time;
 	/** Route for sending deferred DELETEs back to tx. */
 	struct cmsg_hop deferred_delete_route[2];
 };
@@ -355,6 +359,13 @@ vy_worker_pool_start(struct vy_worker_pool *pool)
 		cpipe_create(&worker->worker_pipe, name);
 		stailq_add_tail_entry(&pool->idle_workers, worker, in_idle);
 
+		/*
+		 * Distribute accumulated idle time amongst workers
+		 * as if they were running all the time.
+		 */
+		worker->idle_start = pool->last_idle_update;
+		worker->idle_time = pool->last_idle_time / pool->size;
+
 		struct cmsg_hop *route = worker->deferred_delete_route;
 		route[0].f = vy_deferred_delete_batch_process_f;
 		route[0].pipe = &worker->worker_pipe;
@@ -384,6 +395,9 @@ vy_worker_pool_create(struct vy_worker_pool *pool, const char *name, int size)
 	pool->size = size;
 	pool->workers = NULL;
 	stailq_create(&pool->idle_workers);
+	pool->idle_ratio = 1;
+	pool->last_idle_time = 0;
+	pool->last_idle_update = ev_monotonic_now(loop());
 }
 
 static void
@@ -412,6 +426,8 @@ vy_worker_pool_get(struct vy_worker_pool *pool)
 		worker = stailq_shift_entry(&pool->idle_workers,
 					    struct vy_worker, in_idle);
 		assert(worker->pool == pool);
+		worker->idle_time += ev_monotonic_now(loop()) -
+						worker->idle_start;
 	}
 	return worker;
 }
@@ -425,6 +441,51 @@ vy_worker_pool_put(struct vy_worker *worker)
 {
 	struct vy_worker_pool *pool = worker->pool;
 	stailq_add_entry(&pool->idle_workers, worker, in_idle);
+	worker->idle_start = ev_monotonic_now(loop());
+}
+
+/**
+ * Return total time workers have spent idle.
+ */
+static double
+vy_worker_pool_idle_time(struct vy_worker_pool *pool)
+{
+	double now = ev_monotonic_now(loop());
+
+	if (pool->workers == NULL) {
+		/*
+		 * Workers haven't been started yet so naturally
+		 * they all should be accounted as idle.
+		 */
+		return pool->last_idle_time +
+			(now - pool->last_idle_update) * pool->size;
+	}
+
+	double idle_time = 0;
+	struct vy_worker *worker;
+	for (int i = 0; i < pool->size; i++) {
+		worker = &pool->workers[i];
+		idle_time += worker->idle_time;
+	}
+
+	stailq_foreach_entry(worker, &pool->idle_workers, in_idle)
+		idle_time += now - worker->idle_start;
+
+	return idle_time;
+}
+
+/**
+ * Update idle ratio of a worker pool.
+ */
+static void
+vy_worker_pool_update_idle_ratio(struct vy_worker_pool *pool)
+{
+	double now = ev_monotonic_now(loop());
+	double idle_time = vy_worker_pool_idle_time(pool);
+	pool->idle_ratio = (idle_time - pool->last_idle_time) /
+			   (now - pool->last_idle_update) / pool->size;
+	pool->last_idle_time = idle_time;
+	pool->last_idle_update = now;
 }
 
 void
@@ -657,6 +718,9 @@ vy_scheduler_complete_dump(struct vy_scheduler *scheduler)
 	scheduler->dump_complete_cb(scheduler,
 			min_generation - 1, dump_duration);
 	fiber_cond_signal(&scheduler->dump_cond);
+
+	vy_worker_pool_update_idle_ratio(&scheduler->dump_pool);
+	vy_worker_pool_update_idle_ratio(&scheduler->compact_pool);
 }
 
 int
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index 96ce721b..d6553cc8 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -65,6 +65,17 @@ struct vy_worker_pool {
 	struct vy_worker *workers;
 	/** List of workers that are currently idle. */
 	struct stailq idle_workers;
+	/**
+	 * How much time worker threads were idle, relative to
+	 * the total time (0 <= @idle_ratio <= 1).
+	 *
+	 * Updated on memory dump completion.
+	 */
+	double idle_ratio;
+	/** Time of the last @idle_ratio update. */
+	double last_idle_update;
+	/** Total idle time at @last_idle_update. */
+	double last_idle_time;
 };
 
 struct vy_scheduler {
diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result
index 7b880030..35769e66 100644
--- a/test/vinyl/errinj.result
+++ b/test/vinyl/errinj.result
@@ -2247,3 +2247,85 @@ i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
 s:drop()
 ---
 ...
+--
+-- Check idle_ratio metric.
+--
+dump_threads = math.max(1, math.ceil(box.cfg.vinyl_write_threads / 4))
+---
+...
+compact_threads = box.cfg.vinyl_write_threads - dump_threads
+---
+...
+s = box.schema.space.create('test', {engine = 'vinyl'})
+---
+...
+i = s:create_index('pk', {run_count_per_level = 2})
+---
+...
+function dump() for i = 1, 10 do s:replace{i} end box.snapshot() end
+---
+...
+dump()
+---
+...
+fiber.sleep(0.1)
+---
+...
+errinj.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0.1)
+---
+- ok
+...
+dump()
+---
+...
+errinj.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0)
+---
+- ok
+...
+-- one dump thread was busy half of the time
+idle = box.stat.vinyl().disk.dump_idle_ratio
+---
+...
+expected = 1 - 1 / (2 * dump_threads)
+---
+...
+math.abs(idle - expected) < 0.1 or idle
+---
+- true
+...
+-- all compaction threads were idle
+box.stat.vinyl().disk.compact_idle_ratio -- 1
+---
+- 1
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+---
+- ok
+...
+dump()
+---
+...
+dump()
+---
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+---
+- ok
+...
+-- one compaction thread was busy all the time
+idle = box.stat.vinyl().disk.compact_idle_ratio
+---
+...
+expected = 1 - 1 / compact_threads
+---
+...
+math.abs(idle - expected) < 0.1 or idle
+---
+- true
+...
+while i:stat().disk.compact.count < 1 do fiber.sleep(0.01) end
+---
+...
+s:drop()
+---
+...
diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua
index 9037bfad..fc52dbee 100644
--- a/test/vinyl/errinj.test.lua
+++ b/test/vinyl/errinj.test.lua
@@ -884,3 +884,40 @@ i:stat().disk.compact.debt -- none
 i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
 i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
 s:drop()
+
+--
+-- Check idle_ratio metric.
+--
+dump_threads = math.max(1, math.ceil(box.cfg.vinyl_write_threads / 4))
+compact_threads = box.cfg.vinyl_write_threads - dump_threads
+
+s = box.schema.space.create('test', {engine = 'vinyl'})
+i = s:create_index('pk', {run_count_per_level = 2})
+function dump() for i = 1, 10 do s:replace{i} end box.snapshot() end
+
+dump()
+fiber.sleep(0.1)
+errinj.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0.1)
+dump()
+errinj.set('ERRINJ_VY_RUN_WRITE_TIMEOUT', 0)
+
+-- one dump thread was busy half of the time
+idle = box.stat.vinyl().disk.dump_idle_ratio
+expected = 1 - 1 / (2 * dump_threads)
+math.abs(idle - expected) < 0.1 or idle
+
+-- all compaction threads were idle
+box.stat.vinyl().disk.compact_idle_ratio -- 1
+
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+dump()
+dump()
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+
+-- one compaction thread was busy all the time
+idle = box.stat.vinyl().disk.compact_idle_ratio
+expected = 1 - 1 / compact_threads
+math.abs(idle - expected) < 0.1 or idle
+
+while i:stat().disk.compact.count < 1 do fiber.sleep(0.01) end
+s:drop()
diff --git a/test/vinyl/info.result b/test/vinyl/info.result
index 556f5eca..6dcdc90f 100644
--- a/test/vinyl/info.result
+++ b/test/vinyl/info.result
@@ -102,6 +102,8 @@ function gstat()
     st.quota.use_rate = nil
     st.quota.dump_bandwidth = nil
     st.quota.watermark = nil
+    st.disk.dump_idle_ratio = nil
+    st.disk.compact_idle_ratio = nil
     return st
 end;
 ---
@@ -218,12 +220,12 @@ gstat()
 ---
 - disk:
     dump_total: 0
-    data_size: 0
+    index_size: 0
     compact_debt: 0
-    compact_queue: 0
     data_files: 0
+    compact_queue: 0
+    data_size: 0
     compact_total: 0
-    index_size: 0
   quota:
     limit: 134217728
     used: 0
@@ -1039,12 +1041,12 @@ gstat()
 ---
 - disk:
     dump_total: 0
-    data_size: 104300
+    index_size: 1190
     compact_debt: 0
-    compact_queue: 0
     data_files: 2
+    compact_queue: 0
+    data_size: 104300
     compact_total: 0
-    index_size: 1190
   quota:
     limit: 134217728
     used: 262583
diff --git a/test/vinyl/info.test.lua b/test/vinyl/info.test.lua
index 919dde63..637aa323 100644
--- a/test/vinyl/info.test.lua
+++ b/test/vinyl/info.test.lua
@@ -84,6 +84,8 @@ function gstat()
     st.quota.use_rate = nil
     st.quota.dump_bandwidth = nil
     st.quota.watermark = nil
+    st.disk.dump_idle_ratio = nil
+    st.disk.compact_idle_ratio = nil
     return st
 end;
 
-- 
2.11.0

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 1/8] vinyl: add helper to check whether dump is in progress
  2018-09-04 17:23       ` [PATCH 1/8] vinyl: add helper to check whether dump is in progress Vladimir Davydov
@ 2018-09-06  7:33         ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:33 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
>  src/box/vy_scheduler.c | 17 +++++++----------
>  src/box/vy_scheduler.h | 13 +++++++++++++
>  2 files changed, 20 insertions(+), 10 deletions(-)

OK to push.


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 2/8] vinyl: don't use mempool for allocating background tasks
  2018-09-04 17:23       ` [PATCH 2/8] vinyl: don't use mempool for allocating background tasks Vladimir Davydov
@ 2018-09-06  7:33         ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:33 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> Background tasks are allocated infrequently, not more often than once
> per several seconds, so using mempool for them is unnecessary and only
> clutters vy_scheduler struct. Let's allocate them with malloc().

OK to push.


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 3/8] vinyl: factor out worker pool from scheduler struct
  2018-09-04 17:23       ` [PATCH 3/8] vinyl: factor out worker pool from scheduler struct Vladimir Davydov
@ 2018-09-06  7:34         ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:34 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> A worker pool is an independent entity that provides the scheduler with
> worker threads on demand. Let's factor it out so that we can introduce
> separate pools for dump and compaction tasks.

OK to push.


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 4/8] vinyl: move worker allocation closer to task creation
  2018-09-04 17:23       ` [PATCH 4/8] vinyl: move worker allocation closer to task creation Vladimir Davydov
@ 2018-09-06  7:35         ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:35 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> Call vy_worker_pool_get() from vy_scheduler_peek_{dump,compaction} so
> that we can use different worker pools for dump and compaction tasks.

OK to push.


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks
  2018-09-04 17:23       ` [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks Vladimir Davydov
@ 2018-09-06  7:37         ` Konstantin Osipov
  2018-09-06  9:48           ` Vladimir Davydov
  0 siblings, 1 reply; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:37 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> Using the same thread pool for both dump and compaction tasks makes
> estimation of dump bandwidth unstable. For instance, if we have four
> worker threads, then the observed dump bandwidth may vary from X if
> there's high compaction demand and all worker threads tend to be busy
> with compaction tasks to 4 * X if there's no compaction demand. As a
> result, we can overestimate the dump bandwidth and trigger dump when
> it's too late, which will result in hitting the limit before dump is
> complete and hence stalling write transactions, which is unacceptable.
> 
> To avoid that, let's separate thread pools used for dump and compaction
> tasks. Since LSM tree based design typically implies high levels of
> write amplification, let's allocate 1/4th of all threads for dump tasks
> and use the rest exclusively for compaction.

Please follow this up by changing the default number of write
threads to 4 and making 4 the lower bound for this configuration
setting.

Thanks.
> --
-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 6/8] vinyl: zap vy_worker_pool::idle_worker_count
  2018-09-04 17:23       ` [PATCH 6/8] vinyl: zap vy_worker_pool::idle_worker_count Vladimir Davydov
@ 2018-09-06  7:38         ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:38 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> It is not used anywhere anymore.
> ---
>  src/box/vy_scheduler.c | 6 ------
>  src/box/vy_scheduler.h | 2 --
>  2 files changed, 8 deletions(-)

OK to push.


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete
  2018-09-04 17:23       ` [PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete Vladimir Davydov
@ 2018-09-06  7:39         ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:39 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> We must not schedule any background jobs during local recovery, because
> they may disrupt yet to be recovered data stored on disk. Since we start
> the scheduler fiber as soon as the engine is initialized, we have to
> pull some tricks to make sure it doesn't schedule any tasks: the
> scheduler fiber function yields immediately upon startup; we assume
> that it won't be woken up until local recovery is complete, because we
> don't set the memory limit until then.

OK to push. I esp. like this change.

 

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 8/8] vinyl: keep track of thread pool idle ratio
  2018-09-04 17:23       ` [PATCH 8/8] vinyl: keep track of thread pool idle ratio Vladimir Davydov
@ 2018-09-06  7:49         ` Konstantin Osipov
  2018-09-06  8:18           ` Vladimir Davydov
  0 siblings, 1 reply; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06  7:49 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> To understand whether the disk is fully utilized or can still handle
> more compaction load and make right decisions regarding transaction
> throttling, we need a metric that would report how much time worker
> threads spent being idle. So this patch adds two new metrics to global
> statistics, disk.dump_idle_ratio and compact_idle_ratio, which show how
> much time dump threads and compaction threads were idle, respectively.
> The metrics are updated using the following formula:
> 
>                        idle_time
>   idle_ratio = --------------------------
>                dump_period * worker_count

I don't understand the formula. There can be many workers.
Is idle time measured per worker or per entire pool? 

If it is measured per entire pool, how is idle time calculated if
some workers are busy and some not?


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 8/8] vinyl: keep track of thread pool idle ratio
  2018-09-06  7:49         ` Konstantin Osipov
@ 2018-09-06  8:18           ` Vladimir Davydov
  2018-09-06 10:26             ` Konstantin Osipov
  0 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-06  8:18 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Thu, Sep 06, 2018 at 10:49:37AM +0300, Konstantin Osipov wrote:
> * Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> > To understand whether the disk is fully utilized or can still handle
> > more compaction load and make right decisions regarding transaction
> > throttling, we need a metric that would report how much time worker
> > threads spent being idle. So this patch adds two new metrics to global
> > statistics, disk.dump_idle_ratio and compact_idle_ratio, which show how
> > much time dump threads and compaction threads were idle, respectively.
> > The metrics are updated using the following formula:
> > 
> >                        idle_time
> >   idle_ratio = --------------------------
> >                dump_period * worker_count
> 
> I don't understand the formula. There can be many workers.
> Is idle time measured per worker or per entire pool? 
> 
> If it is measured per entire pool, how is idle time calculated if
> some workers are busy and some not?

It is measured for entire pool - note that I divide the result by
worker_count. E.g. if there were two workers and one of them were
busy all the time between two last dumps while another were idle,
idle_ratio would be 0.5.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks
  2018-09-06  7:37         ` Konstantin Osipov
@ 2018-09-06  9:48           ` Vladimir Davydov
  2018-09-06 10:32             ` Konstantin Osipov
  0 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-06  9:48 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Thu, Sep 06, 2018 at 10:37:51AM +0300, Konstantin Osipov wrote:
> * Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/04 20:59]:
> > Using the same thread pool for both dump and compaction tasks makes
> > estimation of dump bandwidth unstable. For instance, if we have four
> > worker threads, then the observed dump bandwidth may vary from X if
> > there's high compaction demand and all worker threads tend to be busy
> > with compaction tasks to 4 * X if there's no compaction demand. As a
> > result, we can overestimate the dump bandwidth and trigger dump when
> > it's too late, which will result in hitting the limit before dump is
> > complete and hence stalling write transactions, which is unacceptable.
> > 
> > To avoid that, let's separate thread pools used for dump and compaction
> > tasks. Since LSM tree based design typically implies high levels of
> > write amplification, let's allocate 1/4th of all threads for dump tasks
> > and use the rest exclusively for compaction.
> 
> Please follow this up by changing the default number of write
> threads to 4 and making 4 the lower bound for this configuration
> setting.

I'm OK with setting vinyl_write_threads to 4 by default, but I don't
think it's a good idea to increase the lower bound from 2 to 4. First,
disabling parallel compaction may be useful for performance experiments.
Second, it's going to break our customers who use default settings.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 8/8] vinyl: keep track of thread pool idle ratio
  2018-09-06  8:18           ` Vladimir Davydov
@ 2018-09-06 10:26             ` Konstantin Osipov
  2018-09-06 10:52               ` Vladimir Davydov
  0 siblings, 1 reply; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06 10:26 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/06 11:19]:

> > > To understand whether the disk is fully utilized or can still handle
> > > more compaction load and make right decisions regarding transaction
> > > throttling, we need a metric that would report how much time worker
> > > threads spent being idle. So this patch adds two new metrics to global
> > > statistics, disk.dump_idle_ratio and compact_idle_ratio, which show how
> > > much time dump threads and compaction threads were idle, respectively.
> > > The metrics are updated using the following formula:
> > > 
> > >                        idle_time
> > >   idle_ratio = --------------------------
> > >                dump_period * worker_count
> > 
> > I don't understand the formula. There can be many workers.
> > Is idle time measured per worker or per entire pool? 
> > 
> > If it is measured per entire pool, how is idle time calculated if
> > some workers are busy and some not?
> 
> It is measured for entire pool - note that I divide the result by
> worker_count. E.g. if there were two workers and one of them were
> busy all the time between two last dumps while another were idle,
> idle_ratio would be 0.5.

This looks imprecise. Why not measure idle time of each worker and
then even it out over the total number of workers? Besides, once
again, how do you define the window over which you measure?

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks
  2018-09-06  9:48           ` Vladimir Davydov
@ 2018-09-06 10:32             ` Konstantin Osipov
  0 siblings, 0 replies; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06 10:32 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/06 12:51]:
> I'm OK with setting vinyl_write_threads to 4 by default, but I don't
> think it's a good idea to increase the lower bound from 2 to 4. First,
> disabling parallel compaction may be useful for performance experiments.
> Second, it's going to break our customers who use default settings.

OK

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 8/8] vinyl: keep track of thread pool idle ratio
  2018-09-06 10:26             ` Konstantin Osipov
@ 2018-09-06 10:52               ` Vladimir Davydov
  2018-09-06 10:57                 ` Konstantin Osipov
  0 siblings, 1 reply; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-06 10:52 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Thu, Sep 06, 2018 at 01:26:31PM +0300, Konstantin Osipov wrote:
> * Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/06 11:19]:
> 
> > > > To understand whether the disk is fully utilized or can still handle
> > > > more compaction load and make right decisions regarding transaction
> > > > throttling, we need a metric that would report how much time worker
> > > > threads spent being idle. So this patch adds two new metrics to global
> > > > statistics, disk.dump_idle_ratio and compact_idle_ratio, which show how
> > > > much time dump threads and compaction threads were idle, respectively.
> > > > The metrics are updated using the following formula:
> > > > 
> > > >                        idle_time
> > > >   idle_ratio = --------------------------
> > > >                dump_period * worker_count
> > > 
> > > I don't understand the formula. There can be many workers.
> > > Is idle time measured per worker or per entire pool? 
> > > 
> > > If it is measured per entire pool, how is idle time calculated if
> > > some workers are busy and some not?
> > 
> > It is measured for entire pool - note that I divide the result by
> > worker_count. E.g. if there were two workers and one of them were
> > busy all the time between two last dumps while another were idle,
> > idle_ratio would be 0.5.
> 
> This looks imprecise. Why not measure idle time of each worker and
> then even it out over the total number of workers?

That's exactly what I do. I maintain the idle time for each worker
thread and then divide it by the total number of workers.

> Besides, once again, how do you define the window over which you
> measure?

I use dump period for the window, i.e. time between two subsequent
memory dumps. Respectively, I update the idle_ratio after each memory
dump. I'm planning to update throttle rate limit there too.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 8/8] vinyl: keep track of thread pool idle ratio
  2018-09-06 10:52               ` Vladimir Davydov
@ 2018-09-06 10:57                 ` Konstantin Osipov
  2018-09-06 11:59                   ` Vladimir Davydov
  0 siblings, 1 reply; 39+ messages in thread
From: Konstantin Osipov @ 2018-09-06 10:57 UTC (permalink / raw)
  To: Vladimir Davydov; +Cc: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/06 13:53]:
> > > > > To understand whether the disk is fully utilized or can still handle
> > > > > more compaction load and make right decisions regarding transaction
> > > > > throttling, we need a metric that would report how much time worker
> > > > > threads spent being idle. So this patch adds two new metrics to global
> > > > > statistics, disk.dump_idle_ratio and compact_idle_ratio, which show how
> > > > > much time dump threads and compaction threads were idle, respectively.
> > > > > The metrics are updated using the following formula:
> > > > > 
> > > > >                        idle_time
> > > > >   idle_ratio = --------------------------
> > > > >                dump_period * worker_count
> > > > 
> > > > I don't understand the formula. There can be many workers.
> > > > Is idle time measured per worker or per entire pool? 
> > > > 
> > > > If it is measured per entire pool, how is idle time calculated if
> > > > some workers are busy and some not?
> > > 
> > > It is measured for entire pool - note that I divide the result by
> > > worker_count. E.g. if there were two workers and one of them were
> > > busy all the time between two last dumps while another were idle,
> > > idle_ratio would be 0.5.
> > 
> > This looks imprecise. Why not measure idle time of each worker and
> > then even it out over the total number of workers?
> 
> That's exactly what I do. I maintain the idle time for each worker
> thread and then divide it by the total number of workers.

This is not what you've written in the comment, though.

> > Besides, once again, how do you define the window over which you
> > measure?
> I use dump period for the window, i.e. time between two subsequent
> memory dumps. Respectively, I update the idle_ratio after each memory
> dump. I'm planning to update throttle rate limit there too.

This choice of time window looks arbitrary. Why do you think it's
a good choice? 


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 8/8] vinyl: keep track of thread pool idle ratio
  2018-09-06 10:57                 ` Konstantin Osipov
@ 2018-09-06 11:59                   ` Vladimir Davydov
  0 siblings, 0 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-06 11:59 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Thu, Sep 06, 2018 at 01:57:42PM +0300, Konstantin Osipov wrote:
> * Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/06 13:53]:
> > > > > > To understand whether the disk is fully utilized or can still handle
> > > > > > more compaction load and make right decisions regarding transaction
> > > > > > throttling, we need a metric that would report how much time worker
> > > > > > threads spent being idle. So this patch adds two new metrics to global
> > > > > > statistics, disk.dump_idle_ratio and compact_idle_ratio, which show how
> > > > > > much time dump threads and compaction threads were idle, respectively.
> > > > > > The metrics are updated using the following formula:
> > > > > > 
> > > > > >                        idle_time
> > > > > >   idle_ratio = --------------------------
> > > > > >                dump_period * worker_count
> > > > > 
> > > > > I don't understand the formula. There can be many workers.
> > > > > Is idle time measured per worker or per entire pool? 
> > > > > 
> > > > > If it is measured per entire pool, how is idle time calculated if
> > > > > some workers are busy and some not?
> > > > 
> > > > It is measured for entire pool - note that I divide the result by
> > > > worker_count. E.g. if there were two workers and one of them were
> > > > busy all the time between two last dumps while another were idle,
> > > > idle_ratio would be 0.5.
> > > 
> > > This looks imprecise. Why not measure idle time of each worker and
> > > then even it out over the total number of workers?
> > 
> > That's exactly what I do. I maintain the idle time for each worker
> > thread and then divide it by the total number of workers.
> 
> This is not what you've written in the comment, though.

It is. Quoting the commit message:

} The metrics are updated using the following formula:
} 
}                        idle_time
}   idle_ratio = --------------------------
}                dump_period * worker_count
} 
} where idle_time is the total amount of time workers were idle between
} the last two dumps, dump_period is the time that passed between the last
} two dumps, worker_count is the number of workers in the pool.


> > > Besides, once again, how do you define the window over which you
> > > measure?
> > I use dump period for the window, i.e. time between two subsequent
> > memory dumps. Respectively, I update the idle_ratio after each memory
> > dump. I'm planning to update throttle rate limit there too.
> 
> This choice of time window looks arbitrary. Why do you think it's
> a good choice? 

Dump period seems to be the only reasonable choice for a minimal time
window characterising a vinyl workload. If we choose a smaller window,
then dump_idle_ratio will jump from 0 when dump is inactive to 1 when
dump is in progress, which isn't very convenient.

At the same time, if we want to accumulate idle time statistics over a
longer period, we can still do that by averaging idle_ratio.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH 0/7] vinyl: improve stats for throttling
  2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
                   ` (6 preceding siblings ...)
  2018-09-02 20:19 ` [PATCH 7/7] vinyl: keep track of disk idle time Vladimir Davydov
@ 2018-09-09 11:41 ` Vladimir Davydov
  7 siblings, 0 replies; 39+ messages in thread
From: Vladimir Davydov @ 2018-09-09 11:41 UTC (permalink / raw)
  To: kostja; +Cc: tarantool-patches

I pushed the following approved patches plus the patch that sets the
default value of box.cfg.vinyl_write_threads to 4 to branch 1.10:

e78ebb77 vinyl: add global memory stats
16faada1 vinyl: fix accounting of secondary index cache statements
fe1e4694 vinyl: set box.cfg.vinyl_write_threads to 4 by default
7069eab5 vinyl: don't start scheduler fiber until local recovery is complete
0ff58856 vinyl: zap vy_worker_pool::idle_worker_count
3e76f7b9 vinyl: use separate thread pools for dump and compaction tasks
ba7abf6f vinyl: move worker allocation closer to task creation
49595189 vinyl: factor out worker pool from scheduler struct
661763ed vinyl: don't use mempool for allocating background tasks
04a735b2 vinyl: add helper to check whether dump is in progress

Here's what's left on branch dv/vy-stats-for-throttling:

a4e0694d vinyl: keep track of thread pool idle ratio
323161d9 vinyl: keep track of compaction queue length and debt
8468caa8 vinyl: update compact priority usual way on range split/coalesce
5d1b5db6 vinyl: fix force compaction logic
28d4f0f6 vinyl: add global disk stats

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2018-09-09 11:41 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-02 20:18 [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov
2018-09-02 20:18 ` [PATCH 1/7] vinyl: fix accounting of secondary index cache statements Vladimir Davydov
2018-09-02 22:26   ` [tarantool-patches] " Konstantin Osipov
2018-09-02 20:18 ` [PATCH 2/7] vinyl: add global memory stats Vladimir Davydov
2018-09-02 22:27   ` [tarantool-patches] " Konstantin Osipov
2018-09-02 22:27   ` Konstantin Osipov
2018-09-03  8:10     ` Vladimir Davydov
2018-09-02 20:18 ` [PATCH 3/7] vinyl: add global disk stats Vladimir Davydov
2018-09-02 22:30   ` [tarantool-patches] " Konstantin Osipov
2018-09-02 20:18 ` [PATCH 4/7] vinyl: fix force compaction logic Vladimir Davydov
2018-09-02 20:18 ` [PATCH 5/7] vinyl: update compact priority usual way on range split/coalesce Vladimir Davydov
2018-09-02 20:18 ` [PATCH 6/7] vinyl: keep track of compaction queue length and debt Vladimir Davydov
2018-09-02 20:19 ` [PATCH 7/7] vinyl: keep track of disk idle time Vladimir Davydov
2018-09-04 11:54   ` Vladimir Davydov
2018-09-04 17:23     ` Vladimir Davydov
2018-09-04 17:23       ` [PATCH 1/8] vinyl: add helper to check whether dump is in progress Vladimir Davydov
2018-09-06  7:33         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 2/8] vinyl: don't use mempool for allocating background tasks Vladimir Davydov
2018-09-06  7:33         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 3/8] vinyl: factor out worker pool from scheduler struct Vladimir Davydov
2018-09-06  7:34         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 4/8] vinyl: move worker allocation closer to task creation Vladimir Davydov
2018-09-06  7:35         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 5/8] vinyl: use separate thread pools for dump and compaction tasks Vladimir Davydov
2018-09-06  7:37         ` Konstantin Osipov
2018-09-06  9:48           ` Vladimir Davydov
2018-09-06 10:32             ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 6/8] vinyl: zap vy_worker_pool::idle_worker_count Vladimir Davydov
2018-09-06  7:38         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 7/8] vinyl: don't start scheduler fiber until local recovery is complete Vladimir Davydov
2018-09-06  7:39         ` Konstantin Osipov
2018-09-04 17:23       ` [PATCH 8/8] vinyl: keep track of thread pool idle ratio Vladimir Davydov
2018-09-06  7:49         ` Konstantin Osipov
2018-09-06  8:18           ` Vladimir Davydov
2018-09-06 10:26             ` Konstantin Osipov
2018-09-06 10:52               ` Vladimir Davydov
2018-09-06 10:57                 ` Konstantin Osipov
2018-09-06 11:59                   ` Vladimir Davydov
2018-09-09 11:41 ` [PATCH 0/7] vinyl: improve stats for throttling Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox