[PATCH 6/7] vinyl: keep track of compaction queue length and debt

Vladimir Davydov vdavydov.dev at gmail.com
Sun Sep 2 23:18:59 MSK 2018


Currently, there's no way to figure out whether compaction keeps up
with dumps or not while this is essential for implementing transaction
throttling. This patch adds metrics that are supposed to help answer
this question. Those are size of compaction queue and compaction debt.

Size of compaction queue is the total size of runs awaiting compaction.
Compaction debt is how many runs must be compacted to restore the target
LSM tree shape. It can't be greater than the size of compaction queue.
Ideally, it should be zero.

We update both metrics along with compaction priority of a range.
Calculation of compaction queue size is trivial - we simply sum
statement counters of slices to be compacted. Compaction debt is a bit
more sophisticated. When walking down an LSM tree, we check whether the
number of runs at any level is greater than run_count_level * 2. If it
is, then we would reschedule compaction had the previous compaction task
completed by now, so we assume that the range is "in debt". If a range
is in debt, its compaction queue length contributes to compaction debt
metric.

Both metrics are accumulated and reported per index and globally.
Global metrics are reported under box.info.vinyl().disk.compact_queue
and disk.compact_debt, in bytes only. Index statistics are reported
under index.stat().disk.compact.queue and disk.compact.debt, both in
bytes and in rows. The metrics don't take into account disk compression.
---
 src/box/vinyl.c            |  27 +++++-----
 src/box/vy_lsm.c           |  20 +++++++
 src/box/vy_lsm.h           |  26 ++++++++-
 src/box/vy_range.c         |  28 +++++++---
 src/box/vy_range.h         |   9 ++++
 src/box/vy_scheduler.c     |   9 +++-
 src/box/vy_stat.h          |  39 ++++++++++----
 src/errinj.h               |   1 +
 test/box/errinj.result     |   4 +-
 test/vinyl/errinj.result   | 129 +++++++++++++++++++++++++++++++++++++++++++++
 test/vinyl/errinj.test.lua |  34 ++++++++++++
 test/vinyl/info.result     |  20 ++++++-
 12 files changed, 307 insertions(+), 39 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index edfaa824..416c9824 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -309,6 +309,8 @@ vy_info_append_disk(struct vy_env *env, struct info_handler *h)
 	info_append_int(h, "index_size", lsm_env->disk_index_size);
 	info_append_int(h, "dump_total", lsm_env->dump_total);
 	info_append_int(h, "compact_total", lsm_env->compact_total);
+	info_append_int(h, "compact_queue", lsm_env->compact_queue);
+	info_append_int(h, "compact_debt", lsm_env->compact_debt);
 	info_table_end(h);
 }
 
@@ -352,17 +354,6 @@ vy_info_append_disk_stmt_counter(struct info_handler *h, const char *name,
 }
 
 static void
-vy_info_append_compact_stat(struct info_handler *h, const char *name,
-			    const struct vy_compact_stat *stat)
-{
-	info_table_begin(h, name);
-	info_append_int(h, "count", stat->count);
-	vy_info_append_stmt_counter(h, "in", &stat->in);
-	vy_info_append_stmt_counter(h, "out", &stat->out);
-	info_table_end(h);
-}
-
-static void
 vinyl_index_stat(struct index *index, struct info_handler *h)
 {
 	char buf[1024];
@@ -413,8 +404,18 @@ vinyl_index_stat(struct index *index, struct info_handler *h)
 	info_append_int(h, "miss", stat->disk.iterator.bloom_miss);
 	info_table_end(h);
 	info_table_end(h);
-	vy_info_append_compact_stat(h, "dump", &stat->disk.dump);
-	vy_info_append_compact_stat(h, "compact", &stat->disk.compact);
+	info_table_begin(h, "dump");
+	info_append_int(h, "count", stat->disk.dump.count);
+	vy_info_append_stmt_counter(h, "in", &stat->disk.dump.in);
+	vy_info_append_stmt_counter(h, "out", &stat->disk.dump.out);
+	info_table_end(h);
+	info_table_begin(h, "compact");
+	info_append_int(h, "count", stat->disk.compact.count);
+	vy_info_append_stmt_counter(h, "in", &stat->disk.compact.in);
+	vy_info_append_stmt_counter(h, "out", &stat->disk.compact.out);
+	vy_info_append_stmt_counter(h, "queue", &stat->disk.compact.queue);
+	vy_info_append_stmt_counter(h, "debt", &stat->disk.compact.debt);
+	info_table_end(h);
 	info_append_int(h, "index_size", lsm->page_index_size);
 	info_append_int(h, "bloom_size", lsm->bloom_size);
 	info_table_end(h);
diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index bf359973..f27634b9 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -252,6 +252,8 @@ vy_lsm_delete(struct vy_lsm *lsm)
 	assert(lsm->env->lsm_count > 0);
 
 	lsm->env->lsm_count--;
+	lsm->env->compact_queue -= lsm->stat.disk.compact.queue.bytes;
+	lsm->env->compact_debt -= lsm->stat.disk.compact.debt.bytes;
 
 	if (lsm->pk != NULL)
 		vy_lsm_unref(lsm->pk);
@@ -759,12 +761,28 @@ void
 vy_lsm_acct_range(struct vy_lsm *lsm, struct vy_range *range)
 {
 	histogram_collect(lsm->run_hist, range->slice_count);
+	vy_stmt_counter_add(&lsm->stat.disk.compact.queue,
+			    &range->compact_count);
+	lsm->env->compact_queue += range->compact_count.bytes;
+	if (range->in_compaction_debt) {
+		vy_stmt_counter_add(&lsm->stat.disk.compact.debt,
+				    &range->compact_count);
+		lsm->env->compact_debt += range->compact_count.bytes;
+	}
 }
 
 void
 vy_lsm_unacct_range(struct vy_lsm *lsm, struct vy_range *range)
 {
 	histogram_discard(lsm->run_hist, range->slice_count);
+	vy_stmt_counter_sub(&lsm->stat.disk.compact.queue,
+			    &range->compact_count);
+	lsm->env->compact_queue -= range->compact_count.bytes;
+	if (range->in_compaction_debt) {
+		vy_stmt_counter_sub(&lsm->stat.disk.compact.debt,
+				    &range->compact_count);
+		lsm->env->compact_debt -= range->compact_count.bytes;
+	}
 }
 
 int
@@ -1179,8 +1197,10 @@ vy_lsm_force_compaction(struct vy_lsm *lsm)
 
 	vy_range_tree_ifirst(lsm->tree, &it);
 	while ((range = vy_range_tree_inext(&it)) != NULL) {
+		vy_lsm_unacct_range(lsm, range);
 		range->needs_compaction = true;
 		vy_range_update_compact_priority(range, &lsm->opts);
+		vy_lsm_acct_range(lsm, range);
 	}
 
 	vy_range_heap_update_all(&lsm->range_heap);
diff --git a/src/box/vy_lsm.h b/src/box/vy_lsm.h
index 67a5c5d1..7e3e8430 100644
--- a/src/box/vy_lsm.h
+++ b/src/box/vy_lsm.h
@@ -107,6 +107,16 @@ struct vy_lsm_env {
 	 * tasks (uncompressed).
 	 */
 	int64_t compact_total;
+	/**
+	 * Sum size of runs awaiting compaction, in bytes
+	 * (uncompressed).
+	 */
+	int64_t compact_queue;
+	/**
+	 * Sum size of runs that must be compacted to restore
+	 * the target LSM tree shape, in bytes (uncompressed).
+	 */
+	int64_t compact_debt;
 	/** Memory pool for vy_history_node allocations. */
 	struct mempool history_node_pool;
 };
@@ -452,11 +462,23 @@ vy_lsm_add_range(struct vy_lsm *lsm, struct vy_range *range);
 void
 vy_lsm_remove_range(struct vy_lsm *lsm, struct vy_range *range);
 
-/** Account a range to the run histogram of an LSM tree. */
+/**
+ * Account a range in an LSM tree.
+ *
+ * This function updates the following LSM tree statistics:
+ *  - vy_lsm::run_hist after a slice is added to or removed from
+ *    a range of the LSM tree.
+ *  - vy_lsm::stat::disk::compact::queue and debt after compaction
+ *    priority of a range is updated.
+ */
 void
 vy_lsm_acct_range(struct vy_lsm *lsm, struct vy_range *range);
 
-/** Unaccount a range from the run histogram of an LSM tree. */
+/**
+ * Unaccount a range in an LSM tree.
+ *
+ * This function undoes the effect of vy_lsm_acct_range().
+ */
 void
 vy_lsm_unacct_range(struct vy_lsm *lsm, struct vy_range *range);
 
diff --git a/src/box/vy_range.c b/src/box/vy_range.c
index ddcd2ed3..1816e546 100644
--- a/src/box/vy_range.c
+++ b/src/box/vy_range.c
@@ -292,19 +292,19 @@ vy_range_update_compact_priority(struct vy_range *range,
 	assert(opts->run_count_per_level > 0);
 	assert(opts->run_size_ratio > 1);
 
+	range->compact_priority = 0;
+	range->in_compaction_debt = false;
+	vy_stmt_counter_reset(&range->compact_count);
+
 	if (range->slice_count <= 1) {
 		/* Nothing to compact. */
-		range->compact_priority = 0;
 		range->needs_compaction = false;
 		return;
 	}
-	if (range->needs_compaction) {
-		range->compact_priority = range->slice_count;
-		return;
-	}
-
-	range->compact_priority = 0;
 
+	/* Total number of statements in checked runs. */
+	struct vy_stmt_counter total_stmt_count;
+	vy_stmt_counter_reset(&total_stmt_count);
 	/* Total number of checked runs. */
 	uint32_t total_run_count = 0;
 	/* The total size of runs checked so far. */
@@ -333,6 +333,7 @@ vy_range_update_compact_priority(struct vy_range *range,
 		total_size += size;
 		level_run_count++;
 		total_run_count++;
+		vy_stmt_counter_add_disk(&total_stmt_count, &slice->count);
 		while (size > target_run_size) {
 			/*
 			 * The run size exceeds the threshold
@@ -362,7 +363,8 @@ vy_range_update_compact_priority(struct vy_range *range,
 			 * we find an appropriate level for it.
 			 */
 		}
-		if (level_run_count > opts->run_count_per_level) {
+		if (range->needs_compaction ||
+		    level_run_count > opts->run_count_per_level) {
 			/*
 			 * The number of runs at the current level
 			 * exceeds the configured maximum. Arrange
@@ -370,8 +372,18 @@ vy_range_update_compact_priority(struct vy_range *range,
 			 * this level and upper levels.
 			 */
 			range->compact_priority = total_run_count;
+			range->compact_count = total_stmt_count;
 			est_new_run_size = total_size;
 		}
+		/*
+		 * If the number of runs on any level is twice as
+		 * many as run_count_per_level, we would schedule
+		 * compaction for another time had the previously
+		 * scheduled compaction task completed. This means
+		 * that compaction doesn't keep up with dumps.
+		 */
+		if (level_run_count > opts->run_count_per_level * 2)
+			range->in_compaction_debt = true;
 	}
 }
 
diff --git a/src/box/vy_range.h b/src/box/vy_range.h
index 2ca19a1c..60ce8d96 100644
--- a/src/box/vy_range.h
+++ b/src/box/vy_range.h
@@ -106,6 +106,15 @@ struct vy_range {
 	 * how we  decide how many runs to compact next time.
 	 */
 	int compact_priority;
+	/** Number of statements that needs to be compacted. */
+	struct vy_stmt_counter compact_count;
+	/**
+	 * Set if compaction doesn't manage to keep this LSM tree
+	 * in a good shape, because there are too many dumps.
+	 * Updated by vy_range_update_compact_priority() along
+	 * with compaction priority.
+	 */
+	bool in_compaction_debt;
 	/**
 	 * If this flag is set, the range must be scheduled for
 	 * major compaction, i.e. its compact_priority must be
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index a1ae3f54..580c3129 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -1136,8 +1136,8 @@ vy_task_dump_complete(struct vy_task *task)
 		slice = new_slices[i];
 		vy_lsm_unacct_range(lsm, range);
 		vy_range_add_slice(range, slice);
-		vy_lsm_acct_range(lsm, range);
 		vy_range_update_compact_priority(range, &lsm->opts);
+		vy_lsm_acct_range(lsm, range);
 		if (!vy_range_is_scheduled(range))
 			vy_range_heap_update(&lsm->range_heap,
 					     &range->heap_node);
@@ -1363,6 +1363,11 @@ err:
 static int
 vy_task_compact_execute(struct vy_task *task)
 {
+	struct errinj *errinj = errinj(ERRINJ_VY_COMPACTION_DELAY, ERRINJ_BOOL);
+	if (errinj != NULL && errinj->bparam) {
+		while (errinj->bparam)
+			fiber_sleep(0.01);
+	}
 	return vy_task_write_run(task);
 }
 
@@ -1487,8 +1492,8 @@ vy_task_compact_complete(struct vy_task *task)
 	}
 	range->n_compactions++;
 	range->version++;
-	vy_lsm_acct_range(lsm, range);
 	vy_range_update_compact_priority(range, &lsm->opts);
+	vy_lsm_acct_range(lsm, range);
 	lsm->stat.disk.compact.count++;
 
 	/*
diff --git a/src/box/vy_stat.h b/src/box/vy_stat.h
index ca52c4d3..99159022 100644
--- a/src/box/vy_stat.h
+++ b/src/box/vy_stat.h
@@ -32,6 +32,7 @@
  */
 
 #include <stdint.h>
+#include <string.h>
 
 #include "latency.h"
 #include "tuple.h"
@@ -101,15 +102,6 @@ struct vy_txw_iterator_stat {
 	struct vy_stmt_counter get;
 };
 
-/** Dump/compaction statistics. */
-struct vy_compact_stat {
-	int32_t count;
-	/** Number of input statements. */
-	struct vy_stmt_counter in;
-	/** Number of output statements. */
-	struct vy_stmt_counter out;
-};
-
 /** LSM tree statistics. */
 struct vy_lsm_stat {
 	/** Number of lookups in the LSM tree. */
@@ -141,9 +133,28 @@ struct vy_lsm_stat {
 		/** Run iterator statistics. */
 		struct vy_run_iterator_stat iterator;
 		/** Dump statistics. */
-		struct vy_compact_stat dump;
+		struct {
+			int32_t count;
+			/** Number of input statements. */
+			struct vy_stmt_counter in;
+			/** Number of output statements. */
+			struct vy_stmt_counter out;
+		} dump;
 		/** Compaction statistics. */
-		struct vy_compact_stat compact;
+		struct {
+			int32_t count;
+			/** Number of input statements. */
+			struct vy_stmt_counter in;
+			/** Number of output statements. */
+			struct vy_stmt_counter out;
+			/** Number of statements awaiting compaction. */
+			struct vy_stmt_counter queue;
+			/**
+			 * Number of statements that must be compacted
+			 * to restore the LSM tree shape.
+			 */
+			struct vy_stmt_counter debt;
+		} compact;
 	} disk;
 	/** TX write set statistics. */
 	struct {
@@ -199,6 +210,12 @@ vy_lsm_stat_destroy(struct vy_lsm_stat *stat)
 }
 
 static inline void
+vy_stmt_counter_reset(struct vy_stmt_counter *c)
+{
+	memset(c, 0, sizeof(*c));
+}
+
+static inline void
 vy_stmt_counter_acct_tuple(struct vy_stmt_counter *c,
 			   const struct tuple *tuple)
 {
diff --git a/src/errinj.h b/src/errinj.h
index b6d4a4c9..84a1fbb5 100644
--- a/src/errinj.h
+++ b/src/errinj.h
@@ -120,6 +120,7 @@ struct errinj {
 	_(ERRINJ_VY_INDEX_FILE_RENAME, ERRINJ_BOOL, {.bparam = false}) \
 	_(ERRINJ_RELAY_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
 	_(ERRINJ_WAL_BREAK_LSN, ERRINJ_INT, {.iparam = -1}) \
+	_(ERRINJ_VY_COMPACTION_DELAY, ERRINJ_BOOL, {.bparam = false}) \
 
 ENUM0(errinj_id, ERRINJ_LIST);
 extern struct errinj errinjs[];
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 8dae7614..81087900 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -58,6 +58,8 @@ errinj.info()
     state: 0
   ERRINJ_XLOG_META:
     state: false
+  ERRINJ_SNAP_COMMIT_DELAY:
+    state: false
   ERRINJ_WAL_BREAK_LSN:
     state: -1
   ERRINJ_WAL_WRITE_DISK:
@@ -74,7 +76,7 @@ errinj.info()
     state: false
   ERRINJ_RELAY_FINAL_JOIN:
     state: false
-  ERRINJ_SNAP_COMMIT_DELAY:
+  ERRINJ_VY_COMPACTION_DELAY:
     state: false
   ERRINJ_RELAY_FINAL_SLEEP:
     state: false
diff --git a/test/vinyl/errinj.result b/test/vinyl/errinj.result
index 17e4dc8c..7b880030 100644
--- a/test/vinyl/errinj.result
+++ b/test/vinyl/errinj.result
@@ -2118,3 +2118,132 @@ s:select()
 s:drop()
 ---
 ...
+-- Check compact.queue and compact.debt statistics.
+s = box.schema.space.create('test', {engine = 'vinyl'})
+---
+...
+i = s:create_index('pk', {run_count_per_level = 2})
+---
+...
+function dump() for i = 1, 10 do s:replace{i} end box.snapshot() end
+---
+...
+dump()
+---
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+---
+- ok
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- 30 statements
+---
+- rows: 30
+  bytes: 471
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- 40 statements
+---
+- rows: 40
+  bytes: 628
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+dump()
+---
+...
+i:stat().disk.compact.queue -- 50 statements
+---
+- rows: 50
+  bytes: 785
+...
+i:stat().disk.compact.debt -- 50 statements
+---
+- rows: 50
+  bytes: 785
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+---
+- ok
+...
+while i:stat().disk.compact.count < 2 do fiber.sleep(0.01) end
+---
+...
+i:stat().disk.compact.queue -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.debt -- none
+---
+- rows: 0
+  bytes: 0
+...
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+---
+- true
+...
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+---
+- true
+...
+s:drop()
+---
+...
diff --git a/test/vinyl/errinj.test.lua b/test/vinyl/errinj.test.lua
index 1b02c01c..9037bfad 100644
--- a/test/vinyl/errinj.test.lua
+++ b/test/vinyl/errinj.test.lua
@@ -850,3 +850,37 @@ fiber.sleep(0)
 s:create_index('sk', {parts = {2, 'unsigned'}})
 s:select()
 s:drop()
+
+-- Check compact.queue and compact.debt statistics.
+s = box.schema.space.create('test', {engine = 'vinyl'})
+i = s:create_index('pk', {run_count_per_level = 2})
+function dump() for i = 1, 10 do s:replace{i} end box.snapshot() end
+dump()
+dump()
+i:stat().disk.compact.queue -- none
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', true)
+dump()
+i:stat().disk.compact.queue -- 30 statements
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+dump()
+i:stat().disk.compact.queue -- 40 statements
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+dump()
+i:stat().disk.compact.queue -- 50 statements
+i:stat().disk.compact.debt -- 50 statements
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+errinj.set('ERRINJ_VY_COMPACTION_DELAY', false)
+while i:stat().disk.compact.count < 2 do fiber.sleep(0.01) end
+i:stat().disk.compact.queue -- none
+i:stat().disk.compact.debt -- none
+i:stat().disk.compact.queue.bytes == box.stat.vinyl().disk.compact_queue
+i:stat().disk.compact.debt.bytes == box.stat.vinyl().disk.compact_debt
+s:drop()
diff --git a/test/vinyl/info.result b/test/vinyl/info.result
index 36fa732c..556f5eca 100644
--- a/test/vinyl/info.result
+++ b/test/vinyl/info.result
@@ -163,11 +163,17 @@ istat()
         rows: 0
         bytes: 0
     compact:
+      out:
+        rows: 0
+        bytes: 0
       in:
         rows: 0
         bytes: 0
       count: 0
-      out:
+      debt:
+        rows: 0
+        bytes: 0
+      queue:
         rows: 0
         bytes: 0
     iterator:
@@ -213,6 +219,8 @@ gstat()
 - disk:
     dump_total: 0
     data_size: 0
+    compact_debt: 0
+    compact_queue: 0
     data_files: 0
     compact_total: 0
     index_size: 0
@@ -976,11 +984,17 @@ istat()
         rows: 0
         bytes: 0
     compact:
+      out:
+        rows: 0
+        bytes: 0
       in:
         rows: 0
         bytes: 0
       count: 0
-      out:
+      debt:
+        rows: 0
+        bytes: 0
+      queue:
         rows: 0
         bytes: 0
     iterator:
@@ -1026,6 +1040,8 @@ gstat()
 - disk:
     dump_total: 0
     data_size: 104300
+    compact_debt: 0
+    compact_queue: 0
     data_files: 2
     compact_total: 0
     index_size: 1190
-- 
2.11.0




More information about the Tarantool-patches mailing list