* [PATCH v3 1/4] vinyl: introduce quota consumer types
2019-02-12 17:26 [PATCH v3 0/4] vinyl: compaction randomization and throttling Vladimir Davydov
@ 2019-02-12 17:26 ` Vladimir Davydov
2019-02-12 18:14 ` Konstantin Osipov
2019-02-12 17:26 ` [PATCH v3 2/4] vinyl: remove extra quota check from vy_quota_use Vladimir Davydov
` (3 subsequent siblings)
4 siblings, 1 reply; 10+ messages in thread
From: Vladimir Davydov @ 2019-02-12 17:26 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Currently, we only limit quota consumption rate so that writers won't
hit the hard limit before memory dump is complete. However, it isn't
enough, because we also need to consider compaction: if it doesn't keep
up with dumps, read and space amplification will grow uncontrollably.
The problem is compaction may be a quota consumer by itself, as it may
generate deferred DELETE statements for secondary indexes. We can't
ignore quota completely there, because if we do, we may hit the memory
limit and stall all writers, which is unacceptable, but we do want to
ignore the rate limit imposed to make sure that compaction keeps up with
dumps, otherwise compaction won't benefit from such a throttling.
To tackle this problem, this patch introduces the concept of quota
consumer types and resources. Now vy_quota maintains one rate limit per
each resource and one wait queue per each consumer type. There are two
types of consumers, compaction jobs and usual transactions, and there
are two resources managed by vy_quota, disk and memory. Memory-based
rate limit ensures that transactions won't hit the hard memory limit and
stall before memory dump is complete. It is respected by all types of
consumers. Disk-based rate limit is supposed to be set when compaction
doesn't keep up with dumps. It is only used by usual transactions and
ignored by compaction jobs.
Since now there are two wait queues, we need to balance wakeups between
them in case consumers in both queues are ready to proceed. To ensure
there's no starvation, we maintain a monotonically growing counter and
assign its value to each consumer put to slip (ticket). We use it to
wake up the consumer that has waited most when both queues are ready.
Note, the patch doesn't implement the logic of disk-based throttling in
the regulator module. It is still left for future work.
Needed for #3721
---
src/box/vinyl.c | 30 ++++++-----
src/box/vy_quota.c | 133 ++++++++++++++++++++++++++++++++++++++-----------
src/box/vy_quota.h | 91 +++++++++++++++++++++++++++------
src/box/vy_regulator.c | 12 +++--
4 files changed, 208 insertions(+), 58 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 2815a7fd..22fe135b 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2347,7 +2347,8 @@ vinyl_engine_prepare(struct engine *engine, struct txn *txn)
* the transaction to be sent to read view or aborted, we call
* it before checking for conflicts.
*/
- if (vy_quota_use(&env->quota, tx->write_size, timeout) != 0)
+ if (vy_quota_use(&env->quota, VY_QUOTA_CONSUMER_TX,
+ tx->write_size, timeout) != 0)
return -1;
size_t mem_used_before = lsregion_used(&env->mem_env.allocator);
@@ -2356,8 +2357,8 @@ vinyl_engine_prepare(struct engine *engine, struct txn *txn)
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
- vy_quota_adjust(&env->quota, tx->write_size,
- mem_used_after - mem_used_before);
+ vy_quota_adjust(&env->quota, VY_QUOTA_CONSUMER_TX,
+ tx->write_size, mem_used_after - mem_used_before);
vy_regulator_check_dump_watermark(&env->regulator);
return rc;
}
@@ -2382,7 +2383,8 @@ vinyl_engine_commit(struct engine *engine, struct txn *txn)
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
/* We can't abort the transaction at this point, use force. */
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+ vy_quota_force_use(&env->quota, VY_QUOTA_CONSUMER_TX,
+ mem_used_after - mem_used_before);
vy_regulator_check_dump_watermark(&env->regulator);
txn->engine_tx = NULL;
@@ -3194,7 +3196,8 @@ vinyl_space_apply_initial_join_row(struct space *space, struct request *request)
* quota accounting.
*/
size_t reserved = tx->write_size;
- if (vy_quota_use(&env->quota, reserved, TIMEOUT_INFINITY) != 0)
+ if (vy_quota_use(&env->quota, VY_QUOTA_CONSUMER_TX,
+ reserved, TIMEOUT_INFINITY) != 0)
unreachable();
size_t mem_used_before = lsregion_used(&env->mem_env.allocator);
@@ -3213,7 +3216,7 @@ vinyl_space_apply_initial_join_row(struct space *space, struct request *request)
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
size_t used = mem_used_after - mem_used_before;
- vy_quota_adjust(&env->quota, reserved, used);
+ vy_quota_adjust(&env->quota, VY_QUOTA_CONSUMER_TX, reserved, used);
vy_regulator_check_dump_watermark(&env->regulator);
return rc;
}
@@ -3534,7 +3537,7 @@ vy_squash_process(struct vy_squash *squash)
* so there's no need in invalidating the cache.
*/
vy_mem_commit_stmt(mem, region_stmt);
- vy_quota_force_use(&env->quota,
+ vy_quota_force_use(&env->quota, VY_QUOTA_CONSUMER_TX,
mem_used_after - mem_used_before);
vy_regulator_check_dump_watermark(&env->regulator);
}
@@ -4010,9 +4013,10 @@ vy_build_insert_tuple(struct vy_env *env, struct vy_lsm *lsm,
/* Consume memory quota. Throttle if it is exceeded. */
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+ vy_quota_force_use(&env->quota, VY_QUOTA_CONSUMER_TX,
+ mem_used_after - mem_used_before);
vy_regulator_check_dump_watermark(&env->regulator);
- vy_quota_wait(&env->quota);
+ vy_quota_wait(&env->quota, VY_QUOTA_CONSUMER_TX);
return rc;
}
@@ -4138,7 +4142,8 @@ vy_build_recover(struct vy_env *env, struct vy_lsm *lsm, struct vy_lsm *pk)
mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+ vy_quota_force_use(&env->quota, VY_QUOTA_CONSUMER_TX,
+ mem_used_after - mem_used_before);
return rc;
}
@@ -4354,7 +4359,7 @@ vy_deferred_delete_on_replace(struct trigger *trigger, void *event)
*/
struct vy_env *env = vy_env(space->engine);
if (is_first_statement)
- vy_quota_wait(&env->quota);
+ vy_quota_wait(&env->quota, VY_QUOTA_CONSUMER_COMPACTION);
/* Create the deferred DELETE statement. */
struct vy_lsm *pk = vy_lsm(space->index[0]);
@@ -4441,7 +4446,8 @@ vy_deferred_delete_on_replace(struct trigger *trigger, void *event)
}
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+ vy_quota_force_use(&env->quota, VY_QUOTA_CONSUMER_COMPACTION,
+ mem_used_after - mem_used_before);
vy_regulator_check_dump_watermark(&env->regulator);
tuple_unref(delete);
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 07cd5856..dc452adb 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -52,6 +52,45 @@
static const double VY_QUOTA_TIMER_PERIOD = 0.1;
/**
+ * Bit mask of resources used by a particular consumer type.
+ */
+static unsigned
+vy_quota_consumer_resource_map[] = {
+ /**
+ * Transaction throttling pursues two goals. First, it is
+ * capping memory consumption rate so that the hard memory
+ * limit will not be hit before memory dump has completed
+ * (memory-based throttling). Second, we must make sure
+ * that compaction jobs keep up with dumps to keep read and
+ * space amplification within bounds (disk-based throttling).
+ * Transactions ought to respect them both.
+ */
+ [VY_QUOTA_CONSUMER_TX] = (1 << VY_QUOTA_RESOURCE_DISK) |
+ (1 << VY_QUOTA_RESOURCE_MEMORY),
+ /**
+ * Compaction jobs may need some quota too, because they
+ * may generate deferred DELETEs for secondary indexes.
+ * Apparently, we must not impose the rate limit that is
+ * supposed to speed up compaction on them (disk-based),
+ * however they still have to respect memory-based throttling
+ * to avoid long stalls.
+ */
+ [VY_QUOTA_CONSUMER_COMPACTION] = (1 << VY_QUOTA_RESOURCE_MEMORY),
+};
+
+/**
+ * Check if the rate limit corresponding to resource @resource_type
+ * should be applied to a consumer of type @consumer_type.
+ */
+static inline bool
+vy_rate_limit_is_applicable(enum vy_quota_consumer_type consumer_type,
+ enum vy_quota_resource_type resource_type)
+{
+ return (vy_quota_consumer_resource_map[consumer_type] &
+ (1 << resource_type)) != 0;
+}
+
+/**
* Return true if the requested amount of memory may be consumed
* right now, false if consumers have to wait.
*
@@ -60,7 +99,8 @@ static const double VY_QUOTA_TIMER_PERIOD = 0.1;
* it can start memory reclaim immediately.
*/
static inline bool
-vy_quota_may_use(struct vy_quota *q, size_t size)
+vy_quota_may_use(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t size)
{
if (!q->is_enabled)
return true;
@@ -68,8 +108,12 @@ vy_quota_may_use(struct vy_quota *q, size_t size)
q->quota_exceeded_cb(q);
return false;
}
- if (!vy_rate_limit_may_use(&q->rate_limit))
- return false;
+ for (int i = 0; i < vy_quota_resource_type_MAX; i++) {
+ struct vy_rate_limit *rl = &q->rate_limit[i];
+ if (vy_rate_limit_is_applicable(type, i) &&
+ !vy_rate_limit_may_use(rl))
+ return false;
+ }
return true;
}
@@ -77,10 +121,15 @@ vy_quota_may_use(struct vy_quota *q, size_t size)
* Consume the given amount of memory without checking the limit.
*/
static inline void
-vy_quota_do_use(struct vy_quota *q, size_t size)
+vy_quota_do_use(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t size)
{
q->used += size;
- vy_rate_limit_use(&q->rate_limit, size);
+ for (int i = 0; i < vy_quota_resource_type_MAX; i++) {
+ struct vy_rate_limit *rl = &q->rate_limit[i];
+ if (vy_rate_limit_is_applicable(type, i))
+ vy_rate_limit_use(rl, size);
+ }
}
/**
@@ -88,11 +137,16 @@ vy_quota_do_use(struct vy_quota *q, size_t size)
* This function is an exact opposite of vy_quota_do_use().
*/
static inline void
-vy_quota_do_unuse(struct vy_quota *q, size_t size)
+vy_quota_do_unuse(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t size)
{
assert(q->used >= size);
q->used -= size;
- vy_rate_limit_unuse(&q->rate_limit, size);
+ for (int i = 0; i < vy_quota_resource_type_MAX; i++) {
+ struct vy_rate_limit *rl = &q->rate_limit[i];
+ if (vy_rate_limit_is_applicable(type, i))
+ vy_rate_limit_unuse(rl, size);
+ }
}
/**
@@ -112,17 +166,31 @@ vy_quota_check_limit(struct vy_quota *q)
static void
vy_quota_signal(struct vy_quota *q)
{
- if (!rlist_empty(&q->wait_queue)) {
+ /*
+ * To prevent starvation, wake up a consumer that has
+ * waited most irrespective of its type.
+ */
+ struct vy_quota_wait_node *oldest = NULL;
+ for (int i = 0; i < vy_quota_consumer_type_MAX; i++) {
+ struct rlist *wq = &q->wait_queue[i];
+ if (rlist_empty(wq))
+ continue;
+
struct vy_quota_wait_node *n;
- n = rlist_first_entry(&q->wait_queue,
- struct vy_quota_wait_node, in_wait_queue);
+ n = rlist_first_entry(wq, struct vy_quota_wait_node,
+ in_wait_queue);
/*
* No need in waking up a consumer if it will have
* to go back to sleep immediately.
*/
- if (vy_quota_may_use(q, n->size))
- fiber_wakeup(n->fiber);
+ if (!vy_quota_may_use(q, i, n->size))
+ continue;
+
+ if (oldest == NULL || oldest->ticket > n->ticket)
+ oldest = n;
}
+ if (oldest != NULL)
+ fiber_wakeup(oldest->fiber);
}
static void
@@ -133,7 +201,8 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
struct vy_quota *q = timer->data;
- vy_rate_limit_refill(&q->rate_limit, VY_QUOTA_TIMER_PERIOD);
+ for (int i = 0; i < vy_quota_resource_type_MAX; i++)
+ vy_rate_limit_refill(&q->rate_limit[i], VY_QUOTA_TIMER_PERIOD);
vy_quota_signal(q);
}
@@ -146,8 +215,11 @@ vy_quota_create(struct vy_quota *q, size_t limit,
q->used = 0;
q->too_long_threshold = TIMEOUT_INFINITY;
q->quota_exceeded_cb = quota_exceeded_cb;
- rlist_create(&q->wait_queue);
- vy_rate_limit_create(&q->rate_limit);
+ q->wait_ticket = 0;
+ for (int i = 0; i < vy_quota_consumer_type_MAX; i++)
+ rlist_create(&q->wait_queue[i]);
+ for (int i = 0; i < vy_quota_resource_type_MAX; i++)
+ vy_rate_limit_create(&q->rate_limit[i]);
ev_timer_init(&q->timer, vy_quota_timer_cb, 0, VY_QUOTA_TIMER_PERIOD);
q->timer.data = q;
}
@@ -176,15 +248,17 @@ vy_quota_set_limit(struct vy_quota *q, size_t limit)
}
void
-vy_quota_set_rate_limit(struct vy_quota *q, size_t rate)
+vy_quota_set_rate_limit(struct vy_quota *q, enum vy_quota_resource_type type,
+ size_t rate)
{
- vy_rate_limit_set(&q->rate_limit, rate);
+ vy_rate_limit_set(&q->rate_limit[type], rate);
}
void
-vy_quota_force_use(struct vy_quota *q, size_t size)
+vy_quota_force_use(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t size)
{
- vy_quota_do_use(q, size);
+ vy_quota_do_use(q, type, size);
vy_quota_check_limit(q);
}
@@ -201,7 +275,8 @@ vy_quota_release(struct vy_quota *q, size_t size)
}
int
-vy_quota_use(struct vy_quota *q, size_t size, double timeout)
+vy_quota_use(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t size, double timeout)
{
/*
* Fail early if the configured memory limit never allows
@@ -212,8 +287,8 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
return -1;
}
- if (vy_quota_may_use(q, size)) {
- vy_quota_do_use(q, size);
+ if (vy_quota_may_use(q, type, size)) {
+ vy_quota_do_use(q, type, size);
return 0;
}
@@ -224,8 +299,9 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
struct vy_quota_wait_node wait_node = {
.fiber = fiber(),
.size = size,
+ .ticket = ++q->wait_ticket,
};
- rlist_add_tail_entry(&q->wait_queue, &wait_node, in_wait_queue);
+ rlist_add_tail_entry(&q->wait_queue[type], &wait_node, in_wait_queue);
do {
double now = ev_monotonic_now(loop());
@@ -235,7 +311,7 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
diag_set(ClientError, ER_VY_QUOTA_TIMEOUT);
return -1;
}
- } while (!vy_quota_may_use(q, size));
+ } while (!vy_quota_may_use(q, type, size));
rlist_del_entry(&wait_node, in_wait_queue);
@@ -246,7 +322,7 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
wait_time);
}
- vy_quota_do_use(q, size);
+ vy_quota_do_use(q, type, size);
/*
* Blocked consumers are awaken one by one to preserve
* the order they were put to sleep. It's a responsibility
@@ -258,14 +334,15 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
}
void
-vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used)
+vy_quota_adjust(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t reserved, size_t used)
{
if (reserved > used) {
- vy_quota_do_unuse(q, reserved - used);
+ vy_quota_do_unuse(q, type, reserved - used);
vy_quota_signal(q);
}
if (reserved < used) {
- vy_quota_do_use(q, used - reserved);
+ vy_quota_do_use(q, type, used - reserved);
vy_quota_check_limit(q);
}
}
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index 79755e89..d90922b2 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -110,6 +110,50 @@ vy_rate_limit_refill(struct vy_rate_limit *rl, double time)
typedef void
(*vy_quota_exceeded_f)(struct vy_quota *quota);
+/**
+ * Apart from memory usage accounting and limiting, vy_quota is
+ * responsible for consumption rate limiting (aka throttling).
+ * There are multiple rate limits, each of which is associated
+ * with a particular resource type. Different kinds of consumers
+ * respect different limits. The following enumeration defines
+ * the resource types for which vy_quota enables throttling.
+ *
+ * See also vy_quota_consumer_resource_map.
+ */
+enum vy_quota_resource_type {
+ /**
+ * The goal of disk-based throttling is to keep LSM trees
+ * in a good shape so that read and space amplification
+ * stay within bounds. It is enabled when compaction does
+ * not keep up with dumps.
+ */
+ VY_QUOTA_RESOURCE_DISK = 0,
+ /**
+ * Memory-based throttling is needed to avoid long stalls
+ * caused by hitting the hard memory limit. It is set so
+ * that by the time the hard limit is hit, the last memory
+ * dump will have completed.
+ */
+ VY_QUOTA_RESOURCE_MEMORY = 1,
+
+ vy_quota_resource_type_MAX,
+};
+
+/**
+ * Quota consumer type determines how a quota consumer will be
+ * rate limited.
+ *
+ * See also vy_quota_consumer_resource_map.
+ */
+enum vy_quota_consumer_type {
+ /** Transaction processor. */
+ VY_QUOTA_CONSUMER_TX = 0,
+ /** Compaction job. */
+ VY_QUOTA_CONSUMER_COMPACTION = 1,
+
+ vy_quota_consumer_type_MAX,
+};
+
struct vy_quota_wait_node {
/** Link in vy_quota::wait_queue. */
struct rlist in_wait_queue;
@@ -117,6 +161,11 @@ struct vy_quota_wait_node {
struct fiber *fiber;
/** Amount of requested memory. */
size_t size;
+ /**
+ * Ticket assigned to this fiber when it was put to
+ * sleep, see vy_quota::wait_ticket for more details.
+ */
+ int64_t ticket;
};
/**
@@ -144,13 +193,23 @@ struct vy_quota {
*/
vy_quota_exceeded_f quota_exceeded_cb;
/**
- * Queue of consumers waiting for quota, linked by
- * vy_quota_wait_node::state. Newcomers are added
- * to the tail.
+ * Monotonically growing counter assigned to consumers
+ * waiting for quota. It is used for balancing wakeups
+ * among wait queues: if two fibers from different wait
+ * queues may proceed, the one with the lowest ticket
+ * will be picked.
+ *
+ * See also vy_quota_wait_node::ticket.
*/
- struct rlist wait_queue;
- /** Rate limit state. */
- struct vy_rate_limit rate_limit;
+ int64_t wait_ticket;
+ /**
+ * Queue of consumers waiting for quota, one per each
+ * consumer type, linked by vy_quota_wait_node::state.
+ * Newcomers are added to the tail.
+ */
+ struct rlist wait_queue[vy_quota_consumer_type_MAX];
+ /** Rate limit state, one per each resource type. */
+ struct vy_rate_limit rate_limit[vy_quota_resource_type_MAX];
/**
* Periodic timer that is used for refilling the rate
* limit value.
@@ -188,18 +247,20 @@ void
vy_quota_set_limit(struct vy_quota *q, size_t limit);
/**
- * Set the max rate at which quota may be consumed,
- * in bytes per second.
+ * Set the rate limit corresponding to the resource of the given
+ * type. The rate limit is given in bytes per second.
*/
void
-vy_quota_set_rate_limit(struct vy_quota *q, size_t rate);
+vy_quota_set_rate_limit(struct vy_quota *q, enum vy_quota_resource_type type,
+ size_t rate);
/**
* Consume @size bytes of memory. In contrast to vy_quota_use()
* this function does not throttle the caller.
*/
void
-vy_quota_force_use(struct vy_quota *q, size_t size);
+vy_quota_force_use(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t size);
/**
* Release @size bytes of memory.
@@ -242,7 +303,8 @@ vy_quota_release(struct vy_quota *q, size_t size);
* account while estimating the size of a memory allocation.
*/
int
-vy_quota_use(struct vy_quota *q, size_t size, double timeout);
+vy_quota_use(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t size, double timeout);
/**
* Adjust quota after allocating memory.
@@ -253,15 +315,16 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout);
* See also vy_quota_use().
*/
void
-vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used);
+vy_quota_adjust(struct vy_quota *q, enum vy_quota_consumer_type type,
+ size_t reserved, size_t used);
/**
* Block the caller until the quota is not exceeded.
*/
static inline void
-vy_quota_wait(struct vy_quota *q)
+vy_quota_wait(struct vy_quota *q, enum vy_quota_consumer_type type)
{
- vy_quota_use(q, 0, TIMEOUT_INFINITY);
+ vy_quota_use(q, type, 0, TIMEOUT_INFINITY);
}
#if defined(__cplusplus)
diff --git a/src/box/vy_regulator.c b/src/box/vy_regulator.c
index 2e09b93c..e14b01aa 100644
--- a/src/box/vy_regulator.c
+++ b/src/box/vy_regulator.c
@@ -101,7 +101,8 @@ vy_regulator_trigger_dump(struct vy_regulator *regulator)
size_t max_write_rate = (double)mem_left / (mem_used + 1) *
regulator->dump_bandwidth;
max_write_rate = MIN(max_write_rate, regulator->dump_bandwidth);
- vy_quota_set_rate_limit(quota, max_write_rate);
+ vy_quota_set_rate_limit(quota, VY_QUOTA_RESOURCE_MEMORY,
+ max_write_rate);
}
static void
@@ -202,7 +203,8 @@ void
vy_regulator_start(struct vy_regulator *regulator)
{
regulator->quota_used_last = regulator->quota->used;
- vy_quota_set_rate_limit(regulator->quota, regulator->dump_bandwidth);
+ vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
+ regulator->dump_bandwidth);
ev_timer_start(loop(), ®ulator->timer);
}
@@ -253,7 +255,8 @@ vy_regulator_dump_complete(struct vy_regulator *regulator,
* limit to the dump bandwidth rather than disabling it
* completely.
*/
- vy_quota_set_rate_limit(regulator->quota, regulator->dump_bandwidth);
+ vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
+ regulator->dump_bandwidth);
}
void
@@ -263,5 +266,6 @@ vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
if (max > 0 && regulator->dump_bandwidth > max)
regulator->dump_bandwidth = max;
- vy_quota_set_rate_limit(regulator->quota, regulator->dump_bandwidth);
+ vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
+ regulator->dump_bandwidth);
}
--
2.11.0
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v3 4/4] vinyl: throttle tx to ensure compaction keeps up with dumps
2019-02-12 17:26 [PATCH v3 0/4] vinyl: compaction randomization and throttling Vladimir Davydov
` (2 preceding siblings ...)
2019-02-12 17:26 ` [PATCH v3 3/4] vinyl: don't consume quota if wait queue isn't empty Vladimir Davydov
@ 2019-02-12 17:26 ` Vladimir Davydov
2019-02-12 18:20 ` Konstantin Osipov
2019-02-13 10:56 ` [PATCH v3 0/4] vinyl: compaction randomization and throttling Vladimir Davydov
4 siblings, 1 reply; 10+ messages in thread
From: Vladimir Davydov @ 2019-02-12 17:26 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Every byte of data written to a vinyl database eventually gets compacted
with data written to the database earlier. The ratio of the size of data
actually written to disk to the size of data written to the database is
called write amplification. Write amplification depends on the LSM tree
configuration and the workload parameters and varies in a wide range,
from 2-3 to 10-20 or even higher in some extreme cases. If the database
engine doesn't manage to write those extra data, LSM tree shape will get
distorted, which will result in increased read and space amplification,
which, in turn, will lead to slowing down reads and wasting disk space.
That's why it's so important to ensure the database engine has enough
compaction power.
One way to ensure that is increase the number of compaction threads by
tuning box.cfg.vinyl_write_threads configuration knob, but one can't
increase it beyond the capacity of the server running the instance. So
the database engine must throttle writes if it detects that compaction
threads are struggling to keep up. This patch implements a very simple
algorithm to achieve that: it keeps track of recently observed write
amplification and data compaction speed, use them to calculate the max
transaction rate that the database engine can handle while steadily
maintaining the current level of write amplification, and sets the rate
limit to 0.75 of that so as to give the engine enough room to increase
write amplification if needed.
The algorithm is obviously pessimistic: it undervalues the transaction
rate the database can handle after write amplification has steadied. But
this is compensated by its simplicity and stability - there shouldn't be
any abrupt drops or peaks in RPS due to its decisions. Besides, it
adapts fairly quickly to increase in write amplification when a database
is filled up. If one finds that the algorithm is being too cautious by
undervaluing the limit, it's easy to fix by simply increasing the number
of compaction threads - the rate limit will scale proportionately if the
system is underloaded.
The current value of the rate limit set by the algorithm is reported by
box.stat.vinyl() under regulator.rate_limit section.
Thanks to @kostja for the great comment explaining the logic behind the
rate limiting algorithm.
Closes #3721
---
src/box/vinyl.c | 6 +++
src/box/vy_quota.c | 12 +++++
src/box/vy_quota.h | 6 +++
src/box/vy_regulator.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++--
src/box/vy_regulator.h | 27 ++++++++++
5 files changed, 191 insertions(+), 3 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 22fe135b..947ca355 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -274,6 +274,8 @@ vy_info_append_regulator(struct vy_env *env, struct info_handler *h)
info_append_int(h, "write_rate", r->write_rate);
info_append_int(h, "dump_bandwidth", r->dump_bandwidth);
info_append_int(h, "dump_watermark", r->dump_watermark);
+ info_append_int(h, "rate_limit", vy_quota_get_rate_limit(r->quota,
+ VY_QUOTA_CONSUMER_TX));
info_table_end(h); /* regulator */
}
@@ -532,6 +534,7 @@ vinyl_engine_reset_stat(struct engine *engine)
memset(&xm->stat, 0, sizeof(xm->stat));
vy_scheduler_reset_stat(&env->scheduler);
+ vy_regulator_reset_stat(&env->regulator);
}
/** }}} Introspection */
@@ -2480,6 +2483,9 @@ vy_env_dump_complete_cb(struct vy_scheduler *scheduler,
*/
vy_regulator_dump_complete(&env->regulator, mem_dumped, dump_duration);
vy_quota_release(quota, mem_dumped);
+
+ vy_regulator_update_rate_limit(&env->regulator, &scheduler->stat,
+ scheduler->compaction_pool.size);
}
static struct vy_squash_queue *
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 075bfef1..035040a3 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -254,6 +254,18 @@ vy_quota_set_rate_limit(struct vy_quota *q, enum vy_quota_resource_type type,
vy_rate_limit_set(&q->rate_limit[type], rate);
}
+size_t
+vy_quota_get_rate_limit(struct vy_quota *q, enum vy_quota_consumer_type type)
+{
+ size_t rate = SIZE_MAX;
+ for (int i = 0; i < vy_quota_resource_type_MAX; i++) {
+ struct vy_rate_limit *rl = &q->rate_limit[i];
+ if (vy_rate_limit_is_applicable(type, i))
+ rate = MIN(rate, rl->rate);
+ }
+ return rate;
+}
+
void
vy_quota_force_use(struct vy_quota *q, enum vy_quota_consumer_type type,
size_t size)
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index d90922b2..7ff98cc1 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -255,6 +255,12 @@ vy_quota_set_rate_limit(struct vy_quota *q, enum vy_quota_resource_type type,
size_t rate);
/**
+ * Return the rate limit applied to a consumer of the given type.
+ */
+size_t
+vy_quota_get_rate_limit(struct vy_quota *q, enum vy_quota_consumer_type type);
+
+/**
* Consume @size bytes of memory. In contrast to vy_quota_use()
* this function does not throttle the caller.
*/
diff --git a/src/box/vy_regulator.c b/src/box/vy_regulator.c
index e14b01aa..4288753f 100644
--- a/src/box/vy_regulator.c
+++ b/src/box/vy_regulator.c
@@ -34,6 +34,7 @@
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
+#include <string.h>
#include <tarantool_ev.h>
#include "fiber.h"
@@ -42,6 +43,7 @@
#include "trivia/util.h"
#include "vy_quota.h"
+#include "vy_stat.h"
/**
* Regulator timer period, in seconds.
@@ -73,6 +75,14 @@ static const size_t VY_DUMP_BANDWIDTH_DEFAULT = 10 * 1024 * 1024;
*/
static const size_t VY_DUMP_SIZE_ACCT_MIN = 1024 * 1024;
+/**
+ * Number of dumps to take into account for rate limit calculation.
+ * Shouldn't be too small to avoid uneven RPS. Shouldn't be too big
+ * either - otherwise the rate limit will adapt too slowly to workload
+ * changes. 100 feels like a good choice.
+ */
+static const int VY_RECENT_DUMP_COUNT = 100;
+
static void
vy_regulator_trigger_dump(struct vy_regulator *regulator)
{
@@ -182,6 +192,7 @@ vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
100 * MB, 200 * MB, 300 * MB, 400 * MB, 500 * MB, 600 * MB,
700 * MB, 800 * MB, 900 * MB,
};
+ memset(regulator, 0, sizeof(*regulator));
regulator->dump_bandwidth_hist = histogram_new(dump_bandwidth_buckets,
lengthof(dump_bandwidth_buckets));
if (regulator->dump_bandwidth_hist == NULL)
@@ -192,11 +203,8 @@ vy_regulator_create(struct vy_regulator *regulator, struct vy_quota *quota,
ev_timer_init(®ulator->timer, vy_regulator_timer_cb, 0,
VY_REGULATOR_TIMER_PERIOD);
regulator->timer.data = regulator;
- regulator->write_rate = 0;
- regulator->quota_used_last = 0;
regulator->dump_bandwidth = VY_DUMP_BANDWIDTH_DEFAULT;
regulator->dump_watermark = SIZE_MAX;
- regulator->dump_in_progress = false;
}
void
@@ -269,3 +277,132 @@ vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max)
vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_MEMORY,
regulator->dump_bandwidth);
}
+
+void
+vy_regulator_reset_stat(struct vy_regulator *regulator)
+{
+ memset(®ulator->sched_stat_last, 0,
+ sizeof(regulator->sched_stat_last));
+}
+
+/*
+ * The goal of rate limiting is to ensure LSM trees stay close to
+ * their perfect shape, as defined by run_size_ratio. When dump rate
+ * is too high, we have to throttle database writes to ensure
+ * compaction can keep up with dumps. We can't deduce optimal dump
+ * bandwidth from LSM configuration, such as run_size_ratio or
+ * run_count_per_level, since different spaces or different indexes
+ * within a space can have different configuration settings. The
+ * workload can also vary significantly from space to space. So,
+ * when setting the limit, we have to consider dump and compaction
+ * activities of the database as a whole.
+ *
+ * To this end, we keep track of compaction bandwidth and write
+ * amplification of the entire database, across all LSM trees.
+ * The idea is simple: observe the current write amplification
+ * and compaction bandwidth, and set maximal write rate to a value
+ * somewhat below the implied limit, so as to make room for
+ * compaction to do more work if necessary.
+ *
+ * We use the following metrics to calculate the limit:
+ * - dump_output - number of bytes dumped to disk over the last
+ * observation period. The period itself is measured in dumps,
+ * not seconds, and is defined by constant VY_RECENT_DUMP_COUNT.
+ * - compaction_output - number of bytes produced by compaction
+ * over the same period.
+ * - compaction_rate - total compaction output, in bytes, divided
+ * by total time spent on doing compaction by compaction threads,
+ * both measured over the same observation period. This gives an
+ * estimate of the speed at which compaction can write output.
+ * In the real world this speed is dependent on the disk write
+ * throughput, number of dump threads, and actual dump rate, but
+ * given the goal of rate limiting is providing compaction with
+ * extra bandwidth, this metric is considered an accurate enough
+ * approximation of the disk bandwidth available to compaction.
+ *
+ * We calculate the compaction rate with the following formula:
+ *
+ * compaction_output
+ * compaction_rate = compaction_threads * -----------------
+ * compaction_time
+ *
+ * where compaction_threads represents the total number of available
+ * compaction threads and compaction_time is the total time, in
+ * seconds, spent by all threads doing compaction. You can look at
+ * the formula this way: compaction_ouptut / compaction_time gives
+ * the average write speed of a single compaction thread, and by
+ * multiplying it by the number of compaction threads we get the
+ * compaction rate of the entire database.
+ *
+ * In an optimal system dump rate must be proportional to compaction
+ * rate and inverse to write amplification:
+ *
+ * dump_rate = compaction_rate / (write_amplification - 1)
+ *
+ * The latter can be obtained by dividing total output of compaction
+ * by total output of dumps over the observation period:
+ *
+ * dump_output + compaction_output
+ * write_amplification = ------------------------------- =
+ * dump_output
+ *
+ * = 1 + compaction_output / dump_output
+ *
+ * Putting this all together and taking into account data compaction
+ * during memory dump, we get for the max transaction rate:
+ *
+ * dump_input
+ * tx_rate = dump_rate * ----------- =
+ * dump_output
+ *
+ * compaction_output
+ * = compaction_threads * ----------------- *
+ * compaction_time
+ *
+ * dump_output dump_input
+ * * ----------------- * ----------- =
+ * compaction_output dump_output
+ *
+ * = compaction_threads * dump_input / compaction_time
+ *
+ * We set the rate limit to 0.75 of the approximated optimal to
+ * leave the database engine enough room needed to use more disk
+ * bandwidth for compaction if necessary. As soon as compaction gets
+ * enough disk bandwidth to keep LSM trees in optimal shape
+ * compaction speed becomes stable, as does write amplification.
+ */
+void
+vy_regulator_update_rate_limit(struct vy_regulator *regulator,
+ const struct vy_scheduler_stat *stat,
+ int compaction_threads)
+{
+ struct vy_scheduler_stat *last = ®ulator->sched_stat_last;
+ struct vy_scheduler_stat *recent = ®ulator->sched_stat_recent;
+
+ int32_t dump_count = stat->dump_count - last->dump_count;
+ int64_t dump_input = stat->dump_input - last->dump_input;
+ double compaction_time = stat->compaction_time - last->compaction_time;
+ *last = *stat;
+
+ if (dump_input < (ssize_t)VY_DUMP_SIZE_ACCT_MIN)
+ return;
+
+ recent->dump_count += dump_count;
+ recent->dump_input += dump_input;
+ recent->compaction_time += compaction_time;
+
+ double rate = 0.75 * compaction_threads * recent->dump_input /
+ recent->compaction_time;
+ vy_quota_set_rate_limit(regulator->quota, VY_QUOTA_RESOURCE_DISK,
+ MIN(rate, SIZE_MAX));
+
+ /*
+ * Periodically rotate statistics for quicker adaptation
+ * to workload changes.
+ */
+ if (recent->dump_count > VY_RECENT_DUMP_COUNT) {
+ recent->dump_count /= 2;
+ recent->dump_input /= 2;
+ recent->compaction_time /= 2;
+ }
+}
diff --git a/src/box/vy_regulator.h b/src/box/vy_regulator.h
index 0188da26..341f41df 100644
--- a/src/box/vy_regulator.h
+++ b/src/box/vy_regulator.h
@@ -35,6 +35,8 @@
#include <stddef.h>
#include <tarantool_ev.h>
+#include "vy_stat.h"
+
#if defined(__cplusplus)
extern "C" {
#endif /* defined(__cplusplus) */
@@ -107,6 +109,16 @@ struct vy_regulator {
* but vy_regulator_dump_complete() hasn't been called yet.
*/
bool dump_in_progress;
+ /**
+ * Snapshot of scheduler statistics taken at the time of
+ * the last rate limit update.
+ */
+ struct vy_scheduler_stat sched_stat_last;
+ /**
+ * Scheduler statistics for the most recent few dumps.
+ * Used for calculating the rate limit.
+ */
+ struct vy_scheduler_stat sched_stat_recent;
};
void
@@ -146,6 +158,21 @@ vy_regulator_dump_complete(struct vy_regulator *regulator,
void
vy_regulator_reset_dump_bandwidth(struct vy_regulator *regulator, size_t max);
+/**
+ * Called when global statistics are reset by box.stat.reset().
+ */
+void
+vy_regulator_reset_stat(struct vy_regulator *regulator);
+
+/**
+ * Set transaction rate limit so as to ensure that compaction
+ * will keep up with dumps.
+ */
+void
+vy_regulator_update_rate_limit(struct vy_regulator *regulator,
+ const struct vy_scheduler_stat *stat,
+ int compaction_threads);
+
#if defined(__cplusplus)
} /* extern "C" */
#endif /* defined(__cplusplus) */
--
2.11.0
^ permalink raw reply [flat|nested] 10+ messages in thread