* [PATCH 01/11] vinyl: merge vy_quota_release and vy_quota_update_dump_bandwidth
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-20 9:34 ` [PATCH 02/11] vinyl: refactor quota use/unuse methods Vladimir Davydov
` (10 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
These functions are always called together when memory dump is complete.
Soon we will also have to update throttle rate limit there too. Let's
merge them now and call the resulting function vy_quota_dump to reflect
the fact that it's only called on dump by design.
While we are at it, remove vy_quota_dump_bandwidth() declaration as this
function isn't defined or used anywhere.
---
src/box/vinyl.c | 3 +--
src/box/vy_quota.c | 20 ++++++++++----------
src/box/vy_quota.h | 16 +++-------------
3 files changed, 14 insertions(+), 25 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index b5904872..6cc5485b 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2467,8 +2467,7 @@ vy_env_dump_complete_cb(struct vy_scheduler *scheduler,
size_t mem_used_after = lsregion_used(allocator);
assert(mem_used_after <= mem_used_before);
size_t mem_dumped = mem_used_before - mem_used_after;
- vy_quota_release(quota, mem_dumped);
- vy_quota_update_dump_bandwidth(quota, mem_dumped, dump_duration);
+ vy_quota_dump(quota, mem_dumped, dump_duration);
say_info("dumped %zu bytes in %.1f sec", mem_dumped, dump_duration);
}
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 51f0ba71..d3588b6e 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -170,9 +170,17 @@ vy_quota_set_limit(struct vy_quota *q, size_t limit)
}
void
-vy_quota_update_dump_bandwidth(struct vy_quota *q, size_t size,
- double duration)
+vy_quota_dump(struct vy_quota *q, size_t size, double duration)
{
+ /*
+ * Release quota and wake up the first fiber in
+ * the wait queue, if any.
+ */
+ assert(q->used >= size);
+ q->used -= size;
+ fiber_cond_signal(&q->cond);
+
+ /* Update dump bandwidth. */
if (duration > 0) {
histogram_collect(q->dump_bw_hist, size / duration);
/*
@@ -201,14 +209,6 @@ vy_quota_force_use(struct vy_quota *q, size_t size)
q->quota_exceeded_cb(q);
}
-void
-vy_quota_release(struct vy_quota *q, size_t size)
-{
- assert(q->used >= size);
- q->used -= size;
- fiber_cond_signal(&q->cond);
-}
-
int
vy_quota_use(struct vy_quota *q, size_t size, double timeout)
{
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index 9ce53fc4..0fa852b1 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -120,19 +120,15 @@ vy_quota_destroy(struct vy_quota *q);
void
vy_quota_set_limit(struct vy_quota *q, size_t limit);
-/** Return dump bandwidth. */
-size_t
-vy_quota_dump_bandwidth(struct vy_quota *q);
-
/**
- * Update dump bandwidth.
+ * Function called on dump completion to release quota after
+ * freeing memory.
*
* @size: size of dumped memory.
* @duration: how long memory dump took.
*/
void
-vy_quota_update_dump_bandwidth(struct vy_quota *q, size_t size,
- double duration);
+vy_quota_dump(struct vy_quota *q, size_t size, double duration);
/**
* Reset dump bandwidth histogram and update initial estimate.
@@ -149,12 +145,6 @@ void
vy_quota_force_use(struct vy_quota *q, size_t size);
/**
- * Release @size bytes of memory.
- */
-void
-vy_quota_release(struct vy_quota *q, size_t size);
-
-/**
* Try to consume @size bytes of memory, throttle the caller
* if the limit is exceeded. @timeout specifies the maximal
* time to wait. Return 0 on success, -1 on timeout.
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 02/11] vinyl: refactor quota use/unuse methods
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
2018-09-20 9:34 ` [PATCH 01/11] vinyl: merge vy_quota_release and vy_quota_update_dump_bandwidth Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-20 9:34 ` [PATCH 03/11] vinyl: do not try to trigger dump if it is already in progress Vladimir Davydov
` (9 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Some minor refactoring targeted at beautifying the code:
- Rename vy_quota_is_exceeded to vy_quota_may_use and pass the
requested amount of quota to it explicitly instead of incrementing
quota before calling this function and decremented after. Move the
definition closer to vy_quota_use, where this function is used.
- Implement quota consumption in vy_quota_use via vy_quota_force_use
so that there's the only function that increments quota.
- Factor out vy_quota_unuse out of vy_qouta_adjust. This function is
opposite to vy_quota_force_use.
---
src/box/vy_quota.c | 55 +++++++++++++++++++++++++-----------------------------
1 file changed, 25 insertions(+), 30 deletions(-)
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index d3588b6e..8ead5c4b 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -68,16 +68,6 @@ static const size_t VY_DEFAULT_DUMP_BANDWIDTH = 10 * 1024 * 1024;
*/
enum { VY_DUMP_BANDWIDTH_PCT = 10 };
-/**
- * Returns true if the quota limit is exceeded and so consumers
- * have to wait.
- */
-static inline bool
-vy_quota_is_exceeded(struct vy_quota *q)
-{
- return q->used > q->limit;
-}
-
static void
vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
{
@@ -209,25 +199,29 @@ vy_quota_force_use(struct vy_quota *q, size_t size)
q->quota_exceeded_cb(q);
}
+/**
+ * Returns true if the requested amount of quota may be consumed,
+ * or false if consumers have to wait.
+ */
+static bool
+vy_quota_may_use(struct vy_quota *q, size_t size)
+{
+ return q->used + size <= q->limit;
+}
+
int
vy_quota_use(struct vy_quota *q, size_t size, double timeout)
{
- q->used += size;
- q->use_curr += size;
- if (vy_quota_is_exceeded(q)) {
+ if (!vy_quota_may_use(q, size)) {
/* Wait for quota. */
double start_time = ev_monotonic_now(loop());
double deadline = start_time + timeout;
do {
q->quota_exceeded_cb(q);
- q->used -= size;
- q->use_curr -= size;
if (fiber_cond_wait_deadline(&q->cond, deadline) != 0)
return -1; /* timed out */
- q->used += size;
- q->use_curr += size;
- } while (vy_quota_is_exceeded(q));
+ } while (!vy_quota_may_use(q, size));
double wait_time = ev_monotonic_now(loop()) - start_time;
if (wait_time > q->too_long_threshold) {
@@ -240,24 +234,25 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
*/
fiber_cond_signal(&q->cond);
}
- if (q->used >= q->watermark)
- q->quota_exceeded_cb(q);
+ vy_quota_force_use(q, size);
return 0;
}
+static void
+vy_quota_unuse(struct vy_quota *q, size_t size)
+{
+ assert(q->used >= size);
+ q->used -= size;
+ /* use_curr could have been reset on timeout. */
+ q->use_curr -= MIN(q->use_curr, size);
+ fiber_cond_signal(&q->cond);
+}
+
void
vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used)
{
- if (reserved > used) {
- size_t excess = reserved - used;
- assert(q->used >= excess);
- q->used -= excess;
- if (q->use_curr >= excess)
- q->use_curr -= excess;
- else /* was reset by timeout */
- q->use_curr = 0;
- fiber_cond_signal(&q->cond);
- }
+ if (reserved > used)
+ vy_quota_unuse(q, reserved - used);
if (reserved < used)
vy_quota_force_use(q, used - reserved);
}
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 03/11] vinyl: do not try to trigger dump if it is already in progress
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
2018-09-20 9:34 ` [PATCH 01/11] vinyl: merge vy_quota_release and vy_quota_update_dump_bandwidth Vladimir Davydov
2018-09-20 9:34 ` [PATCH 02/11] vinyl: refactor quota use/unuse methods Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-20 9:34 ` [PATCH 04/11] vinyl: don't start quota timer until local recovery is complete Vladimir Davydov
` (8 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Currently, vy_quota_use() calls quota_exceeded_cb callback every time it
sees that the memory usage exceeds the watermark. Actually, this is
pointless, because the callback will return immediately if dump is
already in progress. Let's introduce flag vy_quota::dump_in_progress and
set it when dump is triggered so that we can skip callback invocation if
the flag is already set. The flag is cleared by vy_quota_dump(), which
is called when dump is complete. Since the callback is now used solely
to trigger dump, let's rename it to trigger_dump_cb.
---
src/box/vinyl.c | 9 +++++----
src/box/vy_quota.c | 46 ++++++++++++++++++++++++++++++++++------------
src/box/vy_quota.h | 24 ++++++++++++++++--------
3 files changed, 55 insertions(+), 24 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 6cc5485b..d0262a03 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2436,8 +2436,8 @@ vinyl_engine_rollback_statement(struct engine *engine, struct txn *txn,
/** {{{ Environment */
-static void
-vy_env_quota_exceeded_cb(struct vy_quota *quota)
+static int
+vy_env_trigger_dump_cb(struct vy_quota *quota)
{
struct vy_env *env = container_of(quota, struct vy_env, quota);
@@ -2448,9 +2448,10 @@ vy_env_quota_exceeded_cb(struct vy_quota *quota)
* quota has been consumed by pending transactions.
* There's nothing we can do about that.
*/
- return;
+ return -1;
}
vy_scheduler_trigger_dump(&env->scheduler);
+ return 0;
}
static void
@@ -2519,7 +2520,7 @@ vy_env_new(const char *path, size_t memory,
vy_squash_schedule, e) != 0)
goto error_lsm_env;
- if (vy_quota_create(&e->quota, vy_env_quota_exceeded_cb) != 0)
+ if (vy_quota_create(&e->quota, vy_env_trigger_dump_cb) != 0)
goto error_quota;
struct slab_cache *slab_cache = cord_slab_cache();
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 8ead5c4b..852f381b 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -68,6 +68,29 @@ static const size_t VY_DEFAULT_DUMP_BANDWIDTH = 10 * 1024 * 1024;
*/
enum { VY_DUMP_BANDWIDTH_PCT = 10 };
+/**
+ * Trigger memory dump unless it is already in progress.
+ */
+static void
+vy_quota_trigger_dump(struct vy_quota *q)
+{
+ if (q->dump_in_progress)
+ return;
+ if (q->trigger_dump_cb(q) != 0)
+ return;
+ q->dump_in_progress = true;
+}
+
+/**
+ * Trigger memory dump if usage is above the watermark.
+ */
+static void
+vy_quota_check_watermark(struct vy_quota *q)
+{
+ if (q->used >= q->watermark)
+ vy_quota_trigger_dump(q);
+}
+
static void
vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
{
@@ -98,13 +121,14 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
*/
q->watermark = ((double)q->limit * q->dump_bw /
(q->dump_bw + q->use_rate + 1));
- if (q->used >= q->watermark)
- q->quota_exceeded_cb(q);
+ vy_quota_check_watermark(q);
}
int
-vy_quota_create(struct vy_quota *q, vy_quota_exceeded_f quota_exceeded_cb)
+vy_quota_create(struct vy_quota *q, vy_quota_trigger_dump_f trigger_dump_cb)
{
+ memset(q, 0, sizeof(*q));
+
enum { KB = 1024, MB = KB * KB };
static int64_t dump_bandwidth_buckets[] = {
100 * KB, 200 * KB, 300 * KB, 400 * KB, 500 * KB, 600 * KB,
@@ -127,12 +151,9 @@ vy_quota_create(struct vy_quota *q, vy_quota_exceeded_f quota_exceeded_cb)
q->limit = SIZE_MAX;
q->watermark = SIZE_MAX;
- q->used = 0;
- q->use_curr = 0;
- q->use_rate = 0;
q->too_long_threshold = TIMEOUT_INFINITY;
q->dump_bw = VY_DEFAULT_DUMP_BANDWIDTH;
- q->quota_exceeded_cb = quota_exceeded_cb;
+ q->trigger_dump_cb = trigger_dump_cb;
fiber_cond_create(&q->cond);
ev_timer_init(&q->timer, vy_quota_timer_cb, 0,
VY_QUOTA_UPDATE_INTERVAL);
@@ -154,14 +175,15 @@ void
vy_quota_set_limit(struct vy_quota *q, size_t limit)
{
q->limit = q->watermark = limit;
- if (q->used >= limit)
- q->quota_exceeded_cb(q);
+ vy_quota_check_watermark(q);
fiber_cond_signal(&q->cond);
}
void
vy_quota_dump(struct vy_quota *q, size_t size, double duration)
{
+ q->dump_in_progress = false;
+
/*
* Release quota and wake up the first fiber in
* the wait queue, if any.
@@ -195,8 +217,7 @@ vy_quota_force_use(struct vy_quota *q, size_t size)
{
q->used += size;
q->use_curr += size;
- if (q->used >= q->watermark)
- q->quota_exceeded_cb(q);
+ vy_quota_check_watermark(q);
}
/**
@@ -218,7 +239,8 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
double deadline = start_time + timeout;
do {
- q->quota_exceeded_cb(q);
+ if (q->used + size > q->limit)
+ vy_quota_trigger_dump(q);
if (fiber_cond_wait_deadline(&q->cond, deadline) != 0)
return -1; /* timed out */
} while (!vy_quota_may_use(q, size));
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index 0fa852b1..3b020829 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -31,6 +31,7 @@
* SUCH DAMAGE.
*/
+#include <stdbool.h>
#include <stddef.h>
#include <tarantool_ev.h>
#include "fiber_cond.h"
@@ -42,8 +43,8 @@ extern "C" {
struct vy_quota;
struct histogram;
-typedef void
-(*vy_quota_exceeded_f)(struct vy_quota *quota);
+typedef int
+(*vy_quota_trigger_dump_f)(struct vy_quota *quota);
/**
* Quota used for accounting and limiting memory consumption
@@ -73,11 +74,6 @@ struct vy_quota {
* there is no quota left.
*/
struct fiber_cond cond;
- /**
- * Called when quota is consumed if used >= watermark.
- * It is supposed to trigger memory reclaim.
- */
- vy_quota_exceeded_f quota_exceeded_cb;
/** Timer for updating quota watermark. */
ev_timer timer;
/**
@@ -91,6 +87,18 @@ struct vy_quota {
* moving average of use_curr.
*/
size_t use_rate;
+ /**
+ * Called when quota is consumed if used >= watermark.
+ * It is supposed to trigger memory dump and return 0
+ * on success or -1 on failure.
+ */
+ vy_quota_trigger_dump_f trigger_dump_cb;
+ /**
+ * Set if the last triggered memory dump hasn't completed
+ * yet, i.e. trigger_dump_cb() was successfully invoked,
+ * but quota hasn't been released yet.
+ */
+ bool dump_in_progress;
/** Current dump bandwidth estimate. */
size_t dump_bw;
/**
@@ -108,7 +116,7 @@ struct vy_quota {
};
int
-vy_quota_create(struct vy_quota *q, vy_quota_exceeded_f quota_exceeded_cb);
+vy_quota_create(struct vy_quota *q, vy_quota_trigger_dump_f trigger_dump_cb);
void
vy_quota_destroy(struct vy_quota *q);
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 04/11] vinyl: don't start quota timer until local recovery is complete
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (2 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 03/11] vinyl: do not try to trigger dump if it is already in progress Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-20 9:34 ` [PATCH 05/11] vinyl: add helper to start scheduler and enable quota on startup Vladimir Davydov
` (7 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Although we don't impose the limit during local recovery, the quota
timer, which keeps track of the write rate and recalculates the
watermark, is running. This is incorrect, because the write rate can
rocket sky high during recovery so that we can wind up with the
watermark set too low once recovery is complete. When we introduce
transaction throttling, the timer will also set a throttle limit based
on the write rate, which is pointless during local recovery too.
That said, we shouldn't start the quota timer until local recovery is
complete. To do that, let's introduce new function vy_quota_enable()
that enables the limit and starts the timer. The function is called
instead of vy_quota_set_limit() upon bootstrap and recovery completion.
---
src/box/vinyl.c | 8 ++++----
src/box/vy_quota.c | 25 ++++++++++++++++++++-----
src/box/vy_quota.h | 25 ++++++++++++++++++++++++-
3 files changed, 48 insertions(+), 10 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index d0262a03..e1b35f3a 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2520,7 +2520,7 @@ vy_env_new(const char *path, size_t memory,
vy_squash_schedule, e) != 0)
goto error_lsm_env;
- if (vy_quota_create(&e->quota, vy_env_trigger_dump_cb) != 0)
+ if (vy_quota_create(&e->quota, e->memory, vy_env_trigger_dump_cb) != 0)
goto error_quota;
struct slab_cache *slab_cache = cord_slab_cache();
@@ -2723,7 +2723,7 @@ vinyl_engine_bootstrap(struct engine *engine)
if (vy_log_bootstrap() != 0)
return -1;
vy_scheduler_start(&e->scheduler);
- vy_quota_set_limit(&e->quota, e->memory);
+ vy_quota_enable(&e->quota);
e->status = VINYL_ONLINE;
return 0;
}
@@ -2758,7 +2758,7 @@ vinyl_engine_begin_initial_recovery(struct engine *engine,
if (vy_log_bootstrap() != 0)
return -1;
vy_scheduler_start(&e->scheduler);
- vy_quota_set_limit(&e->quota, e->memory);
+ vy_quota_enable(&e->quota);
e->status = VINYL_INITIAL_RECOVERY_REMOTE;
}
return 0;
@@ -2802,7 +2802,7 @@ vinyl_engine_end_recovery(struct engine *engine)
e->recovery = NULL;
e->recovery_vclock = NULL;
vy_scheduler_start(&e->scheduler);
- vy_quota_set_limit(&e->quota, e->memory);
+ vy_quota_enable(&e->quota);
break;
case VINYL_FINAL_RECOVERY_REMOTE:
break;
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 852f381b..7cd64474 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -74,6 +74,7 @@ enum { VY_DUMP_BANDWIDTH_PCT = 10 };
static void
vy_quota_trigger_dump(struct vy_quota *q)
{
+ assert(q->is_enabled);
if (q->dump_in_progress)
return;
if (q->trigger_dump_cb(q) != 0)
@@ -87,7 +88,7 @@ vy_quota_trigger_dump(struct vy_quota *q)
static void
vy_quota_check_watermark(struct vy_quota *q)
{
- if (q->used >= q->watermark)
+ if (q->is_enabled && q->used >= q->watermark)
vy_quota_trigger_dump(q);
}
@@ -99,6 +100,8 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
struct vy_quota *q = timer->data;
+ assert(q->is_enabled);
+
/*
* Update the quota use rate with the new measurement.
*/
@@ -125,7 +128,8 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
}
int
-vy_quota_create(struct vy_quota *q, vy_quota_trigger_dump_f trigger_dump_cb)
+vy_quota_create(struct vy_quota *q, size_t limit,
+ vy_quota_trigger_dump_f trigger_dump_cb)
{
memset(q, 0, sizeof(*q));
@@ -149,8 +153,8 @@ vy_quota_create(struct vy_quota *q, vy_quota_trigger_dump_f trigger_dump_cb)
return -1;
}
- q->limit = SIZE_MAX;
- q->watermark = SIZE_MAX;
+ q->limit = limit;
+ q->watermark = limit;
q->too_long_threshold = TIMEOUT_INFINITY;
q->dump_bw = VY_DEFAULT_DUMP_BANDWIDTH;
q->trigger_dump_cb = trigger_dump_cb;
@@ -158,11 +162,20 @@ vy_quota_create(struct vy_quota *q, vy_quota_trigger_dump_f trigger_dump_cb)
ev_timer_init(&q->timer, vy_quota_timer_cb, 0,
VY_QUOTA_UPDATE_INTERVAL);
q->timer.data = q;
- ev_timer_start(loop(), &q->timer);
return 0;
}
void
+vy_quota_enable(struct vy_quota *q)
+{
+ assert(!q->is_enabled);
+ q->is_enabled = true;
+ q->use_curr = 0;
+ ev_timer_start(loop(), &q->timer);
+ vy_quota_check_watermark(q);
+}
+
+void
vy_quota_destroy(struct vy_quota *q)
{
ev_timer_stop(loop(), &q->timer);
@@ -227,6 +240,8 @@ vy_quota_force_use(struct vy_quota *q, size_t size)
static bool
vy_quota_may_use(struct vy_quota *q, size_t size)
{
+ if (!q->is_enabled)
+ return true;
return q->used + size <= q->limit;
}
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index 3b020829..793a4430 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -51,6 +51,8 @@ typedef int
* in the vinyl engine. It is NOT multi-threading safe.
*/
struct vy_quota {
+ /** Set when the quota is enabled. */
+ bool is_enabled;
/**
* Memory limit. Once hit, new transactions are
* throttled until memory is reclaimed.
@@ -115,9 +117,30 @@ struct vy_quota {
struct histogram *dump_bw_hist;
};
+/**
+ * Initialize a quota object.
+ *
+ * @limit: max allowed memory usage.
+ * @trigger_dump_cb: callback invoked to trigger memory dump.
+ *
+ * Returns 0 on success, -1 on memory allocation error.
+ *
+ * Note, the limit won't be imposed until vy_quota_enable()
+ * is called.
+ */
int
-vy_quota_create(struct vy_quota *q, vy_quota_trigger_dump_f trigger_dump_cb);
+vy_quota_create(struct vy_quota *q, size_t limit,
+ vy_quota_trigger_dump_f trigger_dump_cb);
+/**
+ * Enable the configured limit for a quota object.
+ */
+void
+vy_quota_enable(struct vy_quota *q);
+
+/**
+ * Destroy a quota object.
+ */
void
vy_quota_destroy(struct vy_quota *q);
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 05/11] vinyl: add helper to start scheduler and enable quota on startup
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (3 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 04/11] vinyl: don't start quota timer until local recovery is complete Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-25 23:22 ` [tarantool-patches] " Konstantin Osipov
2018-09-20 9:34 ` [PATCH 06/11] vinyl: zap vy_env::memory, read_threads, and write_threads Vladimir Davydov
` (6 subsequent siblings)
11 siblings, 1 reply; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
There are three places where we start the scheduler fiber and enable
the quota limit: local bootstrap, remote bootstrap, and local recovery
completion. I'm planning to add more code there so let's factor it out
now.
---
src/box/vinyl.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index e1b35f3a..32143af0 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2565,6 +2565,13 @@ vy_env_delete(struct vy_env *e)
free(e);
}
+static void
+vy_env_start_scheduler(struct vy_env *e)
+{
+ vy_scheduler_start(&e->scheduler);
+ vy_quota_enable(&e->quota);
+}
+
struct vinyl_engine *
vinyl_engine_new(const char *dir, size_t memory,
int read_threads, int write_threads, bool force_recovery)
@@ -2722,8 +2729,7 @@ vinyl_engine_bootstrap(struct engine *engine)
assert(e->status == VINYL_OFFLINE);
if (vy_log_bootstrap() != 0)
return -1;
- vy_scheduler_start(&e->scheduler);
- vy_quota_enable(&e->quota);
+ vy_env_start_scheduler(e);
e->status = VINYL_ONLINE;
return 0;
}
@@ -2757,8 +2763,7 @@ vinyl_engine_begin_initial_recovery(struct engine *engine,
} else {
if (vy_log_bootstrap() != 0)
return -1;
- vy_scheduler_start(&e->scheduler);
- vy_quota_enable(&e->quota);
+ vy_env_start_scheduler(e);
e->status = VINYL_INITIAL_RECOVERY_REMOTE;
}
return 0;
@@ -2801,8 +2806,7 @@ vinyl_engine_end_recovery(struct engine *engine)
vy_recovery_delete(e->recovery);
e->recovery = NULL;
e->recovery_vclock = NULL;
- vy_scheduler_start(&e->scheduler);
- vy_quota_enable(&e->quota);
+ vy_env_start_scheduler(e);
break;
case VINYL_FINAL_RECOVERY_REMOTE:
break;
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 06/11] vinyl: zap vy_env::memory, read_threads, and write_threads
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (4 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 05/11] vinyl: add helper to start scheduler and enable quota on startup Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-25 23:23 ` [tarantool-patches] " Konstantin Osipov
2018-09-20 9:34 ` [PATCH 07/11] vinyl: do not account zero dump bandwidth Vladimir Davydov
` (5 subsequent siblings)
11 siblings, 1 reply; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
They are only used to set corresponding members of vy_quota, vy_run_env,
and vy_scheduler when vy_env is created. No point in keeping them around
all the time.
---
src/box/vinyl.c | 22 ++++++----------------
src/box/vy_run.c | 12 ++++++------
src/box/vy_run.h | 17 ++++++++++++-----
test/unit/vy_point_lookup.c | 2 +-
4 files changed, 25 insertions(+), 28 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 32143af0..4c1e860d 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -132,14 +132,8 @@ struct vy_env {
int64_t join_lsn;
/** Path to the data directory. */
char *path;
- /** Max size of the memory level. */
- size_t memory;
/** Max time a transaction may wait for memory. */
double timeout;
- /** Max number of threads used for reading. */
- int read_threads;
- /** Max number of threads used for writing. */
- int write_threads;
/** Try to recover corrupted data if set. */
bool force_recovery;
};
@@ -759,8 +753,7 @@ vinyl_index_open(struct index *index)
rc = vy_lsm_create(lsm);
if (rc == 0) {
/* Make sure reader threads are up and running. */
- vy_run_env_enable_coio(&env->run_env,
- env->read_threads);
+ vy_run_env_enable_coio(&env->run_env);
}
break;
case VINYL_INITIAL_RECOVERY_REMOTE:
@@ -2491,10 +2484,7 @@ vy_env_new(const char *path, size_t memory,
}
memset(e, 0, sizeof(*e));
e->status = VINYL_OFFLINE;
- e->memory = memory;
e->timeout = TIMEOUT_INFINITY;
- e->read_threads = read_threads;
- e->write_threads = write_threads;
e->force_recovery = force_recovery;
e->path = strdup(path);
if (e->path == NULL) {
@@ -2510,8 +2500,8 @@ vy_env_new(const char *path, size_t memory,
if (e->squash_queue == NULL)
goto error_squash_queue;
- vy_mem_env_create(&e->mem_env, e->memory);
- vy_scheduler_create(&e->scheduler, e->write_threads,
+ vy_mem_env_create(&e->mem_env, memory);
+ vy_scheduler_create(&e->scheduler, write_threads,
vy_env_dump_complete_cb,
&e->run_env, &e->xm->read_views);
@@ -2520,14 +2510,14 @@ vy_env_new(const char *path, size_t memory,
vy_squash_schedule, e) != 0)
goto error_lsm_env;
- if (vy_quota_create(&e->quota, e->memory, vy_env_trigger_dump_cb) != 0)
+ if (vy_quota_create(&e->quota, memory, vy_env_trigger_dump_cb) != 0)
goto error_quota;
struct slab_cache *slab_cache = cord_slab_cache();
mempool_create(&e->iterator_pool, slab_cache,
sizeof(struct vinyl_iterator));
vy_cache_env_create(&e->cache_env, slab_cache);
- vy_run_env_create(&e->run_env);
+ vy_run_env_create(&e->run_env, read_threads);
vy_log_init(e->path);
return e;
error_quota:
@@ -2819,7 +2809,7 @@ vinyl_engine_end_recovery(struct engine *engine)
* creation, see vinyl_index_open().
*/
if (e->lsm_env.lsm_count > 0)
- vy_run_env_enable_coio(&e->run_env, e->read_threads);
+ vy_run_env_enable_coio(&e->run_env);
e->status = VINYL_ONLINE;
return 0;
diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index f107e3a9..975b6349 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -123,12 +123,11 @@ vy_run_reader_f(va_list ap)
/** Start run reader threads. */
static void
-vy_run_env_start_readers(struct vy_run_env *env, int threads)
+vy_run_env_start_readers(struct vy_run_env *env)
{
- assert(threads > 0);
assert(env->reader_pool == NULL);
+ assert(env->reader_pool_size > 0);
- env->reader_pool_size = threads;
env->reader_pool = calloc(env->reader_pool_size,
sizeof(*env->reader_pool));
if (env->reader_pool == NULL)
@@ -166,9 +165,10 @@ vy_run_env_stop_readers(struct vy_run_env *env)
* Initialize vinyl run environment
*/
void
-vy_run_env_create(struct vy_run_env *env)
+vy_run_env_create(struct vy_run_env *env, int read_threads)
{
memset(env, 0, sizeof(*env));
+ env->reader_pool_size = read_threads;
tt_pthread_key_create(&env->zdctx_key, vy_free_zdctx);
mempool_create(&env->read_task_pool, cord_slab_cache(),
sizeof(struct vy_page_read_task));
@@ -190,11 +190,11 @@ vy_run_env_destroy(struct vy_run_env *env)
* Enable coio reads for a vinyl run environment.
*/
void
-vy_run_env_enable_coio(struct vy_run_env *env, int threads)
+vy_run_env_enable_coio(struct vy_run_env *env)
{
if (env->reader_pool != NULL)
return; /* already enabled */
- vy_run_env_start_readers(env, threads);
+ vy_run_env_start_readers(env);
}
/**
diff --git a/src/box/vy_run.h b/src/box/vy_run.h
index 5030886c..5d9865e4 100644
--- a/src/box/vy_run.h
+++ b/src/box/vy_run.h
@@ -281,9 +281,13 @@ struct vy_page {
/**
* Initialize vinyl run environment
+ *
+ * @param read_threads - max number of background threads to
+ * use for disk reads; note background threads are not used
+ * until vy_run_env_enable_coio() is called.
*/
void
-vy_run_env_create(struct vy_run_env *env);
+vy_run_env_create(struct vy_run_env *env, int read_threads);
/**
* Destroy vinyl run environment
@@ -294,14 +298,17 @@ vy_run_env_destroy(struct vy_run_env *env);
/**
* Enable coio reads for a vinyl run environment.
*
- * This function starts @threads reader threads and makes
- * the run iterator hand disk reads over to them rather than
- * read run files directly blocking the current fiber.
+ * This function starts background reader threads and makes
+ * the run iterator hand disk reads over to them rather
+ * than read run files directly blocking the current fiber.
+ *
+ * The number of background reader threads is configured when
+ * the environment is created, see vy_run_env_create().
*
* Subsequent calls to this function will silently return.
*/
void
-vy_run_env_enable_coio(struct vy_run_env *env, int threads);
+vy_run_env_enable_coio(struct vy_run_env *env);
/**
* Return the size of a run bloom filter.
diff --git a/test/unit/vy_point_lookup.c b/test/unit/vy_point_lookup.c
index 86877d7d..dd33bbec 100644
--- a/test/unit/vy_point_lookup.c
+++ b/test/unit/vy_point_lookup.c
@@ -70,7 +70,7 @@ test_basic()
is(rc, 0, "vy_lsm_env_create");
struct vy_run_env run_env;
- vy_run_env_create(&run_env);
+ vy_run_env_create(&run_env, 0);
struct vy_cache_env cache_env;
vy_cache_env_create(&cache_env, slab_cache);
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 06/11] vinyl: zap vy_env::memory, read_threads, and write_threads
2018-09-20 9:34 ` [PATCH 06/11] vinyl: zap vy_env::memory, read_threads, and write_threads Vladimir Davydov
@ 2018-09-25 23:23 ` Konstantin Osipov
0 siblings, 0 replies; 16+ messages in thread
From: Konstantin Osipov @ 2018-09-25 23:23 UTC (permalink / raw)
To: tarantool-patches
* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/20 12:39]:
> They are only used to set corresponding members of vy_quota, vy_run_env,
> and vy_scheduler when vy_env is created. No point in keeping them around
> all the time.
OK to push.
--
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 07/11] vinyl: do not account zero dump bandwidth
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (5 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 06/11] vinyl: zap vy_env::memory, read_threads, and write_threads Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-25 23:24 ` [tarantool-patches] " Konstantin Osipov
2018-09-20 9:34 ` [PATCH 08/11] vinyl: set quota timer period to 100 ms Vladimir Davydov
` (4 subsequent siblings)
11 siblings, 1 reply; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Since we free memory in 16 MB blocks (see SLAB_SIZE), it may occur that
we dump almost all data stored in a block but still have to leave it be,
because it contains data of a newer generation. If the memory limit is
small (as it is typically in tests), this may result in dumping 0 bytes.
In order not to disrupt statistics and throttling transactions in vain,
let's simply ignore such results. Normally, the memory limit should be
large enough for such granularity not to affect the measurements
(hundreds of megabytes) so this problem isn't worth putting more efforts
into.
---
src/box/vy_quota.c | 16 ++++++++++++++--
1 file changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 7cd64474..5f8c0618 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -205,8 +205,20 @@ vy_quota_dump(struct vy_quota *q, size_t size, double duration)
q->used -= size;
fiber_cond_signal(&q->cond);
- /* Update dump bandwidth. */
- if (duration > 0) {
+ /*
+ * Update dump bandwidth.
+ *
+ * Note, since we free memory in 16 MB blocks (see SLAB_SIZE),
+ * it may occur that we dump almost all data stored in a block
+ * but still have to leave it be, because it contains data of
+ * a newer generation. If the memory limit is small, this may
+ * result in dumping 0 bytes. In order not to disrupt statistics
+ * let's simply ignore such results. Normally, the memory limit
+ * should be large enough for such granularity not to affect the
+ * measurements (hundreds of megabytes) so this problem isn't
+ * worth putting more efforts into.
+ */
+ if (size > 0 && duration > 0) {
histogram_collect(q->dump_bw_hist, size / duration);
/*
* To avoid unpredictably long stalls, we need to
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 07/11] vinyl: do not account zero dump bandwidth
2018-09-20 9:34 ` [PATCH 07/11] vinyl: do not account zero dump bandwidth Vladimir Davydov
@ 2018-09-25 23:24 ` Konstantin Osipov
0 siblings, 0 replies; 16+ messages in thread
From: Konstantin Osipov @ 2018-09-25 23:24 UTC (permalink / raw)
To: tarantool-patches
* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/20 12:39]:
> Since we free memory in 16 MB blocks (see SLAB_SIZE), it may occur that
> we dump almost all data stored in a block but still have to leave it be,
> because it contains data of a newer generation. If the memory limit is
> small (as it is typically in tests), this may result in dumping 0 bytes.
> In order not to disrupt statistics and throttling transactions in vain,
> let's simply ignore such results. Normally, the memory limit should be
> large enough for such granularity not to affect the measurements
> (hundreds of megabytes) so this problem isn't worth putting more efforts
> into.
OK to push, obviously, except you have no place to apply this
patch since I haven't approved the set of patches it is based on.
--
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 08/11] vinyl: set quota timer period to 100 ms
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (6 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 07/11] vinyl: do not account zero dump bandwidth Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-20 9:34 ` [PATCH 09/11] vinyl: implement basic transaction throttling Vladimir Davydov
` (3 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Currently, it's 1 second, which is OK for calculating watermark, but
too long for throttling (think of latency of 1 seconds that would be
introduced by throttling if such timeout were used).
---
src/box/vy_quota.c | 25 ++++++++++++-------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 5f8c0618..def05aa2 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -43,18 +43,17 @@
#include "histogram.h"
#include "trivia/util.h"
-enum {
- /**
- * Time interval between successive updates of
- * quota watermark and use rate, in seconds.
- */
- VY_QUOTA_UPDATE_INTERVAL = 1,
- /**
- * Period of time over which the quota use rate
- * is averaged, in seconds.
- */
- VY_QUOTA_RATE_AVG_PERIOD = 5,
-};
+/**
+ * Time interval between successive updates of quota watermark
+ * and use rate, in seconds.
+ */
+static const double VY_QUOTA_UPDATE_INTERVAL = 0.1;
+
+/**
+ * Period of time over which the quota use rate is averaged,
+ * in seconds.
+ */
+static const double VY_QUOTA_RATE_AVG_PERIOD = 5;
/*
* Until we dump anything, assume bandwidth to be 10 MB/s,
@@ -106,7 +105,7 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
* Update the quota use rate with the new measurement.
*/
const double weight = 1 - exp(-VY_QUOTA_UPDATE_INTERVAL /
- (double)VY_QUOTA_RATE_AVG_PERIOD);
+ VY_QUOTA_RATE_AVG_PERIOD);
q->use_rate = (1 - weight) * q->use_rate +
weight * q->use_curr / VY_QUOTA_UPDATE_INTERVAL;
q->use_curr = 0;
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 09/11] vinyl: implement basic transaction throttling
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (7 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 08/11] vinyl: set quota timer period to 100 ms Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-20 9:34 ` [PATCH 10/11] vinyl: implement quota wait queue without fiber_cond Vladimir Davydov
` (2 subsequent siblings)
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
If the rate at which transactions are ready to write to the database is
greater than the dump bandwidth, memory will get depleted before the
last dump is complete and all newer transactions will have to wait,
which may take seconds or even minutes:
W> waited for 555 bytes of vinyl memory quota for too long: 15.750 sec
This patch set implements basic transaction throttling that is supposed
to help avoid unpredictably long stalls. Now transaction write rate is
always limited by observed dump bandwidth, because it doesn't make sense
to consume memory at a greater rate than it can be freed. On top of
that, when a dump begins, we estimate the amount of time it is going to
take and limit the transaction write rate appropriately.
Note, this patch doesn't take into account compaction when setting the
rate limit so compaction threads may still fail to keep up with dumps,
increasing read amplification. It will be addressed later.
Part of #1862
---
src/box/vy_quota.c | 76 +++++++++++++++++++++++++++++++-
src/box/vy_quota.h | 21 ++++++++-
test/vinyl/suite.ini | 2 +-
test/vinyl/throttle.result | 102 +++++++++++++++++++++++++++++++++++++++++++
test/vinyl/throttle.test.lua | 54 +++++++++++++++++++++++
5 files changed, 252 insertions(+), 3 deletions(-)
create mode 100644 test/vinyl/throttle.result
create mode 100644 test/vinyl/throttle.test.lua
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index def05aa2..18cf4f35 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -67,6 +67,40 @@ static const size_t VY_DEFAULT_DUMP_BANDWIDTH = 10 * 1024 * 1024;
*/
enum { VY_DUMP_BANDWIDTH_PCT = 10 };
+static inline void
+vy_quota_rl_set(struct vy_quota_rl *rl, size_t rate)
+{
+ /* Sanity check: never limit rate below 100 KB/s. */
+ rl->rate = MAX(rate, 100U * 1024);
+}
+
+static inline bool
+vy_quota_rl_may_use(struct vy_quota_rl *rl)
+{
+ return rl->value > 0;
+}
+
+static inline void
+vy_quota_rl_use(struct vy_quota_rl *rl, size_t size)
+{
+ rl->value -= size;
+}
+
+static inline void
+vy_quota_rl_unuse(struct vy_quota_rl *rl, size_t size)
+{
+ rl->value += size;
+}
+
+static inline void
+vy_quota_rl_refill(struct vy_quota_rl *rl)
+{
+ ssize_t size = rl->rate * VY_QUOTA_UPDATE_INTERVAL;
+ rl->value += size;
+ /* Allow bursts up to 2x rate. */
+ rl->value = MIN(rl->value, size * 2);
+}
+
/**
* Trigger memory dump unless it is already in progress.
*/
@@ -78,6 +112,22 @@ vy_quota_trigger_dump(struct vy_quota *q)
return;
if (q->trigger_dump_cb(q) != 0)
return;
+
+ /*
+ * To avoid unpredictably long stalls, we must limit
+ * the write rate when a dump is in progress so that
+ * we don't hit the hard limit before the dump has
+ * completed, i.e.
+ *
+ * quota_left dump_size
+ * ---------- >= --------------
+ * use_rate dump_bandwidth
+ */
+ size_t quota_left = q->used < q->limit ? q->limit - q->used : 0;
+ size_t max_use_rate = quota_left * q->dump_bw / (q->used + 1);
+ if (q->rl.rate > max_use_rate)
+ vy_quota_rl_set(&q->rl, max_use_rate);
+
q->dump_in_progress = true;
}
@@ -124,6 +174,12 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
q->watermark = ((double)q->limit * q->dump_bw /
(q->dump_bw + q->use_rate + 1));
vy_quota_check_watermark(q);
+
+ /*
+ * Replenish quota and wake up throttled fibers, if any.
+ */
+ vy_quota_rl_refill(&q->rl);
+ fiber_cond_signal(&q->cond);
}
int
@@ -170,6 +226,8 @@ vy_quota_enable(struct vy_quota *q)
assert(!q->is_enabled);
q->is_enabled = true;
q->use_curr = 0;
+ vy_quota_rl_set(&q->rl, q->dump_bw);
+ vy_quota_rl_refill(&q->rl);
ev_timer_start(loop(), &q->timer);
vy_quota_check_watermark(q);
}
@@ -227,6 +285,16 @@ vy_quota_dump(struct vy_quota *q, size_t size, double duration)
q->dump_bw = histogram_percentile_lower(q->dump_bw_hist,
VY_DUMP_BANDWIDTH_PCT);
}
+
+ /*
+ * Reset the rate limit.
+ *
+ * It doesn't make sense to allow to consume memory at
+ * a higher rate than it can be dumped so we set the rate
+ * limit to the dump bandwidth rather than disabling it
+ * completely.
+ */
+ vy_quota_rl_set(&q->rl, q->dump_bw);
}
void
@@ -241,6 +309,7 @@ vy_quota_force_use(struct vy_quota *q, size_t size)
{
q->used += size;
q->use_curr += size;
+ vy_quota_rl_use(&q->rl, size);
vy_quota_check_watermark(q);
}
@@ -253,7 +322,11 @@ vy_quota_may_use(struct vy_quota *q, size_t size)
{
if (!q->is_enabled)
return true;
- return q->used + size <= q->limit;
+ if (q->used + size > q->limit)
+ return false;
+ if (!vy_quota_rl_may_use(&q->rl))
+ return false;
+ return true;
}
int
@@ -293,6 +366,7 @@ vy_quota_unuse(struct vy_quota *q, size_t size)
q->used -= size;
/* use_curr could have been reset on timeout. */
q->use_curr -= MIN(q->use_curr, size);
+ vy_quota_rl_unuse(&q->rl, size);
fiber_cond_signal(&q->cond);
}
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index 793a4430..3070ff82 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -47,6 +47,20 @@ typedef int
(*vy_quota_trigger_dump_f)(struct vy_quota *quota);
/**
+ * Rate limiter state.
+ */
+struct vy_quota_rl {
+ /** Max allowed use rate, in bytes per second. */
+ size_t rate;
+ /**
+ * Amount of quota that can be used by transactions.
+ * Refilled by the timer callback in accordance with
+ * the configured rate.
+ */
+ ssize_t value;
+};
+
+/**
* Quota used for accounting and limiting memory consumption
* in the vinyl engine. It is NOT multi-threading safe.
*/
@@ -76,7 +90,10 @@ struct vy_quota {
* there is no quota left.
*/
struct fiber_cond cond;
- /** Timer for updating quota watermark. */
+ /**
+ * Timer used for updating average use rate, calculating
+ * quota watermark and refilling rate limit value.
+ */
ev_timer timer;
/**
* Amount of quota used since the last
@@ -89,6 +106,8 @@ struct vy_quota {
* moving average of use_curr.
*/
size_t use_rate;
+ /** Rate limiter. */
+ struct vy_quota_rl rl;
/**
* Called when quota is consumed if used >= watermark.
* It is supposed to trigger memory dump and return 0
diff --git a/test/vinyl/suite.ini b/test/vinyl/suite.ini
index b9dae380..785bc63d 100644
--- a/test/vinyl/suite.ini
+++ b/test/vinyl/suite.ini
@@ -6,5 +6,5 @@ release_disabled = errinj.test.lua errinj_gc.test.lua errinj_vylog.test.lua part
config = suite.cfg
lua_libs = suite.lua stress.lua large.lua txn_proxy.lua ../box/lua/utils.lua
use_unix_sockets = True
-long_run = stress.test.lua large.test.lua write_iterator_rand.test.lua dump_stress.test.lua select_consistency.test.lua
+long_run = stress.test.lua large.test.lua write_iterator_rand.test.lua dump_stress.test.lua select_consistency.test.lua throttle.test.lua
is_parallel = False
diff --git a/test/vinyl/throttle.result b/test/vinyl/throttle.result
new file mode 100644
index 00000000..7e84e496
--- /dev/null
+++ b/test/vinyl/throttle.result
@@ -0,0 +1,102 @@
+--
+-- Basic test for transaction throttling.
+--
+-- It checks that write transactions aren't stalled for long
+-- due to hitting the memory limit, but instead are throttled
+-- in advance.
+--
+test_run = require('test_run').new()
+---
+...
+test_run:cmd("create server test with script='vinyl/low_quota.lua'")
+---
+- true
+...
+test_run:cmd(string.format("start server test with args='%d'", 32 * 1024 * 1024))
+---
+- true
+...
+test_run:cmd('switch test')
+---
+- true
+...
+fiber = require('fiber')
+---
+...
+digest = require('digest')
+---
+...
+box.cfg{snap_io_rate_limit = 4}
+---
+...
+FIBER_COUNT = 5
+---
+...
+TUPLE_SIZE = 1000
+---
+...
+TX_TUPLE_COUNT = 10
+---
+...
+TX_SIZE = TUPLE_SIZE * TX_TUPLE_COUNT
+---
+...
+TX_COUNT = math.ceil(box.cfg.vinyl_memory / (TX_SIZE * FIBER_COUNT))
+---
+...
+s = box.schema.space.create('test', {engine = 'vinyl'})
+---
+...
+_ = s:create_index('primary', {parts = {1, 'unsigned', 2, 'unsigned', 3, 'unsigned'}})
+---
+...
+latency = 0
+---
+...
+c = fiber.channel(FIBER_COUNT)
+---
+...
+test_run:cmd("setopt delimiter ';'")
+---
+- true
+...
+for i = 1, FIBER_COUNT do
+ fiber.create(function()
+ for j = 1, TX_COUNT do
+ local t1 = fiber.time()
+ box.begin()
+ for k = 1, TX_TUPLE_COUNT do
+ s:replace{i, j, k, digest.urandom(TUPLE_SIZE)}
+ end
+ box.commit()
+ local t2 = fiber.time()
+ latency = math.max(latency, t2 - t1)
+ end
+ c:put(true)
+ end)
+end;
+---
+...
+test_run:cmd("setopt delimiter ''");
+---
+- true
+...
+for i = 1, FIBER_COUNT do c:get() end
+---
+...
+latency < 0.2 or latency
+---
+- true
+...
+test_run:cmd('switch default')
+---
+- true
+...
+test_run:cmd("stop server test")
+---
+- true
+...
+test_run:cmd("cleanup server test")
+---
+- true
+...
diff --git a/test/vinyl/throttle.test.lua b/test/vinyl/throttle.test.lua
new file mode 100644
index 00000000..130c8fcd
--- /dev/null
+++ b/test/vinyl/throttle.test.lua
@@ -0,0 +1,54 @@
+--
+-- Basic test for transaction throttling.
+--
+-- It checks that write transactions aren't stalled for long
+-- due to hitting the memory limit, but instead are throttled
+-- in advance.
+--
+test_run = require('test_run').new()
+test_run:cmd("create server test with script='vinyl/low_quota.lua'")
+test_run:cmd(string.format("start server test with args='%d'", 32 * 1024 * 1024))
+test_run:cmd('switch test')
+
+fiber = require('fiber')
+digest = require('digest')
+
+box.cfg{snap_io_rate_limit = 4}
+
+FIBER_COUNT = 5
+TUPLE_SIZE = 1000
+TX_TUPLE_COUNT = 10
+TX_SIZE = TUPLE_SIZE * TX_TUPLE_COUNT
+TX_COUNT = math.ceil(box.cfg.vinyl_memory / (TX_SIZE * FIBER_COUNT))
+
+s = box.schema.space.create('test', {engine = 'vinyl'})
+_ = s:create_index('primary', {parts = {1, 'unsigned', 2, 'unsigned', 3, 'unsigned'}})
+
+latency = 0
+c = fiber.channel(FIBER_COUNT)
+
+test_run:cmd("setopt delimiter ';'")
+for i = 1, FIBER_COUNT do
+ fiber.create(function()
+ for j = 1, TX_COUNT do
+ local t1 = fiber.time()
+ box.begin()
+ for k = 1, TX_TUPLE_COUNT do
+ s:replace{i, j, k, digest.urandom(TUPLE_SIZE)}
+ end
+ box.commit()
+ local t2 = fiber.time()
+ latency = math.max(latency, t2 - t1)
+ end
+ c:put(true)
+ end)
+end;
+test_run:cmd("setopt delimiter ''");
+
+for i = 1, FIBER_COUNT do c:get() end
+
+latency < 0.2 or latency
+
+test_run:cmd('switch default')
+test_run:cmd("stop server test")
+test_run:cmd("cleanup server test")
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 10/11] vinyl: implement quota wait queue without fiber_cond
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (8 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 09/11] vinyl: implement basic transaction throttling Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-20 9:34 ` [PATCH 11/11] vinyl: split quota consumption rate limit into soft and hard Vladimir Davydov
2018-09-25 23:19 ` [tarantool-patches] Re: [PATCH 00/11] vinyl: prepare for transaction throttling Konstantin Osipov
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
In the next patch we will need to split the queue of fibers waiting for
quota in two: one for high priority consumers and the other for low
priority ones. To balance wake-ups between the two queues, we'll need to
attach some additional info to fibers waiting on the queue (timestamp).
With fiber_cond it is impossible to do. So let's rewrite the quota wait
queue using plain rlist and fiber_yield.
---
src/box/vy_quota.c | 47 ++++++++++++++++++++++++++++++++++++-----------
src/box/vy_quota.h | 17 +++++++++++++----
2 files changed, 49 insertions(+), 15 deletions(-)
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 18cf4f35..3374aacc 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -34,11 +34,11 @@
#include <stddef.h>
#include <stdint.h>
#include <math.h>
+#include <small/rlist.h>
#include <tarantool_ev.h>
#include "diag.h"
#include "fiber.h"
-#include "fiber_cond.h"
#include "say.h"
#include "histogram.h"
#include "trivia/util.h"
@@ -141,6 +141,21 @@ vy_quota_check_watermark(struct vy_quota *q)
vy_quota_trigger_dump(q);
}
+/**
+ * Wake up the next consumer in the line waiting for quota.
+ */
+static void
+vy_quota_signal(struct vy_quota *q)
+{
+ if (!rlist_empty(&q->wait_queue)) {
+ struct vy_quota_wait_node *n;
+ n = rlist_first_entry(&q->wait_queue,
+ struct vy_quota_wait_node,
+ in_wait_queue);
+ fiber_wakeup(n->fiber);
+ }
+}
+
static void
vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
{
@@ -179,7 +194,7 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
* Replenish quota and wake up throttled fibers, if any.
*/
vy_quota_rl_refill(&q->rl);
- fiber_cond_signal(&q->cond);
+ vy_quota_signal(q);
}
int
@@ -213,7 +228,7 @@ vy_quota_create(struct vy_quota *q, size_t limit,
q->too_long_threshold = TIMEOUT_INFINITY;
q->dump_bw = VY_DEFAULT_DUMP_BANDWIDTH;
q->trigger_dump_cb = trigger_dump_cb;
- fiber_cond_create(&q->cond);
+ rlist_create(&q->wait_queue);
ev_timer_init(&q->timer, vy_quota_timer_cb, 0,
VY_QUOTA_UPDATE_INTERVAL);
q->timer.data = q;
@@ -237,8 +252,6 @@ vy_quota_destroy(struct vy_quota *q)
{
ev_timer_stop(loop(), &q->timer);
histogram_delete(q->dump_bw_hist);
- fiber_cond_broadcast(&q->cond);
- fiber_cond_destroy(&q->cond);
}
void
@@ -246,7 +259,7 @@ vy_quota_set_limit(struct vy_quota *q, size_t limit)
{
q->limit = q->watermark = limit;
vy_quota_check_watermark(q);
- fiber_cond_signal(&q->cond);
+ vy_quota_signal(q);
}
void
@@ -260,7 +273,7 @@ vy_quota_dump(struct vy_quota *q, size_t size, double duration)
*/
assert(q->used >= size);
q->used -= size;
- fiber_cond_signal(&q->cond);
+ vy_quota_signal(q);
/*
* Update dump bandwidth.
@@ -340,8 +353,20 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
do {
if (q->used + size > q->limit)
vy_quota_trigger_dump(q);
- if (fiber_cond_wait_deadline(&q->cond, deadline) != 0)
- return -1; /* timed out */
+
+ double now = ev_monotonic_now(loop());
+ if (now >= deadline) {
+ diag_set(TimedOut);
+ return -1;
+ }
+
+ struct vy_quota_wait_node node = {
+ .fiber = fiber(),
+ };
+ rlist_add_tail_entry(&q->wait_queue,
+ &node, in_wait_queue);
+ fiber_yield_timeout(deadline - now);
+ rlist_del_entry(&node, in_wait_queue);
} while (!vy_quota_may_use(q, size));
double wait_time = ev_monotonic_now(loop()) - start_time;
@@ -353,7 +378,7 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
* Wake up the next fiber in the line waiting
* for quota.
*/
- fiber_cond_signal(&q->cond);
+ vy_quota_signal(q);
}
vy_quota_force_use(q, size);
return 0;
@@ -367,7 +392,7 @@ vy_quota_unuse(struct vy_quota *q, size_t size)
/* use_curr could have been reset on timeout. */
q->use_curr -= MIN(q->use_curr, size);
vy_quota_rl_unuse(&q->rl, size);
- fiber_cond_signal(&q->cond);
+ vy_quota_signal(q);
}
void
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index 3070ff82..f53bb8a9 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -33,8 +33,8 @@
#include <stdbool.h>
#include <stddef.h>
+#include <small/rlist.h>
#include <tarantool_ev.h>
-#include "fiber_cond.h"
#if defined(__cplusplus)
extern "C" {
@@ -42,6 +42,7 @@ extern "C" {
struct vy_quota;
struct histogram;
+struct fiber;
typedef int
(*vy_quota_trigger_dump_f)(struct vy_quota *quota);
@@ -60,6 +61,13 @@ struct vy_quota_rl {
ssize_t value;
};
+struct vy_quota_wait_node {
+ /** Link in vy_quota::wait_queue. */
+ struct rlist in_wait_queue;
+ /** Fiber waiting for quota. */
+ struct fiber *fiber;
+};
+
/**
* Quota used for accounting and limiting memory consumption
* in the vinyl engine. It is NOT multi-threading safe.
@@ -86,10 +94,11 @@ struct vy_quota {
*/
double too_long_threshold;
/**
- * Condition variable used for throttling consumers when
- * there is no quota left.
+ * Queue of consumers waiting for quota, linked by
+ * vy_quota_wait_node::state. Newcomers are added
+ * to the tail.
*/
- struct fiber_cond cond;
+ struct rlist wait_queue;
/**
* Timer used for updating average use rate, calculating
* quota watermark and refilling rate limit value.
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [PATCH 11/11] vinyl: split quota consumption rate limit into soft and hard
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (9 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 10/11] vinyl: implement quota wait queue without fiber_cond Vladimir Davydov
@ 2018-09-20 9:34 ` Vladimir Davydov
2018-09-25 23:19 ` [tarantool-patches] Re: [PATCH 00/11] vinyl: prepare for transaction throttling Konstantin Osipov
11 siblings, 0 replies; 16+ messages in thread
From: Vladimir Davydov @ 2018-09-20 9:34 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
Currently, we only limit quota consumption rate so that writers won't
hit the hard limit before memory dump is complete. However, it isn't
enough, because we also need to consider compaction: if it doesn't keep
up with dumps, read and space amplification will grow uncontrollably.
The problem is compaction may be a quota consumer by itself, as it may
generate deferred DELETE statements for secondary indexes. We can't
ignore quota completely there, because if we do, we may hit the memory
limit and stall all writers, which is unacceptable, but we do want to
ignore the rate limit imposed to make sure that compaction keeps up with
dumps, otherwise compaction won't benefit from such a throttling.
So this patch splits the rate limit in two, soft and hard, and
introduces the concept of quota consumer priorities. There are two
priorities, low and high. High priority consumers (compaction) respect
only the hard limit while low priority consumers (the rest of the world)
observe both limits. Currently, both limits are always equal, but when
compaction-aware throttling is introduced, the hard limit will be set
solely to make sure that writers won't stall due to pending dump while
the soft limit will also depend on the compaction debt.
The tricky part here is waking up throttled consumers. The problem is we
have to maintain two wait queues to be able to wake up high priority
consumers even when soft rate limit is exceeded and hence all low
priority consumers have to wait. At the same time, we must preserve
fairness between the two queues to make sure that high priority
consumers don't uncontrollably throttle low priority ones, in other
words if the soft rate limit isn't breached, then the two queues should
work like one. To resolve this issue, we assign monotonically growing
timestamps to throttled fibers and always wake up the one with the
lowest timestamp (i.e. the one that has waited most), no matter if it's
low or high priority.
---
src/box/vinyl.c | 31 ++++++++++------
src/box/vy_quota.c | 106 +++++++++++++++++++++++++++++++++++++++--------------
src/box/vy_quota.h | 53 +++++++++++++++++++++++----
3 files changed, 145 insertions(+), 45 deletions(-)
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 4c1e860d..2c74df89 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -2346,7 +2346,8 @@ vinyl_engine_prepare(struct engine *engine, struct txn *txn)
* the transaction to be sent to read view or aborted, we call
* it before checking for conflicts.
*/
- if (vy_quota_use(&env->quota, tx->write_size, timeout) != 0) {
+ if (vy_quota_use(&env->quota, tx->write_size,
+ VY_QUOTA_CONSUMER_DEFAULT, timeout) != 0) {
diag_set(ClientError, ER_VY_QUOTA_TIMEOUT);
return -1;
}
@@ -2358,7 +2359,8 @@ vinyl_engine_prepare(struct engine *engine, struct txn *txn)
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
vy_quota_adjust(&env->quota, tx->write_size,
- mem_used_after - mem_used_before);
+ mem_used_after - mem_used_before,
+ VY_QUOTA_CONSUMER_DEFAULT);
return rc;
}
@@ -2382,7 +2384,8 @@ vinyl_engine_commit(struct engine *engine, struct txn *txn)
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
/* We can't abort the transaction at this point, use force. */
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+ vy_quota_force_use(&env->quota, mem_used_after - mem_used_before,
+ VY_QUOTA_CONSUMER_DEFAULT);
txn->engine_tx = NULL;
if (!txn->is_autocommit)
@@ -3160,7 +3163,8 @@ vinyl_space_apply_initial_join_row(struct space *space, struct request *request)
* quota accounting.
*/
size_t reserved = tx->write_size;
- if (vy_quota_use(&env->quota, reserved, TIMEOUT_INFINITY) != 0)
+ if (vy_quota_use(&env->quota, reserved, VY_QUOTA_CONSUMER_DEFAULT,
+ TIMEOUT_INFINITY) != 0)
unreachable();
size_t mem_used_before = lsregion_used(&env->mem_env.allocator);
@@ -3179,7 +3183,8 @@ vinyl_space_apply_initial_join_row(struct space *space, struct request *request)
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
size_t used = mem_used_after - mem_used_before;
- vy_quota_adjust(&env->quota, reserved, used);
+ vy_quota_adjust(&env->quota, reserved, used,
+ VY_QUOTA_CONSUMER_DEFAULT);
return rc;
}
@@ -3500,7 +3505,8 @@ vy_squash_process(struct vy_squash *squash)
*/
vy_mem_commit_stmt(mem, region_stmt);
vy_quota_force_use(&env->quota,
- mem_used_after - mem_used_before);
+ mem_used_after - mem_used_before,
+ VY_QUOTA_CONSUMER_DEFAULT);
}
return rc;
}
@@ -3974,8 +3980,9 @@ vy_build_insert_tuple(struct vy_env *env, struct vy_lsm *lsm,
/* Consume memory quota. Throttle if it is exceeded. */
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
- vy_quota_wait(&env->quota);
+ vy_quota_force_use(&env->quota, mem_used_after - mem_used_before,
+ VY_QUOTA_CONSUMER_DEFAULT);
+ vy_quota_wait(&env->quota, VY_QUOTA_CONSUMER_DEFAULT);
return rc;
}
@@ -4101,7 +4108,8 @@ vy_build_recover(struct vy_env *env, struct vy_lsm *lsm, struct vy_lsm *pk)
mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+ vy_quota_force_use(&env->quota, mem_used_after - mem_used_before,
+ VY_QUOTA_CONSUMER_DEFAULT);
return rc;
}
@@ -4317,7 +4325,7 @@ vy_deferred_delete_on_replace(struct trigger *trigger, void *event)
*/
struct vy_env *env = vy_env(space->engine);
if (is_first_statement)
- vy_quota_wait(&env->quota);
+ vy_quota_wait(&env->quota, VY_QUOTA_CONSUMER_DEFAULT);
/* Create the deferred DELETE statement. */
struct vy_lsm *pk = vy_lsm(space->index[0]);
@@ -4404,7 +4412,8 @@ vy_deferred_delete_on_replace(struct trigger *trigger, void *event)
}
size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
assert(mem_used_after >= mem_used_before);
- vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+ vy_quota_force_use(&env->quota, mem_used_after - mem_used_before,
+ VY_QUOTA_CONSUMER_DEFAULT);
tuple_unref(delete);
if (rc != 0)
diff --git a/src/box/vy_quota.c b/src/box/vy_quota.c
index 3374aacc..8e40ab88 100644
--- a/src/box/vy_quota.c
+++ b/src/box/vy_quota.c
@@ -125,8 +125,10 @@ vy_quota_trigger_dump(struct vy_quota *q)
*/
size_t quota_left = q->used < q->limit ? q->limit - q->used : 0;
size_t max_use_rate = quota_left * q->dump_bw / (q->used + 1);
- if (q->rl.rate > max_use_rate)
- vy_quota_rl_set(&q->rl, max_use_rate);
+ if (q->rl.soft.rate > max_use_rate)
+ vy_quota_rl_set(&q->rl.soft, max_use_rate);
+ if (q->rl.hard.rate > max_use_rate)
+ vy_quota_rl_set(&q->rl.hard, max_use_rate);
q->dump_in_progress = true;
}
@@ -147,13 +149,45 @@ vy_quota_check_watermark(struct vy_quota *q)
static void
vy_quota_signal(struct vy_quota *q)
{
- if (!rlist_empty(&q->wait_queue)) {
- struct vy_quota_wait_node *n;
- n = rlist_first_entry(&q->wait_queue,
- struct vy_quota_wait_node,
- in_wait_queue);
- fiber_wakeup(n->fiber);
+ /*
+ * If hard rate limit is exceeded, neither low nor high
+ * priority consumers may proceed so no point in waking
+ * anyone.
+ */
+ if (!vy_quota_rl_may_use(&q->rl.hard))
+ return;
+
+ struct vy_quota_wait_node *n1 = NULL;
+ struct vy_quota_wait_node *n2 = NULL;
+
+ /*
+ * Wake up low priority consumers only if soft rate limit
+ * is not exceeded.
+ */
+ if (vy_quota_rl_may_use(&q->rl.soft) &&
+ !rlist_empty(&q->wait_queue[VY_QUOTA_CONSUMER_LO])) {
+ n1 = rlist_first_entry(&q->wait_queue[VY_QUOTA_CONSUMER_LO],
+ struct vy_quota_wait_node,
+ in_wait_queue);
}
+ if (!rlist_empty(&q->wait_queue[VY_QUOTA_CONSUMER_HI])) {
+ n2 = rlist_first_entry(&q->wait_queue[VY_QUOTA_CONSUMER_HI],
+ struct vy_quota_wait_node,
+ in_wait_queue);
+ }
+
+ int64_t ts1 = n1 != NULL ? n1->timestamp : INT64_MAX;
+ int64_t ts2 = n2 != NULL ? n2->timestamp : INT64_MAX;
+
+ /*
+ * Wake up a consumer that have waited most no matter
+ * whether it's high or low priority. This assures that
+ * high priority consumers don't uncontrollably throttle
+ * low priority ones.
+ */
+ struct vy_quota_wait_node *n = ts1 < ts2 ? n1 : n2;
+ if (n != NULL)
+ fiber_wakeup(n->fiber);
}
static void
@@ -193,7 +227,8 @@ vy_quota_timer_cb(ev_loop *loop, ev_timer *timer, int events)
/*
* Replenish quota and wake up throttled fibers, if any.
*/
- vy_quota_rl_refill(&q->rl);
+ vy_quota_rl_refill(&q->rl.soft);
+ vy_quota_rl_refill(&q->rl.hard);
vy_quota_signal(q);
}
@@ -228,7 +263,8 @@ vy_quota_create(struct vy_quota *q, size_t limit,
q->too_long_threshold = TIMEOUT_INFINITY;
q->dump_bw = VY_DEFAULT_DUMP_BANDWIDTH;
q->trigger_dump_cb = trigger_dump_cb;
- rlist_create(&q->wait_queue);
+ rlist_create(&q->wait_queue[VY_QUOTA_CONSUMER_LO]);
+ rlist_create(&q->wait_queue[VY_QUOTA_CONSUMER_HI]);
ev_timer_init(&q->timer, vy_quota_timer_cb, 0,
VY_QUOTA_UPDATE_INTERVAL);
q->timer.data = q;
@@ -241,8 +277,10 @@ vy_quota_enable(struct vy_quota *q)
assert(!q->is_enabled);
q->is_enabled = true;
q->use_curr = 0;
- vy_quota_rl_set(&q->rl, q->dump_bw);
- vy_quota_rl_refill(&q->rl);
+ vy_quota_rl_set(&q->rl.soft, q->dump_bw);
+ vy_quota_rl_set(&q->rl.hard, q->dump_bw);
+ vy_quota_rl_refill(&q->rl.soft);
+ vy_quota_rl_refill(&q->rl.hard);
ev_timer_start(loop(), &q->timer);
vy_quota_check_watermark(q);
}
@@ -307,7 +345,8 @@ vy_quota_dump(struct vy_quota *q, size_t size, double duration)
* limit to the dump bandwidth rather than disabling it
* completely.
*/
- vy_quota_rl_set(&q->rl, q->dump_bw);
+ vy_quota_rl_set(&q->rl.soft, q->dump_bw);
+ vy_quota_rl_set(&q->rl.hard, q->dump_bw);
}
void
@@ -318,11 +357,14 @@ vy_quota_reset_dump_bandwidth(struct vy_quota *q, size_t max)
}
void
-vy_quota_force_use(struct vy_quota *q, size_t size)
+vy_quota_force_use(struct vy_quota *q, size_t size,
+ enum vy_quota_consumer_prio prio)
{
q->used += size;
q->use_curr += size;
- vy_quota_rl_use(&q->rl, size);
+ vy_quota_rl_use(&q->rl.hard, size);
+ if (prio == VY_QUOTA_CONSUMER_LO)
+ vy_quota_rl_use(&q->rl.soft, size);
vy_quota_check_watermark(q);
}
@@ -331,21 +373,26 @@ vy_quota_force_use(struct vy_quota *q, size_t size)
* or false if consumers have to wait.
*/
static bool
-vy_quota_may_use(struct vy_quota *q, size_t size)
+vy_quota_may_use(struct vy_quota *q, size_t size,
+ enum vy_quota_consumer_prio prio)
{
if (!q->is_enabled)
return true;
if (q->used + size > q->limit)
return false;
- if (!vy_quota_rl_may_use(&q->rl))
+ if (!vy_quota_rl_may_use(&q->rl.hard))
+ return false;
+ if (prio == VY_QUOTA_CONSUMER_LO &&
+ !vy_quota_rl_may_use(&q->rl.soft))
return false;
return true;
}
int
-vy_quota_use(struct vy_quota *q, size_t size, double timeout)
+vy_quota_use(struct vy_quota *q, size_t size,
+ enum vy_quota_consumer_prio prio, double timeout)
{
- if (!vy_quota_may_use(q, size)) {
+ if (!vy_quota_may_use(q, size, prio)) {
/* Wait for quota. */
double start_time = ev_monotonic_now(loop());
double deadline = start_time + timeout;
@@ -362,12 +409,13 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
struct vy_quota_wait_node node = {
.fiber = fiber(),
+ .timestamp = ++q->wait_timestamp,
};
- rlist_add_tail_entry(&q->wait_queue,
+ rlist_add_tail_entry(&q->wait_queue[prio],
&node, in_wait_queue);
fiber_yield_timeout(deadline - now);
rlist_del_entry(&node, in_wait_queue);
- } while (!vy_quota_may_use(q, size));
+ } while (!vy_quota_may_use(q, size, prio));
double wait_time = ev_monotonic_now(loop()) - start_time;
if (wait_time > q->too_long_threshold) {
@@ -380,26 +428,30 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout)
*/
vy_quota_signal(q);
}
- vy_quota_force_use(q, size);
+ vy_quota_force_use(q, size, prio);
return 0;
}
static void
-vy_quota_unuse(struct vy_quota *q, size_t size)
+vy_quota_unuse(struct vy_quota *q, size_t size,
+ enum vy_quota_consumer_prio prio)
{
assert(q->used >= size);
q->used -= size;
/* use_curr could have been reset on timeout. */
q->use_curr -= MIN(q->use_curr, size);
- vy_quota_rl_unuse(&q->rl, size);
+ vy_quota_rl_unuse(&q->rl.hard, size);
+ if (prio == VY_QUOTA_CONSUMER_LO)
+ vy_quota_rl_unuse(&q->rl.soft, size);
vy_quota_signal(q);
}
void
-vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used)
+vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used,
+ enum vy_quota_consumer_prio prio)
{
if (reserved > used)
- vy_quota_unuse(q, reserved - used);
+ vy_quota_unuse(q, reserved - used, prio);
if (reserved < used)
- vy_quota_force_use(q, used - reserved);
+ vy_quota_force_use(q, used - reserved, prio);
}
diff --git a/src/box/vy_quota.h b/src/box/vy_quota.h
index f53bb8a9..31fd0d54 100644
--- a/src/box/vy_quota.h
+++ b/src/box/vy_quota.h
@@ -33,6 +33,7 @@
#include <stdbool.h>
#include <stddef.h>
+#include <stdint.h>
#include <small/rlist.h>
#include <tarantool_ev.h>
@@ -44,6 +45,18 @@ struct vy_quota;
struct histogram;
struct fiber;
+/**
+ * Quota consumer priority. Determines how a consumer will be
+ * rate limited, see vy_quota::rl.
+ */
+enum vy_quota_consumer_prio {
+ VY_QUOTA_CONSUMER_LO = 0,
+ VY_QUOTA_CONSUMER_HI = 1,
+ vy_quota_consumer_prio_MAX,
+
+ VY_QUOTA_CONSUMER_DEFAULT = VY_QUOTA_CONSUMER_LO,
+};
+
typedef int
(*vy_quota_trigger_dump_f)(struct vy_quota *quota);
@@ -66,6 +79,13 @@ struct vy_quota_wait_node {
struct rlist in_wait_queue;
/** Fiber waiting for quota. */
struct fiber *fiber;
+ /**
+ * Timestamp assigned to this fiber when it was
+ * put to sleep (see vy_quota::wait_timestamp).
+ * It is used to balance wake-ups between low and
+ * high priority queues.
+ */
+ int64_t timestamp;
};
/**
@@ -98,7 +118,12 @@ struct vy_quota {
* vy_quota_wait_node::state. Newcomers are added
* to the tail.
*/
- struct rlist wait_queue;
+ struct rlist wait_queue[vy_quota_consumer_prio_MAX];
+ /**
+ * Monotonically growing timestamp assigned to fibers
+ * waiting for quota, see vy_quota_wait_node::timestamp.
+ */
+ int64_t wait_timestamp;
/**
* Timer used for updating average use rate, calculating
* quota watermark and refilling rate limit value.
@@ -116,7 +141,18 @@ struct vy_quota {
*/
size_t use_rate;
/** Rate limiter. */
- struct vy_quota_rl rl;
+ struct {
+ /**
+ * Soft rate limit, bypassed by high priority
+ * consumers.
+ */
+ struct vy_quota_rl soft;
+ /**
+ * Hard rate limit, observed by both low and
+ * high priority consumers.
+ */
+ struct vy_quota_rl hard;
+ } rl;
/**
* Called when quota is consumed if used >= watermark.
* It is supposed to trigger memory dump and return 0
@@ -201,7 +237,8 @@ vy_quota_reset_dump_bandwidth(struct vy_quota *q, size_t max);
* this function does not throttle the caller.
*/
void
-vy_quota_force_use(struct vy_quota *q, size_t size);
+vy_quota_force_use(struct vy_quota *q, size_t size,
+ enum vy_quota_consumer_prio prio);
/**
* Try to consume @size bytes of memory, throttle the caller
@@ -238,7 +275,8 @@ vy_quota_force_use(struct vy_quota *q, size_t size);
* account while estimating the size of a memory allocation.
*/
int
-vy_quota_use(struct vy_quota *q, size_t size, double timeout);
+vy_quota_use(struct vy_quota *q, size_t size,
+ enum vy_quota_consumer_prio prio, double timeout);
/**
* Adjust quota after allocating memory.
@@ -249,15 +287,16 @@ vy_quota_use(struct vy_quota *q, size_t size, double timeout);
* See also vy_quota_use().
*/
void
-vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used);
+vy_quota_adjust(struct vy_quota *q, size_t reserved, size_t used,
+ enum vy_quota_consumer_prio prio);
/**
* Block the caller until the quota is not exceeded.
*/
static inline void
-vy_quota_wait(struct vy_quota *q)
+vy_quota_wait(struct vy_quota *q, enum vy_quota_consumer_prio prio)
{
- vy_quota_use(q, 0, TIMEOUT_INFINITY);
+ vy_quota_use(q, 0, prio, TIMEOUT_INFINITY);
}
#if defined(__cplusplus)
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread
* [tarantool-patches] Re: [PATCH 00/11] vinyl: prepare for transaction throttling
2018-09-20 9:34 [PATCH 00/11] vinyl: prepare for transaction throttling Vladimir Davydov
` (10 preceding siblings ...)
2018-09-20 9:34 ` [PATCH 11/11] vinyl: split quota consumption rate limit into soft and hard Vladimir Davydov
@ 2018-09-25 23:19 ` Konstantin Osipov
11 siblings, 0 replies; 16+ messages in thread
From: Konstantin Osipov @ 2018-09-25 23:19 UTC (permalink / raw)
To: tarantool-patches
* Vladimir Davydov <vdavydov.dev@gmail.com> [18/09/20 12:34]:
> This patch set contains preparatory patches necessary for implementing
> transaction throttling. It does some refactoring, introduces basic
> throttling based on dump bandwidth, and splits the rate limit in soft
> and hard, which is necessary for compaction based throttling. The only
> thing left is invent such a throttling policy that guarantees that
> compaction always keeps up with dumps.
Re refactoring of vy_quota: I continue to believe that there is an
underlying low-level quota object, encapsulating the logic of a
limit and the current consumption rate.
Before throttling, we had a single resource we consume:
vinyl.memory, there was no logical limit on disk bandwidth.
Throttling is essentially an introduction and enforcement of a logical
limit on disk bandwidth.
I disagree that with addition of this new logical limit the quota
API should be changed. The current API is conceptually very
simple. A quota is two variables: a limit and the current
consumption rate. So there are two sets of methods: to
change/adjust the limit, and to consume/release the use rate, in
other words, use/release the associated resource.
Introduction of a new resource type changes nothing about the way
we work with the quota. We simply get a new pair of limit and use
rate, and begin working with it in addition to the pair related to
memory.
So I believe that introduction of a new resource does not mandate
the changes in the focus of quota-api from working with the
internal state of the quota (limit and use rate) to essentially
the life cycle of the LSM tree, i.e. handling of the events such
as a dump or a compaction. We may have one set of events which
lead to consumption and release of the quota today, and another
tomorrow.
Basically, the throttling task has two subtasks:
- introduction of disk bandwidth as a separate resource which
usage is accounted for across the code base
This is this patch set's concern. This should have nothing
to do with LSM details. The main consumer of the bandwidth
is DML, since it essentially leads to level 0 dump.
- introduction of a separate entity, which is then part
of our LSM implementation which tracks events in LSM
tree life cycle, such as compaction and dump, and dynamically
adjusts the limit based on predicted implications of these
events on the available disk bandwidth (or actually the
remainder of it, since we assume that compaction's use
of disk bandwidth is not limited with the quota).
We still have no simple solution for the second part, "the
predictor". It's therefore best, I agree, to separate the two
stacks of patches, since we may want to change the predictive
model in the future without changing the rate limiting logic
itself
>
> https://github.com/tarantool/tarantool/issues/1862
> https://github.com/tarantool/tarantool/tree/dv/gh-1862-vy-throttling
>
> Vladimir Davydov (11):
> vinyl: merge vy_quota_release and vy_quota_update_dump_bandwidth
> vinyl: refactor quota use/unuse methods
> vinyl: do not try to trigger dump if it is already in progress
> vinyl: don't start quota timer until local recovery is complete
> vinyl: add helper to start scheduler and enable quota on startup
> vinyl: zap vy_env::memory, read_threads, and write_threads
> vinyl: do not account zero dump bandwidth
> vinyl: set quota timer period to 100 ms
> vinyl: implement basic transaction throttling
> vinyl: implement quota wait queue without fiber_cond
> vinyl: split quota consumption rate limit into soft and hard
>
> src/box/vinyl.c | 79 +++++-----
> src/box/vy_quota.c | 340 +++++++++++++++++++++++++++++++++----------
> src/box/vy_quota.h | 146 +++++++++++++++----
> src/box/vy_run.c | 12 +-
> src/box/vy_run.h | 17 ++-
> test/unit/vy_point_lookup.c | 2 +-
> test/vinyl/suite.ini | 2 +-
> test/vinyl/throttle.result | 102 +++++++++++++
> test/vinyl/throttle.test.lua | 54 +++++++
> 9 files changed, 601 insertions(+), 153 deletions(-)
> create mode 100644 test/vinyl/throttle.result
> create mode 100644 test/vinyl/throttle.test.lua
>
> --
> 2.11.0
>
--
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov
^ permalink raw reply [flat|nested] 16+ messages in thread