[tarantool-patches] [PATCH] vinyl: allow to build secondary index for non-empty space
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Mon May 21 18:18:18 MSK 2018
Hello. Thanks for the patch! First of all, look at my review fixes that I
pushed on the branch in a separate commit. I will not comment here already
fixed places. See my 9 other comments below. All of them are just
recommendations, and you can skip them if you want.
> diff --git a/src/box/vinyl.c b/src/box/vinyl.c
> index ff4c2831..1f88be08 100644
> --- a/src/box/vinyl.c
> +++ b/src/box/vinyl.c> +
> +/**
> + * Insert a tuple fetched from the space into the LSM tree that
> + * is currently being built.
> + */
> +static int
> +vy_build_insert_tuple(struct vy_env *env, struct vy_lsm *lsm,
> + const char *space_name, const char *index_name,
> + struct tuple_format *new_format, struct tuple *tuple)
> +{
> /*
> - * During local recovery we are loading existing indexes
> - * from disk, not building new ones.
> - */
> - if (env->status != VINYL_INITIAL_RECOVERY_LOCAL &&
> - env->status != VINYL_FINAL_RECOVERY_LOCAL) {
> - if (pk->stat.disk.count.rows != 0 ||
> - pk->stat.memory.count.rows != 0) {
> - diag_set(ClientError, ER_UNSUPPORTED, "Vinyl",
> - "building an index for a non-empty space");
> + * Use the last LSN to the time, because read iterator
> + * guarantees that this is the newest tuple version.
> + */
> + int64_t lsn = env->xm->lsn;
> + struct vy_mem *mem = lsm->mem;
> +
> + /* Check the tuple against the new space format. */
> + if (tuple_validate(new_format, tuple) != 0)
> + return -1;
> +
> + /* Check unique constraint if necessary. */
> + if (lsm->check_is_unique &&
> + (!lsm->key_def->is_nullable ||
> + !vy_tuple_key_contains_null(tuple, lsm->key_def))) {
> + struct tuple *key = vy_stmt_extract_key(tuple, lsm->key_def,
> + lsm->env->key_format);
> + if (key == NULL)
> + return -1;
> + /*
> + * Make sure the in-memory index won't go away
> + * while we are reading disk.
> + */
> + vy_mem_pin(mem);
> +
> + struct tuple *found;
> + int rc = vy_lsm_get(lsm, NULL, &env->xm->p_committed_read_view,
> + key, &found);
> + vy_mem_unpin(mem);
> + tuple_unref(key);
> +
> + if (rc != 0)
> + return -1;
> +
> + if (found != NULL &&
> + vy_stmt_compare(tuple, found, lsm->cmp_def) == 0) {
> + tuple_unref(found);
> + return 0;
> + }
> + if (found != NULL) {
> + tuple_unref(found);
> + diag_set(ClientError, ER_TUPLE_FOUND,
> + index_name, space_name);
> return -1;
> }
1. This code is identical to vy_secondary_check_is_unique() introduced by
me in the review-fixes commit except one thing - it uses committed read
view, and does not have a transaction. Maybe you can think up how to
reuse vy_secondary_check_is_unique() here.
> }
>
> + /* Reallocate the new tuple using the new space format. */
> + uint32_t data_len;
> + const char *data = tuple_data_range(tuple, &data_len);
> + struct tuple *stmt = vy_stmt_new_replace(new_format, data,
> + data + data_len);
> + if (stmt == NULL)
> + return -1;
> +
> + /* Insert the new tuple into the in-memory index. */
> + size_t mem_used_before = lsregion_used(&env->mem_env.allocator);
> + const struct tuple *region_stmt = vy_stmt_dup_lsregion(stmt,
> + &mem->env->allocator, mem->generation);
> + tuple_unref(stmt);
2. I do not like this temporary stmt needed only to reuse
vy_stmt_dup_lsregion(). Here you can create the region tuple with no
intermediate tuple on malloc.
3. This code is very similar to vy_lsm_set - it too makes
vy_stmt_dup_lsregion, vy_mem_insert. Maybe there is a way to reuse
vy_lsm_set?
> + if (region_stmt == NULL)
> + return -1;
> + vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
> + if (vy_mem_insert(mem, region_stmt) != 0)
> + return -1;
> +
> + mem->min_lsn = MIN(mem->min_lsn, lsn);
> + mem->max_lsn = MAX(mem->max_lsn, lsn);
4. How about move this into vy_mem_insert? It is the second place, where
min/max_lsn are updated right after vy_mem_insert.
> + vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, region_stmt);
> +
> + size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
> + assert(mem_used_after >= mem_used_before);
> + vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
> + vy_quota_wait(&env->quota);
> + return 0;
> +}
> +
> +/**
> + * Recover a single statement that was inserted into the space
> + * while the newly built index was dumped to disk.
> + */
> +static int
> +vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
> + const struct tuple *mem_stmt)
> +{
> + int64_t lsn = vy_stmt_lsn(mem_stmt);
> + if (lsn <= lsm->dump_lsn)
> + return 0; /* statement was dumped, nothing to do */
> +
> + /* Lookup the tuple that was affected by the statement. */
> + const struct vy_read_view rv = { .vlsn = lsn - 1 };
> + const struct vy_read_view *p_rv = &rv;
> + struct tuple *old_tuple;
> + if (vy_point_lookup(pk, NULL, &p_rv, (struct tuple *)mem_stmt,
> + &old_tuple) != 0)
> + return -1;
> +
> + int rc = -1;
> + uint32_t data_len;
> + const char *data;
> + struct tuple *new_tuple;
> + struct tuple *delete = NULL;
> + struct tuple *insert = NULL;
> +
> + /*
> + * Create DELETE + REPLACE statements corresponding to
> + * the given statement in the secondary index.
> + */
> + switch (vy_stmt_type(mem_stmt)) {
> + case IPROTO_INSERT:
> + case IPROTO_REPLACE:
> + data = tuple_data_range(mem_stmt, &data_len);
> + insert = vy_stmt_new_insert(lsm->mem_format,
> + data, data + data_len);
> + if (insert == NULL)
> + goto out;
> + goto make_delete;
> +
> + case IPROTO_UPSERT:
> + new_tuple = vy_apply_upsert(mem_stmt, old_tuple, pk->cmp_def,
> + pk->mem_format, true);
> + if (new_tuple == NULL)
> + goto out;
> + data = tuple_data_range(new_tuple, &data_len);
> + insert = vy_stmt_new_insert(lsm->mem_format,
> + data, data + data_len);
> + tuple_unref(new_tuple);
> + if (insert == NULL)
> + goto out;
> + goto make_delete;
> +
> + case IPROTO_DELETE: make_delete:
> + if (old_tuple != NULL) {
> + delete = vy_stmt_new_surrogate_delete(lsm->mem_format,
> + old_tuple);
> + if (delete == NULL)
> + goto out;
> + }
> + break;
> + default:
> + unreachable();
> + }
> +
> + /* Insert DELETE + REPLACE into the LSM tree. */
> + if (delete != NULL) {
> + struct tuple *region_stmt = vy_stmt_dup_lsregion(delete,
> + &lsm->mem->env->allocator,
> + lsm->mem->generation);
> + if (region_stmt == NULL)
> + goto out;
> + vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
> + if (vy_mem_insert(lsm->mem, region_stmt) != 0)
> + goto out;
> + vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, delete);
> + }
> + if (insert != NULL) {
> + struct tuple *region_stmt = vy_stmt_dup_lsregion(insert,
> + &lsm->mem->env->allocator,
> + lsm->mem->generation);
> + if (region_stmt == NULL)
> + goto out;
> + vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
> + if (vy_mem_insert(lsm->mem, region_stmt) != 0)
> + goto out;
> + vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, insert);
> + }
> + lsm->mem->min_lsn = MIN(lsm->mem->min_lsn, lsn);
> + lsm->mem->max_lsn = MAX(lsm->mem->max_lsn, lsn);
> +
> + rc = 0;
> +out:
> + if (old_tuple != NULL)
> + tuple_unref(old_tuple);
> + if (delete != NULL)
> + tuple_unref(delete);
> + if (insert != NULL)
> + tuple_unref(insert);
> + return rc;
> +
> +}
5. I propose this diff (I did not check it works, but looks more readable
and compact, in my opinion):
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index ea9787195..9a10f6ad3 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -1531,9 +1531,10 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
int rc = -1;
uint32_t data_len;
const char *data;
- struct tuple *new_tuple;
- struct tuple *delete = NULL;
- struct tuple *insert = NULL;
+ struct tuple *new_tuple, *region_stmt;
+ struct tuple *insert, *delete;
+ struct lsregion *region = &lsm->mem->env->allocator;
+ int64_t gen = lsm->mem->generation;
/*
* Create DELETE + REPLACE statements corresponding to
@@ -1547,6 +1548,8 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
data, data + data_len);
if (insert == NULL)
goto out;
+ if (old_tuple == NULL)
+ break;
goto make_delete;
case IPROTO_UPSERT:
@@ -1560,43 +1563,38 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
tuple_unref(new_tuple);
if (insert == NULL)
goto out;
- goto make_delete;
-
- case IPROTO_DELETE: make_delete:
- if (old_tuple != NULL) {
- delete = vy_stmt_new_surrogate_delete(lsm->mem_format,
- old_tuple);
- if (delete == NULL)
- goto out;
- }
- break;
- default:
- unreachable();
- }
+ if (old_tuple == NULL)
+ break;
+ FALLTHROUGH;
- /* Insert DELETE + REPLACE into the LSM tree. */
- if (delete != NULL) {
- struct tuple *region_stmt = vy_stmt_dup_lsregion(delete,
- &lsm->mem->env->allocator,
- lsm->mem->generation);
- if (region_stmt == NULL)
- goto out;
- vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
- if (vy_mem_insert(lsm->mem, region_stmt) != 0)
+ make_delete:
+ delete = vy_stmt_new_surrogate_delete(lsm->mem_format,
+ old_tuple);
+ if (delete == NULL)
goto out;
- vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, delete);
- }
- if (insert != NULL) {
- struct tuple *region_stmt = vy_stmt_dup_lsregion(insert,
- &lsm->mem->env->allocator,
- lsm->mem->generation);
+ /* Insert DELETE + REPLACE into the LSM tree. */
+ region_stmt = vy_stmt_dup_lsregion(delete, region, gen);
+ tuple_unref(delete);
if (region_stmt == NULL)
goto out;
vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
if (vy_mem_insert(lsm->mem, region_stmt) != 0)
goto out;
- vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, insert);
+ vy_stmt_counter_acct_tuple(&lsm->stat.memory.count,
+ region_stmt);
+ break;
+ default:
+ unreachable();
}
+
+ region_stmt = vy_stmt_dup_lsregion(insert, region, gen);
+ tuple_unref(insert);
+ if (region_stmt == NULL)
+ goto out;
+ vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
+ if (vy_mem_insert(lsm->mem, region_stmt) != 0)
+ goto out;
+ vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, region_stmt);
lsm->mem->min_lsn = MIN(lsm->mem->min_lsn, lsn);
lsm->mem->max_lsn = MAX(lsm->mem->max_lsn, lsn);
@@ -1604,10 +1602,6 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
out:
if (old_tuple != NULL)
tuple_unref(old_tuple);
- if (delete != NULL)
- tuple_unref(delete);
- if (insert != NULL)
- tuple_unref(insert);
return rc;
}
> +
> +static int
> +vinyl_space_build_index(struct space *src_space, struct index *new_index,
> + struct tuple_format *new_format)
> +{
> + struct vy_env *env = vy_env(src_space->engine);
> + struct vy_lsm *pk = vy_lsm(src_space->index[0]);
> + bool is_empty = (pk->stat.disk.count.rows == 0 &&
> + pk->stat.memory.count.rows == 0);
> +
> + if (new_index->def->iid == 0 && !is_empty) {
> + diag_set(ClientError, ER_UNSUPPORTED, "Vinyl",
> + "rebuilding the primary index of a non-empty space");
> + return -1;
> + }
> +
> if (vinyl_index_open(new_index) != 0)
> return -1;
>
> /* Set pointer to the primary key for the new index. */
> - vy_lsm_update_pk(vy_lsm(new_index), pk);
> - return 0;
> + struct vy_lsm *new_lsm = vy_lsm(new_index);
> + vy_lsm_update_pk(new_lsm, pk);
> +
> + if (env->status == VINYL_INITIAL_RECOVERY_LOCAL ||
> + env->status == VINYL_FINAL_RECOVERY_LOCAL)
> + return vy_build_recover(env, new_lsm, pk);
> +
> + if (is_empty)
> + return 0;
> +
> + if (vy_build_prepare(env, new_lsm) != 0)
> + return -1;
> +
> + /*
> + * Iterate over all tuples stored in the space and insert
> + * each of them into the new LSM tree. Since read iterator
> + * may yield, we install an on_replace trigger to forward
> + * DML requests issued during the build.
> + */
> + struct tuple *key = vy_stmt_new_select(pk->env->key_format, NULL, 0);
> + if (key == NULL)
> + return -1;
> +
> + struct trigger on_replace;
> + struct vy_build_ctx ctx;
> + ctx.lsm = new_lsm;
> + ctx.format = new_format;
> + ctx.space_name = space_name(src_space);
> + ctx.index_name = new_index->def->name;
> + ctx.is_failed = false;
> + diag_create(&ctx.diag);
> + trigger_create(&on_replace, vy_build_on_replace, &ctx, NULL);
> + trigger_add(&src_space->on_replace, &on_replace);
> +
> + struct vy_read_iterator itr;
> + vy_read_iterator_open(&itr, pk, NULL, ITER_ALL, key,
> + &env->xm->p_committed_read_view);
> + int rc;
> + int loops = 0;
> + struct tuple *tuple;
> + while ((rc = vy_read_iterator_next(&itr, &tuple)) == 0) {
> + if (tuple == NULL)
> + break;
> + /*
> + * Note, yield is not allowed here. If we yielded,
> + * the tuple could be overwritten by a concurrent
> + * transaction, in which case we would insert an
> + * outdated tuple to the new index.
> + */
> + rc = vy_build_insert_tuple(env, new_lsm, space_name(src_space),
> + new_index->def->name, new_format, tuple);
> + if (rc != 0)
> + break;
> + /*
> + * Read iterator yields only when it reads runs.
> + * Yield periodically in order not to stall the
> + * tx thread in case there are a lot of tuples in
> + * mems or cache.
> + */
> + if (++loops % VY_YIELD_LOOPS == 0)
> + fiber_sleep(0);
6. Why not fiber_yield() ?
> + if (ctx.is_failed) {
> + diag_move(&ctx.diag, diag_get());
> + rc = -1;
> + break;
> + }
> + }
> + vy_read_iterator_close(&itr);
> + tuple_unref(key);
> +
> + /*
> + * Dump the new index upon build completion so that we don't
> + * have to rebuild it on recovery.
> + */
> + if (rc == 0)
> + rc = vy_scheduler_dump(&env->scheduler);
> +
> + if (rc == 0 && ctx.is_failed) {
7. How is it possible? If ctx.is_failed, then rc is set to -1 above.
> + diag_move(&ctx.diag, diag_get());
> + rc = -1;
> + }
> +
> + diag_destroy(&ctx.diag);
> + trigger_clear(&on_replace);
> + return rc;
> }
>
> static size_t
> diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
> index 4dfc0a25..63e18594 100644
> --- a/src/box/vy_lsm.c
> +++ b/src/box/vy_lsm.c
> @@ -743,11 +751,20 @@ int
> vy_lsm_set(struct vy_lsm *lsm, struct vy_mem *mem,
> const struct tuple *stmt, const struct tuple **region_stmt)
> {
> + uint32_t format_id = stmt->format_id;
> +
> assert(vy_stmt_is_refable(stmt));
> assert(*region_stmt == NULL || !vy_stmt_is_refable(*region_stmt));
>
> - /* Allocate region_stmt on demand. */
> - if (*region_stmt == NULL) {
> + /*
> + * Allocate region_stmt on demand.
> + *
> + * Also, reallocate region_stmt if it uses a different tuple
> + * format. This may happen during ALTER, when the LSM tree
> + * that is currently being built uses the new space format
> + * while other LSM trees still use the old space format.
> + */
> + if (*region_stmt == NULL || (*region_stmt)->format_id != format_id) {
8. Do you really need reset region_stmt in such a case? See this case:
vy_lsm_set(pk), vy_lsm_set(old_sk1), vy_lsm_set(new_sk), vy_lsm_set(old_sk2)
Here you create 3 region statements: one for pk, old_sk1; new for new_sk with a
new format, and again new for old_sk2, because *region_stmt is now has new format,
but old_sk2 still has old one.
You can dup stmt on region when format mismatch is detected, but do not reset
*region_stmt. It allows to create only 2 region statements for the case above.
Am I right?
> *region_stmt = vy_stmt_dup_lsregion(stmt, &mem->env->allocator,
> mem->generation);
> if (*region_stmt == NULL)
> diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
> index 4db86152..7dbd6a96 100644
> --- a/src/box/vy_scheduler.h
> +++ b/src/box/vy_scheduler.h
> @@ -202,6 +202,13 @@ void
> vy_scheduler_trigger_dump(struct vy_scheduler *scheduler);
>
> /**
> + * Trigger dump of all currently existing in-memory trees
> + * and wait until it is complete. Returns 0 on success.
> + */
> +int
> +vy_scheduler_dump(struct vy_scheduler *scheduler);
9. Do you really need this at the end of sk building? Why you can not leave
mems as they are? They will be dumped anyway sometimes.
More information about the Tarantool-patches
mailing list