Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
To: tarantool-patches@freelists.org,
	Vladimir Davydov <vdavydov.dev@gmail.com>,
	kostja@tarantool.org
Subject: Re: [tarantool-patches] [PATCH] vinyl: allow to build secondary index for non-empty space
Date: Mon, 21 May 2018 18:18:18 +0300	[thread overview]
Message-ID: <c09c9df9-09ac-5fa9-bea7-997660e5aee9@tarantool.org> (raw)
In-Reply-To: <f24a0484bbf7b112ae2fa8673db9e7a189f4443b.1524318337.git.vdavydov.dev@gmail.com>

Hello. Thanks for the patch! First of all, look at my review fixes that I
pushed on the branch in a separate commit. I will not comment here already
fixed places. See my 9 other comments below. All of them are just
recommendations, and you can skip them if you want.

> diff --git a/src/box/vinyl.c b/src/box/vinyl.c
> index ff4c2831..1f88be08 100644
> --- a/src/box/vinyl.c
> +++ b/src/box/vinyl.c> +
> +/**
> + * Insert a tuple fetched from the space into the LSM tree that
> + * is currently being built.
> + */
> +static int
> +vy_build_insert_tuple(struct vy_env *env, struct vy_lsm *lsm,
> +		      const char *space_name, const char *index_name,
> +		      struct tuple_format *new_format, struct tuple *tuple)
> +{
>   	/*
> -	 * During local recovery we are loading existing indexes
> -	 * from disk, not building new ones.
> -	 */
> -	if (env->status != VINYL_INITIAL_RECOVERY_LOCAL &&
> -	    env->status != VINYL_FINAL_RECOVERY_LOCAL) {
> -		if (pk->stat.disk.count.rows != 0 ||
> -		    pk->stat.memory.count.rows != 0) {
> -			diag_set(ClientError, ER_UNSUPPORTED, "Vinyl",
> -				 "building an index for a non-empty space");
> +	 * Use the last LSN to the time, because read iterator
> +	 * guarantees that this is the newest tuple version.
> +	 */
> +	int64_t lsn = env->xm->lsn;
> +	struct vy_mem *mem = lsm->mem;
> +
> +	/* Check the tuple against the new space format. */
> +	if (tuple_validate(new_format, tuple) != 0)
> +		return -1;
> +
> +	/* Check unique constraint if necessary. */
> +	if (lsm->check_is_unique &&
> +	    (!lsm->key_def->is_nullable ||
> +	     !vy_tuple_key_contains_null(tuple, lsm->key_def))) {
> +		struct tuple *key = vy_stmt_extract_key(tuple, lsm->key_def,
> +							lsm->env->key_format);
> +		if (key == NULL)
> +			return -1;
> +		/*
> +		 * Make sure the in-memory index won't go away
> +		 * while we are reading disk.
> +		 */
> +		vy_mem_pin(mem);
> +
> +		struct tuple *found;
> +		int rc = vy_lsm_get(lsm, NULL, &env->xm->p_committed_read_view,
> +				    key, &found);
> +		vy_mem_unpin(mem);
> +		tuple_unref(key);
> +
> +		if (rc != 0)
> +			return -1;
> +
> +		if (found != NULL &&
> +		    vy_stmt_compare(tuple, found, lsm->cmp_def) == 0) {
> +			tuple_unref(found);
> +			return 0;
> +		}
> +		if (found != NULL) {
> +			tuple_unref(found);
> +			diag_set(ClientError, ER_TUPLE_FOUND,
> +				 index_name, space_name);
>   			return -1;
>   		}

1. This code is identical to vy_secondary_check_is_unique() introduced by
me in the review-fixes commit except one thing - it uses committed read
view, and does not have a transaction. Maybe you can think up how to
reuse vy_secondary_check_is_unique() here.

>   	}
>   
> +	/* Reallocate the new tuple using the new space format. */
> +	uint32_t data_len;
> +	const char *data = tuple_data_range(tuple, &data_len);
> +	struct tuple *stmt = vy_stmt_new_replace(new_format, data,
> +						 data + data_len);
> +	if (stmt == NULL)
> +		return -1;
> +
> +	/* Insert the new tuple into the in-memory index. */
> +	size_t mem_used_before = lsregion_used(&env->mem_env.allocator);
> +	const struct tuple *region_stmt = vy_stmt_dup_lsregion(stmt,
> +				&mem->env->allocator, mem->generation);
> +	tuple_unref(stmt);

2. I do not like this temporary stmt needed only to reuse
vy_stmt_dup_lsregion(). Here you can create the region tuple with no
intermediate tuple on malloc.

3. This code is very similar to vy_lsm_set - it too makes
vy_stmt_dup_lsregion, vy_mem_insert. Maybe there is a way to reuse
vy_lsm_set?

> +	if (region_stmt == NULL)
> +		return -1;
> +	vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
> +	if (vy_mem_insert(mem, region_stmt) != 0)
> +		return -1;
> +
> +	mem->min_lsn = MIN(mem->min_lsn, lsn);
> +	mem->max_lsn = MAX(mem->max_lsn, lsn);

4. How about move this into vy_mem_insert? It is the second place, where
min/max_lsn are updated right after vy_mem_insert.

> +	vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, region_stmt);
> +
> +	size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
> +	assert(mem_used_after >= mem_used_before);
> +	vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
> +	vy_quota_wait(&env->quota);
> +	return 0;
> +}
> +
> +/**
> + * Recover a single statement that was inserted into the space
> + * while the newly built index was dumped to disk.
> + */
> +static int
> +vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
> +			  const struct tuple *mem_stmt)
> +{
> +	int64_t lsn = vy_stmt_lsn(mem_stmt);
> +	if (lsn <= lsm->dump_lsn)
> +		return 0; /* statement was dumped, nothing to do */
> +
> +	/* Lookup the tuple that was affected by the statement. */
> +	const struct vy_read_view rv = { .vlsn = lsn - 1 };
> +	const struct vy_read_view *p_rv = &rv;
> +	struct tuple *old_tuple;
> +	if (vy_point_lookup(pk, NULL, &p_rv, (struct tuple *)mem_stmt,
> +			    &old_tuple) != 0)
> +		return -1;
> +
> +	int rc = -1;
> +	uint32_t data_len;
> +	const char *data;
> +	struct tuple *new_tuple;
> +	struct tuple *delete = NULL;
> +	struct tuple *insert = NULL;
> +
> +	/*
> +	 * Create DELETE + REPLACE statements corresponding to
> +	 * the given statement in the secondary index.
> +	 */
> +	switch (vy_stmt_type(mem_stmt)) {
> +	case IPROTO_INSERT:
> +	case IPROTO_REPLACE:
> +		data = tuple_data_range(mem_stmt, &data_len);
> +		insert = vy_stmt_new_insert(lsm->mem_format,
> +					    data, data + data_len);
> +		if (insert == NULL)
> +			goto out;
> +		goto make_delete;
> +
> +	case IPROTO_UPSERT:
> +		new_tuple = vy_apply_upsert(mem_stmt, old_tuple, pk->cmp_def,
> +					    pk->mem_format, true);
> +		if (new_tuple == NULL)
> +			goto out;
> +		data = tuple_data_range(new_tuple, &data_len);
> +		insert = vy_stmt_new_insert(lsm->mem_format,
> +					    data, data + data_len);
> +		tuple_unref(new_tuple);
> +		if (insert == NULL)
> +			goto out;
> +		goto make_delete;
> +
> +	case IPROTO_DELETE: make_delete:
> +		if (old_tuple != NULL) {
> +			delete = vy_stmt_new_surrogate_delete(lsm->mem_format,
> +							      old_tuple);
> +			if (delete == NULL)
> +				goto out;
> +		}
> +		break;
> +	default:
> +		unreachable();
> +	}
> +
> +	/* Insert DELETE + REPLACE into the LSM tree. */
> +	if (delete != NULL) {
> +		struct tuple *region_stmt = vy_stmt_dup_lsregion(delete,
> +						&lsm->mem->env->allocator,
> +						lsm->mem->generation);
> +		if (region_stmt == NULL)
> +			goto out;
> +		vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
> +		if (vy_mem_insert(lsm->mem, region_stmt) != 0)
> +			goto out;
> +		vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, delete);
> +	}
> +	if (insert != NULL) {
> +		struct tuple *region_stmt = vy_stmt_dup_lsregion(insert,
> +						&lsm->mem->env->allocator,
> +						lsm->mem->generation);
> +		if (region_stmt == NULL)
> +			goto out;
> +		vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
> +		if (vy_mem_insert(lsm->mem, region_stmt) != 0)
> +			goto out;
> +		vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, insert);
> +	}
> +	lsm->mem->min_lsn = MIN(lsm->mem->min_lsn, lsn);
> +	lsm->mem->max_lsn = MAX(lsm->mem->max_lsn, lsn);
> +
> +	rc = 0;
> +out:
> +	if (old_tuple != NULL)
> +		tuple_unref(old_tuple);
> +	if (delete != NULL)
> +		tuple_unref(delete);
> +	if (insert != NULL)
> +		tuple_unref(insert);
> +	return rc;
> +
> +}

5. I propose this diff (I did not check it works, but looks more readable
and compact, in my opinion):


diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index ea9787195..9a10f6ad3 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -1531,9 +1531,10 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
  	int rc = -1;
  	uint32_t data_len;
  	const char *data;
-	struct tuple *new_tuple;
-	struct tuple *delete = NULL;
-	struct tuple *insert = NULL;
+	struct tuple *new_tuple, *region_stmt;
+	struct tuple *insert, *delete;
+	struct lsregion *region = &lsm->mem->env->allocator;
+	int64_t gen = lsm->mem->generation;
  
  	/*
  	 * Create DELETE + REPLACE statements corresponding to
@@ -1547,6 +1548,8 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
  					    data, data + data_len);
  		if (insert == NULL)
  			goto out;
+		if (old_tuple == NULL)
+			break;
  		goto make_delete;
  
  	case IPROTO_UPSERT:
@@ -1560,43 +1563,38 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
  		tuple_unref(new_tuple);
  		if (insert == NULL)
  			goto out;
-		goto make_delete;
-
-	case IPROTO_DELETE: make_delete:
-		if (old_tuple != NULL) {
-			delete = vy_stmt_new_surrogate_delete(lsm->mem_format,
-							      old_tuple);
-			if (delete == NULL)
-				goto out;
-		}
-		break;
-	default:
-		unreachable();
-	}
+		if (old_tuple == NULL)
+			break;
+		FALLTHROUGH;
  
-	/* Insert DELETE + REPLACE into the LSM tree. */
-	if (delete != NULL) {
-		struct tuple *region_stmt = vy_stmt_dup_lsregion(delete,
-						&lsm->mem->env->allocator,
-						lsm->mem->generation);
-		if (region_stmt == NULL)
-			goto out;
-		vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
-		if (vy_mem_insert(lsm->mem, region_stmt) != 0)
+	make_delete:
+		delete = vy_stmt_new_surrogate_delete(lsm->mem_format,
+						      old_tuple);
+		if (delete == NULL)
  			goto out;
-		vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, delete);
-	}
-	if (insert != NULL) {
-		struct tuple *region_stmt = vy_stmt_dup_lsregion(insert,
-						&lsm->mem->env->allocator,
-						lsm->mem->generation);
+		/* Insert DELETE + REPLACE into the LSM tree. */
+		region_stmt = vy_stmt_dup_lsregion(delete, region, gen);
+		tuple_unref(delete);
  		if (region_stmt == NULL)
  			goto out;
  		vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
  		if (vy_mem_insert(lsm->mem, region_stmt) != 0)
  			goto out;
-		vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, insert);
+		vy_stmt_counter_acct_tuple(&lsm->stat.memory.count,
+					   region_stmt);
+		break;
+	default:
+		unreachable();
  	}
+
+	region_stmt = vy_stmt_dup_lsregion(insert, region, gen);
+	tuple_unref(insert);
+	if (region_stmt == NULL)
+		goto out;
+	vy_stmt_set_lsn((struct tuple *)region_stmt, lsn);
+	if (vy_mem_insert(lsm->mem, region_stmt) != 0)
+		goto out;
+	vy_stmt_counter_acct_tuple(&lsm->stat.memory.count, region_stmt);
  	lsm->mem->min_lsn = MIN(lsm->mem->min_lsn, lsn);
  	lsm->mem->max_lsn = MAX(lsm->mem->max_lsn, lsn);
  
@@ -1604,10 +1602,6 @@ vy_build_recover_mem_stmt(struct vy_lsm *lsm, struct vy_lsm *pk,
  out:
  	if (old_tuple != NULL)
  		tuple_unref(old_tuple);
-	if (delete != NULL)
-		tuple_unref(delete);
-	if (insert != NULL)
-		tuple_unref(insert);
  	return rc;
  
  }

> +
> +static int
> +vinyl_space_build_index(struct space *src_space, struct index *new_index,
> +			struct tuple_format *new_format)
> +{
> +	struct vy_env *env = vy_env(src_space->engine);
> +	struct vy_lsm *pk = vy_lsm(src_space->index[0]);
> +	bool is_empty = (pk->stat.disk.count.rows == 0 &&
> +			 pk->stat.memory.count.rows == 0);
> +
> +	if (new_index->def->iid == 0 && !is_empty) {
> +		diag_set(ClientError, ER_UNSUPPORTED, "Vinyl",
> +			 "rebuilding the primary index of a non-empty space");
> +		return -1;
> +	}
> +
>   	if (vinyl_index_open(new_index) != 0)
>   		return -1;
>   
>   	/* Set pointer to the primary key for the new index. */
> -	vy_lsm_update_pk(vy_lsm(new_index), pk);
> -	return 0;
> +	struct vy_lsm *new_lsm = vy_lsm(new_index);
> +	vy_lsm_update_pk(new_lsm, pk);
> +
> +	if (env->status == VINYL_INITIAL_RECOVERY_LOCAL ||
> +	    env->status == VINYL_FINAL_RECOVERY_LOCAL)
> +		return vy_build_recover(env, new_lsm, pk);
> +
> +	if (is_empty)
> +		return 0;
> +
> +	if (vy_build_prepare(env, new_lsm) != 0)
> +		return -1;
> +
> +	/*
> +	 * Iterate over all tuples stored in the space and insert
> +	 * each of them into the new LSM tree. Since read iterator
> +	 * may yield, we install an on_replace trigger to forward
> +	 * DML requests issued during the build.
> +	 */
> +	struct tuple *key = vy_stmt_new_select(pk->env->key_format, NULL, 0);
> +	if (key == NULL)
> +		return -1;
> +
> +	struct trigger on_replace;
> +	struct vy_build_ctx ctx;
> +	ctx.lsm = new_lsm;
> +	ctx.format = new_format;
> +	ctx.space_name = space_name(src_space);
> +	ctx.index_name = new_index->def->name;
> +	ctx.is_failed = false;
> +	diag_create(&ctx.diag);
> +	trigger_create(&on_replace, vy_build_on_replace, &ctx, NULL);
> +	trigger_add(&src_space->on_replace, &on_replace);
> +
> +	struct vy_read_iterator itr;
> +	vy_read_iterator_open(&itr, pk, NULL, ITER_ALL, key,
> +			      &env->xm->p_committed_read_view);
> +	int rc;
> +	int loops = 0;
> +	struct tuple *tuple;
> +	while ((rc = vy_read_iterator_next(&itr, &tuple)) == 0) {
> +		if (tuple == NULL)
> +			break;
> +		/*
> +		 * Note, yield is not allowed here. If we yielded,
> +		 * the tuple could be overwritten by a concurrent
> +		 * transaction, in which case we would insert an
> +		 * outdated tuple to the new index.
> +		 */
> +		rc = vy_build_insert_tuple(env, new_lsm, space_name(src_space),
> +				new_index->def->name, new_format, tuple);
> +		if (rc != 0)
> +			break;
> +		/*
> +		 * Read iterator yields only when it reads runs.
> +		 * Yield periodically in order not to stall the
> +		 * tx thread in case there are a lot of tuples in
> +		 * mems or cache.
> +		 */
> +		if (++loops % VY_YIELD_LOOPS == 0)
> +			fiber_sleep(0);

6. Why not fiber_yield() ?

> +		if (ctx.is_failed) {
> +			diag_move(&ctx.diag, diag_get());
> +			rc = -1;
> +			break;
> +		}
> +	}
> +	vy_read_iterator_close(&itr);
> +	tuple_unref(key);
> +
> +	/*
> +	 * Dump the new index upon build completion so that we don't
> +	 * have to rebuild it on recovery.
> +	 */
> +	if (rc == 0)
> +		rc = vy_scheduler_dump(&env->scheduler);
> +
> +	if (rc == 0 && ctx.is_failed) {

7. How is it possible? If ctx.is_failed, then rc is set to -1 above.

> +		diag_move(&ctx.diag, diag_get());
> +		rc = -1;
> +	}
> +
> +	diag_destroy(&ctx.diag);
> +	trigger_clear(&on_replace);
> +	return rc;
>   }
>   
>   static size_t
> diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
> index 4dfc0a25..63e18594 100644
> --- a/src/box/vy_lsm.c
> +++ b/src/box/vy_lsm.c
> @@ -743,11 +751,20 @@ int
>   vy_lsm_set(struct vy_lsm *lsm, struct vy_mem *mem,
>   	   const struct tuple *stmt, const struct tuple **region_stmt)
>   {
> +	uint32_t format_id = stmt->format_id;
> +
>   	assert(vy_stmt_is_refable(stmt));
>   	assert(*region_stmt == NULL || !vy_stmt_is_refable(*region_stmt));
>   
> -	/* Allocate region_stmt on demand. */
> -	if (*region_stmt == NULL) {
> +	/*
> +	 * Allocate region_stmt on demand.
> +	 *
> +	 * Also, reallocate region_stmt if it uses a different tuple
> +	 * format. This may happen during ALTER, when the LSM tree
> +	 * that is currently being built uses the new space format
> +	 * while other LSM trees still use the old space format.
> +	 */
> +	if (*region_stmt == NULL || (*region_stmt)->format_id != format_id) {

8. Do you really need reset region_stmt in such a case? See this case:
vy_lsm_set(pk), vy_lsm_set(old_sk1), vy_lsm_set(new_sk), vy_lsm_set(old_sk2)

Here you create 3 region statements: one for pk, old_sk1; new for new_sk with a
new format, and again new for old_sk2, because *region_stmt is now has new format,
but old_sk2 still has old one.

You can dup stmt on region when format mismatch is detected, but do not reset
*region_stmt. It allows to create only 2 region statements for the case above.
Am I right?

>   		*region_stmt = vy_stmt_dup_lsregion(stmt, &mem->env->allocator,
>   						    mem->generation);
>   		if (*region_stmt == NULL)
> diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
> index 4db86152..7dbd6a96 100644
> --- a/src/box/vy_scheduler.h
> +++ b/src/box/vy_scheduler.h
> @@ -202,6 +202,13 @@ void
>   vy_scheduler_trigger_dump(struct vy_scheduler *scheduler);
>   
>   /**
> + * Trigger dump of all currently existing in-memory trees
> + * and wait until it is complete. Returns 0 on success.
> + */
> +int
> +vy_scheduler_dump(struct vy_scheduler *scheduler);

9. Do you really need this at the end of sk building? Why you can not leave
mems as they are? They will be dumped anyway sometimes.

  parent reply	other threads:[~2018-05-21 15:18 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-21 13:46 Vladimir Davydov
2018-05-20 22:36 ` Konstantin Osipov
2018-05-21 15:18 ` Vladislav Shpilevoy [this message]
2018-05-24  7:46   ` [tarantool-patches] " Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c09c9df9-09ac-5fa9-bea7-997660e5aee9@tarantool.org \
    --to=v.shpilevoy@tarantool.org \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --cc=vdavydov.dev@gmail.com \
    --subject='Re: [tarantool-patches] [PATCH] vinyl: allow to build secondary index for non-empty space' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox