[PATCH v2 3/8] vinyl: log new index before WAL write on DDL

Vladimir Davydov vdavydov.dev at gmail.com
Sun May 27 22:05:51 MSK 2018


Currently, we write new indexes to vylog only after successful WAL
write (see vinyl_index_commit_create). This is incompatible with space
ALTER - the problem is during ALTER vinyl may need to create new run
files, which we need to track in order not to leave garbage if ALTER
fails or tarantool exits before ALTER is complete.

So this patch splits index creation in two stages, prepare and commit.
The 'commit' stage is represented by existing VY_LOG_CREATE_LSM record,
which is written from index_vtab::commit_create callback, just like
before. For the 'prepare' stage we introduce a new record type,
VY_LOG_REPARE_LSM, written from index_vtab::add_primary_key and
index_vtab::build_index callbacks, i.e. before WAL write. For now, we
don't write anything to prepared, but uncommitted indexes (this will be
done later), but we do add prepared indexes to the scheduler so that
they can be dumped and compacted. If ALTER fails, we drop prepared
indexes in index_vtab::abort_create callback. Prepared but uncommitted
indexes are ignored by backup and replication and cleaned up from vylog
on restart.

Note, we have to rework vinyl/errinj_vylog test in this patch, because
index creation (and hence space truncation) commands now fail on vylog
error, i.e. a situation when the same index is dropped and recreated
multiple times in xlog without having corresponding records in vylog is
now impossible.

Also, space truncation is not linearizable for vinyl anymore as it may
yield before WAL write, while trying to prepare an index in vylog. This
is OK - we never promised it is. Remove the corresponding test case.

Needed for #1653
---
 src/box/vinyl.c                  |  75 +++++++++++------
 src/box/vy_log.c                 | 171 +++++++++++++++++++++++++++++++++++----
 src/box/vy_log.h                 |  51 ++++++++++--
 src/box/vy_lsm.c                 |  80 ++++++++++++------
 src/box/vy_lsm.h                 |   4 +-
 test/engine/truncate.result      |  87 --------------------
 test/engine/truncate.test.lua    |  48 -----------
 test/vinyl/errinj_vylog.result   | 169 ++++++++++++++++++--------------------
 test/vinyl/errinj_vylog.test.lua | 112 ++++++++++++-------------
 9 files changed, 438 insertions(+), 359 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index f0d26874..4d670328 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -803,6 +803,8 @@ vinyl_index_open(struct index *index)
 	default:
 		unreachable();
 	}
+	if (rc == 0)
+		vy_scheduler_add_lsm(&env->scheduler, lsm);
 	return rc;
 }
 
@@ -824,10 +826,8 @@ vinyl_index_commit_create(struct index *index, int64_t lsn)
 		 * the index isn't in the recovery context and we
 		 * need to retry to log it now.
 		 */
-		if (lsm->commit_lsn >= 0) {
-			vy_scheduler_add_lsm(&env->scheduler, lsm);
+		if (lsm->commit_lsn >= 0)
 			return;
-		}
 	}
 
 	if (env->status == VINYL_INITIAL_RECOVERY_REMOTE) {
@@ -852,9 +852,6 @@ vinyl_index_commit_create(struct index *index, int64_t lsn)
 	assert(lsm->commit_lsn < 0);
 	lsm->commit_lsn = lsn;
 
-	assert(lsm->range_count == 1);
-	struct vy_range *range = vy_range_tree_first(lsm->tree);
-
 	/*
 	 * Since it's too late to fail now, in case of vylog write
 	 * failure we leave the records we attempted to write in
@@ -864,15 +861,36 @@ vinyl_index_commit_create(struct index *index, int64_t lsn)
 	 * recovery.
 	 */
 	vy_log_tx_begin();
-	vy_log_create_lsm(lsm->id, lsm->space_id, lsm->index_id,
-			  lsm->key_def, lsn);
-	vy_log_insert_range(lsm->id, range->id, NULL, NULL);
+	vy_log_create_lsm(lsm->id, lsn);
+	vy_log_tx_try_commit();
+}
+
+static void
+vinyl_index_abort_create(struct index *index)
+{
+	struct vy_env *env = vy_env(index->engine);
+	struct vy_lsm *lsm = vy_lsm(index);
+
+	if (env->status != VINYL_ONLINE) {
+		/* Failure during recovery. Nothing to do. */
+		return;
+	}
+	if (lsm->id < 0) {
+		/*
+		 * ALTER failed before we wrote information about
+		 * the new LSM tree to vylog, see vy_lsm_create().
+		 * Nothing to do.
+		 */
+		return;
+	}
+
+	vy_scheduler_remove_lsm(&env->scheduler, lsm);
+
+	lsm->is_dropped = true;
+
+	vy_log_tx_begin();
+	vy_log_drop_lsm(lsm->id, 0);
 	vy_log_tx_try_commit();
-	/*
-	 * After we committed the index in the log, we can schedule
-	 * a task for it.
-	 */
-	vy_scheduler_add_lsm(&env->scheduler, lsm);
 }
 
 static void
@@ -3111,8 +3129,10 @@ vy_send_lsm(struct vy_join_ctx *ctx, struct vy_lsm_recovery_info *lsm_info)
 {
 	int rc = -1;
 
-	if (lsm_info->drop_lsn >= 0)
+	if (lsm_info->drop_lsn >= 0 || lsm_info->create_lsn < 0) {
+		/* Dropped or not yet built LSM tree. */
 		return 0;
+	}
 
 	/*
 	 * We are only interested in the primary index LSM tree.
@@ -3326,16 +3346,21 @@ vy_gc_run(struct vy_env *env,
 }
 
 /**
- * Given a dropped LSM tree, delete all its ranges and slices and
- * mark all its runs as dropped. Forget the LSM tree if it has no
- * associated objects.
+ * Given a dropped or not fully built LSM tree, delete all its
+ * ranges and slices and mark all its runs as dropped. Forget
+ * the LSM tree if it has no associated objects.
  */
 static void
 vy_gc_lsm(struct vy_lsm_recovery_info *lsm_info)
 {
-	assert(lsm_info->drop_lsn >= 0);
+	assert(lsm_info->drop_lsn >= 0 ||
+	       lsm_info->create_lsn < 0);
 
 	vy_log_tx_begin();
+	if (lsm_info->drop_lsn < 0) {
+		lsm_info->drop_lsn = 0;
+		vy_log_drop_lsm(lsm_info->id, 0);
+	}
 	struct vy_range_recovery_info *range_info;
 	rlist_foreach_entry(range_info, &lsm_info->ranges, in_lsm) {
 		struct vy_slice_recovery_info *slice_info;
@@ -3371,8 +3396,10 @@ vy_gc(struct vy_env *env, struct vy_recovery *recovery,
 	int loops = 0;
 	struct vy_lsm_recovery_info *lsm_info;
 	rlist_foreach_entry(lsm_info, &recovery->lsms, in_recovery) {
-		if ((gc_mask & VY_GC_DROPPED) != 0 &&
-		    lsm_info->drop_lsn >= 0)
+		if ((lsm_info->drop_lsn >= 0 &&
+		     (gc_mask & VY_GC_DROPPED) != 0) ||
+		    (lsm_info->create_lsn < 0 &&
+		     (gc_mask & VY_GC_INCOMPLETE) != 0))
 			vy_gc_lsm(lsm_info);
 
 		struct vy_run_recovery_info *run_info;
@@ -3438,8 +3465,10 @@ vinyl_engine_backup(struct engine *engine, struct vclock *vclock,
 	int loops = 0;
 	struct vy_lsm_recovery_info *lsm_info;
 	rlist_foreach_entry(lsm_info, &recovery->lsms, in_recovery) {
-		if (lsm_info->drop_lsn >= 0)
+		if (lsm_info->drop_lsn >= 0 || lsm_info->create_lsn < 0) {
+			/* Dropped or not yet built LSM tree. */
 			continue;
+		}
 		struct vy_run_recovery_info *run_info;
 		rlist_foreach_entry(run_info, &lsm_info->runs, in_lsm) {
 			if (run_info->is_dropped || run_info->is_incomplete)
@@ -4059,7 +4088,7 @@ static const struct space_vtab vinyl_space_vtab = {
 static const struct index_vtab vinyl_index_vtab = {
 	/* .destroy = */ vinyl_index_destroy,
 	/* .commit_create = */ vinyl_index_commit_create,
-	/* .abort_create = */ generic_index_abort_create,
+	/* .abort_create = */ vinyl_index_abort_create,
 	/* .commit_modify = */ vinyl_index_commit_modify,
 	/* .commit_drop = */ vinyl_index_commit_drop,
 	/* .update_def = */ generic_index_update_def,
diff --git a/src/box/vy_log.c b/src/box/vy_log.c
index 069cd528..e98b95e5 100644
--- a/src/box/vy_log.c
+++ b/src/box/vy_log.c
@@ -121,6 +121,7 @@ static const char *vy_log_type_name[] = {
 	[VY_LOG_TRUNCATE_LSM]		= "truncate_lsm",
 	[VY_LOG_MODIFY_LSM]		= "modify_lsm",
 	[VY_LOG_FORGET_LSM]		= "forget_lsm",
+	[VY_LOG_PREPARE_LSM]		= "prepare_lsm",
 };
 
 /** Metadata log object. */
@@ -1308,6 +1309,7 @@ vy_recovery_do_create_lsm(struct vy_recovery *recovery, int64_t id,
 	lsm->modify_lsn = -1;
 	lsm->drop_lsn = -1;
 	lsm->dump_lsn = -1;
+	lsm->prepared = NULL;
 	rlist_create(&lsm->ranges);
 	rlist_create(&lsm->runs);
 	/*
@@ -1322,17 +1324,27 @@ vy_recovery_do_create_lsm(struct vy_recovery *recovery, int64_t id,
 }
 
 /**
- * Handle a VY_LOG_CREATE_LSM log record.
- * This function allocates a new vinyl LSM tree with ID @id
- * and inserts it to the hash.
- * Return 0 on success, -1 on failure (ID collision or OOM).
+ * Handle a VY_LOG_PREPARE_LSM log record.
+ *
+ * This function allocates a new LSM tree with the given ID and
+ * either associates it with the existing LSM tree hashed under
+ * the same space_id/index_id or inserts it into the hash if
+ * there's none.
+ *
+ * Note, we link incomplete LSM trees to index_id_hash (either
+ * directly or indirectly via vy_lsm_recovery_info::prepared),
+ * because an LSM tree may have been fully built and logged in
+ * WAL, but not committed to vylog. We need to be able to identify
+ * such LSM trees during local recovery so that instead of
+ * rebuilding them we can simply retry vylog write.
+ *
+ * Returns 0 on success, -1 on error.
  */
 static int
-vy_recovery_create_lsm(struct vy_recovery *recovery, int64_t id,
-		       uint32_t space_id, uint32_t index_id,
-		       const struct key_part_def *key_parts,
-		       uint32_t key_part_count, int64_t create_lsn,
-		       int64_t modify_lsn, int64_t dump_lsn)
+vy_recovery_prepare_lsm(struct vy_recovery *recovery, int64_t id,
+			uint32_t space_id, uint32_t index_id,
+			const struct key_part_def *key_parts,
+			uint32_t key_part_count)
 {
 	if (vy_recovery_lookup_lsm(recovery, id) != NULL) {
 		diag_set(ClientError, ER_INVALID_VYLOG_FILE,
@@ -1340,23 +1352,113 @@ vy_recovery_create_lsm(struct vy_recovery *recovery, int64_t id,
 				    (long long)id));
 		return -1;
 	}
+	struct vy_lsm_recovery_info *new_lsm;
+	new_lsm = vy_recovery_do_create_lsm(recovery, id, space_id, index_id,
+					    key_parts, key_part_count);
+	if (new_lsm == NULL)
+		return -1;
+
 	struct vy_lsm_recovery_info *lsm;
 	lsm = vy_recovery_lsm_by_index_id(recovery, space_id, index_id);
-	if (lsm != NULL && lsm->drop_lsn < 0) {
+	if (lsm == NULL) {
+		/*
+		 * There's no LSM tree for these space_id/index_id
+		 * in the recovery context. Insert the new LSM tree
+		 * into the index_id_hash.
+		 */
+		return vy_recovery_hash_index_id(recovery, new_lsm);
+	}
+
+	/*
+	 * If there's an LSM tree for the given space_id/index_id,
+	 * it can't be incomplete (i.e. it must be committed albeit
+	 * it may be dropped), neither can it have a prepared LSM
+	 * tree associated with it.
+	 */
+	if (lsm->create_lsn < 0 || lsm->prepared != NULL) {
 		diag_set(ClientError, ER_INVALID_VYLOG_FILE,
-			 tt_sprintf("LSM tree %u/%u created twice",
+			 tt_sprintf("LSM tree %u/%u prepared twice",
 				    (unsigned)space_id, (unsigned)index_id));
 		return -1;
 	}
-	lsm = vy_recovery_do_create_lsm(recovery, id, space_id, index_id,
-					key_parts, key_part_count);
-	if (lsm == NULL)
-		return -1;
 
+	/* Link the new LSM tree to the existing one. */
+	lsm->prepared = new_lsm;
+	return 0;
+}
+
+/**
+ * Handle a VY_LOG_CREATE_LSM log record.
+ *
+ * Depending on whether the LSM tree was previously prepared,
+ * this function either commits it or allocates a new one and
+ * inserts it into the recovery hash.
+ *
+ * Returns 0 on success, -1 on error.
+ */
+static int
+vy_recovery_create_lsm(struct vy_recovery *recovery, int64_t id,
+		       uint32_t space_id, uint32_t index_id,
+		       const struct key_part_def *key_parts,
+		       uint32_t key_part_count, int64_t create_lsn,
+		       int64_t modify_lsn, int64_t dump_lsn)
+{
+	struct vy_lsm_recovery_info *lsm;
+	lsm = vy_recovery_lookup_lsm(recovery, id);
+	if (lsm != NULL) {
+		/*
+		 * If the LSM tree already exists, it must be in
+		 * the prepared state (i.e. not committed or dropped).
+		 */
+		if (lsm->create_lsn >= 0 || lsm->drop_lsn >= 0) {
+			diag_set(ClientError, ER_INVALID_VYLOG_FILE,
+				 tt_sprintf("Duplicate LSM tree id %lld",
+					    (long long)id));
+			return -1;
+		}
+	} else {
+		lsm = vy_recovery_do_create_lsm(recovery, id, space_id, index_id,
+						key_parts, key_part_count);
+		if (lsm == NULL)
+			return -1;
+		lsm->dump_lsn = dump_lsn;
+	}
+
+	/* Mark the LSM tree committed by assigning LSN. */
 	lsm->create_lsn = create_lsn;
 	lsm->modify_lsn = modify_lsn;
-	lsm->dump_lsn = dump_lsn;
 
+	/*
+	 * Hash the new LSM tree under the given space_id/index_id.
+	 * First, look up the LSM tree that is presently in the hash.
+	 */
+	struct vy_lsm_recovery_info *old_lsm;
+	old_lsm = vy_recovery_lsm_by_index_id(recovery, space_id, index_id);
+	if (old_lsm == lsm) {
+		/*
+		 * The new LSM tree is already hashed, nothing to do
+		 * (it was hashed on prepare).
+		 */
+		return 0;
+	}
+
+	/* Unlink the new LSM tree from the old one, if any. */
+	if (old_lsm != NULL) {
+		assert(old_lsm->create_lsn >= 0);
+		if (old_lsm->drop_lsn < 0) {
+			diag_set(ClientError, ER_INVALID_VYLOG_FILE,
+				 tt_sprintf("LSM tree %u/%u created twice",
+					    (unsigned)space_id,
+					    (unsigned)index_id));
+			return -1;
+		}
+		if (old_lsm->prepared != NULL) {
+			assert(old_lsm->prepared == lsm);
+			old_lsm->prepared = NULL;
+		}
+	}
+
+	/* Update the hash with the new LSM tree. */
 	return vy_recovery_hash_index_id(recovery, lsm);
 }
 
@@ -1421,6 +1523,35 @@ vy_recovery_drop_lsm(struct vy_recovery *recovery, int64_t id, int64_t drop_lsn)
 	}
 	assert(drop_lsn >= 0);
 	lsm->drop_lsn = drop_lsn;
+
+	if (lsm->create_lsn >= 0)
+		return 0;
+	/*
+	 * If the dropped LSM tree has never been committed,
+	 * it means that ALTER for the corresponding index was
+	 * aborted, hence we don't need to keep it in the
+	 * index_id_hash, because the LSM tree is pure garbage
+	 * and will never be recovered. Unlink it now.
+	 */
+	struct vy_lsm_recovery_info *hashed_lsm;
+	hashed_lsm = vy_recovery_lsm_by_index_id(recovery,
+				lsm->space_id, lsm->index_id);
+	if (hashed_lsm == lsm) {
+		/*
+		 * The LSM tree is linked to the index_id_hash
+		 * directly. Remove the corresponding hash entry.
+		 */
+		vy_recovery_unhash_index_id(recovery,
+				lsm->space_id, lsm->index_id);
+	} else {
+		/*
+		 * The LSM tree was linked to an existing LSM
+		 * tree via vy_lsm_recovery_info::prepared.
+		 * Clear the reference.
+		 */
+		assert(hashed_lsm->prepared == lsm);
+		hashed_lsm->prepared = NULL;
+	}
 	return 0;
 }
 
@@ -1873,6 +2004,11 @@ vy_recovery_process_record(struct vy_recovery *recovery,
 {
 	int rc;
 	switch (record->type) {
+	case VY_LOG_PREPARE_LSM:
+		rc = vy_recovery_prepare_lsm(recovery, record->lsm_id,
+				record->space_id, record->index_id,
+				record->key_parts, record->key_part_count);
+		break;
 	case VY_LOG_CREATE_LSM:
 		rc = vy_recovery_create_lsm(recovery, record->lsm_id,
 				record->space_id, record->index_id,
@@ -2130,7 +2266,8 @@ vy_log_append_lsm(struct xlog *xlog, struct vy_lsm_recovery_info *lsm)
 	struct vy_log_record record;
 
 	vy_log_record_init(&record);
-	record.type = VY_LOG_CREATE_LSM;
+	record.type = lsm->create_lsn < 0 ?
+		VY_LOG_PREPARE_LSM : VY_LOG_CREATE_LSM;
 	record.lsm_id = lsm->id;
 	record.index_id = lsm->index_id;
 	record.space_id = lsm->space_id;
diff --git a/src/box/vy_log.h b/src/box/vy_log.h
index d77cb298..7672b8f9 100644
--- a/src/box/vy_log.h
+++ b/src/box/vy_log.h
@@ -65,8 +65,9 @@ struct mh_i64ptr_t;
 enum vy_log_record_type {
 	/**
 	 * Create a new LSM tree.
-	 * Requires vy_log_record::lsm_id, index_id, space_id,
-	 * key_def, create_lsn.
+	 * Requires vy_log_record::lsm_id, create_lsn.
+	 * After rotation, it also stores space_id, index_id, key_def,
+	 * create_lsn, modify_lsn, dump_lsn.
 	 */
 	VY_LOG_CREATE_LSM		= 0,
 	/**
@@ -179,6 +180,21 @@ enum vy_log_record_type {
 	 * from vylog on the next rotation.
 	 */
 	VY_LOG_FORGET_LSM		= 14,
+	/**
+	 * Prepare a new LSM tree for building.
+	 * Requires vy_log_record::lsm_id, index_id, space_id.
+	 *
+	 * Index ALTER operation consists of two stages. First, we
+	 * build a new LSM tree, checking constraints if necessary.
+	 * This is done before writing the operation to WAL. Then,
+	 * provided the first stage succeeded, we commit the LSM
+	 * tree to the metadata log.
+	 *
+	 * The following record is used to prepare a new LSM tree
+	 * for building. Once the index has been built, we write
+	 * a VY_LOG_CREATE_LSM record to commit it.
+	 */
+	VY_LOG_PREPARE_LSM		= 15,
 
 	vy_log_record_type_MAX
 };
@@ -273,7 +289,12 @@ struct vy_lsm_recovery_info {
 	struct key_part_def *key_parts;
 	/** Number of key parts. */
 	uint32_t key_part_count;
-	/** LSN of the WAL row that created the LSM tree. */
+	/**
+	 * LSN of the WAL row that created the LSM tree,
+	 * or -1 if the LSM tree was not committed to WAL
+	 * (that is there was an VY_LOG_PREPARE_LSM record
+	 * but no VY_LOG_CREATE_LSM).
+	 */
 	int64_t create_lsn;
 	/** LSN of the WAL row that last modified the LSM tree. */
 	int64_t modify_lsn;
@@ -295,6 +316,11 @@ struct vy_lsm_recovery_info {
 	 * vy_run_recovery_info::in_lsm.
 	 */
 	struct rlist runs;
+	/**
+	 * Pointer to an LSM tree that is going to replace
+	 * this one after successful ALTER.
+	 */
+	struct vy_lsm_recovery_info *prepared;
 };
 
 /** Vinyl range info stored in a recovery context. */
@@ -527,18 +553,29 @@ vy_log_record_init(struct vy_log_record *record)
 	memset(record, 0, sizeof(*record));
 }
 
-/** Helper to log a vinyl LSM tree creation. */
+/** Helper to log a vinyl LSM tree preparation. */
 static inline void
-vy_log_create_lsm(int64_t id, uint32_t space_id, uint32_t index_id,
-		  const struct key_def *key_def, int64_t create_lsn)
+vy_log_prepare_lsm(int64_t id, uint32_t space_id, uint32_t index_id,
+		   const struct key_def *key_def)
 {
 	struct vy_log_record record;
 	vy_log_record_init(&record);
-	record.type = VY_LOG_CREATE_LSM;
+	record.type = VY_LOG_PREPARE_LSM;
 	record.lsm_id = id;
 	record.space_id = space_id;
 	record.index_id = index_id;
 	record.key_def = key_def;
+	vy_log_write(&record);
+}
+
+/** Helper to log a vinyl LSM tree creation. */
+static inline void
+vy_log_create_lsm(int64_t id, int64_t create_lsn)
+{
+	struct vy_log_record record;
+	vy_log_record_init(&record);
+	record.type = VY_LOG_CREATE_LSM;
+	record.lsm_id = id;
 	record.create_lsn = create_lsn;
 	vy_log_write(&record);
 }
diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index 289d5c40..fd352c8a 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -279,21 +279,6 @@ vy_lsm_delete(struct vy_lsm *lsm)
 	free(lsm);
 }
 
-/** Initialize the range tree of a new LSM tree. */
-static int
-vy_lsm_init_range_tree(struct vy_lsm *lsm)
-{
-	struct vy_range *range = vy_range_new(vy_log_next_id(), NULL, NULL,
-					      lsm->cmp_def);
-	if (range == NULL)
-		return -1;
-
-	assert(lsm->range_count == 0);
-	vy_lsm_add_range(lsm, range);
-	vy_lsm_acct_range(lsm, range);
-	return 0;
-}
-
 int
 vy_lsm_create(struct vy_lsm *lsm)
 {
@@ -327,12 +312,34 @@ vy_lsm_create(struct vy_lsm *lsm)
 		return -1;
 	}
 
-	/* Assign unique id. */
-	assert(lsm->id < 0);
-	lsm->id = vy_log_next_id();
+	/*
+	 * Allocate a unique id for the new LSM tree, but don't assign
+	 * it until information about the new LSM tree is successfully
+	 * written to vylog as vinyl_index_abort_create() uses id to
+	 * decide whether it needs to clean up.
+	 */
+	int64_t id = vy_log_next_id();
+
+	/* Create the initial range. */
+	struct vy_range *range = vy_range_new(vy_log_next_id(), NULL, NULL,
+					      lsm->cmp_def);
+	if (range == NULL)
+		return -1;
+	assert(lsm->range_count == 0);
+	vy_lsm_add_range(lsm, range);
+	vy_lsm_acct_range(lsm, range);
+
+	/* Write the new LSM tree record to vylog. */
+	vy_log_tx_begin();
+	vy_log_prepare_lsm(id, lsm->space_id, lsm->index_id, lsm->key_def);
+	vy_log_insert_range(id, range->id, NULL, NULL);
+	if (vy_log_tx_commit() < 0)
+		return -1;
 
-	/* Allocate initial range. */
-	return vy_lsm_init_range_tree(lsm);
+	/* Assign the id. */
+	assert(lsm->id < 0);
+	lsm->id = id;
+	return 0;
 }
 
 static struct vy_run *
@@ -505,7 +512,7 @@ vy_lsm_recover(struct vy_lsm *lsm, struct vy_recovery *recovery,
 	lsm_info = vy_recovery_lsm_by_index_id(recovery,
 			lsm->space_id, lsm->index_id);
 	if (is_checkpoint_recovery) {
-		if (lsm_info == NULL) {
+		if (lsm_info == NULL || lsm_info->create_lsn < 0) {
 			/*
 			 * All LSM trees created from snapshot rows must
 			 * be present in vylog, because snapshot can
@@ -528,16 +535,33 @@ vy_lsm_recover(struct vy_lsm *lsm, struct vy_recovery *recovery,
 		}
 	}
 
-	if (lsm_info == NULL || lsn > lsm_info->create_lsn) {
+	if (lsm_info == NULL || (lsm_info->prepared == NULL &&
+				 lsm_info->create_lsn >= 0 &&
+				 lsn > lsm_info->create_lsn)) {
 		/*
 		 * If we failed to log LSM tree creation before restart,
 		 * we won't find it in the log on recovery. This is OK as
 		 * the LSM tree doesn't have any runs in this case. We will
 		 * retry to log LSM tree in vinyl_index_commit_create().
 		 * For now, just create the initial range and assign id.
+		 *
+		 * Note, this is needed only for backward compatibility
+		 * since now we write VY_LOG_PREPARE_LSM before WAL write
+		 * and hence if the index was committed to WAL, it must be
+		 * present in vylog as well.
 		 */
-		lsm->id = vy_log_next_id();
-		return vy_lsm_init_range_tree(lsm);
+		return vy_lsm_create(lsm);
+	}
+
+	if (lsm_info->create_lsn >= 0 && lsn > lsm_info->create_lsn) {
+		/*
+		 * The index we are recovering was prepared, successfully
+		 * built, and committed to WAL, but it was not marked as
+		 * created in vylog. Recover the prepared LSM tree. We will
+		 * retry vylog write in vinyl_index_commit_create().
+		 */
+		lsm_info = lsm_info->prepared;
+		assert(lsm_info != NULL);
 	}
 
 	lsm->id = lsm_info->id;
@@ -554,7 +578,13 @@ vy_lsm_recover(struct vy_lsm *lsm, struct vy_recovery *recovery,
 		 * We need range tree initialized for all LSM trees,
 		 * even for dropped ones.
 		 */
-		return vy_lsm_init_range_tree(lsm);
+		struct vy_range *range = vy_range_new(vy_log_next_id(),
+						      NULL, NULL, lsm->cmp_def);
+		if (range == NULL)
+			return -1;
+		vy_lsm_add_range(lsm, range);
+		vy_lsm_acct_range(lsm, range);
+		return 0;
 	}
 
 	/*
diff --git a/src/box/vy_lsm.h b/src/box/vy_lsm.h
index 3f820fac..2e99e4d0 100644
--- a/src/box/vy_lsm.h
+++ b/src/box/vy_lsm.h
@@ -355,8 +355,8 @@ vy_lsm_update_pk(struct vy_lsm *lsm, struct vy_lsm *pk)
  *
  * This function is called when an LSM tree is created
  * after recovery is complete or during remote recovery.
- * It initializes the range tree and makes the LSM tree
- * directory.
+ * It initializes the range tree, makes the LSM tree
+ * directory, and writes the LSM tree record to vylog.
  */
 int
 vy_lsm_create(struct vy_lsm *lsm);
diff --git a/test/engine/truncate.result b/test/engine/truncate.result
index 3ad400e2..b4de787f 100644
--- a/test/engine/truncate.result
+++ b/test/engine/truncate.result
@@ -195,93 +195,6 @@ s:drop()
 ---
 ...
 --
--- Check that space truncation is linearizable.
---
--- Create a space with several indexes and start three fibers:
--- 1st and 3rd update the space, 2nd truncates it. Then wait
--- until all fibers are done. The space should contain data
--- inserted by the 3rd fiber.
---
--- Note, this is guaranteed to be true only if space updates
--- don't yield, which is always true for memtx and is true
--- for vinyl in case there's no data on disk, as in this case.
---
-s = box.schema.create_space('test', {engine = engine})
----
-...
-_ = s:create_index('i1', {parts = {1, 'unsigned'}})
----
-...
-_ = s:create_index('i2', {parts = {2, 'unsigned'}})
----
-...
-_ = s:create_index('i3', {parts = {3, 'string'}})
----
-...
-_ = s:insert{1, 1, 'a'}
----
-...
-_ = s:insert{2, 2, 'b'}
----
-...
-_ = s:insert{3, 3, 'c'}
----
-...
-c = fiber.channel(3)
----
-...
-test_run:cmd("setopt delimiter ';'")
----
-- true
-...
-fiber.create(function()
-    box.begin()
-    s:replace{1, 10, 'aa'}
-    s:replace{2, 20, 'bb'}
-    s:replace{3, 30, 'cc'}
-    box.commit()
-    c:put(true)
-end)
-fiber.create(function()
-    s:truncate()
-    c:put(true)
-end)
-fiber.create(function()
-    box.begin()
-    s:replace{1, 100, 'aaa'}
-    s:replace{2, 200, 'bbb'}
-    s:replace{3, 300, 'ccc'}
-    box.commit()
-    c:put(true)
-end)
-test_run:cmd("setopt delimiter ''");
----
-...
-for i = 1, 3 do c:get() end
----
-...
-s.index.i1:select()
----
-- - [1, 100, 'aaa']
-  - [2, 200, 'bbb']
-  - [3, 300, 'ccc']
-...
-s.index.i2:select()
----
-- - [1, 100, 'aaa']
-  - [2, 200, 'bbb']
-  - [3, 300, 'ccc']
-...
-s.index.i3:select()
----
-- - [1, 100, 'aaa']
-  - [2, 200, 'bbb']
-  - [3, 300, 'ccc']
-...
-s:drop()
----
-...
---
 -- Calling space.truncate concurrently.
 --
 s = box.schema.create_space('test', {engine = engine})
diff --git a/test/engine/truncate.test.lua b/test/engine/truncate.test.lua
index df2797a1..74fdd52b 100644
--- a/test/engine/truncate.test.lua
+++ b/test/engine/truncate.test.lua
@@ -82,54 +82,6 @@ s.index.i3:select()
 s:drop()
 
 --
--- Check that space truncation is linearizable.
---
--- Create a space with several indexes and start three fibers:
--- 1st and 3rd update the space, 2nd truncates it. Then wait
--- until all fibers are done. The space should contain data
--- inserted by the 3rd fiber.
---
--- Note, this is guaranteed to be true only if space updates
--- don't yield, which is always true for memtx and is true
--- for vinyl in case there's no data on disk, as in this case.
---
-s = box.schema.create_space('test', {engine = engine})
-_ = s:create_index('i1', {parts = {1, 'unsigned'}})
-_ = s:create_index('i2', {parts = {2, 'unsigned'}})
-_ = s:create_index('i3', {parts = {3, 'string'}})
-_ = s:insert{1, 1, 'a'}
-_ = s:insert{2, 2, 'b'}
-_ = s:insert{3, 3, 'c'}
-c = fiber.channel(3)
-test_run:cmd("setopt delimiter ';'")
-fiber.create(function()
-    box.begin()
-    s:replace{1, 10, 'aa'}
-    s:replace{2, 20, 'bb'}
-    s:replace{3, 30, 'cc'}
-    box.commit()
-    c:put(true)
-end)
-fiber.create(function()
-    s:truncate()
-    c:put(true)
-end)
-fiber.create(function()
-    box.begin()
-    s:replace{1, 100, 'aaa'}
-    s:replace{2, 200, 'bbb'}
-    s:replace{3, 300, 'ccc'}
-    box.commit()
-    c:put(true)
-end)
-test_run:cmd("setopt delimiter ''");
-for i = 1, 3 do c:get() end
-s.index.i1:select()
-s.index.i2:select()
-s.index.i3:select()
-s:drop()
-
---
 -- Calling space.truncate concurrently.
 --
 s = box.schema.create_space('test', {engine = engine})
diff --git a/test/vinyl/errinj_vylog.result b/test/vinyl/errinj_vylog.result
index f78201c9..ca23cb45 100644
--- a/test/vinyl/errinj_vylog.result
+++ b/test/vinyl/errinj_vylog.result
@@ -67,7 +67,7 @@ s:drop()
 ---
 ...
 --
--- Check that an index drop/truncate/create record we failed to
+-- Check that an index drop/create record we failed to
 -- write to vylog is flushed along with the next record.
 --
 fiber = require 'fiber'
@@ -76,62 +76,57 @@ fiber = require 'fiber'
 s1 = box.schema.space.create('test1', {engine = 'vinyl'})
 ---
 ...
-_ = s1:create_index('pk')
----
-...
-_ = s1:insert{1, 'a'}
----
-...
 s2 = box.schema.space.create('test2', {engine = 'vinyl'})
 ---
 ...
 _ = s2:create_index('pk')
 ---
 ...
-_ = s2:insert{2, 'b'}
+_ = s2:insert{1, 'a'}
 ---
 ...
 box.snapshot()
 ---
 - ok
 ...
-_ = s1:insert{3, 'c'}
----
-...
-_ = s2:insert{4, 'd'}
+_ = s2:insert{2, 'b'}
 ---
 ...
-SCHED_TIMEOUT = 0.01
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
 ---
+- ok
 ...
-box.error.injection.set('ERRINJ_VY_SCHED_TIMEOUT', SCHED_TIMEOUT)
+-- VY_LOG_PREPARE_LSM written, but VY_LOG_CREATE_LSM missing
+ch = fiber.channel(1)
 ---
-- ok
 ...
-box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true);
+_ = fiber.create(function() s1:create_index('pk') ch:put(true) end)
 ---
-- ok
 ...
-s1:drop()
+fiber.sleep(0.001)
 ---
 ...
-s2:truncate()
+box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true)
 ---
+- ok
 ...
-_ = s2:insert{5, 'e'}
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
 ---
+- ok
 ...
-s3 = box.schema.space.create('test3', {engine = 'vinyl'})
+ch:get()
 ---
+- true
 ...
-_ = s3:create_index('pk')
+_ = s1:insert{3, 'c'}
 ---
 ...
-_ = s3:insert{6, 'f'}
+-- VY_LOG_DROP_LSM missing
+s2.index.pk:drop()
 ---
 ...
 -- pending records must not be rolled back on error
-box.snapshot()
+s2:create_index('pk') -- error
 ---
 - error: Error injection 'vinyl log flush'
 ...
@@ -139,65 +134,46 @@ box.error.injection.set('ERRINJ_VY_LOG_FLUSH', false);
 ---
 - ok
 ...
-fiber.sleep(2 * SCHED_TIMEOUT) -- wait for scheduler to unthrottle
+_ = s1:insert{4, 'd'}
 ---
 ...
-box.error.injection.set('ERRINJ_VY_SCHED_TIMEOUT', 0)
----
-- ok
-...
-box.snapshot()
----
-- ok
-...
-_ = s2:insert{7, 'g'}
+_ = s2:create_index('pk')
 ---
 ...
-_ = s3:insert{8, 'h'}
+_ = s2:insert{5, 'e'}
 ---
 ...
 test_run:cmd('restart server default')
 s1 = box.space.test1
 ---
 ...
-s1 == nil
+s2 = box.space.test2
 ---
-- true
 ...
-s2 = box.space.test2
+s1:select()
 ---
+- - [3, 'c']
+  - [4, 'd']
 ...
 s2:select()
 ---
 - - [5, 'e']
-  - [7, 'g']
-...
-s2:drop()
----
-...
-s3 = box.space.test3
----
 ...
-s3:select()
+s1:drop()
 ---
-- - [6, 'f']
-  - [8, 'h']
 ...
-s3:drop()
+s2:drop()
 ---
 ...
 --
--- Check that if a buffered index drop/truncate/create record
--- does not make it to the vylog before restart, it will be
--- replayed on recovery.
+-- Check that if a buffered index drop/create record does not
+-- make it to the vylog before restart, it will be replayed on
+-- recovery.
 --
-s1 = box.schema.space.create('test1', {engine = 'vinyl'})
----
-...
-_ = s1:create_index('pk')
+fiber = require 'fiber'
 ---
 ...
-_ = s1:insert{111, 'aaa'}
+s1 = box.schema.space.create('test1', {engine = 'vinyl'})
 ---
 ...
 s2 = box.schema.space.create('test2', {engine = 'vinyl'})
@@ -206,82 +182,97 @@ s2 = box.schema.space.create('test2', {engine = 'vinyl'})
 _ = s2:create_index('pk')
 ---
 ...
-_ = s2:insert{222, 'bbb'}
+_ = s2:insert{111, 'aaa'}
 ---
 ...
 box.snapshot()
 ---
 - ok
 ...
-_ = s1:insert{333, 'ccc'}
----
-...
-_ = s2:insert{444, 'ddd'}
+_ = s2:insert{222, 'bbb'}
 ---
 ...
-box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true);
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
 ---
 - ok
 ...
-s1:drop()
+-- VY_LOG_PREPARE_LSM written, but VY_LOG_CREATE_LSM missing
+ch = fiber.channel(1)
 ---
 ...
-s2:truncate()
+_ = fiber.create(function() s1:create_index('pk') ch:put(true) end)
 ---
 ...
-_ = s2:insert{555, 'eee'}
+fiber.sleep(0.001)
 ---
 ...
-s3 = box.schema.space.create('test3', {engine = 'vinyl'})
+box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true)
 ---
+- ok
 ...
-_ = s3:create_index('pk')
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
 ---
+- ok
 ...
-_ = s3:insert{666, 'fff'}
+ch:get()
 ---
+- true
 ...
--- gh-2532: replaying create/drop from xlog crashes tarantool
-test_run:cmd("setopt delimiter ';'")
+_ = s1:insert{333, 'ccc'}
 ---
-- true
 ...
-for i = 1, 10 do
-    s = box.schema.space.create('test', {engine = 'vinyl'})
-    s:create_index('primary')
-    s:create_index('secondary', {unique = false, parts = {2, 'string'}})
-    s:insert{i, 'test' .. i}
-    s:truncate()
-    s:drop()
-end
-test_run:cmd("setopt delimiter ''");
+-- VY_LOG_DROP_LSM missing
+s2.index.pk:drop()
 ---
 ...
 test_run:cmd('restart server default')
 s1 = box.space.test1
 ---
 ...
-s1 == nil
+s2 = box.space.test2
 ---
-- true
 ...
-s2 = box.space.test2
+_ = s1:insert{444, 'ddd'}
+---
+...
+_ = s2:create_index('pk')
 ---
 ...
+_ = s2:insert{555, 'eee'}
+---
+...
+s1:select()
+---
+- - [333, 'ccc']
+  - [444, 'ddd']
+...
 s2:select()
 ---
 - - [555, 'eee']
 ...
-s2:drop()
+box.snapshot()
 ---
+- ok
 ...
-s3 = box.space.test3
+test_run:cmd('restart server default')
+s1 = box.space.test1
 ---
 ...
-s3:select()
+s2 = box.space.test2
 ---
-- - [666, 'fff']
 ...
-s3:drop()
+s1:select()
+---
+- - [333, 'ccc']
+  - [444, 'ddd']
+...
+s2:select()
+---
+- - [555, 'eee']
+...
+s1:drop()
+---
+...
+s2:drop()
 ---
 ...
diff --git a/test/vinyl/errinj_vylog.test.lua b/test/vinyl/errinj_vylog.test.lua
index 36b3659d..3d90755d 100644
--- a/test/vinyl/errinj_vylog.test.lua
+++ b/test/vinyl/errinj_vylog.test.lua
@@ -32,111 +32,101 @@ s:select()
 s:drop()
 
 --
--- Check that an index drop/truncate/create record we failed to
+-- Check that an index drop/create record we failed to
 -- write to vylog is flushed along with the next record.
 --
 fiber = require 'fiber'
 
 s1 = box.schema.space.create('test1', {engine = 'vinyl'})
-_ = s1:create_index('pk')
-_ = s1:insert{1, 'a'}
-
 s2 = box.schema.space.create('test2', {engine = 'vinyl'})
 _ = s2:create_index('pk')
+_ = s2:insert{1, 'a'}
+box.snapshot()
 _ = s2:insert{2, 'b'}
 
-box.snapshot()
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
 
-_ = s1:insert{3, 'c'}
-_ = s2:insert{4, 'd'}
+-- VY_LOG_PREPARE_LSM written, but VY_LOG_CREATE_LSM missing
+ch = fiber.channel(1)
+_ = fiber.create(function() s1:create_index('pk') ch:put(true) end)
+fiber.sleep(0.001)
 
-SCHED_TIMEOUT = 0.01
-box.error.injection.set('ERRINJ_VY_SCHED_TIMEOUT', SCHED_TIMEOUT)
-box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true);
+box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true)
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
 
-s1:drop()
-s2:truncate()
-_ = s2:insert{5, 'e'}
+ch:get()
+_ = s1:insert{3, 'c'}
 
-s3 = box.schema.space.create('test3', {engine = 'vinyl'})
-_ = s3:create_index('pk')
-_ = s3:insert{6, 'f'}
+-- VY_LOG_DROP_LSM missing
+s2.index.pk:drop()
 
 -- pending records must not be rolled back on error
-box.snapshot()
+s2:create_index('pk') -- error
 
 box.error.injection.set('ERRINJ_VY_LOG_FLUSH', false);
-fiber.sleep(2 * SCHED_TIMEOUT) -- wait for scheduler to unthrottle
-box.error.injection.set('ERRINJ_VY_SCHED_TIMEOUT', 0)
-
-box.snapshot()
 
-_ = s2:insert{7, 'g'}
-_ = s3:insert{8, 'h'}
+_ = s1:insert{4, 'd'}
+_ = s2:create_index('pk')
+_ = s2:insert{5, 'e'}
 
 test_run:cmd('restart server default')
 
 s1 = box.space.test1
-s1 == nil
-
 s2 = box.space.test2
+s1:select()
 s2:select()
+s1:drop()
 s2:drop()
 
-s3 = box.space.test3
-s3:select()
-s3:drop()
-
 --
--- Check that if a buffered index drop/truncate/create record
--- does not make it to the vylog before restart, it will be
--- replayed on recovery.
+-- Check that if a buffered index drop/create record does not
+-- make it to the vylog before restart, it will be replayed on
+-- recovery.
 --
+fiber = require 'fiber'
 
 s1 = box.schema.space.create('test1', {engine = 'vinyl'})
-_ = s1:create_index('pk')
-_ = s1:insert{111, 'aaa'}
-
 s2 = box.schema.space.create('test2', {engine = 'vinyl'})
 _ = s2:create_index('pk')
+_ = s2:insert{111, 'aaa'}
+box.snapshot()
 _ = s2:insert{222, 'bbb'}
 
-box.snapshot()
+box.error.injection.set('ERRINJ_WAL_DELAY', true)
+
+-- VY_LOG_PREPARE_LSM written, but VY_LOG_CREATE_LSM missing
+ch = fiber.channel(1)
+_ = fiber.create(function() s1:create_index('pk') ch:put(true) end)
+fiber.sleep(0.001)
+
+box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true)
+box.error.injection.set('ERRINJ_WAL_DELAY', false)
 
+ch:get()
 _ = s1:insert{333, 'ccc'}
-_ = s2:insert{444, 'ddd'}
 
-box.error.injection.set('ERRINJ_VY_LOG_FLUSH', true);
+-- VY_LOG_DROP_LSM missing
+s2.index.pk:drop()
 
-s1:drop()
-s2:truncate()
+test_run:cmd('restart server default')
+
+s1 = box.space.test1
+s2 = box.space.test2
+
+_ = s1:insert{444, 'ddd'}
+_ = s2:create_index('pk')
 _ = s2:insert{555, 'eee'}
 
-s3 = box.schema.space.create('test3', {engine = 'vinyl'})
-_ = s3:create_index('pk')
-_ = s3:insert{666, 'fff'}
-
--- gh-2532: replaying create/drop from xlog crashes tarantool
-test_run:cmd("setopt delimiter ';'")
-for i = 1, 10 do
-    s = box.schema.space.create('test', {engine = 'vinyl'})
-    s:create_index('primary')
-    s:create_index('secondary', {unique = false, parts = {2, 'string'}})
-    s:insert{i, 'test' .. i}
-    s:truncate()
-    s:drop()
-end
-test_run:cmd("setopt delimiter ''");
+s1:select()
+s2:select()
+
+box.snapshot()
 
 test_run:cmd('restart server default')
 
 s1 = box.space.test1
-s1 == nil
-
 s2 = box.space.test2
+s1:select()
 s2:select()
+s1:drop()
 s2:drop()
-
-s3 = box.space.test3
-s3:select()
-s3:drop()
-- 
2.11.0




More information about the Tarantool-patches mailing list