[PATCH v2 11/14] vinyl: do not fill secondary tuples with nulls when decoded

Vladimir Davydov vdavydov.dev at gmail.com
Wed Mar 13 11:52:57 MSK 2019


In contrast to a primary index, which stores full tuples, secondary
indexes only store extended (secondary + primary) keys on disk. To make
them look like tuples, we fill missing fields with nulls (aka tuple
surrogate). This isn't going to work nicely with multikey indexes
though: how would you make a surrogate array from a key? We could
special-case multikey index handling, but that would look cumbersome.
So this patch removes nulls from secondary tuples restored from disk
altogether. To achieve that, it's enough to use key_format for them -
then the comparators will detect that it's actually a key, not a tuple
and use the appropriate primitive.
---
 src/box/key_def.c           | 36 +++++++++++++++++++++++++++++++++++
 src/box/key_def.h           | 14 ++++++++++++++
 src/box/vinyl.c             | 46 +++++++++++++++++++++++++++++++++------------
 src/box/vy_lsm.c            | 30 ++++++++++++++++++++---------
 src/box/vy_lsm.h            | 13 +++++++++----
 src/box/vy_run.c            | 31 ++++++++++++++++++++----------
 src/box/vy_stmt.c           | 36 +++++++++++++++++------------------
 test/unit/vy_point_lookup.c |  2 +-
 8 files changed, 154 insertions(+), 54 deletions(-)

diff --git a/src/box/key_def.c b/src/box/key_def.c
index d4c979aa..c8993a06 100644
--- a/src/box/key_def.c
+++ b/src/box/key_def.c
@@ -709,6 +709,42 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
 	return new_def;
 }
 
+struct key_def *
+key_def_extract(const struct key_def *cmp_def, const struct key_def *pk_def,
+		struct region *region)
+{
+	struct key_def *extracted_def = NULL;
+	size_t region_svp = region_used(region);
+
+	/* First, dump primary key parts as is. */
+	struct key_part_def *parts = region_alloc(region,
+			pk_def->part_count * sizeof(*parts));
+	if (parts == NULL) {
+		diag_set(OutOfMemory, pk_def->part_count * sizeof(*parts),
+			 "region", "key def parts");
+		goto out;
+	}
+	if (key_def_dump_parts(pk_def, parts, region) != 0)
+		goto out;
+	/*
+	 * Second, update field numbers to match the primary key
+	 * parts in a secondary key.
+	 */
+	for (uint32_t i = 0; i < pk_def->part_count; i++) {
+		const struct key_part *part = key_def_find(cmp_def,
+							   &pk_def->parts[i]);
+		assert(part != NULL);
+		parts[i].fieldno = part - cmp_def->parts;
+		parts[i].path = NULL;
+	}
+
+	/* Finally, allocate the new key definition. */
+	extracted_def = key_def_new(parts, pk_def->part_count);
+out:
+	region_truncate(region, region_svp);
+	return extracted_def;
+}
+
 int
 key_validate_parts(const struct key_def *key_def, const char *key,
 		   uint32_t part_count, bool allow_nullable)
diff --git a/src/box/key_def.h b/src/box/key_def.h
index e3a34ce1..21119a83 100644
--- a/src/box/key_def.h
+++ b/src/box/key_def.h
@@ -375,6 +375,20 @@ key_def_contains(const struct key_def *first, const struct key_def *second);
 struct key_def *
 key_def_merge(const struct key_def *first, const struct key_def *second);
 
+/**
+ * Create a key definition suitable for extracting primary key
+ * parts from an extended secondary key.
+ * @param cmp_def   Extended secondary key definition
+ *                  (must include primary key parts).
+ * @param pk_def    Primary key definition.
+ * @param region    Region used for temporary allocations.
+ * @retval not NULL Pointer to the extracted key definition.
+ * @retval NULL     Memory allocation error.
+ */
+struct key_def *
+key_def_extract(const struct key_def *cmp_def, const struct key_def *pk_def,
+		struct region *region);
+
 /*
  * Check that parts of the key match with the key definition.
  * @param key_def Key definition.
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index ced0efe5..e9a4ae0d 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -719,10 +719,9 @@ vinyl_space_create_index(struct space *space, struct index_def *index_def)
 		pk = vy_lsm(space_index(space, 0));
 		assert(pk != NULL);
 	}
-	struct vy_lsm *lsm = vy_lsm_new(&env->lsm_env, &env->stmt_env,
-					&env->cache_env, &env->mem_env,
-					index_def, space->format, pk,
-					space_group_id(space));
+	struct vy_lsm *lsm = vy_lsm_new(&env->lsm_env, &env->cache_env,
+					&env->mem_env, index_def, space->format,
+					pk, space_group_id(space));
 	if (lsm == NULL) {
 		free(index);
 		return NULL;
@@ -1301,13 +1300,34 @@ vy_get_by_secondary_tuple(struct vy_lsm *lsm, struct vy_tx *tx,
 			  const struct vy_read_view **rv,
 			  struct tuple *tuple, struct tuple **result)
 {
+	int rc = 0;
 	assert(lsm->index_id > 0);
 
-	if (vy_point_lookup(lsm->pk, tx, rv, tuple, result) != 0)
-		return -1;
+	/*
+	 * Lookup the full tuple by a secondary statement.
+	 * There are two cases: the secondary statement may be
+	 * a key, in which case we need to extract the primary
+	 * key parts from it; or it may be a full tuple, which
+	 * we may pass it immediately to the iterator.
+	 */
+	struct tuple *key;
+	if (vy_stmt_is_key(tuple)) {
+		key = vy_stmt_extract_key(tuple, lsm->pk_in_cmp_def,
+					  lsm->env->key_format);
+		if (key == NULL)
+			return -1;
+	} else {
+		key = tuple;
+		tuple_ref(key);
+	}
+
+	if (vy_point_lookup(lsm->pk, tx, rv, key, result) != 0) {
+		rc = -1;
+		goto out;
+	}
 
 	if (*result == NULL ||
-	    vy_stmt_compare(*result, tuple, lsm->key_def) != 0) {
+	    vy_stmt_compare(*result, tuple, lsm->cmp_def) != 0) {
 		/*
 		 * If a tuple read from a secondary index doesn't
 		 * match the tuple corresponding to it in the
@@ -1327,7 +1347,7 @@ vy_get_by_secondary_tuple(struct vy_lsm *lsm, struct vy_tx *tx,
 		 * the tuple cache implementation.
 		 */
 		vy_cache_on_write(&lsm->cache, tuple, NULL);
-		return 0;
+		goto out;
 	}
 
 	/*
@@ -1340,13 +1360,15 @@ vy_get_by_secondary_tuple(struct vy_lsm *lsm, struct vy_tx *tx,
 	 */
 	if (tx != NULL && vy_tx_track_point(tx, lsm->pk, *result) != 0) {
 		tuple_unref(*result);
-		return -1;
+		rc = -1;
+		goto out;
 	}
 
 	if ((*rv)->vlsn == INT64_MAX)
-		vy_cache_add(&lsm->pk->cache, *result, NULL, tuple, ITER_EQ);
-
-	return 0;
+		vy_cache_add(&lsm->pk->cache, *result, NULL, key, ITER_EQ);
+out:
+	tuple_unref(key);
+	return rc;
 }
 
 /**
diff --git a/src/box/vy_lsm.c b/src/box/vy_lsm.c
index 2dc8f639..28cd487c 100644
--- a/src/box/vy_lsm.c
+++ b/src/box/vy_lsm.c
@@ -118,10 +118,9 @@ vy_lsm_mem_tree_size(struct vy_lsm *lsm)
 }
 
 struct vy_lsm *
-vy_lsm_new(struct vy_lsm_env *lsm_env, struct vy_stmt_env *stmt_env,
-	   struct vy_cache_env *cache_env, struct vy_mem_env *mem_env,
-	   struct index_def *index_def, struct tuple_format *format,
-	   struct vy_lsm *pk, uint32_t group_id)
+vy_lsm_new(struct vy_lsm_env *lsm_env, struct vy_cache_env *cache_env,
+	   struct vy_mem_env *mem_env, struct index_def *index_def,
+	   struct tuple_format *format, struct vy_lsm *pk, uint32_t group_id)
 {
 	static int64_t run_buckets[] = {
 		0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100,
@@ -156,10 +155,19 @@ vy_lsm_new(struct vy_lsm_env *lsm_env, struct vy_stmt_env *stmt_env,
 		 */
 		lsm->disk_format = format;
 	} else {
-		lsm->disk_format = vy_stmt_format_new(stmt_env, &cmp_def, 1,
-						      NULL, 0, 0, NULL);
-		if (lsm->disk_format == NULL)
-			goto fail_format;
+		/*
+		 * To save disk space, we do not store full tuples
+		 * in secondary index runs. Instead we only store
+		 * extended keys (i.e. keys consisting of secondary
+		 * and primary index parts). This is enough to look
+		 * up a full tuple in the primary index.
+		 */
+		lsm->disk_format = lsm_env->key_format;
+
+		lsm->pk_in_cmp_def = key_def_extract(lsm->cmp_def, pk->key_def,
+						     &fiber()->gc);
+		if (lsm->pk_in_cmp_def == NULL)
+			goto fail_pk_in_cmp_def;
 	}
 	tuple_format_ref(lsm->disk_format);
 
@@ -208,7 +216,9 @@ fail_run_hist:
 	vy_lsm_stat_destroy(&lsm->stat);
 fail_stat:
 	tuple_format_unref(lsm->disk_format);
-fail_format:
+	if (lsm->pk_in_cmp_def != NULL)
+		key_def_delete(lsm->pk_in_cmp_def);
+fail_pk_in_cmp_def:
 	key_def_delete(cmp_def);
 fail_cmp_def:
 	key_def_delete(key_def);
@@ -262,6 +272,8 @@ vy_lsm_delete(struct vy_lsm *lsm)
 	tuple_format_unref(lsm->disk_format);
 	key_def_delete(lsm->cmp_def);
 	key_def_delete(lsm->key_def);
+	if (lsm->pk_in_cmp_def != NULL)
+		key_def_delete(lsm->pk_in_cmp_def);
 	histogram_delete(lsm->run_hist);
 	vy_lsm_stat_destroy(&lsm->stat);
 	vy_cache_destroy(&lsm->cache);
diff --git a/src/box/vy_lsm.h b/src/box/vy_lsm.h
index b94e7a43..e969e45c 100644
--- a/src/box/vy_lsm.h
+++ b/src/box/vy_lsm.h
@@ -200,6 +200,12 @@ struct vy_lsm {
 	/** Key definition passed by the user. */
 	struct key_def *key_def;
 	/**
+	 * Key definition to extract primary key parts from
+	 * a secondary key. NULL if this LSM tree corresponds
+	 * to a primary index.
+	 */
+	struct key_def *pk_in_cmp_def;
+	/**
 	 * If the following flag is set, the index this LSM tree
 	 * is created for is unique and it must be checked for
 	 * duplicates on INSERT. Otherwise, the check can be skipped,
@@ -333,10 +339,9 @@ vy_lsm_mem_tree_size(struct vy_lsm *lsm);
 
 /** Allocate a new LSM tree object. */
 struct vy_lsm *
-vy_lsm_new(struct vy_lsm_env *lsm_env, struct vy_stmt_env *stmt_env,
-	   struct vy_cache_env *cache_env, struct vy_mem_env *mem_env,
-	   struct index_def *index_def, struct tuple_format *format,
-	   struct vy_lsm *pk, uint32_t group_id);
+vy_lsm_new(struct vy_lsm_env *lsm_env, struct vy_cache_env *cache_env,
+	   struct vy_mem_env *mem_env, struct index_def *index_def,
+	   struct tuple_format *format, struct vy_lsm *pk, uint32_t group_id);
 
 /** Free an LSM tree object. */
 void
diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index 76b2f94a..928ca57f 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -2139,7 +2139,8 @@ vy_run_writer_start_page(struct vy_run_writer *writer,
 	if (run->info.page_count >= writer->page_info_capacity &&
 	    vy_run_alloc_page_info(run, &writer->page_info_capacity) != 0)
 		return -1;
-	const char *key = tuple_extract_key(first_stmt, writer->cmp_def, NULL);
+	const char *key = vy_stmt_is_key(first_stmt) ? tuple_data(first_stmt) :
+			  tuple_extract_key(first_stmt, writer->cmp_def, NULL);
 	if (key == NULL)
 		return -1;
 	if (run->info.page_count == 0) {
@@ -2289,7 +2290,9 @@ vy_run_writer_commit(struct vy_run_writer *writer)
 	}
 
 	assert(writer->last_stmt != NULL);
-	const char *key = tuple_extract_key(writer->last_stmt,
+	const char *key = vy_stmt_is_key(writer->last_stmt) ?
+		          tuple_data(writer->last_stmt) :
+			  tuple_extract_key(writer->last_stmt,
 					    writer->cmp_def, NULL);
 	if (key == NULL)
 		goto out;
@@ -2360,6 +2363,7 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 	int64_t max_lsn = 0;
 	int64_t min_lsn = INT64_MAX;
 	struct tuple *prev_tuple = NULL;
+	char *page_min_key = NULL;
 
 	struct tuple_bloom_builder *bloom_builder = NULL;
 	if (opts->bloom_fpr < 1) {
@@ -2377,7 +2381,6 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 		if (run->info.page_count == page_info_capacity &&
 		    vy_run_alloc_page_info(run, &page_info_capacity) != 0)
 			goto close_err;
-		const char *page_min_key = NULL;
 		uint32_t page_row_count = 0;
 		uint64_t page_row_index_offset = 0;
 		uint64_t row_offset = xlog_cursor_tx_pos(&cursor);
@@ -2400,7 +2403,8 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 				tuple_unref(tuple);
 				goto close_err;
 			}
-			key = tuple_extract_key(tuple, cmp_def, NULL);
+			key = vy_stmt_is_key(tuple) ? tuple_data(tuple) :
+			      tuple_extract_key(tuple, cmp_def, NULL);
 			if (prev_tuple != NULL)
 				tuple_unref(prev_tuple);
 			prev_tuple = tuple;
@@ -2411,8 +2415,11 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 				if (run->info.min_key == NULL)
 					goto close_err;
 			}
-			if (page_min_key == NULL)
-				page_min_key = key;
+			if (page_min_key == NULL) {
+				page_min_key = vy_key_dup(key);
+				if (page_min_key == NULL)
+					goto close_err;
+			}
 			if (xrow.lsn > max_lsn)
 				max_lsn = xrow.lsn;
 			if (xrow.lsn < min_lsn)
@@ -2429,11 +2436,9 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 		info->row_index_offset = page_row_index_offset;
 		++run->info.page_count;
 		vy_run_acct_page(run, info);
-	}
 
-	if (prev_tuple != NULL) {
-		tuple_unref(prev_tuple);
-		prev_tuple = NULL;
+		free(page_min_key);
+		page_min_key = NULL;
 	}
 
 	if (key != NULL) {
@@ -2444,6 +2449,10 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 	run->info.max_lsn = max_lsn;
 	run->info.min_lsn = min_lsn;
 
+	if (prev_tuple != NULL) {
+		tuple_unref(prev_tuple);
+		prev_tuple = NULL;
+	}
 	region_truncate(region, mem_used);
 	run->fd = cursor.fd;
 	xlog_cursor_close(&cursor, true);
@@ -2473,6 +2482,8 @@ close_err:
 	region_truncate(region, mem_used);
 	if (prev_tuple != NULL)
 		tuple_unref(prev_tuple);
+	if (page_min_key != NULL)
+		free(page_min_key);
 	if (bloom_builder != NULL)
 		tuple_bloom_builder_delete(bloom_builder);
 	if (xlog_cursor_is_open(&cursor))
diff --git a/src/box/vy_stmt.c b/src/box/vy_stmt.c
index 229f8b41..fd7850cc 100644
--- a/src/box/vy_stmt.c
+++ b/src/box/vy_stmt.c
@@ -748,6 +748,8 @@ int
 vy_stmt_encode_primary(const struct tuple *value, struct key_def *key_def,
 		       uint32_t space_id, struct xrow_header *xrow)
 {
+	assert(!vy_stmt_is_key(value));
+
 	memset(xrow, 0, sizeof(*xrow));
 	enum iproto_type type = vy_stmt_type(value);
 	xrow->type = type;
@@ -804,7 +806,9 @@ vy_stmt_encode_secondary(const struct tuple *value, struct key_def *cmp_def,
 	memset(&request, 0, sizeof(request));
 	request.type = type;
 	uint32_t size;
-	const char *extracted = tuple_extract_key(value, cmp_def, &size);
+	const char *extracted = vy_stmt_is_key(value) ?
+				tuple_data_range(value, &size) :
+				tuple_extract_key(value, cmp_def, &size);
 	if (extracted == NULL)
 		return -1;
 	if (type == IPROTO_REPLACE || type == IPROTO_INSERT) {
@@ -834,27 +838,23 @@ vy_stmt_decode(struct xrow_header *xrow, const struct key_def *key_def,
 	if (xrow_decode_dml(xrow, &request, key_map) != 0)
 		return NULL;
 	struct tuple *stmt = NULL;
-	const char *key;
-	(void) key;
 	struct iovec ops;
 	switch (request.type) {
 	case IPROTO_DELETE:
-		/* extract key */
-		stmt = vy_stmt_new_surrogate_from_key(request.key,
-						      IPROTO_DELETE,
-						      key_def, format);
-		break;
-	case IPROTO_INSERT:
-	case IPROTO_REPLACE:
-		if (is_primary) {
-			stmt = vy_stmt_new_with_ops(format, request.tuple,
-						    request.tuple_end,
-						    NULL, 0, request.type);
-		} else {
-			stmt = vy_stmt_new_surrogate_from_key(request.tuple,
-							      request.type,
+		if (is_primary)
+			stmt = vy_stmt_new_surrogate_from_key(request.key,
+							      IPROTO_DELETE,
 							      key_def, format);
-		}
+		else
+			stmt = vy_stmt_new_with_ops(format, request.key,
+						    request.key_end,
+						    NULL, 0, IPROTO_DELETE);
+		break;
+	case IPROTO_INSERT:
+	case IPROTO_REPLACE:
+		stmt = vy_stmt_new_with_ops(format, request.tuple,
+					    request.tuple_end,
+					    NULL, 0, request.type);
 		break;
 	case IPROTO_UPSERT:
 		ops.iov_base = (char *)request.ops;
diff --git a/test/unit/vy_point_lookup.c b/test/unit/vy_point_lookup.c
index 38e9f403..f3cd84d4 100644
--- a/test/unit/vy_point_lookup.c
+++ b/test/unit/vy_point_lookup.c
@@ -94,7 +94,7 @@ test_basic()
 		index_def_new(512, 0, "primary", sizeof("primary") - 1, TREE,
 			      &index_opts, key_def, NULL);
 
-	struct vy_lsm *pk = vy_lsm_new(&lsm_env, &stmt_env, &cache_env, &mem_env,
+	struct vy_lsm *pk = vy_lsm_new(&lsm_env, &cache_env, &mem_env,
 				       index_def, format, NULL, 0);
 	isnt(pk, NULL, "lsm is not NULL")
 
-- 
2.11.0




More information about the Tarantool-patches mailing list