[PATCH v4 3/3] box: introduce multikey indexes in memtx

Mon Apr 29 19:06:15 MSK 2019

On Fri, Apr 19, 2019 at 05:14:25PM +0300, Kirill Shcherbatov wrote:
> diff --git a/src/box/field_map.h b/src/box/field_map.h
> index f6c653030..11733baa4 100644
> --- a/src/box/field_map.h
> +++ b/src/box/field_map.h
> @@ -32,8 +32,10 @@
>   */
>  #include <assert.h>
>  #include <stdint.h>
> +#include <stdlib.h>

Unnecessary inclusion?

> diff --git a/src/box/key_def.c b/src/box/key_def.c
> index d07bbe8bc..1366c46b8 100644
> --- a/src/box/key_def.c
> +++ b/src/box/key_def.c
> @@ -104,6 +104,10 @@ key_def_dup(const struct key_def *src)
>  		size_t path_offset = src->parts[i].path - (char *)src;
>  		res->parts[i].path = (char *)res + path_offset;
>  	}
> +	if (src->multikey_path != NULL) {
> +		size_t path_offset = src->multikey_path - (char *)src;
> +		res->multikey_path = (char *)res + path_offset;
> +	}
>  	return res;
>  }
>  
> @@ -138,7 +142,69 @@ key_def_set_func(struct key_def *def)
>  	key_def_set_extract_func(def);
>  }
>  
> -static void
> +static int
> +key_def_set_part_path(struct key_def *def, uint32_t part_no, const char *path,
> +		      uint32_t path_len, char **path_pool)
> +{
> +	struct key_part *part = &def->parts[part_no];
> +	if (path == NULL) {
> +		part->path = NULL;
> +		part->path_len = 0;
> +		return 0;
> +	}
> +	assert(path_pool != NULL);
> +	part->path = *path_pool;
> +	*path_pool += path_len;
> +	memcpy(part->path, path, path_len);
> +	part->path_len = path_len;
> +
> +	/*
> +	 * Test whether this key_part has array index
> +	 * placeholder [*] (i.e. is a part of multikey index
> +	 * definition).
> +	 */
> +	int multikey_path_len =
> +		json_path_multikey_offset(path, path_len, TUPLE_INDEX_BASE);
> +	if ((uint32_t) multikey_path_len == path_len)
> +		return 0;
> +
> +	/*
> +	 * In case of multikey index key_parts must have the
> +	 * same JSON prefix.
> +	 */
> +	if (def->multikey_path == NULL) {
> +		/*
> +		 * Keep the index of the first multikey key_part
> +		 * and the length of JSON path substring to the
> +		 * array index placeholder sign [*].
> +		 */
> +		def->multikey_path = part->path;
> +		def->multikey_fieldno = part->fieldno;
> +		def->multikey_path_len = (uint32_t) multikey_path_len;
> +	} else if (json_path_cmp(path, multikey_path_len, def->multikey_path,
> +				 def->multikey_path_len,
> +				 TUPLE_INDEX_BASE) != 0) {
> +		diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +			 part_no + TUPLE_INDEX_BASE,
> +			 "incompatable multikey index path");
> +		return -1;
> +	}
> +	int multikey_path_suffix_len =
> +		path_len - multikey_path_len - strlen("[*]");
> +	if (json_path_multikey_offset(path + multikey_path_len + strlen("[*]"),
> +				      multikey_path_suffix_len,
> +				      TUPLE_INDEX_BASE) !=

Please don't assume that JSON_TOKEN_ANY is represented by "[*]" outside
json library: suppose we'll allow the user to add some spaces ("[ * ]")
- this code will break then.

Either use json_lexer to skip the multikey token or add a helper
function to the json lib.

> +	    multikey_path_suffix_len) {
> +		diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +			 part_no + TUPLE_INDEX_BASE,
> +			 "no more than one array index placeholder [*] is "
> +			 "allowed in JSON path");
> +		return -1;
> +	}
> +	return 0;
> +}
> +
> +static int
>  key_def_set_part(struct key_def *def, uint32_t part_no, uint32_t fieldno,
>  		 enum field_type type, enum on_conflict_action nullable_action,
>  		 struct coll *coll, uint32_t coll_id,
> diff --git a/src/box/memtx_tree.c b/src/box/memtx_tree.c
> index fe037c54a..9a0834994 100644
> --- a/src/box/memtx_tree.c
> +++ b/src/box/memtx_tree.c
> @@ -584,6 +584,146 @@ memtx_tree_index_replace(struct index *base, struct tuple *old_tuple,
>  	return 0;
>  }
>  
> +/**
> + * Perform tuple insertion by given multikey index.
> + * In case of replacement, all old tuple entries are deleted
> + * by all it's multikey indexes.
> + */
> +static int
> +memtx_tree_index_insert_multikey(struct memtx_tree_index *index,

Bad name: we have replace_multikey and insert_multikey - one would think
the only difference between them is the operation type (REPLACE/INSERT),
but it would be totally wrong, as one is used as a helper for another.

Please come up with a better name. What about

  memtx_tree_index_replace_multikey_one

?

That would emphasize that it inserts a single multikey entry.

> +			struct tuple *old_tuple, struct tuple *new_tuple,
> +			enum dup_replace_mode mode, int multikey_idx,
> +			struct tuple **replaced_tuple)
> +{
> +	struct memtx_tree_data new_data, dup_data;
> +	new_data.tuple = new_tuple;
> +	new_data.hint = multikey_idx;
> +	dup_data.tuple = NULL;
> +	if (memtx_tree_insert(&index->tree, new_data, &dup_data) != 0) {
> +		diag_set(OutOfMemory, MEMTX_EXTENT_SIZE, "memtx_tree_index",
> +			 "replace");
> +		return -1;
> +	}
> +	int errcode = 0;
> +	if (dup_data.tuple == new_tuple) {
> +		/*
> +		 * When tuple contains the same key multiple
> +		 * times, the previous key occurrence is pushed
> +		 * out of the index.
> +		 */
> +		dup_data.tuple = NULL;
> +	} else if ((errcode = replace_check_dup(old_tuple, dup_data.tuple,
> +					        mode)) != 0) {
> +		/* Rollback replace. */
> +		memtx_tree_delete(&index->tree, new_data);
> +		if (dup_data.tuple != NULL)
> +			memtx_tree_insert(&index->tree, dup_data, NULL);
> +		struct space *sp = space_cache_find(index->base.def->space_id);
> +		if (sp != NULL) {
> +			diag_set(ClientError, errcode, index->base.def->name,
> +				 space_name(sp));
> +		}
> +		return -1;
> +	}
> +	*replaced_tuple = dup_data.tuple;
> +	if (dup_data.tuple == NULL)
> +		return 0;
> +
> +	/*
> +	 * Propagate dup_data.tuple deletion for all multikey
> +	 * index keys extracted by dup_data.tuple.
> +	 */
> +	int conflict_idx = (int)dup_data.hint;
> +	struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree);
> +	uint32_t multikey_count = tuple_multikey_count(dup_data.tuple, cmp_def);
> +	for (int i = 0; (uint32_t) i < multikey_count; i++) {
> +		if (conflict_idx == i)
> +			continue;
> +		dup_data.hint = i;
> +		memtx_tree_delete(&index->tree, dup_data);
> +	}
> +	/*
> +	 * The new_tuple multikey index key (by conflict_idx) may
> +	 * be deleted from index when old_tuple has another key by
> +	 * some other multikey index != conflict_idx. Restore it
> +	 * as a precaution.
> +	 */
> +	memtx_tree_insert(&index->tree, new_data, NULL);

Would be better if we didn't delete a tuple in this case. Or at least,
memtx_tree_delete returned the deleted tuple so that we wouldn't call
memtx_tree_insert when we didn't need to. Not a show stopper though.

> +	return 0;
> +}
> +
> +static int
> +memtx_tree_index_replace_multikey(struct index *base, struct tuple *old_tuple,
> +			struct tuple *new_tuple, enum dup_replace_mode mode,
> +			struct tuple **result)
> +{
> +	struct memtx_tree_index *index = (struct memtx_tree_index *)base;
> +	struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree);
> +	*result = NULL;
> +	if (new_tuple != NULL) {
> +		int multikey_idx = 0, err;
> +		uint32_t multikey_count =
> +			tuple_multikey_count(new_tuple, cmp_def);
> +		for (; (uint32_t) multikey_idx < multikey_count;
> +		     multikey_idx++) {
> +			struct tuple *replaced_tuple;
> +			err = memtx_tree_index_insert_multikey(index, old_tuple,
> +						new_tuple, mode, multikey_idx,
> +						&replaced_tuple);
> +			if (err != 0)
> +				break;
> +			if (replaced_tuple != NULL) {
> +				assert(*result == NULL ||
> +				       *result == replaced_tuple);
> +				*result = replaced_tuple;
> +			}
> +		}
> +		if (err != 0) {
> +			/*
> +			 * In case of error, rollback new_tuple
> +			 * insertion by multikey index [0, multikey_idx).
> +			 */
> +			struct memtx_tree_data data;
> +			data.tuple = new_tuple;
> +			for (int i = 0; i < multikey_idx; i++) {
> +				data.hint = i;
> +				memtx_tree_delete(&index->tree, data);
> +			}
> +			if (*result != NULL) {
> +				/*
> +				 * Restore replaced tuple index
> +				 * occurrences.
> +				 */
> +				data.tuple = *result;
> +				multikey_count =
> +					tuple_multikey_count(*result, cmp_def);
> +				for (int i = 0;
> +				     (uint32_t) i < multikey_count; i++) {
> +					data.hint = i;
> +					memtx_tree_insert(&index->tree, data,
> +							  NULL);
> +				}
> +			}
> +			return -1;
> +		}
> +		if (*result != NULL)
> +			return 0;
> +	}
> +	if (old_tuple != NULL) {
> +		*result = old_tuple;
> +		struct memtx_tree_data data;
> +		data.tuple = old_tuple;
> +		uint32_t multikey_count =
> +			tuple_multikey_count(old_tuple, cmp_def);
> +		for (int multikey_idx = 0;
> +		     (uint32_t) multikey_idx < multikey_count; multikey_idx++) {
> +			data.hint = multikey_idx;
> +			memtx_tree_delete(&index->tree, data);
> +		}
> +	}
> +	return 0;
> +}
> +
>  static struct iterator *
>  memtx_tree_index_create_iterator(struct index *base, enum iterator_type type,
>  				 const char *key, uint32_t part_count)
> @@ -655,31 +795,32 @@ memtx_tree_index_reserve(struct index *base, uint32_t size_hint)
>  	return 0;
>  }
>  
> +/* Initialize the next element of the index build_array. */
>  static int
> -memtx_tree_index_build_next(struct index *base, struct tuple *tuple)
> +memtx_tree_index_build_array_append(struct memtx_tree_index *index,
> +				    struct tuple *tuple, hint_t hint)
>  {
> -	struct memtx_tree_index *index = (struct memtx_tree_index *)base;
> -	struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree);
> +	bool need_realloc = false;
>  	if (index->build_array == NULL) {
> -		index->build_array = malloc(MEMTX_EXTENT_SIZE);
> -		if (index->build_array == NULL) {
> -			diag_set(OutOfMemory, MEMTX_EXTENT_SIZE,
> -				 "memtx_tree_index", "build_next");
> -			return -1;
> -		}
>  		index->build_array_alloc_size =
>  			MEMTX_EXTENT_SIZE / sizeof(index->build_array[0]);
> +		need_realloc = true;
> +	}
> +	if (index->build_array_size >= index->build_array_alloc_size) {
> +		index->build_array_alloc_size +=
> +			MAX(index->build_array_alloc_size / 2,
> +			    index->build_array_size + 1 -
> +			    index->build_array_alloc_size);
> +		need_realloc = true;

Please avoid unnecessary refactoring - it complicates review.
The function looks fine to me as it is - no need to merge malloc
and realloc paths IMO.

>  	}
> -	assert(index->build_array_size <= index->build_array_alloc_size);
> -	if (index->build_array_size == index->build_array_alloc_size) {
> -		index->build_array_alloc_size = index->build_array_alloc_size +
> -					index->build_array_alloc_size / 2;
> +	if (need_realloc) {
>  		struct memtx_tree_data *tmp =
>  			realloc(index->build_array,
>  				index->build_array_alloc_size * sizeof(*tmp));
>  		if (tmp == NULL) {
>  			diag_set(OutOfMemory, index->build_array_alloc_size *
> -				 sizeof(*tmp), "memtx_tree_index", "build_next");
> +				 sizeof(*tmp), "memtx_tree_index",
> +				 "build_array");
>  			return -1;
>  		}
>  		index->build_array = tmp;
> @@ -687,10 +828,66 @@ memtx_tree_index_build_next(struct index *base, struct tuple *tuple)
>  	struct memtx_tree_data *elem =
>  		&index->build_array[index->build_array_size++];
>  	elem->tuple = tuple;
> -	elem->hint = tuple_hint(tuple, cmp_def);
> +	elem->hint = hint;
> +	return 0;
> +}
> +
> +static int
> +memtx_tree_index_build_next(struct index *base, struct tuple *tuple)
> +{
> +	struct memtx_tree_index *index = (struct memtx_tree_index *)base;
> +	struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree);
> +	return memtx_tree_index_build_array_append(index, tuple,
> +						   tuple_hint(tuple, cmp_def));
> +}
> +
> +static int
> +memtx_tree_index_build_next_multikey(struct index *base, struct tuple *tuple)
> +{
> +	struct memtx_tree_index *index = (struct memtx_tree_index *)base;
> +	struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree);
> +	uint32_t multikey_count = tuple_multikey_count(tuple, cmp_def);
> +	for (uint32_t multikey_idx = 0; multikey_idx < multikey_count;
> +	     multikey_idx++) {
> +		if (memtx_tree_index_build_array_append(index, tuple,
> +							multikey_idx) != 0)
> +			return -1;
> +	}
>  	return 0;
>  }
>  
> +/**
> + * Process build_array of specified index and remove duplicates
> + * of equal tuples (in terms of index's cmp_def and have same
> + * tuple pointer). The build_array is expected to be sorted.
> + */
> +static void
> +memtx_tree_index_build_array_deduplicate(struct memtx_tree_index *index)
> +{
> +	struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree);
> +	size_t i = 0;
> +	while (i < index->build_array_size) {
> +		size_t next_key = ++i;
> +		while (next_key < index->build_array_size &&
> +		       index->build_array[next_key].tuple ==
> +		       index->build_array[next_key - 1].tuple &&
> +		       tuple_compare_hinted(index->build_array[next_key].tuple,
> +					index->build_array[next_key].hint,
> +					index->build_array[next_key - 1].tuple,
> +					index->build_array[next_key - 1].hint,
> +					cmp_def) == 0)
> +			next_key++;
> +		size_t equal_keys = next_key - i;
> +		if (equal_keys == 0)
> +			continue;
> +		memmove(&index->build_array[i - 1],
> +			&index->build_array[next_key - 1],
> +			(index->build_array_size - next_key + 1) *
> +			sizeof(index->build_array[0]));

This smells O(N^2). Apparently, one can remove duplicates from a sorted
array in O(N). Please fix.

> +		index->build_array_size -= equal_keys;
> +	}
> +}
> +
>  static void
>  memtx_tree_index_end_build(struct index *base)
>  {
> diff --git a/src/box/tuple_format.c b/src/box/tuple_format.c
> index a7c8e872f..585f2680f 100644
> --- a/src/box/tuple_format.c
> +++ b/src/box/tuple_format.c
> @@ -194,6 +196,50 @@ tuple_format_field_by_id(struct tuple_format *format, uint32_t id)
>  	return NULL;
>  }
>  
> +/**
> + * Check if child_field may be attached to parent_field,
> + * update the parent_field container type if required.
> + */
> +static int
> +tuple_field_ensure_child_compatibility(struct tuple_field *parent,
> +				       struct tuple_field *child)

Wouldn't tuple_format_check_field be a better name?

> +{
> +	enum field_type expected_type =
> +		child->token.type == JSON_TOKEN_STR ?
> +		FIELD_TYPE_MAP : FIELD_TYPE_ARRAY;
> +	if (field_type1_contains_type2(parent->type, expected_type)) {
> +		parent->type = expected_type;
> +	} else {
> +		diag_set(ClientError, ER_INDEX_PART_TYPE_MISMATCH,
> +			 tuple_field_path(parent),
> +			 field_type_strs[parent->type],
> +			 field_type_strs[expected_type]);
> +		return -1;
> +	}
> +	/*
> +	 * Attempt to append JSON_TOKEN_ANY leaf to parent that
> +	 * has other child records already i.e. is a intermediate
> +	 * field of non-multikey JSON index.
> +	 */
> +	bool is_not_multikey_parent_conflict =
> +		child->token.type == JSON_TOKEN_ANY &&
> +		!json_token_is_multikey(&parent->token) &&
> +		!json_token_is_leaf(&parent->token);
> +	/*
> +	 * Attempt to append not JSON_TOKEN_ANY child record to
> +	 * the parent defined as multikey index root.
> +	 */
> +	bool is_multikey_parent_conflict =
> +		json_token_is_multikey(&parent->token) &&
> +		child->token.type != JSON_TOKEN_ANY;
> +	if (is_not_multikey_parent_conflict || is_multikey_parent_conflict) {

if (!is not A || is A)

sounds like logical tautology to me.

Please rewrite without these confusing helper flags. You could simply
set diag twice - it'd be okay:

	if (child->token.type == JSON_TOKEN_ANY ...) {
		diag_set(...)
		return -1;
	}
	if (json_token_is_multikey(&parent->token) ...) {
		diag_set(...)
		return -1;
	}

> +		diag_set(ClientError, ER_MULTIKEY_INDEX_MISMATCH,
> +			 tuple_field_path(parent));
> +		return -1;
> +	}
> +	return 0;
> +}
> +
>  /**
>   * Given a field number and a path, add the corresponding field
>   * to the tuple format, allocating intermediate fields if
> @@ -454,6 +497,34 @@ tuple_format_create(struct tuple_format *format, struct key_def * const *keys,
>  		if (json_token_is_leaf(&field->token) &&
>  		    !tuple_field_is_nullable(field))
>  			bit_set(format->required_fields, field->id);
> +		/*
> +		 * When the field represents array index
> +		 * placeholder [*] (JSON_TOKEN_ANY), it requires
> +		 * own multikey_required_fields allocation, that
> +		 * must be initialized by bypassing the JSON
> +		 * subtree of this field.
> +		 */
> +		if (field->token.type == JSON_TOKEN_ANY) {
> +			assert(field->multikey_required_fields == NULL);
> +			void *multikey_required_fields =
> +				calloc(1, required_fields_sz);
> +			if (multikey_required_fields == NULL) {
> +				diag_set(OutOfMemory, required_fields_sz,
> +					"malloc", "required field bitmap");
> +				return -1;
> +			}
> +			struct tuple_field *child;
> +			json_tree_foreach_entry_preorder(child, &field->token,
> +					struct tuple_field, token) {
> +				if (json_token_is_leaf(&child->token) &&
> +				    !tuple_field_is_nullable(child)) {
> +					bit_set(multikey_required_fields,
> +						child->id);
> +				}

json_tree_foreach_entry_preorder inside another one. O(N^2) again.
AFAIU you could fill the map in the outer loop without hurting
readability.

Also, I don't quite like that you use two bitmaps simultaneously here
and in tuple_field_map_create. I think we could make the root bitmap
only store fields not present in any multikey bitmap (those are checked
anyway). The code would look cleaner IMHO.

> +			}
> +			field->multikey_required_fields =
> +				multikey_required_fields;
> +		}
>  	}
>  	format->hash = tuple_format_hash(format);
>  	return 0;