[tarantool-patches] Re: [PATCH v2 3/5] box: introduce path field in key_part

Wed Aug 22 03:26:57 MSK 2018

Thanks for the patch! See 17 comments below.

On 15/08/2018 15:15, Kirill Shcherbatov wrote:
> As we need to store user-defined JSON path in key_part
> and key_part_def, we have introduced path and path_len
> fields. JSON path is verified and transformed to canonical
> form on index msgpack unpack.
> Path string stored as a part of the key_def allocation:
> 
> +-------+---------+-------+---------+-------+-------+-------+
> |key_def|key_part1|  ...  |key_partN| path1 | pathK | pathN |
> +-------+---------+-------+---------+-------+-------+-------+
>            |                         ^
>            |-> path _________________|
> 
> Because of field names specified as format could be changed
> key_part path persisted in Tarantool should be always started
> with first-level field access via array index(not by name).
> 
> Part of #1012.
> ---
>   src/box/index_def.c          |  10 +-
>   src/box/key_def.c            | 276 ++++++++++++++++++++++++++++++++++++++-----
>   src/box/key_def.h            |  21 +++-
>   src/box/lua/space.cc         |   5 +
>   src/box/memtx_engine.c       |   5 +
>   src/box/schema.cc            |  12 +-
>   src/box/tuple_compare.cc     |   2 +
>   src/box/tuple_extract_key.cc |   1 +
>   src/box/tuple_hash.cc        |   1 +
>   src/box/vinyl.c              |   5 +
>   src/box/vy_log.c             |   3 +-
>   11 files changed, 300 insertions(+), 41 deletions(-)
> 
> diff --git a/src/box/key_def.c b/src/box/key_def.c
> index 8a4262b..b00e46d 100644
> --- a/src/box/key_def.c
> +++ b/src/box/key_def.c
> @@ -131,9 +155,9 @@ key_def_set_cmp(struct key_def *def)
>   }
>   
>   struct key_def *
> -key_def_new(uint32_t part_count)
> +key_def_new(uint32_t part_count, size_t extra_size)

1. Please, make extra_size be also an argument of key_def_sizeof
and name it literally paths_size.

>   {
> -	size_t sz = key_def_sizeof(part_count);
> +	size_t sz = key_def_sizeof(part_count) + extra_size;
>   	/** Use calloc() to zero comparator function pointers. */
>   	struct key_def *key_def = (struct key_def *) calloc(1, sz);
>   	if (key_def == NULL) {
> @@ -241,6 +289,11 @@ key_part_cmp(const struct key_part *parts1, uint32_t part_count1,
>   		if (part1->is_nullable != part2->is_nullable)
>   			return part1->is_nullable <
>   			       part2->is_nullable ? -1 : 1;
> +		/* Lexicographic strings order. */
> +		uint32_t len = MIN(part1->path_len, part2->path_len);
> +		int rc = 0;
> +		if ((rc = memcmp(part1->path, part2->path, len)) != 0)

2. Same as strncmp and has the same bug: if a path is prefix of
another path, then you declare them as equal, but it is wrong.

Please, write a test on it. It should alter an index, extending
path of one of its parts. In such a case it should be rebuilt,
but your patch will not do it.

> +			return rc;
>   	}
>   	return part_count1 < part_count2 ? -1 : part_count1 > part_count2;
>   }
> @@ -248,11 +301,12 @@ key_part_cmp(const struct key_part *parts1, uint32_t part_count1,
>   void
>   key_def_set_part(struct key_def *def, uint32_t part_no, uint32_t fieldno,
>   		 enum field_type type, bool is_nullable, struct coll *coll,
> -		 uint32_t coll_id)
> +		 uint32_t coll_id, char *path, uint32_t path_len)

3. Why not const char *path?

>   {
>   	assert(part_no < def->part_count);
>   	assert(type < field_type_MAX);
>   	def->is_nullable |= is_nullable;
> +	def->has_json_paths |= (path != NULL);

4. Please, be consistent in names: you already use 'is_flat'
in the previous patch, not 'has_json_paths'.

5. Why do you need additional parentheses?

>   	def->parts[part_no].is_nullable = is_nullable;
>   	def->parts[part_no].fieldno = fieldno;
>   	def->parts[part_no].type = type;
> @@ -432,10 +516,111 @@ key_def_decode_parts_166(struct key_part_def *parts, uint32_t part_count,
>   				     fields[part->fieldno].is_nullable :
>   				     key_part_def_default.is_nullable);
>   		part->coll_id = COLL_NONE;
> +		part->path = NULL;
>   	}
>   	return 0;
>   }
>   
> +/**
> + * Verify key_part JSON path and convert to canonical form.
> + *
> + * @param region to make allocations.
> + * @param part with path to update.
> + * @param path_extra alloated space to reuse if possible.

6. As usual. Start sentences with a capital letter, finish with the
dot.

7. Typo: 'alloated'.

> + * @param path_extra_size @path_extra size.
> + *
> + * @retval -1 on error.
> + * @retval 0 on success.
> + */
> +static int
> +key_def_normalize_json_path(struct region *region, struct key_part_def *part,
> +			    char **path_extra, uint32_t *path_extra_size)
> +{
> +	const char *err_msg = NULL;
> +
> +	uint32_t allocated_size = *path_extra_size;
> +	char *path = *path_extra;
> +
> +	uint32_t path_len = strlen(part->path);
> +	struct json_path_parser parser;
> +	struct json_path_node node;
> +	json_path_parser_create(&parser, part->path, path_len);
> +	/*
> +	 * A worst-case scenario is .a -> ["a"]
> +	 * i.e. 3*path_len + 1 is enough.

8. Your calculations are diligent, but they are unreadable, sorry.
I tried to calculate again, and found that 2.5*len is enough.
A path in the worst case looks like '.a'*. So the biggest
possible number of path parts is len(path)/len('.a') = len / 2.
A part is either in the correct format already, or requires
3 additional symbols. So the worst new len is:
(len / 2) * 3 + len = 2.5 * len
\____________/
   number of
   additional
   characters

It is not? I failed to find a counter-example.

> +	 */
> +	uint32_t new_path_size = 3 * path_len + 1;
> +	if (new_path_size >= allocated_size) {
> +		path = region_alloc(region, new_path_size);
> +		if (path == NULL) {
> +			diag_set(OutOfMemory, new_path_size,
> +				 "region_alloc", "path");
> +			return -1;
> +		}
> +		allocated_size = new_path_size;
> +	}
> +	assert(path != NULL);
> +	part->path = path;
> +	int rc = json_path_next(&parser, &node);
> +	if (rc != 0)
> +		goto error_invalid_json;
> +	if (node.type != JSON_PATH_NUM) {
> +		diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +			 part->fieldno,
> +			 "invalid JSON path: first part should "
> +			 "be defined as array index");
> +		return -1;
> +	}
> +	if (node.num - TUPLE_INDEX_BASE != part->fieldno) {
> +		diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +			 part->fieldno,
> +			 "invalid JSON path: first part refers "
> +			 "to invalid field");
> +		return -1;
> +	}
> +	uint32_t lexemes = 0;
> +	do {
> +		if (node.type == JSON_PATH_NUM) {
> +			path += sprintf(path, "[%u]", (uint32_t) node.num);

9. As I remember, we have decided to do not store
first path part in the string, because we have it in
part->fieldno already decoded. Please, cut the first
path part off. Also, it is faster when we need to
follow the path along tuple field tree in tuple_format,
because we do not need useless decoding of the first
part.

> +		} else if (node.type == JSON_PATH_STR) {
> +			path += sprintf(path, "[\"%.*s\"]", node.len, node.str);
> +		} else {
> +			unreachable();
> +		}
> +		lexemes++;
> +	} while ((rc = json_path_next(&parser, &node)) == 0 &&
> +		 node.type != JSON_PATH_END);
> +	if (rc != 0 || node.type != JSON_PATH_END)
> +		goto error_invalid_json;
> +	if (lexemes == 1) {
> +		/* JSON index is useless. */
> +		path = part->path;
> +		part->path = NULL;
> +	} else {
> +		/* Skip terminating zero. */
> +		path++;
> +		/* Account constructed string size. */
> +		allocated_size -= path - part->path;
> +	}
> +	/* Going to try to reuse extra allocation next time. */
> +	if ((uint32_t)parser.src_len > path_len) {

10. Here parser.src_len is always equal to path_len - it was
initialized by this value at the beginning of the function.

11. What is more, I do not understand, how can it work. When
you have two key_part_defs, first takes the region memory,
and the second can take exactly the same memory and rewrite it,
if its path len < than the former's. It was not clear on the
previous implementation, but now it is evident. So all
key_part_defs has the same path, if they are in descending
order of their path lens.

> +		*path_extra = path;
> +		*path_extra_size = allocated_size;
> +	} else {
> +		*path_extra = (char *)parser.src;
> +		*path_extra_size = parser.src_len;
> +	}
> +	return 0;
> +
> +error_invalid_json:
> +	err_msg = tt_sprintf("invalid JSON path '%.*s': path has invalid "
> +			     "structure (error at position %d)", parser.src_len,
> +			     parser.src, parser.symbol_count);

12. Symbol_count is not the same as error position. Please, use
json_path_next result as the position.

> +	diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +		 part->fieldno + TUPLE_INDEX_BASE, err_msg);
> +	return -1;
> +}
> +
>   int
>   key_def_decode_parts(struct key_part_def *parts, uint32_t part_count,
>   		     const char **data, const struct field_def *fields,
> @@ -533,18 +726,29 @@ key_def_merge(const struct key_def *first, const struct key_def *second)
>   	 * Find and remove part duplicates, i.e. parts counted
>   	 * twice since they are present in both key defs.
>   	 */
> -	const struct key_part *part = second->parts;
> -	const struct key_part *end = part + second->part_count;
> +	size_t sz = 0;
> +	const struct key_part *part = first->parts;
> +	const struct key_part *end = part + first->part_count;
> +	for (; part != end; part++) {
> +		if (part->path != NULL)
> +			sz += part->path_len + 1;
> +	}
> +	part = second->parts;
> +	end = part + second->part_count;
>   	for (; part != end; part++) {
> -		if (key_def_find(first, part->fieldno))
> +		const struct key_part *duplicate =
> +			key_def_find(first, part->fieldno);

13. Because key_def_find does not care about paths yourself,
now key_def_contains() is broken, and Vinyl is too as a consequence.
Please, write a test on it. Now it is possible to insert duplicates
into unique Vinyl index, if a secondary index contains field numbers
of the primary one, but with different paths. It is wrong.

> +		if (duplicate != NULL &&
> +		    part->path_len == duplicate->path_len &&
> +		    memcmp(part->path, duplicate->path, part->path_len) == 0)
>   			--new_part_count;
> +		else if (part->path != NULL)
> +			sz += part->path_len + 1;
>   	}
> diff --git a/src/box/key_def.h b/src/box/key_def.h
> index 42c054c..b6d6259 100644
> --- a/src/box/key_def.h
> +++ b/src/box/key_def.h
> @@ -54,6 +54,8 @@ struct key_part_def {
>   	uint32_t coll_id;
>   	/** True if a key part can store NULLs. */
>   	bool is_nullable;
> +	/** JSON path to data. */

14. Please, note that it does not include the first
path part, that is decoded already into fieldno.

> +	char *path;
>   };
>   
>   /**
> @@ -78,6 +80,10 @@ struct key_part {
>   	uint64_t format_epoch;
>   	/** Cache for corresponding tuple_format slot_offset. */
>   	int32_t slot_cache;
> +	/** JSON path to data. */
> +	char *path;
> +	/** JSON path length. */
> +	uint32_t path_len;

15. The same.

>   };
>   
>   struct key_def;
> diff --git a/src/box/tuple_extract_key.cc b/src/box/tuple_extract_key.cc
> index d95ee8d..0301186 100644
> --- a/src/box/tuple_extract_key.cc
> +++ b/src/box/tuple_extract_key.cc
> @@ -96,6 +96,7 @@ static char *
>   tuple_extract_key_slowpath(const struct tuple *tuple,
>   			   const struct key_def *key_def, uint32_t *key_size)
>   {
> +	assert(!is_flat == key_def->has_json_paths);
>   	assert(!has_optional_parts || key_def->is_nullable);
>   	assert(has_optional_parts == key_def->has_optional_parts);
>   	assert(contains_sequential_parts ==

16. As I see, you did update neither key_def_contains_sequential_parts
nor points of its result usage (if (! contains_sequential_parts) ... ).
So in your implementation such parts as '[1][2][3]' and '[2][4][5]' are
considered to be sequential, but it is wrong obviously. Their MessagePack's
do not follow each other. After I reviewed the whole patchset I see that
you updated these functions in the next commits, but please, do it in
this commit.

17. Why this patch has no tests? You already now can test paths
normalization and the first path part cut into fieldno.