From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <tarantool-patches-bounce@freelists.org>
Received: from localhost (localhost [127.0.0.1])
	by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id AC4DC28C2F
	for <tarantool-patches@freelists.org>; Wed,  8 Aug 2018 18:03:17 -0400 (EDT)
Received: from turing.freelists.org ([127.0.0.1])
	by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024)
	with ESMTP id lYei69ImTLQK for <tarantool-patches@freelists.org>;
	Wed,  8 Aug 2018 18:03:17 -0400 (EDT)
Received: from smtp37.i.mail.ru (smtp37.i.mail.ru [94.100.177.97])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 49D6628C22
	for <tarantool-patches@freelists.org>; Wed,  8 Aug 2018 18:03:17 -0400 (EDT)
Subject: [tarantool-patches] Re: [PATCH v1 3/5] box: introduce path field in
 key_part
References: <cover.1533558332.git.kshcherbatov@tarantool.org>
 <63cb256a0206e222ee10199e7d671700e44ce2aa.1533558332.git.kshcherbatov@tarantool.org>
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
Message-ID: <295310f0-1590-0b32-47f7-3231f925e19a@tarantool.org>
Date: Thu, 9 Aug 2018 01:03:13 +0300
MIME-Version: 1.0
In-Reply-To: <63cb256a0206e222ee10199e7d671700e44ce2aa.1533558332.git.kshcherbatov@tarantool.org>
Content-Type: text/plain; charset=utf-8; format=flowed
Content-Language: en-US
Content-Transfer-Encoding: 7bit
Sender: tarantool-patches-bounce@freelists.org
Errors-to: tarantool-patches-bounce@freelists.org
Reply-To: tarantool-patches@freelists.org
List-help: <mailto:ecartis@freelists.org?Subject=help>
List-unsubscribe: <tarantool-patches-request@freelists.org?Subject=unsubscribe>
List-software: Ecartis version 1.0.0
List-Id: tarantool-patches <tarantool-patches.freelists.org>
List-subscribe: <tarantool-patches-request@freelists.org?Subject=subscribe>
List-owner: <mailto:>
List-post: <mailto:tarantool-patches@freelists.org>
List-archive: <http://www.freelists.org/archives/tarantool-patches>
To: tarantool-patches@freelists.org, Kirill Shcherbatov <kshcherbatov@tarantool.org>

Thanks for the patch! See 8 comments below.

On 06/08/2018 15:27, Kirill Shcherbatov wrote:
> As we need to store user-defined JSON path in key_part
> and key_part_def, we have introduced path and path_len
> fields. JSON path is verified and transformed to canonical
> form on index msgpack unpack.
> Because of field names specified as format could be changed
> key_part path persisted in Tarantool should be always started
> with first-level field access via array index(not by name).
> 
> Part of #1012.
> ---
>   src/box/key_def.c    | 197 +++++++++++++++++++++++++++++++++++++++++++++++----
>   src/box/key_def.h    |  13 +++-
>   src/box/lua/space.cc |   5 ++
>   src/box/schema.cc    |   8 +--
>   src/box/vy_log.c     |   3 +-
>   5 files changed, 205 insertions(+), 21 deletions(-)
> 
> diff --git a/src/box/key_def.c b/src/box/key_def.c
> index 8a4262b..79e07f8 100644
> --- a/src/box/key_def.c
> +++ b/src/box/key_def.c
> @@ -103,7 +108,27 @@ key_def_dup(const struct key_def *src)
>   		return NULL;
>   	}
>   	memcpy(res, src, sz);
> +	uint32_t i = 0;
> +	for (; i < src->part_count; i++) {
> +		if (src->parts[i].path == NULL) {
> +			res->parts[i].path = NULL;
> +			continue;
> +		}
> +		char *path = strdup(src->parts[i].path);

1. Please, keep monolithic key_def memory layout.

> +		if (path == NULL) {
> +			diag_set(OutOfMemory, src->parts[i].path_len + 1,
> +				"strdup", "path");
> +			goto error;
> +		}
> +		res->parts[i].path = path;
> +	}
>   	return res;
> +
> +error:
> +	for (uint32_t j = 0; j < i; j++)
> +		free((void *)res->parts[j].path);
> +	free(res);
> +	return NULL;
>   }
>   
>   void
> @@ -241,6 +296,11 @@ key_part_cmp(const struct key_part *parts1, uint32_t part_count1,
>   		if (part1->is_nullable != part2->is_nullable)
>   			return part1->is_nullable <
>   			       part2->is_nullable ? -1 : 1;
> +		/* Lexicographic strings order. */
> +		uint32_t len = MIN(part1->path_len, part2->path_len);
> +		int rc = 0;
> +		if ((rc = strncmp(part1->path, part2->path, len)) != 0)

2. As I said you at least two times, strncmp treats non-equal strings
as the same one when the one is prefix of another. So this comparison
is not correct.

> +			return rc;
>   	}
>   	return part_count1 < part_count2 ? -1 : part_count1 > part_count2;
>   }
> @@ -445,8 +529,9 @@ key_def_decode_parts(struct key_part_def *parts, uint32_t part_count,
>   		return key_def_decode_parts_166(parts, part_count, data,
>   						fields, field_count);
>   	}
> +	struct key_part_def *part;
>   	for (uint32_t i = 0; i < part_count; i++) {
> -		struct key_part_def *part = &parts[i];

3. Just do ++part on the previous line.

> +		part = &parts[i];
>   		if (mp_typeof(**data) != MP_MAP) {
>   			diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
>   				 i + TUPLE_INDEX_BASE,
> @@ -473,8 +558,75 @@ key_def_decode_parts(struct key_part_def *parts, uint32_t part_count,
>   				 "string and scalar parts");
>   			return -1;
>   		}
> +		if (part->path != NULL) {
> +			struct region *region = &fiber()->gc;
> +			size_t path_len = strlen(part->path);
> +			struct json_path_parser parser;
> +			struct json_path_node node;
> +			json_path_parser_create(&parser, part->path, path_len);
> +			/*
> +			 * A worst-case scenario is .a -> ["a"]
> +			 * i.e. 3*path_len + 1 is enough.

4. As I understand, len of ".a" is 2, len of "['a']" is 5. So
it is actually 2*len + 1. Not 3*len + 1. It is not?

> +			 */
> +			uint32_t size = region_used(region);

5. You do not need to truncate the region yourself. This is done
later by commit/rollback. But you can reuse previous the region
allocations if store the region memory pointer out of this cycle
together size. And when a new path len * 2 + 1 > than the memory
size you have, you allocate more. Else you can reuse it. In the
best case it reduces number of allocations in part_count times,
in the worst one it is not worse than your way.

> +			char *path =
> +				region_alloc(region, 3 * path_len + 1);
> +			if (path == NULL) {
> +				diag_set(OutOfMemory, 3 * path_len + 1,
> +					"region_alloc", "path");
> +				return -1;
> +			}
> +			part->path = path;
> +			int rc = json_path_next(&parser, &node);
> +			if (rc != 0)
> +				goto error_invalid_json;
> +			if (node.type != JSON_PATH_NUM) {
> +				diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +					part->fieldno,
> +					"invalid JSON path: first part should "
> +					"be defined as array");

6. As array index. Not array.

> +				return -1;
> +			}
> +			if (node.num - TUPLE_INDEX_BASE != part->fieldno) {
> +				diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +					part->fieldno,
> +					"invalid JSON path: first part refers "
> +					"to invalid field");
> +				return -1;
> +			}
> +			uint32_t lexemes = 0;
> +			do {
> +				if (node.type == JSON_PATH_NUM) {
> +					path += sprintf(path, "[%u]",
> +						(unsigned)node.num);
> +				} else if (node.type == JSON_PATH_STR) {
> +					path += sprintf(path, "[\"%.*s\"]",
> +							node.len, node.str);
> +				} else {
> +					unreachable();
> +				}
> +				lexemes++;
> +			} while ((rc = json_path_next(&parser, &node)) == 0 &&
> +				 node.type != JSON_PATH_END);
> +			if (rc != 0 || node.type != JSON_PATH_END)
> +				goto error_invalid_json;
> +			/* JSON index is useless. */
> +			if (lexemes == 1) {
> +				region_truncate(region, size);
> +				part->path = NULL;
> +			} else {
> +				region_truncate(region,
> +						size + (path - part->path + 1));
> +			}
> +		}

7. Please, reduce indentation level. You can check
'if part->path == NULL then continue' and move this
code block 1 tab left. Even now you can join some of
lines into one.

>   	}
>   	return 0;
> +
> +error_invalid_json:
> +	diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> +		 part->fieldno + TUPLE_INDEX_BASE,
> +		 "invalid JSON path: path has invalid structure");

8. Please, write on which position the error occurred.

> +	return -1;
>   }
>   
>   in