[PATCH v1 1/1] lib: introduce a new JSON_TOKEN_ANY json token
Vladimir Davydov
vdavydov.dev at gmail.com
Tue Feb 26 19:58:32 MSK 2019
On Tue, Feb 19, 2019 at 01:31:43PM +0300, Kirill Shcherbatov wrote:
> Introduced a new JSON_TOKEN_ANY json token that makes possible to
> perform anonymous lookup in marked tree nodes. This feature is
> required to implement multikey indexes.
> Since the token entered into the parser becomes available to user,
> additional server-side check is introduced so that an error
> occurs when trying to create a multikey index.
>
> Needed for #1260
>
> With the introduction of multikey indexes, the key_parts_are_compatible
> function will not change significantly: here
> https://gist.github.com/kshcherbatov/c0e48138fc16678ad9a82646f00c1881
> you can see how the check is made that the multikey indexes are compatible
> with each other.
>
> https://github.com/tarantool/tarantool/tree/kshch/gh-1257-multikey-index-json-any-token
> https://github.com/tarantool/tarantool/issues/1257
> ---
> src/box/index_def.c | 60 +++++++++++++++++++++++++++++++++------
> src/lib/json/json.c | 38 +++++++++++++++++++------
> src/lib/json/json.h | 21 ++++++++++++--
> test/engine/json.result | 21 ++++++++++++++
> test/engine/json.test.lua | 8 ++++++
> test/unit/json.c | 52 +++++++++++++++++++++++++++++----
> test/unit/json.result | 10 +++++--
> 7 files changed, 184 insertions(+), 26 deletions(-)
>
> diff --git a/src/box/index_def.c b/src/box/index_def.c
> index 6c37f9f1d..0f99f1759 100644
> --- a/src/box/index_def.c
> +++ b/src/box/index_def.c
> @@ -244,6 +244,56 @@ index_def_cmp(const struct index_def *key1, const struct index_def *key2)
> key2->key_def->parts, key2->key_def->part_count);
> }
>
> +/**
> + * Test whether key_parts a and b are compatible:
> + * + field numbers are differ OR
> + * + json paths are differ
> + * Also perform integrity check: parts must not define multikey
> + * index.
> + */
> +static bool
> +key_parts_are_compatible(struct key_part *a, struct key_part *b,
> + struct index_def *index_def, const char *space_name)
> +{
> + if (a->fieldno != b->fieldno)
> + return true;
> + struct json_lexer lexer_a, lexer_b;
> + json_lexer_create(&lexer_a, a->path, a->path_len, TUPLE_INDEX_BASE);
> + json_lexer_create(&lexer_b, b->path, b->path_len, TUPLE_INDEX_BASE);
> + struct json_token token_a, token_b;
> + /* For the sake of json_token_cmp(). */
> + token_a.parent = NULL;
> + token_b.parent = NULL;
Violates encapsulation :-/
> + int a_rc, b_rc;
> + int token_idx = 0, differ_token_idx = 0;
> + int a_multikey_rank = 0, b_multikey_rank = 0;
> + while ((a_rc = json_lexer_next_token(&lexer_a, &token_a)) == 0 &&
> + (b_rc = json_lexer_next_token(&lexer_b, &token_b)) == 0 &&
> + token_a.type != JSON_TOKEN_END &&
> + token_b.type != JSON_TOKEN_END) {
What if a->path is longer than b->path?
> + token_idx++;
> + if (differ_token_idx == 0) {
> + if (json_token_cmp(&token_a, &token_b) != 0)
> + differ_token_idx = token_idx;
> + }
> + if (token_a.type == JSON_TOKEN_ANY)
> + a_multikey_rank = token_idx;
> + if (token_b.type == JSON_TOKEN_ANY)
> + b_multikey_rank = token_idx;
> + }
> + if (a_multikey_rank > 0 || b_multikey_rank > 0) {
> + diag_set(ClientError, ER_MODIFY_INDEX,
> + index_def->name, space_name,
> + "multikey index feature is not supported yet");
Should be ER_UNSUPPORTED and should live in check_index_def engine
callbacks. This implies that we should set has_multikey somewhere in
key_def_new. Probably, this check should be moved there as well - after
all, it's about key def, not index def.
Anyway, I don't think that this should be a business of this particular
patch. Let's start with patching json library only and add a simple stub
that raises error on any * in key_def.
> + return false;
> + }
> + if (differ_token_idx > 0 || token_b.type != token_a.type)
> + return true;
> + diag_set(ClientError, ER_MODIFY_INDEX, index_def->name, space_name,
> + "same key part is indexed twice");
> + return false;
> +}
> +
> bool
> index_def_is_valid(struct index_def *index_def, const char *space_name)
>
> @@ -282,15 +332,9 @@ index_def_is_valid(struct index_def *index_def, const char *space_name)
> */
The comment right above this line is obsoleted by this patch.
> struct key_part *part_a = &index_def->key_def->parts[i];
> struct key_part *part_b = &index_def->key_def->parts[j];
> - if (part_a->fieldno == part_b->fieldno &&
> - json_path_cmp(part_a->path, part_a->path_len,
> - part_b->path, part_b->path_len,
> - TUPLE_INDEX_BASE) == 0) {
> - diag_set(ClientError, ER_MODIFY_INDEX,
> - index_def->name, space_name,
> - "same key part is indexed twice");
> + if (!key_parts_are_compatible(part_a, part_b, index_def,
> + space_name))
> return false;
> - }
> }
> }
> return true;
> diff --git a/src/lib/json/json.c b/src/lib/json/json.c
> index 1b1a3ec2c..019917825 100644
> --- a/src/lib/json/json.c
> +++ b/src/lib/json/json.c
> @@ -146,18 +146,25 @@ json_parse_integer(struct json_lexer *lexer, struct json_token *token)
> int len = 0;
> int value = 0;
> char c = *pos;
> - if (! isdigit(c))
> - return lexer->symbol_count + 1;
> + if (!isdigit(c)) {
> + if (c != '*')
> + return lexer->symbol_count + 1;
> + token->type = JSON_TOKEN_ANY;
> + ++len;
> + ++pos;
> + goto end;
> + }
This should live in json_lexer_next_token, not in json_parse_integer,
because * is not an integer.
> do {
> value = value * 10 + c - (int)'0';
> ++len;
> } while (++pos < end && isdigit((c = *pos)));
> if (value < lexer->index_base)
> return lexer->symbol_count + 1;
> - lexer->offset += len;
> - lexer->symbol_count += len;
> token->type = JSON_TOKEN_NUM;
> token->num = value - lexer->index_base;
> +end:
> + lexer->offset += len;
> + lexer->symbol_count += len;
> return 0;
> }
>
> diff --git a/src/lib/json/json.h b/src/lib/json/json.h
> index 66cddd026..ff4ab7d91 100644
> --- a/src/lib/json/json.h
> +++ b/src/lib/json/json.h
> @@ -66,6 +66,7 @@ struct json_lexer {
> enum json_token_type {
> JSON_TOKEN_NUM,
> JSON_TOKEN_STR,
> + JSON_TOKEN_ANY,
> /** Lexer reached end of path. */
> JSON_TOKEN_END,
> };
> @@ -113,6 +114,10 @@ struct json_token {
> * a JSON tree root.
> */
> int sibling_idx;
> + /**
> + * True when it has the only child token JSON_TOKEN_ANY.
> + */
> + bool is_multikey;
I don't think that we really need this flag, because we can introduce
a simple helper function instead:
bool json_token_is_multikey(token)
{
return token->max_child_idx == 0 &&
token->children[0].type == JSON_TOKEN_ANY;
}
Also, please update the comment to children array to point out what it
stores in case of a multikey token.
> /**
> * Hash value of the token. Used for lookups in a JSON tree.
> * For more details, see the comment to json_tree::hash.
> @@ -236,6 +241,12 @@ json_lexer_create(struct json_lexer *lexer, const char *src, int src_len,
> int
> json_lexer_next_token(struct json_lexer *lexer, struct json_token *token);
>
> +/**
> + * Compare JSON token keys.
> + */
> +int
> +json_token_cmp(const struct json_token *a, const struct json_token *b);
> +
> /**
> * Compare two JSON paths using Lexer class.
> * - in case of paths that have same token-sequence prefix,
> @@ -307,10 +318,16 @@ json_tree_lookup(struct json_tree *tree, struct json_token *parent,
> const struct json_token *token)
> {
> struct json_token *ret = NULL;
> + if (unlikely(parent->is_multikey &&
> + (token->type == JSON_TOKEN_NUM ||
> + token->type == JSON_TOKEN_ANY))) {
I don't think there's much point in checking token->type here.
We can return ANY for JSON_TOKEN_STR as well. Moreover, this would
look more logical IMO.
> + assert(parent->max_child_idx == 0);
> + return parent->children[0];
> + }
> if (likely(token->type == JSON_TOKEN_NUM)) {
> - ret = (int)token->num < parent->children_capacity ?
> + ret = token->num <= parent->max_child_idx ?
> parent->children[token->num] : NULL;
> - } else {
> + } else if (token->type == JSON_TOKEN_STR) {
> ret = json_tree_lookup_slowpath(tree, parent, token);
> }
> return ret;
If we look up ANY, shouldn't we be given any token rather than NULL?
More information about the Tarantool-patches
mailing list