From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Tue, 26 Feb 2019 19:58:32 +0300 From: Vladimir Davydov Subject: Re: [PATCH v1 1/1] lib: introduce a new JSON_TOKEN_ANY json token Message-ID: <20190226165832.jvwc55ctxj5zhpgt@esperanza> References: <51b08c09ebe40d52bc3b6b6b63755930040500f6.1550571905.git.kshcherbatov@tarantool.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <51b08c09ebe40d52bc3b6b6b63755930040500f6.1550571905.git.kshcherbatov@tarantool.org> To: Kirill Shcherbatov Cc: tarantool-patches@freelists.org List-ID: On Tue, Feb 19, 2019 at 01:31:43PM +0300, Kirill Shcherbatov wrote: > Introduced a new JSON_TOKEN_ANY json token that makes possible to > perform anonymous lookup in marked tree nodes. This feature is > required to implement multikey indexes. > Since the token entered into the parser becomes available to user, > additional server-side check is introduced so that an error > occurs when trying to create a multikey index. > > Needed for #1260 > > With the introduction of multikey indexes, the key_parts_are_compatible > function will not change significantly: here > https://gist.github.com/kshcherbatov/c0e48138fc16678ad9a82646f00c1881 > you can see how the check is made that the multikey indexes are compatible > with each other. > > https://github.com/tarantool/tarantool/tree/kshch/gh-1257-multikey-index-json-any-token > https://github.com/tarantool/tarantool/issues/1257 > --- > src/box/index_def.c | 60 +++++++++++++++++++++++++++++++++------ > src/lib/json/json.c | 38 +++++++++++++++++++------ > src/lib/json/json.h | 21 ++++++++++++-- > test/engine/json.result | 21 ++++++++++++++ > test/engine/json.test.lua | 8 ++++++ > test/unit/json.c | 52 +++++++++++++++++++++++++++++---- > test/unit/json.result | 10 +++++-- > 7 files changed, 184 insertions(+), 26 deletions(-) > > diff --git a/src/box/index_def.c b/src/box/index_def.c > index 6c37f9f1d..0f99f1759 100644 > --- a/src/box/index_def.c > +++ b/src/box/index_def.c > @@ -244,6 +244,56 @@ index_def_cmp(const struct index_def *key1, const struct index_def *key2) > key2->key_def->parts, key2->key_def->part_count); > } > > +/** > + * Test whether key_parts a and b are compatible: > + * + field numbers are differ OR > + * + json paths are differ > + * Also perform integrity check: parts must not define multikey > + * index. > + */ > +static bool > +key_parts_are_compatible(struct key_part *a, struct key_part *b, > + struct index_def *index_def, const char *space_name) > +{ > + if (a->fieldno != b->fieldno) > + return true; > + struct json_lexer lexer_a, lexer_b; > + json_lexer_create(&lexer_a, a->path, a->path_len, TUPLE_INDEX_BASE); > + json_lexer_create(&lexer_b, b->path, b->path_len, TUPLE_INDEX_BASE); > + struct json_token token_a, token_b; > + /* For the sake of json_token_cmp(). */ > + token_a.parent = NULL; > + token_b.parent = NULL; Violates encapsulation :-/ > + int a_rc, b_rc; > + int token_idx = 0, differ_token_idx = 0; > + int a_multikey_rank = 0, b_multikey_rank = 0; > + while ((a_rc = json_lexer_next_token(&lexer_a, &token_a)) == 0 && > + (b_rc = json_lexer_next_token(&lexer_b, &token_b)) == 0 && > + token_a.type != JSON_TOKEN_END && > + token_b.type != JSON_TOKEN_END) { What if a->path is longer than b->path? > + token_idx++; > + if (differ_token_idx == 0) { > + if (json_token_cmp(&token_a, &token_b) != 0) > + differ_token_idx = token_idx; > + } > + if (token_a.type == JSON_TOKEN_ANY) > + a_multikey_rank = token_idx; > + if (token_b.type == JSON_TOKEN_ANY) > + b_multikey_rank = token_idx; > + } > + if (a_multikey_rank > 0 || b_multikey_rank > 0) { > + diag_set(ClientError, ER_MODIFY_INDEX, > + index_def->name, space_name, > + "multikey index feature is not supported yet"); Should be ER_UNSUPPORTED and should live in check_index_def engine callbacks. This implies that we should set has_multikey somewhere in key_def_new. Probably, this check should be moved there as well - after all, it's about key def, not index def. Anyway, I don't think that this should be a business of this particular patch. Let's start with patching json library only and add a simple stub that raises error on any * in key_def. > + return false; > + } > + if (differ_token_idx > 0 || token_b.type != token_a.type) > + return true; > + diag_set(ClientError, ER_MODIFY_INDEX, index_def->name, space_name, > + "same key part is indexed twice"); > + return false; > +} > + > bool > index_def_is_valid(struct index_def *index_def, const char *space_name) > > @@ -282,15 +332,9 @@ index_def_is_valid(struct index_def *index_def, const char *space_name) > */ The comment right above this line is obsoleted by this patch. > struct key_part *part_a = &index_def->key_def->parts[i]; > struct key_part *part_b = &index_def->key_def->parts[j]; > - if (part_a->fieldno == part_b->fieldno && > - json_path_cmp(part_a->path, part_a->path_len, > - part_b->path, part_b->path_len, > - TUPLE_INDEX_BASE) == 0) { > - diag_set(ClientError, ER_MODIFY_INDEX, > - index_def->name, space_name, > - "same key part is indexed twice"); > + if (!key_parts_are_compatible(part_a, part_b, index_def, > + space_name)) > return false; > - } > } > } > return true; > diff --git a/src/lib/json/json.c b/src/lib/json/json.c > index 1b1a3ec2c..019917825 100644 > --- a/src/lib/json/json.c > +++ b/src/lib/json/json.c > @@ -146,18 +146,25 @@ json_parse_integer(struct json_lexer *lexer, struct json_token *token) > int len = 0; > int value = 0; > char c = *pos; > - if (! isdigit(c)) > - return lexer->symbol_count + 1; > + if (!isdigit(c)) { > + if (c != '*') > + return lexer->symbol_count + 1; > + token->type = JSON_TOKEN_ANY; > + ++len; > + ++pos; > + goto end; > + } This should live in json_lexer_next_token, not in json_parse_integer, because * is not an integer. > do { > value = value * 10 + c - (int)'0'; > ++len; > } while (++pos < end && isdigit((c = *pos))); > if (value < lexer->index_base) > return lexer->symbol_count + 1; > - lexer->offset += len; > - lexer->symbol_count += len; > token->type = JSON_TOKEN_NUM; > token->num = value - lexer->index_base; > +end: > + lexer->offset += len; > + lexer->symbol_count += len; > return 0; > } > > diff --git a/src/lib/json/json.h b/src/lib/json/json.h > index 66cddd026..ff4ab7d91 100644 > --- a/src/lib/json/json.h > +++ b/src/lib/json/json.h > @@ -66,6 +66,7 @@ struct json_lexer { > enum json_token_type { > JSON_TOKEN_NUM, > JSON_TOKEN_STR, > + JSON_TOKEN_ANY, > /** Lexer reached end of path. */ > JSON_TOKEN_END, > }; > @@ -113,6 +114,10 @@ struct json_token { > * a JSON tree root. > */ > int sibling_idx; > + /** > + * True when it has the only child token JSON_TOKEN_ANY. > + */ > + bool is_multikey; I don't think that we really need this flag, because we can introduce a simple helper function instead: bool json_token_is_multikey(token) { return token->max_child_idx == 0 && token->children[0].type == JSON_TOKEN_ANY; } Also, please update the comment to children array to point out what it stores in case of a multikey token. > /** > * Hash value of the token. Used for lookups in a JSON tree. > * For more details, see the comment to json_tree::hash. > @@ -236,6 +241,12 @@ json_lexer_create(struct json_lexer *lexer, const char *src, int src_len, > int > json_lexer_next_token(struct json_lexer *lexer, struct json_token *token); > > +/** > + * Compare JSON token keys. > + */ > +int > +json_token_cmp(const struct json_token *a, const struct json_token *b); > + > /** > * Compare two JSON paths using Lexer class. > * - in case of paths that have same token-sequence prefix, > @@ -307,10 +318,16 @@ json_tree_lookup(struct json_tree *tree, struct json_token *parent, > const struct json_token *token) > { > struct json_token *ret = NULL; > + if (unlikely(parent->is_multikey && > + (token->type == JSON_TOKEN_NUM || > + token->type == JSON_TOKEN_ANY))) { I don't think there's much point in checking token->type here. We can return ANY for JSON_TOKEN_STR as well. Moreover, this would look more logical IMO. > + assert(parent->max_child_idx == 0); > + return parent->children[0]; > + } > if (likely(token->type == JSON_TOKEN_NUM)) { > - ret = (int)token->num < parent->children_capacity ? > + ret = token->num <= parent->max_child_idx ? > parent->children[token->num] : NULL; > - } else { > + } else if (token->type == JSON_TOKEN_STR) { > ret = json_tree_lookup_slowpath(tree, parent, token); > } > return ret; If we look up ANY, shouldn't we be given any token rather than NULL?