From: Vladimir Davydov <vdavydov.dev@gmail.com> To: Kirill Shcherbatov <kshcherbatov@tarantool.org> Cc: tarantool-patches@freelists.org, kostja@tarantool.org Subject: Re: [PATCH v8 3/5] box: introduce JSON Indexes Date: Wed, 23 Jan 2019 16:49:08 +0300 [thread overview] Message-ID: <20190123134908.tfpakeamx7d2q7iy@esperanza> (raw) In-Reply-To: <4c8bd7e8682e577ed8d49f92e231077924f4fe53.1547645795.git.kshcherbatov@tarantool.org> On Wed, Jan 16, 2019 at 04:44:41PM +0300, Kirill Shcherbatov wrote: > diff --git a/src/box/key_def.c b/src/box/key_def.c > index dae3580e2..0ed497dfc 100644 > --- a/src/box/key_def.c > +++ b/src/box/key_def.c > @@ -485,6 +568,14 @@ key_def_decode_parts(struct key_part_def *parts, uint32_t part_count, > "index part: unknown sort order"); > return -1; > } > + if (part->path != NULL && > + json_path_validate(part->path, strlen(part->path), > + TUPLE_INDEX_BASE) != 0) { > + diag_set(ClientError, ER_WRONG_INDEX_OPTIONS, > + part->fieldno + TUPLE_INDEX_BASE, The code above passes (i + TUPLE_INDEX_BASE) to diag_set. Please fix this place to conform. > + "invalid path"); > + return -1; > + } > } > return 0; > } > diff --git a/src/box/key_def.h b/src/box/key_def.h > index d1866303b..fe4acffb5 100644 > --- a/src/box/key_def.h > +++ b/src/box/key_def.h > @@ -299,11 +320,12 @@ key_def_encode_parts(char *data, const struct key_part_def *parts, > * [NUM, STR, ..][NUM, STR, ..].., > * OR > * {field=NUM, type=STR, ..}{field=NUM, type=STR, ..}.., > + * The region is used for allocating JSON paths, if any. Extra space before "The region is used ...". Makes the comment difficult to read. > */ > int > key_def_decode_parts(struct key_part_def *parts, uint32_t part_count, > const char **data, const struct field_def *fields, > - uint32_t field_count); > + uint32_t field_count, struct region *region); > > /** > * Returns the part in index_def->parts for the specified fieldno. > diff --git a/src/box/tuple_format.c b/src/box/tuple_format.c > index 61b4c9734..9af9c307a 100644 > --- a/src/box/tuple_format.c > +++ b/src/box/tuple_format.c > @@ -69,9 +69,86 @@ static const char * > tuple_field_path(const struct tuple_field *field) > { > assert(field->token.parent != NULL); > - assert(field->token.parent->parent == NULL); > - assert(field->token.type == JSON_TOKEN_NUM); > - return int2str(field->token.num + TUPLE_INDEX_BASE); > + if (field->token.parent->parent == NULL) { > + /* Top-level field, no need to format the path. */ > + return int2str(field->token.num + TUPLE_INDEX_BASE); > + } > + char *path = tt_static_buf(); > + json_tree_snprint_path(path, TT_STATIC_BUF_LEN, &field->token, > + TUPLE_INDEX_BASE); > + return path; > +} > + > +/** > + * Given a field number and a path, add the corresponding tuple > + * field to the tuple format, allocating intermediate fields if > + * necessary. > + * Return a pointer to the leaf field on success, NULL on memory They usually add an empty line as a paragraph delimiter - text looks more readable that way. > + * allocation error or type/nullability mistmatch error, diag > + * message is set. > + */ > +static struct tuple_field * > +tuple_format_add_field(struct tuple_format *format, uint32_t fieldno, > + const char *path, uint32_t path_len) > +{ > + struct tuple_field *field = NULL; > + struct tuple_field *parent = tuple_format_field(format, fieldno); assert(parent != NULL); would be useful here to show the reader that top-level fields have already been created someplace else. > + if (path_len == 0) > + goto end; Better 'return parent;' - we don't need to call tuple_field_delete() in this case. > + field = tuple_field_new(); > + if (field == NULL) > + goto fail; > + > + int rc = 0; > + uint32_t token_count = 0; > + struct json_tree *tree = &format->fields; > + struct json_lexer lexer; > + json_lexer_create(&lexer, path, path_len, TUPLE_INDEX_BASE); > + while ((rc = json_lexer_next_token(&lexer, &field->token)) == 0 && > + field->token.type != JSON_TOKEN_END) { > + enum field_type expected_type = > + field->token.type == JSON_TOKEN_STR ? > + FIELD_TYPE_MAP : FIELD_TYPE_ARRAY; > + if (field_type1_contains_type2(parent->type, expected_type)) { > + parent->type = expected_type; > + } else { > + diag_set(ClientError, ER_INDEX_PART_TYPE_MISMATCH, > + tuple_field_path(parent), > + field_type_strs[parent->type], > + field_type_strs[expected_type]); > + goto fail; > + } > + struct tuple_field *next = > + json_tree_lookup_entry(tree, &parent->token, > + &field->token, > + struct tuple_field, token); > + if (next == NULL) { > + field->id = format->total_field_count++; > + rc = json_tree_add(tree, &parent->token, &field->token); > + if (rc != 0) { > + diag_set(OutOfMemory, sizeof(struct json_token), > + "json_tree_add", "tree"); > + goto fail; > + } > + next = field; > + field = tuple_field_new(); > + if (field == NULL) > + goto fail; > + } > + parent = next; > + token_count++; > + } > + /* Path has been verified by key_def_decode_parts. */ > + assert(rc == 0 && field->token.type == JSON_TOKEN_END); > + assert(parent != NULL); > + /* Update tree depth information. */ > + format->fields_depth = MAX(format->fields_depth, token_count + 1); > +end: > + tuple_field_delete(field); > + return parent; > +fail: > + parent = NULL; > + goto end; > } > > /** > @@ -95,15 +172,26 @@ tuple_format_field_by_id(struct tuple_format *format, uint32_t id) > static int > tuple_format_use_key_part(struct tuple_format *format, uint32_t field_count, > const struct key_part *part, bool is_sequential, > - int *current_slot) > + int *current_slot, char **path_pool) > { > assert(part->fieldno < tuple_format_field_count(format)); > - struct tuple_field *field = tuple_format_field(format, part->fieldno); > + const char *path = *path_pool; > + if (part->path != NULL) { > + /** > + * Copy JSON path data to reserved area at the > + * end of format allocation. > + */ > + memcpy(*path_pool, part->path, part->path_len); > + *path_pool += part->path_len; > + } This piece should be moved inside tuple_format_add_field(). > + struct tuple_field *field = tuple_format_add_field(format, part->fieldno, > + path, part->path_len); > + if (field == NULL) > + return -1; > /* > - * If a field is not present in the space format, > - * inherit nullable action of the first key part > - * referencing it. > - */ > + * If a field is not present in the space format, inherit > + * nullable action of the first key part referencing it. > + */ > if (part->fieldno >= field_count && !field->is_key_part) > field->nullable_action = part->nullable_action; > /* > @@ -512,49 +659,79 @@ tuple_init_field_map(struct tuple_format *format, uint32_t *field_map, > /* Empty tuple, nothing to do. */ > goto skip; > } > - /* first field is simply accessible, so we do not store offset to it */ > - struct tuple_field *field = tuple_format_field(format, 0); > - if (validate && > - !field_mp_type_is_compatible(field->type, mp_typeof(*pos), > - tuple_field_is_nullable(field))) { > - diag_set(ClientError, ER_FIELD_TYPE, tuple_field_path(field), > - field_type_strs[field->type]); > - goto error; > - } > - if (required_fields != NULL) > - bit_clear(required_fields, field->id); > - mp_next(&pos); > - /* other fields...*/ > - uint32_t i = 1; > uint32_t defined_field_count = MIN(field_count, validate ? > tuple_format_field_count(format) : > format->index_field_count); > - if (field_count < format->index_field_count) { > - /* > - * Nullify field map to be able to detect by 0, > - * which key fields are absent in tuple_field(). > - */ > - memset((char *)field_map - format->field_map_size, 0, > - format->field_map_size); > - } > - for (; i < defined_field_count; ++i) { > - field = tuple_format_field(format, i); > - if (validate && > - !field_mp_type_is_compatible(field->type, mp_typeof(*pos), > - tuple_field_is_nullable(field))) { > - diag_set(ClientError, ER_FIELD_TYPE, > - tuple_field_path(field), > - field_type_strs[field->type]); > - goto error; > + /* > + * Nullify field map to be able to detect by 0, > + * which key fields are absent in tuple_field(). > + */ > + memset((char *)field_map - format->field_map_size, 0, > + format->field_map_size); > + uint32_t mp_frames_sz = format->fields_depth * sizeof(struct mp_frame); > + struct mp_frame *mp_frames = region_alloc(region, mp_frames_sz); > + if (mp_frames == NULL) { > + diag_set(OutOfMemory, mp_frames_sz, "region", "frames"); > + goto error; > + } > + struct mp_stack mp_stack; > + mp_stack_init(&mp_stack, format->fields_depth, mp_frames); Call them simply 'stack' and 'frames'. > + mp_stack_push(&mp_stack, MP_ARRAY, defined_field_count); > + struct tuple_field *field; > + struct json_token *parent = &format->fields.root; > + while (1) { > + while (mp_stack_advance(&mp_stack)) { > + mp_stack_pop(&mp_stack); > + if (mp_stack_is_empty(&mp_stack)) > + goto skip; > + parent = parent->parent; > + } > + struct json_token token; > + token.type = mp_stack_top(&mp_stack)->type == MP_ARRAY ? > + JSON_TOKEN_NUM : JSON_TOKEN_STR; > + if (token.type == JSON_TOKEN_NUM) { > + token.num = mp_stack_top(&mp_stack)->curr - 1; I don't like that frame->curr index is in fact 1-based. Let's initialize frame->curr with -1 (or UINT32_MAX) in mp_stack_push() so that the first mp_stack_advance() called after mp_stack_push() would set it to 0. Also, what about changing the API slightly? mp_stack_advance() -> item index (>= 0) on success -1 if all items have been processed. Then we wouldn't really need to access mp_frame to get 'curr' so we could replace mp_stack_top() with mp_stack_type() => MP_MAP/MP_ARRAY. Not sure if it's a really good idea though. The code would look more concise, but need to check. > + } else if (token.type == JSON_TOKEN_STR) { > + if (mp_typeof(*pos) != MP_STR) { > + /* Skip key. */ > + mp_next(&pos); > + mp_next(&pos); > + continue; > + } > + token.str = mp_decode_str(&pos, (uint32_t *)&token.len); > + } else { > + unreachable(); > + } The code would look better IMO if you rewrote it without double-if (first check frame->type to initialize token.type, then check token->type to initialize token.key): switch (mp_stack_top(&stack)->type) { case MP_MAP: token.type = JSON_TOKEN_STR; token.str = ... break; case MP_ARRAY: token.type = JSON_TOKEN_NUM; token.num = ... default: unreachable(); } > + enum mp_type type = mp_typeof(*pos); > + assert(parent != NULL); > + field = json_tree_lookup_entry(&format->fields, parent, &token, > + struct tuple_field, token); > + if (field != NULL) { > + bool is_nullable = tuple_field_is_nullable(field); > + if (validate && > + !field_mp_type_is_compatible(field->type, type, > + is_nullable) != 0) { > + diag_set(ClientError, ER_FIELD_TYPE, > + tuple_field_path(field), > + field_type_strs[field->type]); > + goto error; > + } > + if (field->offset_slot != TUPLE_OFFSET_SLOT_NIL) > + field_map[field->offset_slot] = pos - tuple; > + if (required_fields != NULL) > + bit_clear(required_fields, field->id); > } > - if (field->offset_slot != TUPLE_OFFSET_SLOT_NIL) { > - field_map[field->offset_slot] = > - (uint32_t) (pos - tuple); > + if ((type == MP_ARRAY || type == MP_MAP) && > + !mp_stack_is_full(&mp_stack) && field != NULL) { > + uint32_t size = type == MP_ARRAY ? > + mp_decode_array(&pos) : > + mp_decode_map(&pos); > + mp_stack_push(&mp_stack, type, size); > + parent = &field->token; > + } else { > + mp_next(&pos); > } > - if (required_fields != NULL) > - bit_clear(required_fields, field->id); > - mp_next(&pos); > - } > + }; The semicolon (;) isn't needed. > diff --git a/src/box/vy_point_lookup.c b/src/box/vy_point_lookup.c > index ddbc2d46f..23b7a5869 100644 > --- a/src/box/vy_point_lookup.c > +++ b/src/box/vy_point_lookup.c > @@ -196,7 +196,8 @@ vy_point_lookup(struct vy_lsm *lsm, struct vy_tx *tx, > const struct vy_read_view **rv, > struct tuple *key, struct tuple **ret) > { > - assert(tuple_field_count(key) >= lsm->cmp_def->part_count); > + assert(tuple_field_count(key) >= lsm->cmp_def->part_count || > + lsm->cmp_def->has_json_paths); This isn't how this assertion should be fixed - instead we should check tuple_field_count for keys only (i.e. SELECT statements): /* All key parts must be set for a point lookup. */ assert(vy_stmt_type(key) != IPROTO_SELECT || tuple_field_count(key) >= lsm->cmp_def->part_count); > > diff --git a/test/engine/json.test.lua b/test/engine/json.test.lua > new file mode 100644 > index 000000000..a876ff1ed > --- /dev/null > +++ b/test/engine/json.test.lua > @@ -0,0 +1,131 @@ > +test_run = require('test_run').new() > +engine = test_run:get_cfg('engine') > +-- > +-- gh-1012: Indexes for JSON-defined paths. > +-- > +s = box.schema.space.create('withdata', {engine = engine}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO["fname"]'}, {3, 'str', path = '["FIO"].fname'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 666}, {3, 'str', path = '["FIO"]["fname"]'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'map', path = 'FIO'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'array', path = '[1]'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO'}, {3, 'str', path = 'FIO.fname'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '[1].sname'}, {3, 'str', path = '["FIO"].fname'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO....fname'}}}) > +idx = s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO.fname', is_nullable = false}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +idx ~= nil > +idx.parts[2].path == 'FIO.fname' > +format = {{'id', 'unsigned'}, {'meta', 'unsigned'}, {'data', 'array'}, {'age', 'unsigned'}, {'level', 'unsigned'}} > +s:format(format) > +format = {{'id', 'unsigned'}, {'meta', 'unsigned'}, {'data', 'map'}, {'age', 'unsigned'}, {'level', 'unsigned'}} > +s:format(format) > +s:create_index('test2', {parts = {{2, 'number'}, {3, 'number', path = 'FIO.fname'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +s:insert{7, 7, {town = 'London', FIO = 666}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 666, sname = 'Bond'}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = "James"}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond', data = "extra"}}, 4, 5} > +s:insert{7, 7, {town = 'Moscow', FIO = {fname = 'Max', sname = 'Isaev', data = "extra"}}, 4, 5} > +idx:select() > +idx:min() > +idx:max() > +s:drop() > + > +s = box.schema.create_space('withdata', {engine = engine}) > +parts = {} > +parts[1] = {1, 'unsigned', path='[2]'} > +pk = s:create_index('pk', {parts = parts}) > +s:insert{{1, 2}, 3} > +s:upsert({{box.null, 2}}, {{'+', 2, 5}}) > +s:get(2) > +s:drop() > + > +-- Create index on space with data > +s = box.schema.space.create('withdata', {engine = engine}) > +pk = s:create_index('primary', { type = 'tree' }) > +s:insert{1, 7, {town = 'London', FIO = 1234}, 4, 5} > +s:insert{2, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{3, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{4, 7, {town = 'London', FIO = {1,2,3}}, 4, 5} > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +_ = s:delete(1) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +_ = s:delete(2) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +_ = s:delete(4) > +idx = s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]', is_nullable = true}, {3, 'str', path = '["FIO"]["sname"]'}, {3, 'str', path = '["FIO"]["extra"]', is_nullable = true}}}) > +idx ~= nil > +s:create_index('test2', {parts = {{2, 'number'}, {3, 'number', path = '["FIO"]["fname"]'}}}) > +idx2 = s:create_index('test2', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}}}) > +idx2 ~= nil > +t = s:insert{5, 7, {town = 'Matrix', FIO = {fname = 'Agent', sname = 'Smith'}}, 4, 5} > +-- Test field_map in tuple speed-up access by indexed path. > +t["[3][\"FIO\"][\"fname\"]"] > +idx:select() > +idx:min() > +idx:max() > +idx:drop() > +s:drop() > + > +-- Test complex JSON indexes > +s = box.schema.space.create('withdata', {engine = engine}) > +parts = {} > +parts[1] = {1, 'str', path='[3][2].a'} > +parts[2] = {1, 'unsigned', path = '[3][1]'} > +parts[3] = {2, 'str', path = '[2].d[1]'} > +pk = s:create_index('primary', { type = 'tree', parts = parts}) > +s:insert{{1, 2, {3, {3, a = 'str', b = 5}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {1, 2, 3}} > +s:insert{{1, 2, {3, {a = 'str', b = 1}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6} > +parts = {} > +parts[1] = {4, 'unsigned', path='[1]', is_nullable = false} > +parts[2] = {4, 'unsigned', path='[2]', is_nullable = true} > +parts[3] = {4, 'unsigned', path='[4]', is_nullable = true} > +trap_idx = s:create_index('trap', { type = 'tree', parts = parts}) > +s:insert{{1, 2, {3, {3, a = 'str2', b = 5}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {}} > +parts = {} > +parts[1] = {1, 'unsigned', path='[3][2].b' } > +parts[2] = {3, 'unsigned'} > +crosspart_idx = s:create_index('crosspart', { parts = parts}) > +s:insert{{1, 2, {3, {a = 'str2', b = 2}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {9, 2, 3}} > +parts = {} > +parts[1] = {1, 'unsigned', path='[3][2].b'} > +num_idx = s:create_index('numeric', {parts = parts}) > +s:insert{{1, 2, {3, {a = 'str3', b = 9}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {0}} > +num_idx:get(2) > +num_idx:select() > +num_idx:max() > +num_idx:min() > +crosspart_idx:max() == num_idx:max() > +crosspart_idx:min() == num_idx:min() > +trap_idx:max() > +trap_idx:min() > +s:drop() > + > +s = box.schema.space.create('withdata', {engine = engine}) > +pk_simplified = s:create_index('primary', { type = 'tree', parts = {{1, 'unsigned'}}}) > +pk_simplified.path == box.NULL > +idx = s:create_index('idx', {parts = {{2, 'integer', path = 'a'}}}) > +s:insert{31, {a = 1, aa = -1}} > +s:insert{22, {a = 2, aa = -2}} > +s:insert{13, {a = 3, aa = -3}} > +idx:select() > +idx:alter({parts = {{2, 'integer', path = 'aa'}}}) > +idx:select() > +s:drop() > + > +-- incompatible format change > +s = box.schema.space.create('test') > +i = s:create_index('pk', {parts = {{1, 'integer', path = '[1]'}}}) > +s:insert{{-1}} > +i:alter{parts = {{1, 'string', path = '[1]'}}} > +s:insert{{'a'}} > +i:drop() > +i = s:create_index('pk', {parts = {{1, 'integer', path = '[1].FIO'}}}) > +s:insert{{{FIO=-1}}} > +i:alter{parts = {{1, 'integer', path = '[1][1]'}}} > +i:alter{parts = {{1, 'integer', path = '[1].FIO[1]'}}} > +s:drop() > + > +engine = nil > +test_run = nil Needless assignments. > + Extra new line. Please also check recovery, snapshotting, and select by multipart JSON key (both full and partial). All both for secondary and primary indexes.
next prev parent reply other threads:[~2019-01-23 13:49 UTC|newest] Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top 2019-01-16 13:44 [PATCH v8 0/5] box: Indexes by JSON path Kirill Shcherbatov 2019-01-16 13:44 ` [PATCH v8 1/5] lib: updated msgpuck library version Kirill Shcherbatov 2019-01-23 12:53 ` Vladimir Davydov 2019-01-16 13:44 ` [PATCH v8 2/5] box: refactor tuple_field_raw_by_path routine Kirill Shcherbatov 2019-01-23 13:15 ` Vladimir Davydov 2019-01-16 13:44 ` [PATCH v8 3/5] box: introduce JSON Indexes Kirill Shcherbatov 2019-01-23 13:49 ` Vladimir Davydov [this message] 2019-01-16 13:44 ` [PATCH v8 4/5] box: introduce offset_slot cache in key_part Kirill Shcherbatov 2019-01-23 14:23 ` Vladimir Davydov 2019-01-16 13:44 ` [PATCH v8 5/5] box: specify indexes in user-friendly form Kirill Shcherbatov 2019-01-23 15:29 ` Vladimir Davydov 2019-01-16 15:35 ` [tarantool-patches] [PATCH v8 6/6] box: introduce has_json_paths flag in templates Kirill Shcherbatov 2019-01-23 14:15 ` Vladimir Davydov
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20190123134908.tfpakeamx7d2q7iy@esperanza \ --to=vdavydov.dev@gmail.com \ --cc=kostja@tarantool.org \ --cc=kshcherbatov@tarantool.org \ --cc=tarantool-patches@freelists.org \ --subject='Re: [PATCH v8 3/5] box: introduce JSON Indexes' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox