From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Wed, 23 Jan 2019 16:49:08 +0300 From: Vladimir Davydov Subject: Re: [PATCH v8 3/5] box: introduce JSON Indexes Message-ID: <20190123134908.tfpakeamx7d2q7iy@esperanza> References: <4c8bd7e8682e577ed8d49f92e231077924f4fe53.1547645795.git.kshcherbatov@tarantool.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <4c8bd7e8682e577ed8d49f92e231077924f4fe53.1547645795.git.kshcherbatov@tarantool.org> To: Kirill Shcherbatov Cc: tarantool-patches@freelists.org, kostja@tarantool.org List-ID: On Wed, Jan 16, 2019 at 04:44:41PM +0300, Kirill Shcherbatov wrote: > diff --git a/src/box/key_def.c b/src/box/key_def.c > index dae3580e2..0ed497dfc 100644 > --- a/src/box/key_def.c > +++ b/src/box/key_def.c > @@ -485,6 +568,14 @@ key_def_decode_parts(struct key_part_def *parts, uint32_t part_count, > "index part: unknown sort order"); > return -1; > } > + if (part->path != NULL && > + json_path_validate(part->path, strlen(part->path), > + TUPLE_INDEX_BASE) != 0) { > + diag_set(ClientError, ER_WRONG_INDEX_OPTIONS, > + part->fieldno + TUPLE_INDEX_BASE, The code above passes (i + TUPLE_INDEX_BASE) to diag_set. Please fix this place to conform. > + "invalid path"); > + return -1; > + } > } > return 0; > } > diff --git a/src/box/key_def.h b/src/box/key_def.h > index d1866303b..fe4acffb5 100644 > --- a/src/box/key_def.h > +++ b/src/box/key_def.h > @@ -299,11 +320,12 @@ key_def_encode_parts(char *data, const struct key_part_def *parts, > * [NUM, STR, ..][NUM, STR, ..].., > * OR > * {field=NUM, type=STR, ..}{field=NUM, type=STR, ..}.., > + * The region is used for allocating JSON paths, if any. Extra space before "The region is used ...". Makes the comment difficult to read. > */ > int > key_def_decode_parts(struct key_part_def *parts, uint32_t part_count, > const char **data, const struct field_def *fields, > - uint32_t field_count); > + uint32_t field_count, struct region *region); > > /** > * Returns the part in index_def->parts for the specified fieldno. > diff --git a/src/box/tuple_format.c b/src/box/tuple_format.c > index 61b4c9734..9af9c307a 100644 > --- a/src/box/tuple_format.c > +++ b/src/box/tuple_format.c > @@ -69,9 +69,86 @@ static const char * > tuple_field_path(const struct tuple_field *field) > { > assert(field->token.parent != NULL); > - assert(field->token.parent->parent == NULL); > - assert(field->token.type == JSON_TOKEN_NUM); > - return int2str(field->token.num + TUPLE_INDEX_BASE); > + if (field->token.parent->parent == NULL) { > + /* Top-level field, no need to format the path. */ > + return int2str(field->token.num + TUPLE_INDEX_BASE); > + } > + char *path = tt_static_buf(); > + json_tree_snprint_path(path, TT_STATIC_BUF_LEN, &field->token, > + TUPLE_INDEX_BASE); > + return path; > +} > + > +/** > + * Given a field number and a path, add the corresponding tuple > + * field to the tuple format, allocating intermediate fields if > + * necessary. > + * Return a pointer to the leaf field on success, NULL on memory They usually add an empty line as a paragraph delimiter - text looks more readable that way. > + * allocation error or type/nullability mistmatch error, diag > + * message is set. > + */ > +static struct tuple_field * > +tuple_format_add_field(struct tuple_format *format, uint32_t fieldno, > + const char *path, uint32_t path_len) > +{ > + struct tuple_field *field = NULL; > + struct tuple_field *parent = tuple_format_field(format, fieldno); assert(parent != NULL); would be useful here to show the reader that top-level fields have already been created someplace else. > + if (path_len == 0) > + goto end; Better 'return parent;' - we don't need to call tuple_field_delete() in this case. > + field = tuple_field_new(); > + if (field == NULL) > + goto fail; > + > + int rc = 0; > + uint32_t token_count = 0; > + struct json_tree *tree = &format->fields; > + struct json_lexer lexer; > + json_lexer_create(&lexer, path, path_len, TUPLE_INDEX_BASE); > + while ((rc = json_lexer_next_token(&lexer, &field->token)) == 0 && > + field->token.type != JSON_TOKEN_END) { > + enum field_type expected_type = > + field->token.type == JSON_TOKEN_STR ? > + FIELD_TYPE_MAP : FIELD_TYPE_ARRAY; > + if (field_type1_contains_type2(parent->type, expected_type)) { > + parent->type = expected_type; > + } else { > + diag_set(ClientError, ER_INDEX_PART_TYPE_MISMATCH, > + tuple_field_path(parent), > + field_type_strs[parent->type], > + field_type_strs[expected_type]); > + goto fail; > + } > + struct tuple_field *next = > + json_tree_lookup_entry(tree, &parent->token, > + &field->token, > + struct tuple_field, token); > + if (next == NULL) { > + field->id = format->total_field_count++; > + rc = json_tree_add(tree, &parent->token, &field->token); > + if (rc != 0) { > + diag_set(OutOfMemory, sizeof(struct json_token), > + "json_tree_add", "tree"); > + goto fail; > + } > + next = field; > + field = tuple_field_new(); > + if (field == NULL) > + goto fail; > + } > + parent = next; > + token_count++; > + } > + /* Path has been verified by key_def_decode_parts. */ > + assert(rc == 0 && field->token.type == JSON_TOKEN_END); > + assert(parent != NULL); > + /* Update tree depth information. */ > + format->fields_depth = MAX(format->fields_depth, token_count + 1); > +end: > + tuple_field_delete(field); > + return parent; > +fail: > + parent = NULL; > + goto end; > } > > /** > @@ -95,15 +172,26 @@ tuple_format_field_by_id(struct tuple_format *format, uint32_t id) > static int > tuple_format_use_key_part(struct tuple_format *format, uint32_t field_count, > const struct key_part *part, bool is_sequential, > - int *current_slot) > + int *current_slot, char **path_pool) > { > assert(part->fieldno < tuple_format_field_count(format)); > - struct tuple_field *field = tuple_format_field(format, part->fieldno); > + const char *path = *path_pool; > + if (part->path != NULL) { > + /** > + * Copy JSON path data to reserved area at the > + * end of format allocation. > + */ > + memcpy(*path_pool, part->path, part->path_len); > + *path_pool += part->path_len; > + } This piece should be moved inside tuple_format_add_field(). > + struct tuple_field *field = tuple_format_add_field(format, part->fieldno, > + path, part->path_len); > + if (field == NULL) > + return -1; > /* > - * If a field is not present in the space format, > - * inherit nullable action of the first key part > - * referencing it. > - */ > + * If a field is not present in the space format, inherit > + * nullable action of the first key part referencing it. > + */ > if (part->fieldno >= field_count && !field->is_key_part) > field->nullable_action = part->nullable_action; > /* > @@ -512,49 +659,79 @@ tuple_init_field_map(struct tuple_format *format, uint32_t *field_map, > /* Empty tuple, nothing to do. */ > goto skip; > } > - /* first field is simply accessible, so we do not store offset to it */ > - struct tuple_field *field = tuple_format_field(format, 0); > - if (validate && > - !field_mp_type_is_compatible(field->type, mp_typeof(*pos), > - tuple_field_is_nullable(field))) { > - diag_set(ClientError, ER_FIELD_TYPE, tuple_field_path(field), > - field_type_strs[field->type]); > - goto error; > - } > - if (required_fields != NULL) > - bit_clear(required_fields, field->id); > - mp_next(&pos); > - /* other fields...*/ > - uint32_t i = 1; > uint32_t defined_field_count = MIN(field_count, validate ? > tuple_format_field_count(format) : > format->index_field_count); > - if (field_count < format->index_field_count) { > - /* > - * Nullify field map to be able to detect by 0, > - * which key fields are absent in tuple_field(). > - */ > - memset((char *)field_map - format->field_map_size, 0, > - format->field_map_size); > - } > - for (; i < defined_field_count; ++i) { > - field = tuple_format_field(format, i); > - if (validate && > - !field_mp_type_is_compatible(field->type, mp_typeof(*pos), > - tuple_field_is_nullable(field))) { > - diag_set(ClientError, ER_FIELD_TYPE, > - tuple_field_path(field), > - field_type_strs[field->type]); > - goto error; > + /* > + * Nullify field map to be able to detect by 0, > + * which key fields are absent in tuple_field(). > + */ > + memset((char *)field_map - format->field_map_size, 0, > + format->field_map_size); > + uint32_t mp_frames_sz = format->fields_depth * sizeof(struct mp_frame); > + struct mp_frame *mp_frames = region_alloc(region, mp_frames_sz); > + if (mp_frames == NULL) { > + diag_set(OutOfMemory, mp_frames_sz, "region", "frames"); > + goto error; > + } > + struct mp_stack mp_stack; > + mp_stack_init(&mp_stack, format->fields_depth, mp_frames); Call them simply 'stack' and 'frames'. > + mp_stack_push(&mp_stack, MP_ARRAY, defined_field_count); > + struct tuple_field *field; > + struct json_token *parent = &format->fields.root; > + while (1) { > + while (mp_stack_advance(&mp_stack)) { > + mp_stack_pop(&mp_stack); > + if (mp_stack_is_empty(&mp_stack)) > + goto skip; > + parent = parent->parent; > + } > + struct json_token token; > + token.type = mp_stack_top(&mp_stack)->type == MP_ARRAY ? > + JSON_TOKEN_NUM : JSON_TOKEN_STR; > + if (token.type == JSON_TOKEN_NUM) { > + token.num = mp_stack_top(&mp_stack)->curr - 1; I don't like that frame->curr index is in fact 1-based. Let's initialize frame->curr with -1 (or UINT32_MAX) in mp_stack_push() so that the first mp_stack_advance() called after mp_stack_push() would set it to 0. Also, what about changing the API slightly? mp_stack_advance() -> item index (>= 0) on success -1 if all items have been processed. Then we wouldn't really need to access mp_frame to get 'curr' so we could replace mp_stack_top() with mp_stack_type() => MP_MAP/MP_ARRAY. Not sure if it's a really good idea though. The code would look more concise, but need to check. > + } else if (token.type == JSON_TOKEN_STR) { > + if (mp_typeof(*pos) != MP_STR) { > + /* Skip key. */ > + mp_next(&pos); > + mp_next(&pos); > + continue; > + } > + token.str = mp_decode_str(&pos, (uint32_t *)&token.len); > + } else { > + unreachable(); > + } The code would look better IMO if you rewrote it without double-if (first check frame->type to initialize token.type, then check token->type to initialize token.key): switch (mp_stack_top(&stack)->type) { case MP_MAP: token.type = JSON_TOKEN_STR; token.str = ... break; case MP_ARRAY: token.type = JSON_TOKEN_NUM; token.num = ... default: unreachable(); } > + enum mp_type type = mp_typeof(*pos); > + assert(parent != NULL); > + field = json_tree_lookup_entry(&format->fields, parent, &token, > + struct tuple_field, token); > + if (field != NULL) { > + bool is_nullable = tuple_field_is_nullable(field); > + if (validate && > + !field_mp_type_is_compatible(field->type, type, > + is_nullable) != 0) { > + diag_set(ClientError, ER_FIELD_TYPE, > + tuple_field_path(field), > + field_type_strs[field->type]); > + goto error; > + } > + if (field->offset_slot != TUPLE_OFFSET_SLOT_NIL) > + field_map[field->offset_slot] = pos - tuple; > + if (required_fields != NULL) > + bit_clear(required_fields, field->id); > } > - if (field->offset_slot != TUPLE_OFFSET_SLOT_NIL) { > - field_map[field->offset_slot] = > - (uint32_t) (pos - tuple); > + if ((type == MP_ARRAY || type == MP_MAP) && > + !mp_stack_is_full(&mp_stack) && field != NULL) { > + uint32_t size = type == MP_ARRAY ? > + mp_decode_array(&pos) : > + mp_decode_map(&pos); > + mp_stack_push(&mp_stack, type, size); > + parent = &field->token; > + } else { > + mp_next(&pos); > } > - if (required_fields != NULL) > - bit_clear(required_fields, field->id); > - mp_next(&pos); > - } > + }; The semicolon (;) isn't needed. > diff --git a/src/box/vy_point_lookup.c b/src/box/vy_point_lookup.c > index ddbc2d46f..23b7a5869 100644 > --- a/src/box/vy_point_lookup.c > +++ b/src/box/vy_point_lookup.c > @@ -196,7 +196,8 @@ vy_point_lookup(struct vy_lsm *lsm, struct vy_tx *tx, > const struct vy_read_view **rv, > struct tuple *key, struct tuple **ret) > { > - assert(tuple_field_count(key) >= lsm->cmp_def->part_count); > + assert(tuple_field_count(key) >= lsm->cmp_def->part_count || > + lsm->cmp_def->has_json_paths); This isn't how this assertion should be fixed - instead we should check tuple_field_count for keys only (i.e. SELECT statements): /* All key parts must be set for a point lookup. */ assert(vy_stmt_type(key) != IPROTO_SELECT || tuple_field_count(key) >= lsm->cmp_def->part_count); > > diff --git a/test/engine/json.test.lua b/test/engine/json.test.lua > new file mode 100644 > index 000000000..a876ff1ed > --- /dev/null > +++ b/test/engine/json.test.lua > @@ -0,0 +1,131 @@ > +test_run = require('test_run').new() > +engine = test_run:get_cfg('engine') > +-- > +-- gh-1012: Indexes for JSON-defined paths. > +-- > +s = box.schema.space.create('withdata', {engine = engine}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO["fname"]'}, {3, 'str', path = '["FIO"].fname'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 666}, {3, 'str', path = '["FIO"]["fname"]'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'map', path = 'FIO'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'array', path = '[1]'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO'}, {3, 'str', path = 'FIO.fname'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '[1].sname'}, {3, 'str', path = '["FIO"].fname'}}}) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO....fname'}}}) > +idx = s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO.fname', is_nullable = false}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +idx ~= nil > +idx.parts[2].path == 'FIO.fname' > +format = {{'id', 'unsigned'}, {'meta', 'unsigned'}, {'data', 'array'}, {'age', 'unsigned'}, {'level', 'unsigned'}} > +s:format(format) > +format = {{'id', 'unsigned'}, {'meta', 'unsigned'}, {'data', 'map'}, {'age', 'unsigned'}, {'level', 'unsigned'}} > +s:format(format) > +s:create_index('test2', {parts = {{2, 'number'}, {3, 'number', path = 'FIO.fname'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +s:insert{7, 7, {town = 'London', FIO = 666}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 666, sname = 'Bond'}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = "James"}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond', data = "extra"}}, 4, 5} > +s:insert{7, 7, {town = 'Moscow', FIO = {fname = 'Max', sname = 'Isaev', data = "extra"}}, 4, 5} > +idx:select() > +idx:min() > +idx:max() > +s:drop() > + > +s = box.schema.create_space('withdata', {engine = engine}) > +parts = {} > +parts[1] = {1, 'unsigned', path='[2]'} > +pk = s:create_index('pk', {parts = parts}) > +s:insert{{1, 2}, 3} > +s:upsert({{box.null, 2}}, {{'+', 2, 5}}) > +s:get(2) > +s:drop() > + > +-- Create index on space with data > +s = box.schema.space.create('withdata', {engine = engine}) > +pk = s:create_index('primary', { type = 'tree' }) > +s:insert{1, 7, {town = 'London', FIO = 1234}, 4, 5} > +s:insert{2, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{3, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5} > +s:insert{4, 7, {town = 'London', FIO = {1,2,3}}, 4, 5} > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +_ = s:delete(1) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +_ = s:delete(2) > +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}}) > +_ = s:delete(4) > +idx = s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]', is_nullable = true}, {3, 'str', path = '["FIO"]["sname"]'}, {3, 'str', path = '["FIO"]["extra"]', is_nullable = true}}}) > +idx ~= nil > +s:create_index('test2', {parts = {{2, 'number'}, {3, 'number', path = '["FIO"]["fname"]'}}}) > +idx2 = s:create_index('test2', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}}}) > +idx2 ~= nil > +t = s:insert{5, 7, {town = 'Matrix', FIO = {fname = 'Agent', sname = 'Smith'}}, 4, 5} > +-- Test field_map in tuple speed-up access by indexed path. > +t["[3][\"FIO\"][\"fname\"]"] > +idx:select() > +idx:min() > +idx:max() > +idx:drop() > +s:drop() > + > +-- Test complex JSON indexes > +s = box.schema.space.create('withdata', {engine = engine}) > +parts = {} > +parts[1] = {1, 'str', path='[3][2].a'} > +parts[2] = {1, 'unsigned', path = '[3][1]'} > +parts[3] = {2, 'str', path = '[2].d[1]'} > +pk = s:create_index('primary', { type = 'tree', parts = parts}) > +s:insert{{1, 2, {3, {3, a = 'str', b = 5}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {1, 2, 3}} > +s:insert{{1, 2, {3, {a = 'str', b = 1}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6} > +parts = {} > +parts[1] = {4, 'unsigned', path='[1]', is_nullable = false} > +parts[2] = {4, 'unsigned', path='[2]', is_nullable = true} > +parts[3] = {4, 'unsigned', path='[4]', is_nullable = true} > +trap_idx = s:create_index('trap', { type = 'tree', parts = parts}) > +s:insert{{1, 2, {3, {3, a = 'str2', b = 5}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {}} > +parts = {} > +parts[1] = {1, 'unsigned', path='[3][2].b' } > +parts[2] = {3, 'unsigned'} > +crosspart_idx = s:create_index('crosspart', { parts = parts}) > +s:insert{{1, 2, {3, {a = 'str2', b = 2}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {9, 2, 3}} > +parts = {} > +parts[1] = {1, 'unsigned', path='[3][2].b'} > +num_idx = s:create_index('numeric', {parts = parts}) > +s:insert{{1, 2, {3, {a = 'str3', b = 9}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {0}} > +num_idx:get(2) > +num_idx:select() > +num_idx:max() > +num_idx:min() > +crosspart_idx:max() == num_idx:max() > +crosspart_idx:min() == num_idx:min() > +trap_idx:max() > +trap_idx:min() > +s:drop() > + > +s = box.schema.space.create('withdata', {engine = engine}) > +pk_simplified = s:create_index('primary', { type = 'tree', parts = {{1, 'unsigned'}}}) > +pk_simplified.path == box.NULL > +idx = s:create_index('idx', {parts = {{2, 'integer', path = 'a'}}}) > +s:insert{31, {a = 1, aa = -1}} > +s:insert{22, {a = 2, aa = -2}} > +s:insert{13, {a = 3, aa = -3}} > +idx:select() > +idx:alter({parts = {{2, 'integer', path = 'aa'}}}) > +idx:select() > +s:drop() > + > +-- incompatible format change > +s = box.schema.space.create('test') > +i = s:create_index('pk', {parts = {{1, 'integer', path = '[1]'}}}) > +s:insert{{-1}} > +i:alter{parts = {{1, 'string', path = '[1]'}}} > +s:insert{{'a'}} > +i:drop() > +i = s:create_index('pk', {parts = {{1, 'integer', path = '[1].FIO'}}}) > +s:insert{{{FIO=-1}}} > +i:alter{parts = {{1, 'integer', path = '[1][1]'}}} > +i:alter{parts = {{1, 'integer', path = '[1].FIO[1]'}}} > +s:drop() > + > +engine = nil > +test_run = nil Needless assignments. > + Extra new line. Please also check recovery, snapshotting, and select by multipart JSON key (both full and partial). All both for secondary and primary indexes.