[PATCH v8 3/5] box: introduce JSON Indexes
Vladimir Davydov
vdavydov.dev at gmail.com
Wed Jan 23 16:49:08 MSK 2019
On Wed, Jan 16, 2019 at 04:44:41PM +0300, Kirill Shcherbatov wrote:
> diff --git a/src/box/key_def.c b/src/box/key_def.c
> index dae3580e2..0ed497dfc 100644
> --- a/src/box/key_def.c
> +++ b/src/box/key_def.c
> @@ -485,6 +568,14 @@ key_def_decode_parts(struct key_part_def *parts, uint32_t part_count,
> "index part: unknown sort order");
> return -1;
> }
> + if (part->path != NULL &&
> + json_path_validate(part->path, strlen(part->path),
> + TUPLE_INDEX_BASE) != 0) {
> + diag_set(ClientError, ER_WRONG_INDEX_OPTIONS,
> + part->fieldno + TUPLE_INDEX_BASE,
The code above passes (i + TUPLE_INDEX_BASE) to diag_set. Please fix
this place to conform.
> + "invalid path");
> + return -1;
> + }
> }
> return 0;
> }
> diff --git a/src/box/key_def.h b/src/box/key_def.h
> index d1866303b..fe4acffb5 100644
> --- a/src/box/key_def.h
> +++ b/src/box/key_def.h
> @@ -299,11 +320,12 @@ key_def_encode_parts(char *data, const struct key_part_def *parts,
> * [NUM, STR, ..][NUM, STR, ..]..,
> * OR
> * {field=NUM, type=STR, ..}{field=NUM, type=STR, ..}..,
> + * The region is used for allocating JSON paths, if any.
Extra space before "The region is used ...". Makes the comment difficult
to read.
> */
> int
> key_def_decode_parts(struct key_part_def *parts, uint32_t part_count,
> const char **data, const struct field_def *fields,
> - uint32_t field_count);
> + uint32_t field_count, struct region *region);
>
> /**
> * Returns the part in index_def->parts for the specified fieldno.
> diff --git a/src/box/tuple_format.c b/src/box/tuple_format.c
> index 61b4c9734..9af9c307a 100644
> --- a/src/box/tuple_format.c
> +++ b/src/box/tuple_format.c
> @@ -69,9 +69,86 @@ static const char *
> tuple_field_path(const struct tuple_field *field)
> {
> assert(field->token.parent != NULL);
> - assert(field->token.parent->parent == NULL);
> - assert(field->token.type == JSON_TOKEN_NUM);
> - return int2str(field->token.num + TUPLE_INDEX_BASE);
> + if (field->token.parent->parent == NULL) {
> + /* Top-level field, no need to format the path. */
> + return int2str(field->token.num + TUPLE_INDEX_BASE);
> + }
> + char *path = tt_static_buf();
> + json_tree_snprint_path(path, TT_STATIC_BUF_LEN, &field->token,
> + TUPLE_INDEX_BASE);
> + return path;
> +}
> +
> +/**
> + * Given a field number and a path, add the corresponding tuple
> + * field to the tuple format, allocating intermediate fields if
> + * necessary.
> + * Return a pointer to the leaf field on success, NULL on memory
They usually add an empty line as a paragraph delimiter - text looks
more readable that way.
> + * allocation error or type/nullability mistmatch error, diag
> + * message is set.
> + */
> +static struct tuple_field *
> +tuple_format_add_field(struct tuple_format *format, uint32_t fieldno,
> + const char *path, uint32_t path_len)
> +{
> + struct tuple_field *field = NULL;
> + struct tuple_field *parent = tuple_format_field(format, fieldno);
assert(parent != NULL);
would be useful here to show the reader that top-level fields have
already been created someplace else.
> + if (path_len == 0)
> + goto end;
Better 'return parent;' - we don't need to call tuple_field_delete() in
this case.
> + field = tuple_field_new();
> + if (field == NULL)
> + goto fail;
> +
> + int rc = 0;
> + uint32_t token_count = 0;
> + struct json_tree *tree = &format->fields;
> + struct json_lexer lexer;
> + json_lexer_create(&lexer, path, path_len, TUPLE_INDEX_BASE);
> + while ((rc = json_lexer_next_token(&lexer, &field->token)) == 0 &&
> + field->token.type != JSON_TOKEN_END) {
> + enum field_type expected_type =
> + field->token.type == JSON_TOKEN_STR ?
> + FIELD_TYPE_MAP : FIELD_TYPE_ARRAY;
> + if (field_type1_contains_type2(parent->type, expected_type)) {
> + parent->type = expected_type;
> + } else {
> + diag_set(ClientError, ER_INDEX_PART_TYPE_MISMATCH,
> + tuple_field_path(parent),
> + field_type_strs[parent->type],
> + field_type_strs[expected_type]);
> + goto fail;
> + }
> + struct tuple_field *next =
> + json_tree_lookup_entry(tree, &parent->token,
> + &field->token,
> + struct tuple_field, token);
> + if (next == NULL) {
> + field->id = format->total_field_count++;
> + rc = json_tree_add(tree, &parent->token, &field->token);
> + if (rc != 0) {
> + diag_set(OutOfMemory, sizeof(struct json_token),
> + "json_tree_add", "tree");
> + goto fail;
> + }
> + next = field;
> + field = tuple_field_new();
> + if (field == NULL)
> + goto fail;
> + }
> + parent = next;
> + token_count++;
> + }
> + /* Path has been verified by key_def_decode_parts. */
> + assert(rc == 0 && field->token.type == JSON_TOKEN_END);
> + assert(parent != NULL);
> + /* Update tree depth information. */
> + format->fields_depth = MAX(format->fields_depth, token_count + 1);
> +end:
> + tuple_field_delete(field);
> + return parent;
> +fail:
> + parent = NULL;
> + goto end;
> }
>
> /**
> @@ -95,15 +172,26 @@ tuple_format_field_by_id(struct tuple_format *format, uint32_t id)
> static int
> tuple_format_use_key_part(struct tuple_format *format, uint32_t field_count,
> const struct key_part *part, bool is_sequential,
> - int *current_slot)
> + int *current_slot, char **path_pool)
> {
> assert(part->fieldno < tuple_format_field_count(format));
> - struct tuple_field *field = tuple_format_field(format, part->fieldno);
> + const char *path = *path_pool;
> + if (part->path != NULL) {
> + /**
> + * Copy JSON path data to reserved area at the
> + * end of format allocation.
> + */
> + memcpy(*path_pool, part->path, part->path_len);
> + *path_pool += part->path_len;
> + }
This piece should be moved inside tuple_format_add_field().
> + struct tuple_field *field = tuple_format_add_field(format, part->fieldno,
> + path, part->path_len);
> + if (field == NULL)
> + return -1;
> /*
> - * If a field is not present in the space format,
> - * inherit nullable action of the first key part
> - * referencing it.
> - */
> + * If a field is not present in the space format, inherit
> + * nullable action of the first key part referencing it.
> + */
> if (part->fieldno >= field_count && !field->is_key_part)
> field->nullable_action = part->nullable_action;
> /*
> @@ -512,49 +659,79 @@ tuple_init_field_map(struct tuple_format *format, uint32_t *field_map,
> /* Empty tuple, nothing to do. */
> goto skip;
> }
> - /* first field is simply accessible, so we do not store offset to it */
> - struct tuple_field *field = tuple_format_field(format, 0);
> - if (validate &&
> - !field_mp_type_is_compatible(field->type, mp_typeof(*pos),
> - tuple_field_is_nullable(field))) {
> - diag_set(ClientError, ER_FIELD_TYPE, tuple_field_path(field),
> - field_type_strs[field->type]);
> - goto error;
> - }
> - if (required_fields != NULL)
> - bit_clear(required_fields, field->id);
> - mp_next(&pos);
> - /* other fields...*/
> - uint32_t i = 1;
> uint32_t defined_field_count = MIN(field_count, validate ?
> tuple_format_field_count(format) :
> format->index_field_count);
> - if (field_count < format->index_field_count) {
> - /*
> - * Nullify field map to be able to detect by 0,
> - * which key fields are absent in tuple_field().
> - */
> - memset((char *)field_map - format->field_map_size, 0,
> - format->field_map_size);
> - }
> - for (; i < defined_field_count; ++i) {
> - field = tuple_format_field(format, i);
> - if (validate &&
> - !field_mp_type_is_compatible(field->type, mp_typeof(*pos),
> - tuple_field_is_nullable(field))) {
> - diag_set(ClientError, ER_FIELD_TYPE,
> - tuple_field_path(field),
> - field_type_strs[field->type]);
> - goto error;
> + /*
> + * Nullify field map to be able to detect by 0,
> + * which key fields are absent in tuple_field().
> + */
> + memset((char *)field_map - format->field_map_size, 0,
> + format->field_map_size);
> + uint32_t mp_frames_sz = format->fields_depth * sizeof(struct mp_frame);
> + struct mp_frame *mp_frames = region_alloc(region, mp_frames_sz);
> + if (mp_frames == NULL) {
> + diag_set(OutOfMemory, mp_frames_sz, "region", "frames");
> + goto error;
> + }
> + struct mp_stack mp_stack;
> + mp_stack_init(&mp_stack, format->fields_depth, mp_frames);
Call them simply 'stack' and 'frames'.
> + mp_stack_push(&mp_stack, MP_ARRAY, defined_field_count);
> + struct tuple_field *field;
> + struct json_token *parent = &format->fields.root;
> + while (1) {
> + while (mp_stack_advance(&mp_stack)) {
> + mp_stack_pop(&mp_stack);
> + if (mp_stack_is_empty(&mp_stack))
> + goto skip;
> + parent = parent->parent;
> + }
> + struct json_token token;
> + token.type = mp_stack_top(&mp_stack)->type == MP_ARRAY ?
> + JSON_TOKEN_NUM : JSON_TOKEN_STR;
> + if (token.type == JSON_TOKEN_NUM) {
> + token.num = mp_stack_top(&mp_stack)->curr - 1;
I don't like that frame->curr index is in fact 1-based. Let's initialize
frame->curr with -1 (or UINT32_MAX) in mp_stack_push() so that the first
mp_stack_advance() called after mp_stack_push() would set it to 0.
Also, what about changing the API slightly?
mp_stack_advance() -> item index (>= 0) on success
-1 if all items have been processed.
Then we wouldn't really need to access mp_frame to get 'curr' so we
could replace mp_stack_top() with mp_stack_type() => MP_MAP/MP_ARRAY.
Not sure if it's a really good idea though. The code would look more
concise, but need to check.
> + } else if (token.type == JSON_TOKEN_STR) {
> + if (mp_typeof(*pos) != MP_STR) {
> + /* Skip key. */
> + mp_next(&pos);
> + mp_next(&pos);
> + continue;
> + }
> + token.str = mp_decode_str(&pos, (uint32_t *)&token.len);
> + } else {
> + unreachable();
> + }
The code would look better IMO if you rewrote it without double-if
(first check frame->type to initialize token.type, then check
token->type to initialize token.key):
switch (mp_stack_top(&stack)->type) {
case MP_MAP:
token.type = JSON_TOKEN_STR;
token.str = ...
break;
case MP_ARRAY:
token.type = JSON_TOKEN_NUM;
token.num = ...
default:
unreachable();
}
> + enum mp_type type = mp_typeof(*pos);
> + assert(parent != NULL);
> + field = json_tree_lookup_entry(&format->fields, parent, &token,
> + struct tuple_field, token);
> + if (field != NULL) {
> + bool is_nullable = tuple_field_is_nullable(field);
> + if (validate &&
> + !field_mp_type_is_compatible(field->type, type,
> + is_nullable) != 0) {
> + diag_set(ClientError, ER_FIELD_TYPE,
> + tuple_field_path(field),
> + field_type_strs[field->type]);
> + goto error;
> + }
> + if (field->offset_slot != TUPLE_OFFSET_SLOT_NIL)
> + field_map[field->offset_slot] = pos - tuple;
> + if (required_fields != NULL)
> + bit_clear(required_fields, field->id);
> }
> - if (field->offset_slot != TUPLE_OFFSET_SLOT_NIL) {
> - field_map[field->offset_slot] =
> - (uint32_t) (pos - tuple);
> + if ((type == MP_ARRAY || type == MP_MAP) &&
> + !mp_stack_is_full(&mp_stack) && field != NULL) {
> + uint32_t size = type == MP_ARRAY ?
> + mp_decode_array(&pos) :
> + mp_decode_map(&pos);
> + mp_stack_push(&mp_stack, type, size);
> + parent = &field->token;
> + } else {
> + mp_next(&pos);
> }
> - if (required_fields != NULL)
> - bit_clear(required_fields, field->id);
> - mp_next(&pos);
> - }
> + };
The semicolon (;) isn't needed.
> diff --git a/src/box/vy_point_lookup.c b/src/box/vy_point_lookup.c
> index ddbc2d46f..23b7a5869 100644
> --- a/src/box/vy_point_lookup.c
> +++ b/src/box/vy_point_lookup.c
> @@ -196,7 +196,8 @@ vy_point_lookup(struct vy_lsm *lsm, struct vy_tx *tx,
> const struct vy_read_view **rv,
> struct tuple *key, struct tuple **ret)
> {
> - assert(tuple_field_count(key) >= lsm->cmp_def->part_count);
> + assert(tuple_field_count(key) >= lsm->cmp_def->part_count ||
> + lsm->cmp_def->has_json_paths);
This isn't how this assertion should be fixed - instead we should check
tuple_field_count for keys only (i.e. SELECT statements):
/* All key parts must be set for a point lookup. */
assert(vy_stmt_type(key) != IPROTO_SELECT ||
tuple_field_count(key) >= lsm->cmp_def->part_count);
>
> diff --git a/test/engine/json.test.lua b/test/engine/json.test.lua
> new file mode 100644
> index 000000000..a876ff1ed
> --- /dev/null
> +++ b/test/engine/json.test.lua
> @@ -0,0 +1,131 @@
> +test_run = require('test_run').new()
> +engine = test_run:get_cfg('engine')
> +--
> +-- gh-1012: Indexes for JSON-defined paths.
> +--
> +s = box.schema.space.create('withdata', {engine = engine})
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO["fname"]'}, {3, 'str', path = '["FIO"].fname'}}})
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 666}, {3, 'str', path = '["FIO"]["fname"]'}}})
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'map', path = 'FIO'}}})
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'array', path = '[1]'}}})
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO'}, {3, 'str', path = 'FIO.fname'}}})
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '[1].sname'}, {3, 'str', path = '["FIO"].fname'}}})
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO....fname'}}})
> +idx = s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = 'FIO.fname', is_nullable = false}, {3, 'str', path = '["FIO"]["sname"]'}}})
> +idx ~= nil
> +idx.parts[2].path == 'FIO.fname'
> +format = {{'id', 'unsigned'}, {'meta', 'unsigned'}, {'data', 'array'}, {'age', 'unsigned'}, {'level', 'unsigned'}}
> +s:format(format)
> +format = {{'id', 'unsigned'}, {'meta', 'unsigned'}, {'data', 'map'}, {'age', 'unsigned'}, {'level', 'unsigned'}}
> +s:format(format)
> +s:create_index('test2', {parts = {{2, 'number'}, {3, 'number', path = 'FIO.fname'}, {3, 'str', path = '["FIO"]["sname"]'}}})
> +s:insert{7, 7, {town = 'London', FIO = 666}, 4, 5}
> +s:insert{7, 7, {town = 'London', FIO = {fname = 666, sname = 'Bond'}}, 4, 5}
> +s:insert{7, 7, {town = 'London', FIO = {fname = "James"}}, 4, 5}
> +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5}
> +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5}
> +s:insert{7, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond', data = "extra"}}, 4, 5}
> +s:insert{7, 7, {town = 'Moscow', FIO = {fname = 'Max', sname = 'Isaev', data = "extra"}}, 4, 5}
> +idx:select()
> +idx:min()
> +idx:max()
> +s:drop()
> +
> +s = box.schema.create_space('withdata', {engine = engine})
> +parts = {}
> +parts[1] = {1, 'unsigned', path='[2]'}
> +pk = s:create_index('pk', {parts = parts})
> +s:insert{{1, 2}, 3}
> +s:upsert({{box.null, 2}}, {{'+', 2, 5}})
> +s:get(2)
> +s:drop()
> +
> +-- Create index on space with data
> +s = box.schema.space.create('withdata', {engine = engine})
> +pk = s:create_index('primary', { type = 'tree' })
> +s:insert{1, 7, {town = 'London', FIO = 1234}, 4, 5}
> +s:insert{2, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5}
> +s:insert{3, 7, {town = 'London', FIO = {fname = 'James', sname = 'Bond'}}, 4, 5}
> +s:insert{4, 7, {town = 'London', FIO = {1,2,3}}, 4, 5}
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}})
> +_ = s:delete(1)
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}})
> +_ = s:delete(2)
> +s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}, {3, 'str', path = '["FIO"]["sname"]'}}})
> +_ = s:delete(4)
> +idx = s:create_index('test1', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]', is_nullable = true}, {3, 'str', path = '["FIO"]["sname"]'}, {3, 'str', path = '["FIO"]["extra"]', is_nullable = true}}})
> +idx ~= nil
> +s:create_index('test2', {parts = {{2, 'number'}, {3, 'number', path = '["FIO"]["fname"]'}}})
> +idx2 = s:create_index('test2', {parts = {{2, 'number'}, {3, 'str', path = '["FIO"]["fname"]'}}})
> +idx2 ~= nil
> +t = s:insert{5, 7, {town = 'Matrix', FIO = {fname = 'Agent', sname = 'Smith'}}, 4, 5}
> +-- Test field_map in tuple speed-up access by indexed path.
> +t["[3][\"FIO\"][\"fname\"]"]
> +idx:select()
> +idx:min()
> +idx:max()
> +idx:drop()
> +s:drop()
> +
> +-- Test complex JSON indexes
> +s = box.schema.space.create('withdata', {engine = engine})
> +parts = {}
> +parts[1] = {1, 'str', path='[3][2].a'}
> +parts[2] = {1, 'unsigned', path = '[3][1]'}
> +parts[3] = {2, 'str', path = '[2].d[1]'}
> +pk = s:create_index('primary', { type = 'tree', parts = parts})
> +s:insert{{1, 2, {3, {3, a = 'str', b = 5}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {1, 2, 3}}
> +s:insert{{1, 2, {3, {a = 'str', b = 1}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6}
> +parts = {}
> +parts[1] = {4, 'unsigned', path='[1]', is_nullable = false}
> +parts[2] = {4, 'unsigned', path='[2]', is_nullable = true}
> +parts[3] = {4, 'unsigned', path='[4]', is_nullable = true}
> +trap_idx = s:create_index('trap', { type = 'tree', parts = parts})
> +s:insert{{1, 2, {3, {3, a = 'str2', b = 5}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {}}
> +parts = {}
> +parts[1] = {1, 'unsigned', path='[3][2].b' }
> +parts[2] = {3, 'unsigned'}
> +crosspart_idx = s:create_index('crosspart', { parts = parts})
> +s:insert{{1, 2, {3, {a = 'str2', b = 2}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {9, 2, 3}}
> +parts = {}
> +parts[1] = {1, 'unsigned', path='[3][2].b'}
> +num_idx = s:create_index('numeric', {parts = parts})
> +s:insert{{1, 2, {3, {a = 'str3', b = 9}}}, {'c', {d = {'e', 'f'}, e = 'g'}}, 6, {0}}
> +num_idx:get(2)
> +num_idx:select()
> +num_idx:max()
> +num_idx:min()
> +crosspart_idx:max() == num_idx:max()
> +crosspart_idx:min() == num_idx:min()
> +trap_idx:max()
> +trap_idx:min()
> +s:drop()
> +
> +s = box.schema.space.create('withdata', {engine = engine})
> +pk_simplified = s:create_index('primary', { type = 'tree', parts = {{1, 'unsigned'}}})
> +pk_simplified.path == box.NULL
> +idx = s:create_index('idx', {parts = {{2, 'integer', path = 'a'}}})
> +s:insert{31, {a = 1, aa = -1}}
> +s:insert{22, {a = 2, aa = -2}}
> +s:insert{13, {a = 3, aa = -3}}
> +idx:select()
> +idx:alter({parts = {{2, 'integer', path = 'aa'}}})
> +idx:select()
> +s:drop()
> +
> +-- incompatible format change
> +s = box.schema.space.create('test')
> +i = s:create_index('pk', {parts = {{1, 'integer', path = '[1]'}}})
> +s:insert{{-1}}
> +i:alter{parts = {{1, 'string', path = '[1]'}}}
> +s:insert{{'a'}}
> +i:drop()
> +i = s:create_index('pk', {parts = {{1, 'integer', path = '[1].FIO'}}})
> +s:insert{{{FIO=-1}}}
> +i:alter{parts = {{1, 'integer', path = '[1][1]'}}}
> +i:alter{parts = {{1, 'integer', path = '[1].FIO[1]'}}}
> +s:drop()
> +
> +engine = nil
> +test_run = nil
Needless assignments.
> +
Extra new line.
Please also check recovery, snapshotting, and select by multipart JSON
key (both full and partial). All both for secondary and primary indexes.
More information about the Tarantool-patches
mailing list