From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Thu, 29 Nov 2018 20:38:16 +0300 From: Vladimir Davydov Subject: Re: [tarantool-patches] [PATCH v5 2/9] lib: implement JSON tree class for json library Message-ID: <20181129173816.kprfjhki5o7ytfbl@esperanza> References: <02671a3d0a2236ecd6e12c0bc51b7f5e39272a2f.1543229303.git.kshcherbatov@tarantool.org> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: To: Kirill Shcherbatov Cc: tarantool-patches@freelists.org, Kostya Osipov List-ID: On Mon, Nov 26, 2018 at 03:53:03PM +0300, Kirill Shcherbatov wrote: > New JSON tree class would store JSON paths for tuple fields > for registered non-plain indexes. It is a hierarchical data > structure that organize JSON nodes produced by parser. > Class provides API to lookup node by path and iterate over the > tree. > JSON Indexes patch require such functionality to make lookup > for tuple_fields by path, make initialization of field map and > build vynyl_stmt msgpack for secondary index via JSON tree > iteration. > > Need for #1012 In general looks OK, please see a few comments below. > diff --git a/src/lib/json/json.c b/src/lib/json/json.c > index eb80e4bbc..6bc74507f 100644 > --- a/src/lib/json/json.c > +++ b/src/lib/json/json.c > @@ -241,3 +242,268 @@ json_lexer_next_token(struct json_lexer *lexer, struct json_token *token) > return json_parse_identifier(lexer, token); > } > } > + > +/** Compare JSON tokens keys. */ > +static int > +json_token_key_cmp(const struct json_token *a, const struct json_token *b) > +{ > + if (a->key.type != b->key.type) > + return a->key.type - b->key.type; > + int ret = 0; > + if (a->key.type == JSON_TOKEN_STR) { > + if (a->key.len != b->key.len) > + return a->key.len - b->key.len; > + ret = memcmp(a->key.str, b->key.str, a->key.len); > + } else if (a->key.type == JSON_TOKEN_NUM) { > + ret = a->key.num - b->key.num; > + } else { > + unreachable(); > + } > + return ret; > +} > + > +/** > + * Compare hash records of two json tree nodes. Return 0 if equal. > + */ > +static inline int > +mh_json_cmp(const struct json_token *a, const struct json_token *b) > +{ > + if (a->parent != b->parent) > + return a->parent - b->parent; > + return json_token_key_cmp(a, b); > +} Please merge the two functions - the more functions we have for doing roughly the same thing (token comparison in this case), the more difficult it gets to follow the code. I understand that you intend to use json_token_key_cmp (without parent comparison) to compare paths in a future patch, but IMO you'd better nullify token->parent in json_path_cmp before token comparison instead of having two functions. The resulting function should be called json_token_cmp. > + > +#define MH_SOURCE 1 > +#define mh_name _json > +#define mh_key_t struct json_token ** > +#define mh_node_t struct json_token * > +#define mh_arg_t void * > +#define mh_hash(a, arg) ((*a)->rolling_hash) > +#define mh_hash_key(a, arg) ((*a)->rolling_hash) > +#define mh_cmp(a, b, arg) (mh_json_cmp((*a), (*b))) > +#define mh_cmp_key(a, b, arg) mh_cmp(a, b, arg) > +#include "salad/mhash.h" > + > +static const uint32_t hash_seed = 13U; > + > +/** Compute the hash value of a JSON token. */ > +static uint32_t > +json_token_hash(struct json_token *token, uint32_t seed) You always pass token->parent->hash to this function so the 'seed' argument is not really needed. > +{ > + uint32_t h = seed; > + uint32_t carry = 0; > + const void *data; > + uint32_t data_size; > + if (token->key.type == JSON_TOKEN_STR) { > + data = token->key.str; > + data_size = token->key.len; > + } else if (token->key.type == JSON_TOKEN_NUM) { > + data = &token->key.num; > + data_size = sizeof(token->key.num); > + } else { > + unreachable(); > + } > + PMurHash32_Process(&h, &carry, data, data_size); > + return PMurHash32_Result(h, carry, data_size); > +} > + > +int > +json_tree_create(struct json_tree *tree) > +{ > + memset(tree, 0, sizeof(struct json_tree)); > + tree->root.rolling_hash = hash_seed; > + tree->root.key.type = JSON_TOKEN_END; > + tree->hash = mh_json_new(); > + return tree->hash == NULL ? -1 : 0; > +} > + > +static void > +json_token_destroy(struct json_token *token) > +{ > + free(token->children); > +} > + > +void > +json_tree_destroy(struct json_tree *tree) > +{ > + assert(tree->hash != NULL); This is a rather pointless assertion as mh_json_delete will crash anyway if tree->hash is NULL. Besides, how would anyone ever step on it? By passing an uninitialized json_tree to this function? This will hardly ever happen. OTOH it would be good to ensure that the tree is empty here, because this is a subtle part of the function protocol. > + json_token_destroy(&tree->root); > + mh_json_delete(tree->hash); > +} > + > +struct json_token * > +json_tree_lookup(struct json_tree *tree, struct json_token *parent, > + struct json_token *token) 'token' should be marked const here. > +{ > + if (parent == NULL) > + parent = &tree->root; I'd rather require the caller to pass tree->root explicitly. This would save us a comparison. > + struct json_token *ret = NULL; > + if (token->key.type == JSON_TOKEN_STR) { > + struct json_token key = *token; Please don't assign a whole object when you need just a few fields: struct json_token key; key.str = token->str; key.len = token->len; key.parent = parent; key.hash = json_token_hash(token); This is a warm path after all. > + key.rolling_hash = json_token_hash(token, parent->rolling_hash); > + key.parent = parent; > + token = &key; > + mh_int_t id = mh_json_find(tree->hash, &token, NULL); > + if (unlikely(id == mh_end(tree->hash))) unlikely() isn't needed here - there are plenty of function calls so it will hardly make any difference. Besides, why would one expect that the key is likely to be present in the tree? > + return NULL; > + struct json_token **entry = mh_json_node(tree->hash, id); > + assert(entry == NULL || (*entry)->parent == parent); > + return entry != NULL ? *entry : NULL; > + } else if (token->key.type == JSON_TOKEN_NUM) { > + uint32_t idx = token->key.num - 1; > + ret = idx < parent->child_size ? parent->children[idx] : NULL; May be, it's worth making this case (JSON_TOKEN_NUM) inline, to save a function call in case of a top-level tuple field lookup? BTW likely() would be appropriate there. > + } else { > + unreachable(); > + } > + return ret; > +} > + > +int > +json_tree_add(struct json_tree *tree, struct json_token *parent, > + struct json_token *token) > +{ > + if (parent == NULL) > + parent = &tree->root; > + uint32_t rolling_hash = > + json_token_hash(token, parent->rolling_hash); > + assert(json_tree_lookup(tree, parent, token) == NULL); > + uint32_t insert_idx = (token->key.type == JSON_TOKEN_NUM) ? > + (uint32_t)token->key.num - 1 : > + parent->child_size; > + if (insert_idx >= parent->child_size) { > + uint32_t new_size = > + parent->child_size == 0 ? 1 : 2 * parent->child_size; > + while (insert_idx >= new_size) > + new_size *= 2; > + struct json_token **children = > + realloc(parent->children, new_size*sizeof(void *)); > + if (unlikely(children == NULL)) Useless unlikely(). > + return -1; > + memset(children + parent->child_size, 0, > + (new_size - parent->child_size)*sizeof(void *)); > + parent->children = children; > + parent->child_size = new_size; Ouch, this looks much more complex than it was in the previous version. Please revert. Quoting myself: } > + if (insert_idx >= parent->children_count) { } > + uint32_t new_size = insert_idx + 1; } } We usually double the size with each allocation. If this is intentional, } please add a comment. I didn't push you to change that. I just wanted you to add a comment saying that we can afford quadratic algorithmic complexity here, because this function is far from a hot path and we expect the tree to be rather small. > + } > + assert(parent->children[insert_idx] == NULL); > + parent->children[insert_idx] = token; > + parent->child_count = MAX(parent->child_count, insert_idx + 1); > + token->sibling_idx = insert_idx; > + token->rolling_hash = rolling_hash; > + token->parent = parent; > + > + const struct json_token **key = > + (const struct json_token **)&token; > + mh_int_t rc = mh_json_put(tree->hash, key, NULL, NULL); > + if (unlikely(rc == mh_end(tree->hash))) { Again, unlikely() is pointless here. > + parent->children[insert_idx] = NULL; > + return -1; > + } > + assert(json_tree_lookup(tree, parent, token) == token); > + return 0; > +} > + > +void > +json_tree_del(struct json_tree *tree, struct json_token *token) > +{ > + struct json_token *parent = token->parent; > + assert(json_tree_lookup(tree, parent, token) == token); > + struct json_token **child_slot = NULL; > + if (token->key.type == JSON_TOKEN_NUM) { > + child_slot = &parent->children[token->key.num - 1]; > + } else { > + uint32_t idx = 0; > + while (idx < parent->child_size && > + parent->children[idx] != token) { idx++; } What's this loop for? Can't you just use sibling_idx here? > + if (idx < parent->child_size && > + parent->children[idx] == token) Indentation. > + child_slot = &parent->children[idx]; > + } > + assert(child_slot != NULL && *child_slot == token); > + *child_slot = NULL; > + > + mh_int_t id = mh_json_find(tree->hash, &token, NULL); > + assert(id != mh_end(tree->hash)); > + mh_json_del(tree->hash, id, NULL); > + json_token_destroy(token); > + assert(json_tree_lookup(tree, parent, token) == NULL); > +} > + > +struct json_token * > +json_tree_lookup_path(struct json_tree *tree, struct json_token *parent, > + const char *path, uint32_t path_len) > +{ > + int rc; > + struct json_lexer lexer; > + struct json_token token; > + struct json_token *ret = parent != NULL ? parent : &tree->root; > + json_lexer_create(&lexer, path, path_len); > + while ((rc = json_lexer_next_token(&lexer, &token)) == 0 && > + token.key.type != JSON_TOKEN_END && ret != NULL) { > + ret = json_tree_lookup(tree, ret, &token); > + } > + if (rc != 0 || token.key.type != JSON_TOKEN_END) > + return NULL; > + return ret; > +} > + > +static struct json_token * > +json_tree_child_next(struct json_token *parent, struct json_token *pos) > +{ > + assert(pos == NULL || pos->parent == parent); > + struct json_token **arr = parent->children; > + uint32_t arr_size = parent->child_size; > + if (arr == NULL) > + return NULL; > + uint32_t idx = pos != NULL ? pos->sibling_idx + 1 : 0; > + while (idx < arr_size && arr[idx] == NULL) > + idx++; This function may be called many times from json_tree_foreach helper. The latter is invoked at each tuple creation so we must strive to make it as fast as we can. Now, suppose you have JSON arrays of 100 elements with the last one indexed. You'll be iterating over 99 elements for nothing. Is it really that bad? I'm not sure, because the iteration should finish pretty quickly, but it looks rather ugly to me. A suggestion: may be, we could use rlist, as you did it before, for iteration and the children array for lookup in an array? That is the children array wouldn't be allocated at all for JSON_TOKEN_STR, only for JSON_TOKEN_NUM, and it would be used solely for quick access, not for iteration. BTW sibling_idx wouldn't be needed then. Would it make sense? > + if (idx >= arr_size) > + return NULL; > + return arr[idx]; > +} > + > +static struct json_token * > +json_tree_leftmost(struct json_token *pos) > +{ > + struct json_token *last; > + do { > + last = pos; > + pos = json_tree_child_next(pos, NULL); > + } while (pos != NULL); > + return last; > +} > + > +struct json_token * > +json_tree_preorder_next(struct json_token *root, struct json_token *pos) > +{ > + if (pos == NULL) > + pos = root; > + struct json_token *next = json_tree_child_next(pos, NULL); > + if (next != NULL) > + return next; > + while (pos != root) { > + next = json_tree_child_next(pos->parent, pos); > + if (next != NULL) > + return next; > + pos = pos->parent; > + } > + return NULL; > +} > + > +struct json_token * > +json_tree_postorder_next(struct json_token *root, struct json_token *pos) > +{ > + struct json_token *next; > + if (pos == NULL) { > + next = json_tree_leftmost(root); > + return next != root ? next : NULL; > + } > + if (pos == root) > + return NULL; > + next = json_tree_child_next(pos->parent, pos); > + if (next != NULL) { > + next = json_tree_leftmost(next); > + return next != root ? next : NULL; > + } > + return pos->parent != root ? pos->parent : NULL; > +} > diff --git a/src/lib/json/json.h b/src/lib/json/json.h > index ead446878..2bc159ff8 100644 > --- a/src/lib/json/json.h > +++ b/src/lib/json/json.h > @@ -61,20 +61,53 @@ enum json_token_type { > /** > * Element of a JSON path. It can be either string or number. > * String idenfiers are in ["..."] and between dots. Numbers are > - * indexes in [...]. > + * indexes in [...]. May be organized in a JSON tree. This is such a lame comment :-( Can we say a few words here about the json_token lifetime: how it is first filled by lexer, how it then can be passed to json_tree_add to form a tree, how the tree is organized... May be, even add a code snippet to the comment. > */ > struct json_token { > - enum json_token_type type; > - union { > - struct { > - /** String identifier. */ > - const char *str; > - /** Length of @a str. */ > - int len; > + struct { > + enum json_token_type type; > + union { > + struct { > + /** String identifier. */ > + const char *str; > + /** Length of @a str. */ > + int len; > + }; > + /** Index value. */ > + uint64_t num; > }; > - /** Index value. */ > - uint64_t num; > - }; > + } key; After some consideration, I don't think that it's worth moving the token identifier to anonymous struct 'key' - it doesn't conflict with other members and using 'num' or 'str/len' directly is shorter than 'key.num' and 'key.str/len'. Also, the 'type' still has type json_token_type, not json_token_key_type, which suggests that it should be under token.type, not under token.key.type. > + /** Rolling hash for node used to lookup in json_tree. */ Quoting myself: } My another concern is comments to the code and function members. They } look better than they used to be, but still not perfect. E.g. take this } comment: } } > + /** } > + * Rolling hash for node calculated with } > + * json_path_node_hash(key, parent). } > + */ } > + uint32_t rolling_hash; } } It took me a while to realize how you calculate rolling hash - I had to } dive deep into the implementation. Alas, the new comment isn't a bit better than the previous one. > + uint32_t rolling_hash; Let's call it simply 'hash', short and clear. The rolling nature of the hash should be explained in the comment. > + /** > + * Array of child records. Indexes in this array > + * follows array indexe-1 for JSON_TOKEN_NUM token type typo: indexe -> index BTW, json array start indexing from 0, not 1 AFAIK. Starting indexing from 1 looks weird to me. > + * and are allocated sequently for JSON_TOKEN_NUM child typo: sequently -> sequentially Please enable spell checker. > + * tokens. NULLs initializations are performed with new > + * entries allocation. > + */ > + struct json_token **children; > + /** Allocation size of children array. */ > + uint32_t child_size; > + /** > + * Count of defined children array items. Equal the > + * maximum index of inserted item. > + */ > + uint32_t child_count; > + /** Index of node in parent children array. */ > + uint32_t sibling_idx; > + /** Pointer to parent node. */ > + struct json_token *parent; > +}; > + > +struct mh_json_t; > + > +/** JSON tree object to manage tokens relations. */ > +struct json_tree { > + /** JSON tree root node. */ > + struct json_token root; > + /** Hashtable af all tree nodes. */ How does it work? In a year nobody will remember, and to understand how this thing operates we will have to dive deep into the code... > + struct mh_json_t *hash; > }; > > /** > @@ -104,6 +137,147 @@ json_lexer_create(struct json_lexer *lexer, const char *src, int src_len) > int > json_lexer_next_token(struct json_lexer *lexer, struct json_token *token); > > +/** Create a JSON tree object to manage data relations. */ > +int > +json_tree_create(struct json_tree *tree); > + > +/** > + * Destroy JSON tree object. This routine doesn't destroy attached > + * subtree so it should be called at the end of manual destroy. > + */ > +void > +json_tree_destroy(struct json_tree *tree); > + > +/** > + * Make child lookup in JSON tree by token at position specified > + * with parent. The parent may be set NULL to use tree root > + * record. > + */ > +struct json_token * > +json_tree_lookup(struct json_tree *tree, struct json_token *parent, > + struct json_token *token); > + > +/** > + * Append token to the given parent position in a JSON tree. The > + * parent mustn't have a child with such content. The parent may > + * be set NULL to use tree root record. > + */ > +int > +json_tree_add(struct json_tree *tree, struct json_token *parent, > + struct json_token *token); > + > +/** > + * Delete a JSON tree token at the given parent position in JSON > + * tree. Token entry shouldn't have subtree. > + */ > +void > +json_tree_del(struct json_tree *tree, struct json_token *token); > + > +/** Make child lookup by path in JSON tree. */ > +struct json_token * > +json_tree_lookup_path(struct json_tree *tree, struct json_token *parent, > + const char *path, uint32_t path_len); > + > +/** Make pre order traversal in JSON tree. */ > +struct json_token * > +json_tree_preorder_next(struct json_token *root, struct json_token *pos); > + > +/** Make post order traversal in JSON tree. */ > +struct json_token * > +json_tree_postorder_next(struct json_token *root, struct json_token *pos); It's definitely worth mentioning that these functions don't visit the root node. > + > +/** > + * Make safe post-order traversal in JSON tree. > + * May be used for destructors. > + */ > +#define json_tree_foreach_safe(node, root) \ > +for (struct json_token *__next = json_tree_postorder_next((root), NULL); \ > + (((node) = __next) && \ > + (__next = json_tree_postorder_next((root), (node))), (node) != NULL);) IMHO it's rather difficult for understanding. Can we rewrite it so that it uses for() loop as expected, i.e. initializes the loop variable in the first part, checks for a loop condition in the second part, and advances the loop variable in the third part? Something like: #define json_tree_foreach_safe(curr, root, next) for (curr = json_tree_postorder_next(root, NULL), next = curr ? json_tree_postorder_next(root, curr) : NULL; curr != NULL; curr = next, next = curr ? json_tree_postorder_next(root, curr) : NULL) > + > +#ifndef typeof > +/* TODO: 'typeof' is a GNU extension */ > +#define typeof __typeof__ > +#endif > + > +/** Return container entry by json_tree_node node. */ > +#define json_tree_entry(node, type, member) ({ \ > + const typeof( ((type *)0)->member ) *__mptr = (node); \ > + (type *)( (char *)__mptr - ((size_t) &((type *)0)->member) ); \ Use container_of please. > +}) > + > +/** > + * Return container entry by json_tree_node or NULL if > + * node is NULL. > + */ > +#define json_tree_entry_safe(node, type, member) ({ \ > + (node) != NULL ? json_tree_entry((node), type, member) : NULL; \ > +}) > + > +/** Make entry pre order traversal in JSON tree */ > +#define json_tree_preorder_next_entry(node, root, type, member) ({ \ > + struct json_token *__next = \ > + json_tree_preorder_next((root), (node)); \ > + json_tree_entry_safe(__next, type, member); \ > +}) > + > +/** Make entry post order traversal in JSON tree */ > +#define json_tree_postorder_next_entry(node, root, type, member) ({ \ > + struct json_token *__next = \ > + json_tree_postorder_next((root), (node)); \ > + json_tree_entry_safe(__next, type, member); \ > +}) > + > +/** Make lookup in tree by path and return entry. */ > +#define json_tree_lookup_path_entry(tree, parent, path, path_len, type, \ > + member) \ > +({struct json_token *__node = \ > + json_tree_lookup_path((tree), (parent), path, path_len); \ > + json_tree_entry_safe(__node, type, member); }) > + > +/** Make lookup in tree by token and return entry. */ > +#define json_tree_lookup_entry(tree, parent, token, type, member) \ > +({struct json_token *__node = \ > + json_tree_lookup((tree), (parent), token); \ Sometimes you wrap macro arguments in parentheses (tree, parent), sometimes you don't (token, path, path_len). Please fix. > + json_tree_entry_safe(__node, type, member); \ > +}) > + > +/** Make pre-order traversal in JSON tree. */ > +#define json_tree_foreach_preorder(node, root) \ > +for ((node) = json_tree_preorder_next((root), NULL); (node) != NULL; \ > + (node) = json_tree_preorder_next((root), (node))) > + > +/** Make post-order traversal in JSON tree. */ > +#define json_tree_foreach_postorder(node, root) \ > +for ((node) = json_tree_postorder_next((root), NULL); (node) != NULL; \ > + (node) = json_tree_postorder_next((root), (node))) > + > +/** Make post-order traversal in JSON tree and return entry. */ > +#define json_tree_foreach_entry_preorder(node, root, type, member) \ > +for ((node) = json_tree_preorder_next_entry(NULL, (root), type, member); \ > + (node) != NULL; \ > + (node) = json_tree_preorder_next_entry(&(node)->member, (root), type, \ > + member)) > + > +/** Make pre-order traversal in JSON tree and return entry. */ > +#define json_tree_foreach_entry_postorder(node, root, type, member) \ > +for ((node) = json_tree_postorder_next_entry(NULL, (root), type, member); \ > + (node) != NULL; \ > + (node) = json_tree_postorder_next_entry(&(node)->member, (root), type, \ > + member)) > + > +/** > + * Make secure post-order traversal in JSON tree and return entry. > + */ > +#define json_tree_foreach_entry_safe(node, root, type, member) \ > +for (type *__next = json_tree_postorder_next_entry(NULL, (root), type, \ > + member); \ > + (((node) = __next) && \ > + (__next = json_tree_postorder_next_entry(&(node)->member, (root), type, \ > + member)), \ > + (node) != NULL);) > + > + > #ifdef __cplusplus > } > #endif > diff --git a/test/unit/json_path.c b/test/unit/json_path.c > index a5f90ad98..f6b0472f4 100644 > --- a/test/unit/json_path.c > +++ b/test/unit/json_path.c > @@ -159,14 +160,207 @@ test_errors() > footer(); > } > > +struct test_struct { > + int value; > + struct json_token node; > +}; > + > +struct test_struct * > +test_struct_alloc(struct test_struct *records_pool, int *pool_idx) > +{ > + struct test_struct *ret = &records_pool[*pool_idx]; > + *pool_idx = *pool_idx + 1; > + memset(&ret->node, 0, sizeof(ret->node)); > + return ret; > +} > + > +struct test_struct * > +test_add_path(struct json_tree *tree, const char *path, uint32_t path_len, > + struct test_struct *records_pool, int *pool_idx) > +{ > + int rc; > + struct json_lexer lexer; > + struct json_token *parent = NULL; > + json_lexer_create(&lexer, path, path_len); > + struct test_struct *field = test_struct_alloc(records_pool, pool_idx); > + while ((rc = json_lexer_next_token(&lexer, &field->node)) == 0 && > + field->node.key.type != JSON_TOKEN_END) { > + struct json_token *next = > + json_tree_lookup(tree, parent, &field->node); > + if (next == NULL) { > + rc = json_tree_add(tree, parent, &field->node); > + fail_if(rc != 0); > + next = &field->node; > + field = test_struct_alloc(records_pool, pool_idx); > + } > + parent = next; > + } > + fail_if(rc != 0 || field->node.key.type != JSON_TOKEN_END); > + *pool_idx = *pool_idx - 1; > + /* release field */ > + return json_tree_entry(parent, struct test_struct, node); > +} > + > +void > +test_tree() > +{ > + header(); > + plan(35); > + > + struct json_tree tree; > + int rc = json_tree_create(&tree); > + fail_if(rc != 0); > + > + struct test_struct records[6]; > + for (int i = 0; i < 6; i++) > + records[i].value = i; > + > + const char *path1 = "[1][10]"; > + const char *path2 = "[1][20].file"; > + const char *path_unregistered = "[1][3]"; Please add more paths. I'd like to see the following cases covered as well: - JSON_TOKEN_STR as an intermediate node - JSON_TOKEN_STR as a common node for two paths - coinciding paths - one path is a sub-path of another