[PATCH v5 05/12] lib: implement JSON tree class for json library

Vladimir Davydov vdavydov.dev at gmail.com
Tue Nov 20 19:43:16 MSK 2018


On Mon, Oct 29, 2018 at 09:56:37AM +0300, Kirill Shcherbatov wrote:
> New JSON tree class would store JSON paths for tuple fields
> for registered non-plain indexes. It is a hierarchical data
> structure that organize JSON nodes produced by parser.
> Class provides API to lookup node by path and iterate over the
> tree.
> JSON Indexes patch require such functionality to make lookup
> for tuple_fields by path, make initialization of field map and
> build vynyl_stmt msgpack for secondary index via JSON tree
> iteration.
> 
> Need for #1012
> ---
>  src/lib/json/CMakeLists.txt |   2 +
>  src/lib/json/tree.c         | 327 ++++++++++++++++++++++++++++++++++++++++++++
>  src/lib/json/tree.h         | 224 ++++++++++++++++++++++++++++++
>  test/unit/json_path.c       | 211 +++++++++++++++++++++++++++-
>  test/unit/json_path.result  |  42 +++++-
>  5 files changed, 804 insertions(+), 2 deletions(-)
>  create mode 100644 src/lib/json/tree.c
>  create mode 100644 src/lib/json/tree.h

First, I agree with Kostja that naming looks confusing. Please try to
come up with better names and send them to us for review before
reworking the patch so that we could agree first.

Also, once we agree, don't forget to fix comments, function and local
variable names. For example, if you decide to rename json_path_node to
json_path_token and json_path_parser to json_path_tokenizer, you should
also rename all local variables referring to them respectively, e.g.
node => token, parser => tokenizer and so forth. Please avoid calling
local variable 'entry' if the struct is called json_path_node or vice
versa, in other words be consistent.

My another concern is comments to the code and function members. They
look better than they used to be, but still not perfect. E.g. take this
comment:

> +	/**
> +	 * Rolling hash for node calculated with
> +	 * json_path_node_hash(key, parent).
> +	 */
> +	uint32_t rolling_hash;

It took me a while to realize how you calculate rolling hash - I had to
dive deep into the implementation.

Another example:

> +	/** Array of child records. Match indexes. */
> +	struct json_tree_node **children;

What is the order of entries in this array for a json array/map?

Please be more thorough when writing comments. Try to explain things
that might not be obvious to the reader. There are quite a few typos
and grammar mistakes, too. I don't want to go over every place in the
code - please try to do some self-review.

Plus, see a few comments inline.

> +/**
> + * Compare hash records of two json tree nodes. Return 0 if equal.
> + */
> +static inline int
> +mh_json_tree_node_cmp(const struct mh_json_tree_node *a,
> +		      const struct mh_json_tree_node *b)
> +{
> +	if (a->key.type != b->key.type)
> +		return a->key.type - b->key.type;
> +	if (a->parent != b->parent)
> +		return a->parent - b->parent;
> +	if (a->key.type == JSON_PATH_STR) {
> +		if (a->key.len != b->key.len)
> +			return a->key.len - b->key.len;
> +		return memcmp(a->key.str, b->key.str, a->key.len);
> +	} else if (a->key.type == JSON_PATH_NUM) {
> +		return a->key_hash - b->key_hash;
> +	}
> +	unreachable();

unreachable() may turn into nothing in release mode, in which case the
compiler will probably complain that the return value is missing.

> +}
> +
> +#define MH_SOURCE 1
> +#define mh_name _json_tree_node
> +#define mh_key_t struct mh_json_tree_node *
> +#define mh_node_t struct mh_json_tree_node
> +#define mh_arg_t void *
> +#define mh_hash(a, arg) ((a)->key_hash)
> +#define mh_hash_key(a, arg) ((a)->key_hash)
> +#define mh_cmp(a, b, arg) (mh_json_tree_node_cmp((a), (b)))
> +#define mh_cmp_key(a, b, arg) mh_cmp(a, b, arg)
> +#include "salad/mhash.h"
> +
> +static const uint32_t json_path_node_hash_seed = 13U;
> +
> +uint32_t
> +json_path_node_hash(struct json_path_node *key, uint32_t seed)
> +{
> +	uint32_t h = seed;
> +	uint32_t carry = 0;
> +	const void *data;
> +	uint32_t data_size;
> +	if (key->type == JSON_PATH_STR) {
> +		data = key->str;
> +		data_size = key->len;
> +	} else if (key->type == JSON_PATH_NUM) {
> +		data = &key->num;
> +		data_size = sizeof(key->num);
> +	} else {
> +		unreachable();
> +	}
> +	PMurHash32_Process(&h, &carry, data, data_size);
> +	return PMurHash32_Result(h, carry, data_size);
> +}
> +
> +int
> +json_tree_create(struct json_tree *tree)
> +{
> +	memset(tree, 0, sizeof(struct json_tree));
> +	tree->root.rolling_hash = json_path_node_hash_seed;
> +	tree->root.key.type = JSON_PATH_END;
> +	tree->hash = mh_json_tree_node_new();

mh_json_tree_node_new() - but it's not a node, it's a hash table.
Please fix the name.

May be, it's worth allocating the hash table on demand? It'd simplify
the API.

> +	if (unlikely(tree->hash == NULL))

Please don't use 'unlikely' on cold paths - it is meaningless.

> +		return -1;
> +	return 0;
> +}
> +
> +void
> +json_tree_destroy(struct json_tree *tree)
> +{
> +	assert(tree->hash != NULL);
> +	json_tree_node_destroy(&tree->root);
> +	mh_json_tree_node_delete(tree->hash);
> +}
> +
> +void
> +json_tree_node_create(struct json_tree_node *node,
> +		      struct json_path_node *path_node)
> +{
> +	memset(node, 0, sizeof(struct json_tree_node));
> +	node->key = *path_node;
> +}
> +
> +void
> +json_tree_node_destroy(struct json_tree_node *node)
> +{
> +	free(node->children);
> +}
> +
> +struct json_tree_node *
> +json_tree_lookup_by_path_node(struct json_tree *tree,
> +			      struct json_tree_node *parent,
> +			      struct json_path_node *path_node,
> +			      uint32_t rolling_hash)
> +{
> +	assert(parent != NULL);
> +	assert(rolling_hash == json_path_node_hash(path_node,
> +						   parent->rolling_hash));
> +	struct mh_json_tree_node info;
> +	info.key = *path_node;
> +	info.key_hash = rolling_hash;
> +	info.parent = parent;
> +	mh_int_t id = mh_json_tree_node_find(tree->hash, &info, NULL);
> +	if (unlikely(id == mh_end(tree->hash)))
> +		return NULL;

Wouldn't it be worth looking up in the children array directly in case
of JSON_PATH_NUM? BTW, then we wouldn't have to access 'children'
directly in tuple_format_field() (next patch) thus violating
encapsulation - instead we could use this helper there as well.

> +	struct mh_json_tree_node *ht_node =
> +		mh_json_tree_node_node(tree->hash, id);
> +	assert(ht_node == NULL || ht_node->node != NULL);
> +	struct json_tree_node *ret = ht_node != NULL ? ht_node->node : NULL;
> +	assert(ret == NULL || ret->parent == parent);
> +	return ret;
> +}
> +
> +
> +int
> +json_tree_add(struct json_tree *tree, struct json_tree_node *parent,
> +	      struct json_tree_node *node, uint32_t rolling_hash)
> +{
> +	assert(parent != NULL);
> +	assert(node->parent == NULL);
> +	assert(rolling_hash ==
> +	       json_path_node_hash(&node->key, parent->rolling_hash));
> +	assert(json_tree_lookup_by_path_node(tree, parent, &node->key,
> +					     rolling_hash) == NULL);
> +	uint32_t insert_idx = (node->key.type == JSON_PATH_NUM) ?
> +			      (uint32_t)node->key.num - 1 :
> +			      parent->children_count;
> +	if (insert_idx >= parent->children_count) {
> +		uint32_t new_size = insert_idx + 1;

We usually double the size with each allocation. If this is intentional,
please add a comment.

> +		struct json_tree_node **children =
> +			realloc(parent->children, new_size*sizeof(void *));

Coding style: spaces missing.

> +		if (unlikely(children == NULL))
> +			return -1;
> +		memset(children + parent->children_count, 0,
> +		       (new_size - parent->children_count)*sizeof(void *));
> +		parent->children = children;
> +		parent->children_count = new_size;
> +	}
> +	assert(parent->children[insert_idx] == NULL);
> +	parent->children[insert_idx] = node;
> +	node->sibling_idx = insert_idx;
> +	node->rolling_hash = rolling_hash;
> +
> +	struct mh_json_tree_node ht_node;
> +	ht_node.key = node->key;
> +	ht_node.key_hash = rolling_hash;
> +	ht_node.node = node;
> +	ht_node.parent = parent;
> +	mh_int_t rc = mh_json_tree_node_put(tree->hash, &ht_node, NULL, NULL);
> +	if (unlikely(rc == mh_end(tree->hash))) {
> +		parent->children[insert_idx] = NULL;
> +		return -1;
> +	}
> +	node->parent = parent;
> +	assert(json_tree_lookup_by_path_node(tree, parent, &node->key,
> +					     rolling_hash) == node);
> +	return 0;
> +}
> +
> +void
> +json_tree_remove(struct json_tree *tree, struct json_tree_node *parent,
> +		 struct json_tree_node *node, uint32_t rolling_hash)

I don't think you really need this function - it's not used anywhere in
the following patches.

> diff --git a/src/lib/json/tree.h b/src/lib/json/tree.h
> new file mode 100644
> index 0000000..6166024
> --- /dev/null
> +++ b/src/lib/json/tree.h
> @@ -0,0 +1,224 @@
> +#ifndef TARANTOOL_JSON_TREE_H_INCLUDED
> +#define TARANTOOL_JSON_TREE_H_INCLUDED
> +/*
> + * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + * 1. Redistributions of source code must retain the above
> + *    copyright notice, this list of conditions and the
> + *    following disclaimer.
> + *
> + * 2. Redistributions in binary form must reproduce the above
> + *    copyright notice, this list of conditions and the following
> + *    disclaimer in the documentation and/or other materials
> + *    provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
> + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
> + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +#include <stdbool.h>
> +#include <stdint.h>
> +#include "small/rlist.h"

rlist isn't used anymore

> +#include "path.h"
> +#include <stdio.h>
> +#include <inttypes.h>

why do you need inttypes?

> +#include <stdint.h>

stdint is included twice

Please cleanup the headers.

> +#include <stdlib.h>
> +
> +#ifdef __cplusplus
> +extern "C"
> +{
> +#endif
> +
> +struct mh_json_tree_node_t;
> +
> +/**
> + * JSON tree is a hierarchical data structure that organize JSON
> + * nodes produced by parser. Key record point to source strings
> + * memory.
> + */
> +struct json_tree_node {
> +	/** JSON path node produced by json_next_token. */
> +	struct json_path_node key;
> +	/**
> +	 * Rolling hash for node calculated with
> +	 * json_path_node_hash(key, parent).
> +	 */
> +	uint32_t rolling_hash;
> +	/** Array of child records. Match indexes. */
> +	struct json_tree_node **children;
> +	/** Size of children array. */
> +	uint32_t children_count;
> +	/** Index of node in parent children array. */
> +	uint32_t sibling_idx;
> +	/** Pointer to parent node. */
> +	struct json_tree_node *parent;
> +};
> +
> +/** JSON tree object managing data relations. */
> +struct json_tree {
> +	/** JSON tree root node. */
> +	struct json_tree_node root;
> +	/** Hashtable af all tree nodes. */
> +	struct mh_json_tree_node_t *hash;
> +};
> +
> +/** Create a JSON tree object to manage data relations. */
> +int
> +json_tree_create(struct json_tree *tree);
> +
> +/**
> + * Destroy JSON tree object. This routine doesn't  subtree so
> + * should be called at the end of it's manual destruction.
> + */
> +void
> +json_tree_destroy(struct json_tree *tree);
> +
> +/** Compute the hash value of a JSON path component. */
> +uint32_t
> +json_path_node_hash(struct json_path_node *key, uint32_t seed);
> +
> +/** Init a JSON tree node. */
> +void
> +json_tree_node_create(struct json_tree_node *node,
> +		      struct json_path_node *path_node);
> +
> +/** Destroy a JSON tree node. */
> +void
> +json_tree_node_destroy(struct json_tree_node *node);
> +
> +/**
> + * Make child lookup in tree by path_node at position specified
> + * with parent. Rolling hash should be calculated for path_node
> + * and parent with json_path_node_hash.
> + */
> +struct json_tree_node *
> +json_tree_lookup_by_path_node(struct json_tree *tree,
> +			      struct json_tree_node *parent,
> +			      struct json_path_node *path_node,
> +			      uint32_t rolling_hash);

I really don't like that the caller has to pass rolling_hash explicitly
as this obfuscates the API. Judging by the following patches, you always
compute it as json_path_node_hash(path_node, parent->rolling_hash) so
why not just hide it under the hood? Same goes for json_tree_add() and
json_tree_remove().

Also, I'd rather call this function simply json_tree_lookup().

> +
> +/**
> + * Append node to the given parent position in JSON tree. The
> + * parent mustn't have a child with such content. Rolling
> + * hash should be calculated for path_node and parent with
> + * json_path_node_hash.
> + */
> +int
> +json_tree_add(struct json_tree *tree, struct json_tree_node *parent,
> +	      struct json_tree_node *node, uint32_t rolling_hash);
> +
> +/**
> + * Delete a JSON tree node at the given parent position in JSON
> + * tree. The parent node must have such child. Rolling hash should
> + * be calculated for path_node and parent with
> + * json_path_node_hash.
> + */
> +void
> +json_tree_remove(struct json_tree *tree, struct json_tree_node *parent,
> +		 struct json_tree_node *node, uint32_t rolling_hash);
> +
> +/** Make child lookup by path in JSON tree. */
> +struct json_tree_node *
> +json_tree_lookup_by_path(struct json_tree *tree, struct json_tree_node *parent,
> +			 const char *path, uint32_t path_len);
> +
> +/** Make pre order traversal in JSON tree. */
> +struct json_tree_node *
> +json_tree_next_pre(struct json_tree_node *parent,
> +		   struct json_tree_node *pos);
> +
> +/** Make post order traversal in JSON tree. */
> +struct json_tree_node *
> +json_tree_next_post(struct json_tree_node *parent,
> +		    struct json_tree_node *pos);
> +
> +/** Return entry by json_tree_node item. */
> +#define json_tree_entry(item, type, member) ({				     \
> +	const typeof( ((type *)0)->member ) *__mptr = (item);		     \
> +	(type *)( (char *)__mptr - ((size_t) &((type *)0)->member) ); })
> +
> +/** Return entry by json_tree_node item or NULL if item is NULL.*/
> +#define json_tree_entry_safe(item, type, member) ({			     \
> +	(item) != NULL ? json_tree_entry((item), type, member) : NULL; })
> +
> +/** Make lookup in tree by path and return entry. */
> +#define json_tree_lookup_entry_by_path(tree, parent, path, path_len, type,   \
> +				       member)				     \
> +({struct json_tree_node *__tree_node =					     \
> +	json_tree_lookup_by_path((tree), (parent), path, path_len);	     \
> + __tree_node != NULL ? json_tree_entry(__tree_node, type, member) : NULL; })
> +
> +/** Make lookup in tree by path_node and return entry. */
> +#define json_tree_lookup_entry_by_path_node(tree, parent, path_node,	     \
> +					    path_node_hash, type, member)    \
> +({struct json_tree_node *__tree_node =					     \
> +	json_tree_lookup_by_path_node((tree), (parent), path_node,	     \
> +				      path_node_hash);			     \
> + __tree_node != NULL ? json_tree_entry(__tree_node, type, member) :	     \
> +		       NULL; })
> +
> +/** Make pre-order traversal in JSON tree. */
> +#define json_tree_foreach_pre(item, root)				     \
> +for ((item) = json_tree_next_pre((root), NULL); (item) != NULL;		     \
> +     (item) = json_tree_next_pre((root), (item)))
> +
> +/** Make post-order traversal in JSON tree. */
> +#define json_tree_foreach_post(item, root)				     \
> +for ((item) = json_tree_next_post((root), NULL); (item) != NULL;	     \
> +     (item) = json_tree_next_post((root), (item)))
> +
> +/**
> + * Make safe post-order traversal in JSON tree.
> + * Safe for building destructors.
> + */
> +#define json_tree_foreach_safe(item, root)				     \
> +for (struct json_tree_node *__iter = json_tree_next_post((root), NULL);      \
> +     (((item) = __iter) && (__iter = json_tree_next_post((root), (item))),   \
> +     (item) != NULL);)
> +
> +/** Make post-order traversal in JSON tree and return entry. */
> +#define json_tree_foreach_entry_pre(item, root, type, member)		     \
> +for (struct json_tree_node *__iter =					     \
> +     json_tree_next_pre((root), NULL);					     \
> +     __iter != NULL && ((item) = json_tree_entry(__iter, type, member));     \
> +     __iter = json_tree_next_pre((root), __iter))
> +
> +/** Make pre-order traversal in JSON tree and return entry. */
> +#define json_tree_foreach_entry_post(item, root, type, member)		     \
> +for (struct json_tree_node *__iter =					     \
> +     json_tree_next_post((root), NULL		);			     \
> +     __iter != NULL && ((item) = json_tree_entry(__iter, type, member));     \
> +     __iter = json_tree_next_post((root), __iter))
> +
> +/**
> + * Make secure post-order traversal in JSON tree and return entry.
> + */
> +#define json_tree_foreach_entry_safe(item, root, type, member)		     \
> +for (struct json_tree_node *__tmp_iter, *__iter =			     \
> +     json_tree_next_post((root), NULL);					     \
> +     __iter != NULL && ((item) = json_tree_entry(__iter, type, member)) &&   \
> +     (__iter != NULL && (__tmp_iter =					     \
> +			 json_tree_next_post((root), __iter))),		     \
> +     (__iter != NULL && ((item) = json_tree_entry(__iter, type, member)));   \
> +     __iter = __tmp_iter)

This is barely understandable. Please rewrite without introducing extra
loop variable, similarly to how rlist_foreach_entry_safe is implemented.
Also, we use tabs, not spaces for indentation.



More information about the Tarantool-patches mailing list