Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH v1 1/1] lib: implement JSON tree class for json library
@ 2018-10-01 12:27 Kirill Shcherbatov
  2018-10-01 21:34 ` Vladimir Davydov
  2018-10-04 22:54 ` [tarantool-patches] " Konstantin Osipov
  0 siblings, 2 replies; 8+ messages in thread
From: Kirill Shcherbatov @ 2018-10-01 12:27 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, Kirill Shcherbatov

New JSON tree class would store JSON paths for tuple fields
for registered non-plain indexes.

Part of #1012.
---
Branch: http://github.com/tarantool/tarantool/tree/kshch/gh-1012-json-tree-class
Issue: https://github.com/tarantool/tarantool/issues/1012

 src/lib/json/CMakeLists.txt |   2 +
 src/lib/json/tree.c         | 272 ++++++++++++++++++++++++++++++++++++++++++++
 src/lib/json/tree.h         | 207 +++++++++++++++++++++++++++++++++
 test/unit/json_path.c       | 210 +++++++++++++++++++++++++++++++++-
 4 files changed, 690 insertions(+), 1 deletion(-)
 create mode 100644 src/lib/json/tree.c
 create mode 100644 src/lib/json/tree.h

diff --git a/src/lib/json/CMakeLists.txt b/src/lib/json/CMakeLists.txt
index 203fe6f..9702b79 100644
--- a/src/lib/json/CMakeLists.txt
+++ b/src/lib/json/CMakeLists.txt
@@ -1,6 +1,8 @@
 set(lib_sources
     path.c
+    tree.c
 )
 
 set_source_files_compile_flags(${lib_sources})
 add_library(json_path STATIC ${lib_sources})
+target_link_libraries(json_path misc salad small)
diff --git a/src/lib/json/tree.c b/src/lib/json/tree.c
new file mode 100644
index 0000000..a393d5b
--- /dev/null
+++ b/src/lib/json/tree.c
@@ -0,0 +1,272 @@
+
+/*
+ * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdbool.h>
+#include <unicode/uchar.h>
+#include <unicode/utf8.h>
+
+#include "tree.h"
+#include "path.h"
+#include "third_party/PMurHash.h"
+
+/** Hash record representing json_tree_node. */
+struct mh_json_tree_node {
+	struct json_path_node key;
+	uint32_t key_hash;
+	struct json_tree_node *node;
+};
+
+/**
+ * Compare json_tree_node hash bindings @a and @b.
+ * @param a Hash record to compare.
+ * @param b Hash record to compare.
+ * @retval 0 On equal, not 0 otherwise.
+ */
+bool
+mh_json_tree_node_cmp(const struct mh_json_tree_node *a,
+		      const struct mh_json_tree_node *b)
+{
+	return (a->key.type != b->key.type) ||
+		((a->key.type != JSON_PATH_STR ||
+		a->key.len != b->key.len ||
+		memcmp(a->key.str, b->key.str,
+		       a->key.len) != 0) &&
+		(a->key.type != JSON_PATH_NUM ||
+		a->key_hash != b->key_hash));
+}
+
+/** Define tree record hashtable. */
+#define MH_SOURCE 1
+#define mh_name _json_tree_node
+#define mh_key_t struct mh_json_tree_node *
+#define mh_node_t struct mh_json_tree_node
+#define mh_arg_t void *
+#define mh_hash(a, arg) ((a)->key_hash)
+#define mh_hash_key(a, arg) ((a)->key_hash)
+#define mh_cmp(a, b, arg) (mh_json_tree_node_cmp((a), (b)))
+#define mh_cmp_key(a, b, arg) mh_cmp(a, b, arg)
+#include "salad/mhash.h"
+
+static const uint32_t json_path_node_hash_seed = 13U;
+
+uint32_t
+json_path_node_hash(struct json_path_node *key)
+{
+	uint32_t h = json_path_node_hash_seed;
+	uint32_t carry = 0;
+	uint8_t *data;
+	uint32_t data_size;
+	if (key->type == JSON_PATH_STR) {
+		data = (uint8_t *)key->str;
+		data_size = key->len;
+	} else if (key->type == JSON_PATH_NUM) {
+		data = (uint8_t *)&key->num;
+		data_size = sizeof(key->num);
+	} else {
+		assert(0);
+	}
+	PMurHash32_Process(&h, &carry, data, data_size);
+	return PMurHash32_Result(h, carry, data_size);
+}
+
+int
+json_tree_node_create(struct json_tree_node *node,
+		      struct json_path_node *path_node, uint32_t path_node_hash)
+{
+	assert(path_node == NULL ||
+	       path_node_hash == json_path_node_hash(path_node));
+	memset(node, 0, sizeof(struct json_tree_node));
+	node->children_ht = mh_json_tree_node_new();
+	if (node->children_ht == NULL)
+		return -1;
+	if (path_node != NULL) {
+		memcpy(&node->key, path_node, sizeof(struct json_path_node));
+		node->key_hash = path_node_hash;
+	} else {
+		node->key.type = JSON_PATH_END;
+	}
+	rlist_create(&node->siblings);
+	rlist_create(&node->children);
+	return 0;
+}
+
+void
+json_tree_node_destroy(struct json_tree_node *node)
+{
+	mh_json_tree_node_delete(node->children_ht);
+}
+
+struct json_tree_node *
+json_tree_lookup(struct json_tree_node *root, struct json_path_node *path_node,
+		 uint32_t path_node_hash)
+{
+	struct mh_json_tree_node info;
+	memcpy(&info.key, path_node, sizeof(struct json_path_node));
+	info.key_hash = path_node_hash;
+	mh_int_t id = mh_json_tree_node_find(root->children_ht, &info, NULL);
+	if (id == mh_end(root->children_ht))
+		return NULL;
+	struct mh_json_tree_node *ht_node =
+		mh_json_tree_node_node(root->children_ht, id);
+	assert(ht_node == NULL || ht_node->node != NULL);
+	struct json_tree_node *ret = ht_node != NULL ? ht_node->node : NULL;
+	assert(ret == NULL || ret->parent == root);
+	return ret;
+}
+
+int
+json_tree_add(struct json_tree_node *root, struct json_tree_node *node)
+{
+	assert(node->parent == NULL);
+	assert(json_tree_lookup(root, &node->key, node->key_hash) == NULL);
+	struct mh_json_tree_node ht_node;
+	memcpy(&ht_node.key, &node->key, sizeof(struct json_path_node));
+	ht_node.key_hash = node->key_hash;
+	ht_node.node = node;
+	mh_int_t rc = mh_json_tree_node_put(root->children_ht, &ht_node, NULL,
+					    NULL);
+	if (rc == mh_end(node->children_ht))
+		return -1;
+	node->parent = root;
+	rlist_add(&root->children, &node->siblings);
+	return 0;
+}
+
+void
+json_tree_remove(struct json_tree_node *root, struct json_tree_node *node)
+{
+	assert(node->parent == root);
+	assert(json_tree_lookup(root, &node->key, node->key_hash) == node);
+	rlist_del_entry(node, siblings);
+	struct mh_json_tree_node info;
+	memcpy(&info.key, &node->key,
+	       sizeof(struct json_path_node));
+	info.key_hash = node->key_hash;
+	mh_int_t id = mh_json_tree_node_find(root->children_ht, &info, NULL);
+	assert(id != mh_end(root->children_ht));
+	mh_json_tree_node_del(root->children_ht, id, NULL);
+	assert(json_tree_lookup(root, &node->key, node->key_hash) == NULL);
+}
+
+struct json_tree_node *
+json_tree_lookup_path(struct json_tree_node *root, const char *path,
+		      uint32_t path_len)
+{
+	int rc;
+	struct json_path_parser parser;
+	struct json_path_node path_node;
+	json_path_parser_create(&parser, path, path_len);
+	while ((rc = json_path_next(&parser, &path_node)) == 0 &&
+	       path_node.type != JSON_PATH_END && root != NULL) {
+		uint32_t path_node_hash = json_path_node_hash(&path_node);
+		struct json_tree_node *node =
+			json_tree_lookup(root, &path_node, path_node_hash);
+		assert(node == NULL || node->parent == root);
+		root = node;
+	}
+	if (rc != 0 || path_node.type != JSON_PATH_END)
+		return NULL;
+	return root;
+}
+
+/**
+ * Get JSON tree node next child.
+ * @param pos JSON tree node iterator.
+ * @param parent Parent of @pos record.
+ * @retval not NULL pointer to next item if exists.
+ */
+static struct json_tree_node *
+json_tree_next_child(struct json_tree_node *pos,
+		     struct json_tree_node *parent)
+{
+	struct json_tree_node *next;
+	if (pos == NULL) {
+		next = rlist_first_entry(&parent->children,
+					 struct json_tree_node, siblings);
+	} else {
+		next = rlist_next_entry(pos, siblings);
+	}
+	if (&next->siblings != &parent->children)
+		return next;
+	return NULL;
+}
+
+/**
+ * Find leftmost child rellative for iterator @pos.
+ * @param pos JSON tree node iterator.
+ * @retval not NULL pointer to next item if exists.
+ */
+static struct json_tree_node *
+json_tree_leftmost_descendant(struct json_tree_node *pos)
+{
+	struct json_tree_node *last;
+	do {
+		last = pos;
+		pos = json_tree_next_child(NULL, pos);
+	} while (pos);
+	return last;
+}
+
+struct json_tree_node *
+json_tree_preorder_next(struct json_tree_node *pos, struct json_tree_node *root)
+{
+	if (pos == NULL)
+		return root;
+	struct json_tree_node *next = json_tree_next_child(NULL, pos);
+	if (next != NULL)
+		return next;
+	while (pos != root) {
+		next = json_tree_next_child(pos, pos->parent);
+		if (next != NULL)
+			return next;
+		pos = pos->parent;
+	}
+	return NULL;
+}
+
+struct json_tree_node *
+json_tree_postorder_next(struct json_tree_node *pos,
+			 struct json_tree_node *root)
+{
+	struct json_tree_node *next;
+	if (pos == NULL)
+		return json_tree_leftmost_descendant(root);
+	if (pos == root)
+		return NULL;
+	next = json_tree_next_child(pos, pos->parent);
+	if (next != NULL)
+		return json_tree_leftmost_descendant(next);
+	return pos->parent;
+}
diff --git a/src/lib/json/tree.h b/src/lib/json/tree.h
new file mode 100644
index 0000000..aa77b51
--- /dev/null
+++ b/src/lib/json/tree.h
@@ -0,0 +1,207 @@
+#ifndef TARANTOOL_JSON_TREE_H_INCLUDED
+#define TARANTOOL_JSON_TREE_H_INCLUDED
+/*
+ * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <stdint.h>
+#include "small/rlist.h"
+#include "path.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+/** JSON path tree hash structure. */
+struct mh_json_tree_node_t;
+
+/** JSON path tree record structure. */
+struct json_tree_node {
+	/** JSON parser node path. */
+	struct json_path_node key;
+	/** Hash calculated for key field. */
+	uint32_t key_hash;
+	/** Childs hashtable: json_path_node -> json_tree_node. */
+	struct mh_json_tree_node_t *children_ht;
+	/** Siblings nodes list. */
+	struct rlist siblings;
+	/** Child nodes list. */
+	struct rlist children;
+	/** Pointer to parent json_tree_node record. */
+	struct json_tree_node *parent;
+};
+
+/**
+ * Calculate json_path_node hash.
+ * @param key JSON parser node to calculate hash.
+ * @retval Hash value.
+ */
+uint32_t
+json_path_node_hash(struct json_path_node *key);
+
+/**
+ * Init JSON tree node.
+ * @param node JSON tree node to initialize.
+ * @param path_node Source JSON parser node to use on initialize.
+ * @param path_node_hash Hash of @path_node.
+ * @retval 0 On success, -1 otherwise.
+ */
+int
+json_tree_node_create(struct json_tree_node *node,
+		      struct json_path_node *path_node,
+		      uint32_t path_node_hash);
+
+/**
+ * Destroy JSON tree node.
+ * @param node JSON tree node to destruct.
+ * @retval 0 On success, -1 on error.
+ */
+void
+json_tree_node_destroy(struct json_tree_node *node);
+
+/**
+ * Lookup record with data @path_node having hash @path_node_hash
+ * in JSON tree @node childs.
+ * @param node JSON tree root node.
+ * @param path_node JSON parser node with data.
+ * @param path_node_hash Hash of @path_node.
+ * @retval not NULL pointer to JSON tree node if any.
+ */
+struct json_tree_node *
+json_tree_lookup(struct json_tree_node *root,
+		 struct json_path_node *path_node,
+		 uint32_t path_node_hash);
+
+/**
+ * Append JSON tree node @node to @root. Root JSON tree node @root
+ * shouldn't have childs with same content.
+ * @param root JSON tree root node.
+ * @param node JSON tree node to append.
+ * @retval 0 On success, -1 on memory allocation error.
+ */
+int
+json_tree_add(struct json_tree_node *root,
+	      struct json_tree_node *node);
+
+/**
+ * Detach JSON tree node @node by @root. Root JSON tree node @root
+ * should have such child.
+ * @param root JSON tree root node.
+ * @param node JSON tree node to add.
+ */
+void
+json_tree_remove(struct json_tree_node *root,
+		 struct json_tree_node *node);
+
+/**
+ * Lookup JSON path @path having length @path_len in JSON
+ * tree @root.
+ * @param root JSON tree root node.
+ * @param path JSON path string.
+ * @param path_len Length of @path.
+ * @retval not NULL pointer to leaf record if any.
+ */
+struct json_tree_node *
+json_tree_lookup_path(struct json_tree_node *root,
+		      const char *path, uint32_t path_len);
+
+/**
+ * Make pre order traversal iterator @pos step.
+ * @param pos Iterator pointer, NULL for initialization.
+ * @param root The JSON path tree to walk root.
+ * @retval not NULL pointer to next node. NULL if nothing found.
+ */
+struct json_tree_node *
+json_tree_preorder_next(struct json_tree_node *pos,
+			struct json_tree_node *root);
+
+/**
+ * Make post order traversal iterator @pos step.
+ * @param pos Iterator pointer, NULL for initialization.
+ * @param root The JSON path tree to walk root.
+ * @retval not NULL pointer to next node. NULL if nothing found.
+ */
+struct json_tree_node *
+json_tree_postorder_next(struct json_tree_node *pos,
+			 struct json_tree_node *root);
+
+
+/**
+ * Define macros to operate with containers.
+ */
+
+/** Return entry by json_tree_node item. */
+#define json_tree_node_entry(item, type, member) ({			     \
+	const typeof( ((type *)0)->member ) *__mptr = (item);		     \
+	(type *)( (char *)__mptr - ((size_t) &((type *)0)->member) ); })
+
+/** Make lookup in tree by path and return entry on found. */
+#define json_tree_lookup_path_entry(root, path, path_len, type, member)      \
+({struct json_tree_node *__tree_node =					     \
+	json_tree_lookup_path(&(root)->member, path, path_len);		     \
+ __tree_node != NULL ? json_tree_node_entry(__tree_node, type, member) :     \
+		       NULL; })
+
+/** Make lookup in tree by path_node and return entry on found. */
+#define json_tree_lookup_entry(root, path_node, path_node_hash, type, member)\
+({struct json_tree_node *__tree_node =					     \
+	json_tree_lookup(&(root)->member, path_node, path_node_hash);	     \
+ __tree_node != NULL ? json_tree_node_entry(__tree_node, type, member) :     \
+		       NULL; })
+
+/** Make traversal in JSON tree. */
+#define json_tree_foreach_entry(order, item, root, member)		     \
+for (struct json_tree_node *__iter =					     \
+     json_tree_##order##_next(NULL, &(root)->member);			     \
+     __iter != NULL && (item = json_tree_node_entry(__iter, typeof(*root),   \
+						    member));		     \
+     __iter = json_tree_##order##_next(__iter, &(root)->member))
+
+/**
+ * Make secure traversal in JSON tree that works even on
+ * destruction under iterator.
+ */
+#define json_tree_foreach_entry_safe(order, item, root, member)		     \
+for (struct json_tree_node *__tmp_iter, *__iter =			     \
+     json_tree_##order##_next(NULL, &(root)->member);			     \
+     __iter != NULL && ((item) = json_tree_node_entry(__iter, typeof(*root), \
+						      member)) &&	     \
+     (__iter != NULL && (__tmp_iter =					     \
+			 json_tree_##order##_next(__iter,		     \
+						  &(root)->member))),	     \
+     (__iter != NULL && ((item) = json_tree_node_entry(__iter, typeof(*root),\
+						       member)));	     \
+     __iter = __tmp_iter)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TARANTOOL_JSON_TREE_H_INCLUDED */
diff --git a/test/unit/json_path.c b/test/unit/json_path.c
index 1d7e7d3..fb16162 100644
--- a/test/unit/json_path.c
+++ b/test/unit/json_path.c
@@ -1,7 +1,9 @@
 #include "json/path.h"
+#include "json/tree.h"
 #include "unit.h"
 #include "trivia/util.h"
 #include <string.h>
+#include <stdbool.h>
 
 #define reset_to_new_path(value) \
 	path = value; \
@@ -159,14 +161,220 @@ test_errors()
 	footer();
 }
 
+enum tuple_field_type {
+	TUPLE_FIELD_TYPE_ANY,
+	TUPLE_FIELD_TYPE_MAP,
+	TUPLE_FIELD_TYPE_ARRAY,
+	TUPLE_FIELD_TYPE_OTHER,
+};
+
+const char *field_type_strs[] = {
+	/** TUPLE_FIELD_TYPE_ANY */	"any",
+	/** TUPLE_FIELD_TYPE_MAP */	"map",
+	/** TUPLE_FIELD_TYPE_ARRAY */	"array",
+	/** TUPLE_FIELD_TYPE_OTHER */	"other"
+};
+
+struct tuple_field {
+	enum tuple_field_type type;
+	struct json_tree_node tree_node;
+};
+
+static inline enum tuple_field_type
+tuple_field_by_leaf_node_type(enum json_path_type path_type)
+{
+	if (path_type == JSON_PATH_STR)
+		return TUPLE_FIELD_TYPE_MAP;
+	if (path_type == JSON_PATH_NUM)
+		return TUPLE_FIELD_TYPE_ARRAY;
+	return TUPLE_FIELD_TYPE_OTHER;
+}
+
+struct tuple_field *
+tuple_field_new(struct json_path_node *node, uint32_t node_hash)
+{
+	struct tuple_field *ret = calloc(1, sizeof(struct tuple_field));
+	assert(ret != NULL);
+	if (json_tree_node_create(&ret->tree_node, node, node_hash) != 0) {
+		free(ret);
+		return NULL;
+	}
+	ret->type = TUPLE_FIELD_TYPE_ANY;
+	return ret;
+}
+
+void
+tuple_field_delete(struct tuple_field *field)
+{
+	json_tree_node_destroy(&field->tree_node);
+	free(field);
+}
+
+int
+tuple_field_add_path(struct tuple_field *root, const char *path,
+		     uint32_t path_len, struct tuple_field **leaf)
+{
+	int rc;
+	struct json_path_parser parser;
+	struct json_path_node path_node;
+	struct tuple_field *field = root;
+	json_path_parser_create(&parser, path, path_len);
+	while ((rc = json_path_next(&parser, &path_node)) == 0 &&
+		path_node.type != JSON_PATH_END) {
+		enum tuple_field_type type =
+				tuple_field_by_leaf_node_type(path_node.type);
+		if (field->type != TUPLE_FIELD_TYPE_ANY && field->type != type)
+			return -1;
+		uint32_t path_node_hash = json_path_node_hash(&path_node);
+		struct tuple_field *new_field =
+			json_tree_lookup_entry(field, &path_node,
+					       path_node_hash,
+					       struct tuple_field, tree_node);
+		if (new_field == NULL) {
+			new_field =
+				tuple_field_new(&path_node, path_node_hash);
+			assert(new_field != NULL);
+			rc = json_tree_node_create(&new_field->tree_node,
+						   &path_node, path_node_hash);
+			assert(rc == 0);
+			rc = json_tree_add(&field->tree_node,
+					   &new_field->tree_node);
+			assert(rc == 0);
+			assert(memcmp(&path_node, &new_field->tree_node.key,
+					sizeof(path_node)) == 0);
+			assert(json_tree_lookup_entry(field, &path_node,
+						      path_node_hash,
+						      struct tuple_field,
+						      tree_node) != NULL);
+		}
+		field->type = type;
+		field = new_field;
+	}
+	assert(rc == 0 && path_node.type == JSON_PATH_END);
+	*leaf = field;
+	return 0;
+}
+
+void
+test_tree()
+{
+	header();
+	plan(19);
+
+	struct tuple_field *root_field = tuple_field_new(NULL, 0);
+	assert(root_field != NULL);
+
+	/* Add paths to tree. */
+	int rc = 0;
+	struct tuple_field *node;
+	const char *path1 = "[1][10]";
+	const char *path2 = "[1].aaa";
+	const char *path3 = "[1][100]";
+	const char *path4 = "[1][100].a";
+	const char *path5 = "[1000].map";
+	rc = tuple_field_add_path(root_field, path1, strlen(path1), &node);
+	is(rc, 0, "add valid path '%s' - normal ret", path1);
+	isnt(node, NULL, "add valid path '%s' - not NULL pointer", path1);
+	node->type = TUPLE_FIELD_TYPE_OTHER;
+
+	rc = tuple_field_add_path(root_field, path2, strlen(path2), &node);
+	isnt(rc, 0, "add path with conflicting interm. node '%s' - error ret",
+	     path2);
+	assert(rc != 0);
+
+	rc = tuple_field_add_path(root_field, path3, strlen(path3), &node);
+	is(rc, 0, "add valid path '%s' - normal ret", path3);
+	isnt(node, NULL, "add valid path '%s' - not NULL pointer", path3);
+	node->type = TUPLE_FIELD_TYPE_OTHER;
+
+	rc = tuple_field_add_path(root_field, path4, strlen(path4), &node);
+	isnt(rc, 0, "add path with conflicting leaf node '%s' - error ret",
+	     path4);
+
+	rc = tuple_field_add_path(root_field, path5, strlen(path5), &node);
+	is(rc, 0, "add path with map container '%s' - normal ret", path5);
+	isnt(node, NULL, "add path with map container '%s' - not NULL pointer",
+	     path5);
+	node->type = TUPLE_FIELD_TYPE_OTHER;
+
+	/* Test registered paths. */
+	node = json_tree_lookup_path_entry(root_field, path2, strlen(path2),
+					   struct tuple_field, tree_node);
+	is(node, NULL, "lookup by unregistered path '%s'", path2);
+
+	node = json_tree_lookup_path_entry(root_field, path1, strlen(path1),
+					   struct tuple_field, tree_node);
+	isnt(node, NULL, "lookup by registered path '%s'", path1);
+	/* Iterate over tree. */
+	int sum = 0;
+	struct tuple_field *field;
+	json_tree_foreach_entry(preorder, field, root_field, tree_node) {
+		if (field->tree_node.key.type == JSON_PATH_NUM)
+			sum += field->tree_node.key.num;
+	}
+	is(sum, 1111, "iterate over the tree and calculate sum");
+	if (node != NULL) {
+		json_tree_remove(node->tree_node.parent, &node->tree_node);
+		tuple_field_delete(node);
+		node = json_tree_lookup_path_entry(root_field, path1, strlen(path1),
+					   struct tuple_field, tree_node);
+		is(node, NULL, "lookup after registered path '%s' deletion",
+		   path1);
+	}
+	sum = 0;
+	json_tree_foreach_entry(postorder, field, root_field, tree_node) {
+		if (field->tree_node.key.type == JSON_PATH_NUM)
+			sum += field->tree_node.key.num;
+	}
+	is(sum, 1101, "iterate over the tree and calculate sum after leaf "
+		      "deletion");
+
+	/* Destroy tree. */
+	/**
+	 *       [1]
+	 *        |
+	 *      [  ...  10  ...  100  ...  1000]
+	 *               |        |          |
+	 *               *        #       {"map"}
+	 *                                   |
+	 *                                   *
+	 *
+	 * Item marked with # should be already deleted.
+	 */
+	enum tuple_field_type destruction_order[] =
+		{TUPLE_FIELD_TYPE_OTHER, TUPLE_FIELD_TYPE_MAP,
+		 TUPLE_FIELD_TYPE_OTHER, TUPLE_FIELD_TYPE_ARRAY,
+		 TUPLE_FIELD_TYPE_ARRAY};
+	int destruction_order_size =
+		sizeof(destruction_order)/sizeof(destruction_order[0]);
+	int idx = 0;
+	json_tree_foreach_entry_safe(postorder, field, root_field, tree_node) {
+		if (idx > destruction_order_size)
+			break;
+		is(field->type, destruction_order[idx],
+		   "destruct valid order - delete record %s, expected %s",
+		   field_type_strs[field->type],
+		   field_type_strs[destruction_order[idx]]);
+		tuple_field_delete(field);
+		idx++;
+	}
+	is(idx, destruction_order_size,
+	   "valid count of destructed items - have %d, expected %d",
+	   idx, destruction_order_size);
+
+	check_plan();
+	footer();
+}
+
 int
 main()
 {
 	header();
-	plan(2);
+	plan(3);
 
 	test_basic();
 	test_errors();
+	test_tree();
 
 	int rc = check_plan();
 	footer();
-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v1 1/1] lib: implement JSON tree class for json library
  2018-10-01 12:27 [PATCH v1 1/1] lib: implement JSON tree class for json library Kirill Shcherbatov
@ 2018-10-01 21:34 ` Vladimir Davydov
  2018-10-03  7:15   ` [tarantool-patches] " Kirill Shcherbatov
  2018-10-04 22:54 ` [tarantool-patches] " Konstantin Osipov
  1 sibling, 1 reply; 8+ messages in thread
From: Vladimir Davydov @ 2018-10-01 21:34 UTC (permalink / raw)
  To: Kirill Shcherbatov; +Cc: tarantool-patches

See my comments inline. The most annoying part is comments to the code.
They are either confusing or useless or both. Plus grammar mistakes,
which are driving me mad. Please apply yourself when writing comments.

On Mon, Oct 01, 2018 at 03:27:02PM +0300, Kirill Shcherbatov wrote:
> New JSON tree class would store JSON paths for tuple fields
> for registered non-plain indexes.

Bad comment. Please elaborate what a json tree is and what it can be
used for regardless of the json path index feature. Then mention why
it's needed for json indexes.

> 
> Part of #1012.

I'd rather say 'Needed for', because it's an independent class.

> ---
> Branch: http://github.com/tarantool/tarantool/tree/kshch/gh-1012-json-tree-class
> Issue: https://github.com/tarantool/tarantool/issues/1012

Please don't prefix the branch and issue names with 'Branch:' and
'Issue:' - it's apparent what is what without it.

> 
>  src/lib/json/CMakeLists.txt |   2 +
>  src/lib/json/tree.c         | 272 ++++++++++++++++++++++++++++++++++++++++++++
>  src/lib/json/tree.h         | 207 +++++++++++++++++++++++++++++++++
>  test/unit/json_path.c       | 210 +++++++++++++++++++++++++++++++++-
>  4 files changed, 690 insertions(+), 1 deletion(-)
>  create mode 100644 src/lib/json/tree.c
>  create mode 100644 src/lib/json/tree.h
> 
> diff --git a/src/lib/json/CMakeLists.txt b/src/lib/json/CMakeLists.txt
> index 203fe6f..9702b79 100644
> --- a/src/lib/json/CMakeLists.txt
> +++ b/src/lib/json/CMakeLists.txt
> @@ -1,6 +1,8 @@
>  set(lib_sources
>      path.c
> +    tree.c
>  )
>  
>  set_source_files_compile_flags(${lib_sources})
>  add_library(json_path STATIC ${lib_sources})
> +target_link_libraries(json_path misc salad small)

Do you really need all these dependencies? AFAICS you only use header
files (rlist.h and mhash.h) so no new dependencies should be introduced.

> diff --git a/src/lib/json/tree.c b/src/lib/json/tree.c
> new file mode 100644
> index 0000000..a393d5b
> --- /dev/null
> +++ b/src/lib/json/tree.c
> @@ -0,0 +1,272 @@
> +
> +/*
> + * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + * 1. Redistributions of source code must retain the above
> + *    copyright notice, this list of conditions and the
> + *    following disclaimer.
> + *
> + * 2. Redistributions in binary form must reproduce the above
> + *    copyright notice, this list of conditions and the following
> + *    disclaimer in the documentation and/or other materials
> + *    provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
> + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
> + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +
> +#include <assert.h>
> +#include <ctype.h>
> +#include <string.h>
> +#include <stdbool.h>
> +#include <unicode/uchar.h>
> +#include <unicode/utf8.h>

Why do you include ctype and unicode?

The following headers are missing:

stddef.h
stdint.h
small/rlist.h

(you ought to include all headers that are used in the file, even those
that are included indirectly)

> +
> +#include "tree.h"
> +#include "path.h"
> +#include "third_party/PMurHash.h"
> +
> +/** Hash record representing json_tree_node. */

A simple comment:

/**
 * Hash table: json_path_node => json_tree_node.
 */

would be much more helpful here.

> +struct mh_json_tree_node {
> +	struct json_path_node key;
> +	uint32_t key_hash;
> +	struct json_tree_node *node;
> +};
> +
> +/**
> + * Compare json_tree_node hash bindings @a and @b.
> + * @param a Hash record to compare.
> + * @param b Hash record to compare.
> + * @retval 0 On equal, not 0 otherwise.

if equal

No need to write doxygen comments for all functions. The most important
thing is a comment should be helpful and understandable. Sometimes,
annotating each argument is simply pointless and would only make the
comment look bulky, like in this case. I'd rewrite this as

/**
 * Compare hash records of two json tree nodes. Return 0 if equal.
 */

Or even delete it at all, because it's obvious that this is a comparator
function needed for the hash table defined right below.

Personally, I prefer to avoid writing doxygen comments at all and only
write them if
 - other comments in the same file are formatted as doxygen
 - the function has a lot of arguments and it makes sense annotating
   them all
 - this is a public API of a library

> + */
> +bool

static inline

> +mh_json_tree_node_cmp(const struct mh_json_tree_node *a,
> +		      const struct mh_json_tree_node *b)
> +{
> +	return (a->key.type != b->key.type) ||
> +		((a->key.type != JSON_PATH_STR ||
> +		a->key.len != b->key.len ||
> +		memcmp(a->key.str, b->key.str,
> +		       a->key.len) != 0) &&
> +		(a->key.type != JSON_PATH_NUM ||
> +		a->key_hash != b->key_hash));

This is very difficult for understanding. Please rewrite with ifs.

> +}
> +
> +/** Define tree record hashtable. */

This comment is rather useless. A comment to mh_json_tree_node would be
enough.

> +#define MH_SOURCE 1
> +#define mh_name _json_tree_node
> +#define mh_key_t struct mh_json_tree_node *

Shouldn't we use json_path_node for a key?

> +#define mh_node_t struct mh_json_tree_node
> +#define mh_arg_t void *
> +#define mh_hash(a, arg) ((a)->key_hash)
> +#define mh_hash_key(a, arg) ((a)->key_hash)
> +#define mh_cmp(a, b, arg) (mh_json_tree_node_cmp((a), (b)))
> +#define mh_cmp_key(a, b, arg) mh_cmp(a, b, arg)
> +#include "salad/mhash.h"
> +
> +static const uint32_t json_path_node_hash_seed = 13U;
> +
> +uint32_t
> +json_path_node_hash(struct json_path_node *key)
> +{
> +	uint32_t h = json_path_node_hash_seed;
> +	uint32_t carry = 0;
> +	uint8_t *data;

Should you define it as void *, you wouldn't need type conversions
below.

> +	uint32_t data_size;
> +	if (key->type == JSON_PATH_STR) {
> +		data = (uint8_t *)key->str;
> +		data_size = key->len;
> +	} else if (key->type == JSON_PATH_NUM) {
> +		data = (uint8_t *)&key->num;
> +		data_size = sizeof(key->num);
> +	} else {
> +		assert(0);

unreachable()

> +	}
> +	PMurHash32_Process(&h, &carry, data, data_size);
> +	return PMurHash32_Result(h, carry, data_size);
> +}
> +
> +int
> +json_tree_node_create(struct json_tree_node *node,
> +		      struct json_path_node *path_node, uint32_t path_node_hash)
> +{
> +	assert(path_node == NULL ||
> +	       path_node_hash == json_path_node_hash(path_node));
> +	memset(node, 0, sizeof(struct json_tree_node));
> +	node->children_ht = mh_json_tree_node_new();
> +	if (node->children_ht == NULL)
> +		return -1;
> +	if (path_node != NULL) {
> +		memcpy(&node->key, path_node, sizeof(struct json_path_node));

I'd rather you used

	node->key = *path_node;

here and everywhere else in the patch (it's easier to read the code that
way IMO).

> +		node->key_hash = path_node_hash;
> +	} else {
> +		node->key.type = JSON_PATH_END;
> +	}
> +	rlist_create(&node->siblings);
> +	rlist_create(&node->children);
> +	return 0;
> +}
> +
> +void
> +json_tree_node_destroy(struct json_tree_node *node)
> +{
> +	mh_json_tree_node_delete(node->children_ht);
> +}
> +
> +struct json_tree_node *
> +json_tree_lookup(struct json_tree_node *root, struct json_path_node *path_node,
> +		 uint32_t path_node_hash)
> +{
> +	struct mh_json_tree_node info;
> +	memcpy(&info.key, path_node, sizeof(struct json_path_node));
> +	info.key_hash = path_node_hash;
> +	mh_int_t id = mh_json_tree_node_find(root->children_ht, &info, NULL);
> +	if (id == mh_end(root->children_ht))
> +		return NULL;
> +	struct mh_json_tree_node *ht_node =
> +		mh_json_tree_node_node(root->children_ht, id);
> +	assert(ht_node == NULL || ht_node->node != NULL);
> +	struct json_tree_node *ret = ht_node != NULL ? ht_node->node : NULL;
> +	assert(ret == NULL || ret->parent == root);
> +	return ret;
> +}
> +
> +int
> +json_tree_add(struct json_tree_node *root, struct json_tree_node *node)
> +{
> +	assert(node->parent == NULL);
> +	assert(json_tree_lookup(root, &node->key, node->key_hash) == NULL);
> +	struct mh_json_tree_node ht_node;
> +	memcpy(&ht_node.key, &node->key, sizeof(struct json_path_node));
> +	ht_node.key_hash = node->key_hash;
> +	ht_node.node = node;
> +	mh_int_t rc = mh_json_tree_node_put(root->children_ht, &ht_node, NULL,
> +					    NULL);
> +	if (rc == mh_end(node->children_ht))
> +		return -1;
> +	node->parent = root;
> +	rlist_add(&root->children, &node->siblings);
> +	return 0;
> +}
> +
> +void
> +json_tree_remove(struct json_tree_node *root, struct json_tree_node *node)
> +{
> +	assert(node->parent == root);
> +	assert(json_tree_lookup(root, &node->key, node->key_hash) == node);
> +	rlist_del_entry(node, siblings);

Please, use rlist_del() here for consistency with rlist_add() above.

> +	struct mh_json_tree_node info;
> +	memcpy(&info.key, &node->key,
> +	       sizeof(struct json_path_node));
> +	info.key_hash = node->key_hash;
> +	mh_int_t id = mh_json_tree_node_find(root->children_ht, &info, NULL);
> +	assert(id != mh_end(root->children_ht));
> +	mh_json_tree_node_del(root->children_ht, id, NULL);
> +	assert(json_tree_lookup(root, &node->key, node->key_hash) == NULL);
> +}
> +
> +struct json_tree_node *
> +json_tree_lookup_path(struct json_tree_node *root, const char *path,
> +		      uint32_t path_len)
> +{
> +	int rc;
> +	struct json_path_parser parser;
> +	struct json_path_node path_node;
> +	json_path_parser_create(&parser, path, path_len);
> +	while ((rc = json_path_next(&parser, &path_node)) == 0 &&
> +	       path_node.type != JSON_PATH_END && root != NULL) {
> +		uint32_t path_node_hash = json_path_node_hash(&path_node);
> +		struct json_tree_node *node =
> +			json_tree_lookup(root, &path_node, path_node_hash);
> +		assert(node == NULL || node->parent == root);
> +		root = node;

It looks kinda weird the way you use 'root' as a loop variable.
May be, it's worth using some local variables instead.

> +	}
> +	if (rc != 0 || path_node.type != JSON_PATH_END)
> +		return NULL;
> +	return root;
> +}
> +
> +/**
> + * Get JSON tree node next child.

This is grammatically incorrect. What about

  Return the next child of a JSON tree node.

> + * @param pos JSON tree node iterator.

There's no such entity as "JSON tree node iterator" in the code so this
comment is confusing. Please fix.

> + * @param parent Parent of @pos record.
> + * @retval not NULL pointer to next item if exists.
> + */
> +static struct json_tree_node *
> +json_tree_next_child(struct json_tree_node *pos,
> +		     struct json_tree_node *parent)

I'd swap 'parent' and 'pos' here, because

  json_tree_next_child(NULL, node)

looks unnatural to me.

> +{
> +	struct json_tree_node *next;
> +	if (pos == NULL) {
> +		next = rlist_first_entry(&parent->children,
> +					 struct json_tree_node, siblings);
> +	} else {
> +		next = rlist_next_entry(pos, siblings);

I wouldn't use _entry variants here, because 'next' may be not a list
entry, but a list head, which you check below.

> +	}
> +	if (&next->siblings != &parent->children)
> +		return next;
> +	return NULL;
> +}
> +
> +/**
> + * Find leftmost child rellative for iterator @pos.

rellative => relative, please enable spell checking in your editor.

Also, "relative for" is a solecism, should be "relative to".
Please always make sure that the preposition matches the verb.

Also, this function finds not a child, but a descendant so I'd rewrite
the comment as "Find the leftmost descendant of a JSON tree node".

> + * @param pos JSON tree node iterator.

Again, there's no such thing as iterator. What this function does is
find the leftmost descendant of a tree node. Please rewrite the comment
accordingly.

> + * @retval not NULL pointer to next item if exists.
> + */
> +static struct json_tree_node *
> +json_tree_leftmost_descendant(struct json_tree_node *pos)

I don't like that you use 'descendant' in some function names, but not
in others, e.g. json_tree_preorder_next. Please be consistent. I'd drop
'descendant' from all function names so that this function would turn
into json_tree_leftmost.

> +{
> +	struct json_tree_node *last;
> +	do {
> +		last = pos;
> +		pos = json_tree_next_child(NULL, pos);
> +	} while (pos);

This is against our coding style, should be (pos != NULL).

> +	return last;
> +}
> +
> +struct json_tree_node *
> +json_tree_preorder_next(struct json_tree_node *pos, struct json_tree_node *root)
> +{
> +	if (pos == NULL)
> +		return root;
> +	struct json_tree_node *next = json_tree_next_child(NULL, pos);
> +	if (next != NULL)
> +		return next;
> +	while (pos != root) {
> +		next = json_tree_next_child(pos, pos->parent);
> +		if (next != NULL)
> +			return next;
> +		pos = pos->parent;
> +	}
> +	return NULL;
> +}
> +
> +struct json_tree_node *
> +json_tree_postorder_next(struct json_tree_node *pos,
> +			 struct json_tree_node *root)

Again, I'd swap 'root' and 'pos' for these two functions.

Also, I don't quite like the names. What about json_tree_next_pre and
json_tree_next_post?

> +{
> +	struct json_tree_node *next;
> +	if (pos == NULL)
> +		return json_tree_leftmost_descendant(root);
> +	if (pos == root)
> +		return NULL;
> +	next = json_tree_next_child(pos, pos->parent);
> +	if (next != NULL)
> +		return json_tree_leftmost_descendant(next);
> +	return pos->parent;
> +}
> diff --git a/src/lib/json/tree.h b/src/lib/json/tree.h
> new file mode 100644
> index 0000000..aa77b51
> --- /dev/null
> +++ b/src/lib/json/tree.h
> @@ -0,0 +1,207 @@
> +#ifndef TARANTOOL_JSON_TREE_H_INCLUDED
> +#define TARANTOOL_JSON_TREE_H_INCLUDED
> +/*
> + * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + * 1. Redistributions of source code must retain the above
> + *    copyright notice, this list of conditions and the
> + *    following disclaimer.
> + *
> + * 2. Redistributions in binary form must reproduce the above
> + *    copyright notice, this list of conditions and the following
> + *    disclaimer in the documentation and/or other materials
> + *    provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
> + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
> + * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
> + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
> + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
> + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
> + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> + * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> + */
> +#include <stdint.h>
> +#include "small/rlist.h"
> +#include "path.h"
> +
> +#ifdef __cplusplus
> +extern "C"
> +{

extern "C" {

> +#endif
> +
> +/** JSON path tree hash structure. */

We usually don't add comments to type declarations. This one is useless
anyway, so please remove.

> +struct mh_json_tree_node_t;
> +
> +/** JSON path tree record structure. */

This comment is useless, because it simply repeats the structure name.
Please try to elaborate what a json tree is and what it can be used for.

> +struct json_tree_node {
> +	/** JSON parser node path. */

Again, repeating function/type names in comments looks bad. At least,
you could say that this is a path component represented by this node.

> +	struct json_path_node key;
> +	/** Hash calculated for key field. */
> +	uint32_t key_hash;
> +	/** Childs hashtable: json_path_node -> json_tree_node. */

There's no word 'childs', should be 'children'.

> +	struct mh_json_tree_node_t *children_ht;

'ht' is confusing, better call it child_by_path, child_by_path_node, or
child_by_name IMO.

> +	/** Siblings nodes list. */
> +	struct rlist siblings;

IMO, better call it 'sibling' (singular) to emphasize that it's a link
in a list, not a list head.

> +	/** Child nodes list. */
> +	struct rlist children;

It'd be helpful to mention how the list is linked, e.g.

/** Link in json_tree_node::children of the parent node. */
struct rlist sibling;
/** List of child nodes, linked by json_tree_node::sibling. */
struct rlist children;

> +	/** Pointer to parent json_tree_node record. */

/** Pointer to the parent node. */

> +	struct json_tree_node *parent;
> +};
> +
> +/**
> + * Calculate json_path_node hash.

I don't like when type names are referred in comments like this.
What about "Compute the hash value of a JSON path component"?

> + * @param key JSON parser node to calculate hash.

It's not a parser node. It's a path component. Also, the preposition is
missing, should be

  JSON path component to compute the hash for.

> + * @retval Hash value.
> + */
> +uint32_t
> +json_path_node_hash(struct json_path_node *key);

May be, this function should be defined in json/path.h?

> +
> +/**
> + * Init JSON tree node.

Init *a* JSON tree node.

> + * @param node JSON tree node to initialize.
> + * @param path_node Source JSON parser node to use on initialize.

What is "Source JSON parser"? Unclear. May be, simply rewrite it as

  JSON path component the new node will represent

Also, you should mention what happens if it's NULL.

> + * @param path_node_hash Hash of @path_node.

You should mention that the hash must be computed with
json_path_node_hash.

> + * @retval 0 On success, -1 otherwise.

What does 'otherwise' mean here? An error? What kind of error?
Memory allocation error I assume. Should be mentioned in the comment.

> + */
> +int
> +json_tree_node_create(struct json_tree_node *node,
> +		      struct json_path_node *path_node,
> +		      uint32_t path_node_hash);
> +
> +/**
> + * Destroy JSON tree node.

Destroy *a* JSON tree node.

> + * @param node JSON tree node to destruct.
> + * @retval 0 On success, -1 on error.

What retval? It returns void.

> + */
> +void
> +json_tree_node_destroy(struct json_tree_node *node);
> +
> +/**
> + * Lookup record with data @path_node having hash @path_node_hash
> + * in JSON tree @node childs.

This is barely understandable...

Why not simply write something like this:

  Look up a child of a JSON tree node by a path component.

> + * @param node JSON tree root node.

The variable is called 'root' while the comment is for 'node'. Anyway,
it's not necessarily a root node, it may be any node.

> + * @param path_node JSON parser node with data.
> + * @param path_node_hash Hash of @path_node.

Should be mentioned that the hash must be computed with
json_path_node_hash.

> + * @retval not NULL pointer to JSON tree node if any.

Can it return NULL? I suppose it can. Hence it should be in the comment.

> + */
> +struct json_tree_node *
> +json_tree_lookup(struct json_tree_node *root,
> +		 struct json_path_node *path_node,
> +		 uint32_t path_node_hash);
> +
> +/**
> + * Append JSON tree node @node to @root. Root JSON tree node @root
> + * shouldn't have childs with same content.

Please try not to use @ in the summary, especially when it's not
necessary:

  Add a new JSON tree node at the given position in the tree.
  The parent node must not have a child representing the same
  path component.

> + * @param root JSON tree root node.

Not necessarily a root. Should be called 'parent', I guess.

I stop reviewing comments at this point, but you should now feel how
ugly they look to me. Please self-review them and make them look good.
Sorry for being so meticulous.

> + * @param node JSON tree node to append.
> + * @retval 0 On success, -1 on memory allocation error.
> + */
> +int
> +json_tree_add(struct json_tree_node *root,
> +	      struct json_tree_node *node);
> +
> +/**
> + * Detach JSON tree node @node by @root. Root JSON tree node @root
> + * should have such child.
> + * @param root JSON tree root node.
> + * @param node JSON tree node to add.
> + */
> +void
> +json_tree_remove(struct json_tree_node *root,
> +		 struct json_tree_node *node);
> +
> +/**
> + * Lookup JSON path @path having length @path_len in JSON
> + * tree @root.
> + * @param root JSON tree root node.
> + * @param path JSON path string.
> + * @param path_len Length of @path.
> + * @retval not NULL pointer to leaf record if any.
> + */
> +struct json_tree_node *
> +json_tree_lookup_path(struct json_tree_node *root,
> +		      const char *path, uint32_t path_len);

This function doesn't look up a path, it looks up a JSON tree node *by*
path. May be, call it json_tree_lookup_by_path then? Not sure. Don't
really like how the lookup function names play along.

  json_tree_lookup - by path component
  json_tree_lookup_by_path - by full path

  json_tree_lookup_child
  json_tree_lookup

  json_tree_lookup_by_path_node
  json_tree_lookup_by_path

Which one is better? Don't know...

> +
> +/**
> + * Make pre order traversal iterator @pos step.
> + * @param pos Iterator pointer, NULL for initialization.
> + * @param root The JSON path tree to walk root.
> + * @retval not NULL pointer to next node. NULL if nothing found.
> + */
> +struct json_tree_node *
> +json_tree_preorder_next(struct json_tree_node *pos,
> +			struct json_tree_node *root);
> +
> +/**
> + * Make post order traversal iterator @pos step.
> + * @param pos Iterator pointer, NULL for initialization.
> + * @param root The JSON path tree to walk root.
> + * @retval not NULL pointer to next node. NULL if nothing found.
> + */
> +struct json_tree_node *
> +json_tree_postorder_next(struct json_tree_node *pos,
> +			 struct json_tree_node *root);
> +
> +
> +/**
> + * Define macros to operate with containers.
> + */

Useless comment.

> +
> +/** Return entry by json_tree_node item. */
> +#define json_tree_node_entry(item, type, member) ({			     \

json_tree_entry?

> +	const typeof( ((type *)0)->member ) *__mptr = (item);		     \
> +	(type *)( (char *)__mptr - ((size_t) &((type *)0)->member) ); })
> +
> +/** Make lookup in tree by path and return entry on found. */
> +#define json_tree_lookup_path_entry(root, path, path_len, type, member)      \

json_tree_lookup_entry_by_path?

> +({struct json_tree_node *__tree_node =					     \
> +	json_tree_lookup_path(&(root)->member, path, path_len);		     \
> + __tree_node != NULL ? json_tree_node_entry(__tree_node, type, member) :     \
> +		       NULL; })
> +
> +/** Make lookup in tree by path_node and return entry on found. */
> +#define json_tree_lookup_entry(root, path_node, path_node_hash, type, member)\
> +({struct json_tree_node *__tree_node =					     \
> +	json_tree_lookup(&(root)->member, path_node, path_node_hash);	     \
> + __tree_node != NULL ? json_tree_node_entry(__tree_node, type, member) :     \
> +		       NULL; })
> +
> +/** Make traversal in JSON tree. */
> +#define json_tree_foreach_entry(order, item, root, member)		     \

Apart from _entry variants, there should be json_tree_foreach_pre,
json_tree_foreach_post, and json_tree_foreach_safe. The latter must use
post-order traversal, because pre-order traversal is unsafe against node
removal.

IMO _entry variants should be called

  json_tree_foreach_entry_pre
  json_tree_foreach_entry_post
  json_tree_foreach_entry_safe

> +for (struct json_tree_node *__iter =					     \
> +     json_tree_##order##_next(NULL, &(root)->member);			     \
> +     __iter != NULL && (item = json_tree_node_entry(__iter, typeof(*root),   \
> +						    member));		     \
> +     __iter = json_tree_##order##_next(__iter, &(root)->member))
> +
> +/**
> + * Make secure traversal in JSON tree that works even on
> + * destruction under iterator.
> + */
> +#define json_tree_foreach_entry_safe(order, item, root, member)		     \
> +for (struct json_tree_node *__tmp_iter, *__iter =			     \
> +     json_tree_##order##_next(NULL, &(root)->member);			     \
> +     __iter != NULL && ((item) = json_tree_node_entry(__iter, typeof(*root), \
> +						      member)) &&	     \
> +     (__iter != NULL && (__tmp_iter =					     \
> +			 json_tree_##order##_next(__iter,		     \
> +						  &(root)->member))),	     \
> +     (__iter != NULL && ((item) = json_tree_node_entry(__iter, typeof(*root),\
> +						       member)));	     \
> +     __iter = __tmp_iter)
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* TARANTOOL_JSON_TREE_H_INCLUDED */
> diff --git a/test/unit/json_path.c b/test/unit/json_path.c
> index 1d7e7d3..fb16162 100644
> --- a/test/unit/json_path.c
> +++ b/test/unit/json_path.c
> @@ -1,7 +1,9 @@
>  #include "json/path.h"
> +#include "json/tree.h"
>  #include "unit.h"
>  #include "trivia/util.h"
>  #include <string.h>
> +#include <stdbool.h>
>  
>  #define reset_to_new_path(value) \
>  	path = value; \
> @@ -159,14 +161,220 @@ test_errors()
>  	footer();
>  }
>  
> +enum tuple_field_type {
> +	TUPLE_FIELD_TYPE_ANY,
> +	TUPLE_FIELD_TYPE_MAP,
> +	TUPLE_FIELD_TYPE_ARRAY,
> +	TUPLE_FIELD_TYPE_OTHER,
> +};

Please rewrite the test without any reference to tuple fields.
All you need is check all functions so I expect the test to be
simple and clear.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] Re: [PATCH v1 1/1] lib: implement JSON tree class for json library
  2018-10-01 21:34 ` Vladimir Davydov
@ 2018-10-03  7:15   ` Kirill Shcherbatov
  2018-10-04 22:59     ` Konstantin Osipov
  0 siblings, 1 reply; 8+ messages in thread
From: Kirill Shcherbatov @ 2018-10-03  7:15 UTC (permalink / raw)
  To: tarantool-patches, Vladimir Davydov

Hi thank you for review!
I've fixed most your suggestions, and tried to use class in main patch.
Thus I've implemented lazy-initialization for hashtable.

> Shouldn't we use json_path_node for a key?
No, we also need to pass hash that lacks in key re

>   json_tree_next_child(NULL, node)
> 
> looks unnatural to me.
Ok for here, but I've kept such order for foreach defines
foreach(iterator, object) is looking good for me.


> I wouldn't use _entry variants here, because 'next' may be not a list
> entry, but a list head, which you check below.
Sorry, I don't understand, how should I implement the other way.
I've adopted this code from cgroups.

> Please rewrite the test without any reference to tuple fields.
> All you need is check all functions so I expect the test to be
> simple and clear.
Ok, done.

> Do you really need all these dependencies? AFAICS you only use header
> files (rlist.h and mhash.h) so no new dependencies should be introduced
Yes, I do. We need a hash functions that are the part of misc.
target_link_libraries(json_path misc)

=================================================

New JSON tree class would store JSON paths for tuple fields
for registered non-plain indexes. It is a hierarchical data
structure that organize JSON nodes produced by parser.
Class provides API to lookup node by path and iterate over the
tree.
JSON Indexes patch require such functionality to make lookup
for tuple_fields by path, make initialization of field map and
build vynyl_stmt msgpack for secondary index via JSON tree
iteration.

Need for #1012.
---
 src/lib/json/CMakeLists.txt |   2 +
 src/lib/json/tree.c         | 288 ++++++++++++++++++++++++++++++++++++++++++++
 src/lib/json/tree.h         | 241 ++++++++++++++++++++++++++++++++++++
 test/unit/json_path.c       | 205 ++++++++++++++++++++++++++++++-
 4 files changed, 735 insertions(+), 1 deletion(-)
 create mode 100644 src/lib/json/tree.c
 create mode 100644 src/lib/json/tree.h

diff --git a/src/lib/json/CMakeLists.txt b/src/lib/json/CMakeLists.txt
index 203fe6f..9eaba37 100644
--- a/src/lib/json/CMakeLists.txt
+++ b/src/lib/json/CMakeLists.txt
@@ -1,6 +1,8 @@
 set(lib_sources
     path.c
+    tree.c
 )
 
 set_source_files_compile_flags(${lib_sources})
 add_library(json_path STATIC ${lib_sources})
+target_link_libraries(json_path misc)
diff --git a/src/lib/json/tree.c b/src/lib/json/tree.c
new file mode 100644
index 0000000..505118d
--- /dev/null
+++ b/src/lib/json/tree.c
@@ -0,0 +1,288 @@
+
+/*
+ * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <assert.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#include "small/rlist.h"
+#include "module.h"
+#include "tree.h"
+#include "path.h"
+#include "third_party/PMurHash.h"
+
+
+/**
+ * Hash table: json_path_node => json_tree_node.
+ */
+struct mh_json_tree_node {
+	struct json_path_node key;
+	uint32_t key_hash;
+	struct json_tree_node *node;
+};
+
+/**
+ * Compare hash records of two json tree nodes. Return 0 if equal.
+ */
+static inline int
+mh_json_tree_node_cmp(const struct mh_json_tree_node *a,
+		      const struct mh_json_tree_node *b)
+{
+	if (a->key.type != b->key.type)
+		return a->key.type - b->key.type;
+	if (a->key.type == JSON_PATH_STR) {
+		if (a->key.len != b->key.len)
+			return a->key.len - b->key.len;
+		return memcmp(a->key.str, b->key.str, a->key.len);
+	} else if (a->key.type == JSON_PATH_NUM) {
+		return a->key_hash - b->key_hash;
+}
+	unreachable();
+}
+
+#define MH_SOURCE 1
+#define mh_name _json_tree_node
+#define mh_key_t struct mh_json_tree_node *
+#define mh_node_t struct mh_json_tree_node
+#define mh_arg_t void *
+#define mh_hash(a, arg) ((a)->key_hash)
+#define mh_hash_key(a, arg) ((a)->key_hash)
+#define mh_cmp(a, b, arg) (mh_json_tree_node_cmp((a), (b)))
+#define mh_cmp_key(a, b, arg) mh_cmp(a, b, arg)
+#include "salad/mhash.h"
+
+static const uint32_t json_path_node_hash_seed = 13U;
+
+uint32_t
+json_path_node_hash(struct json_path_node *key)
+{
+	uint32_t h = json_path_node_hash_seed;
+	uint32_t carry = 0;
+	const void *data;
+	uint32_t data_size;
+	if (key->type == JSON_PATH_STR) {
+		data = key->str;
+		data_size = key->len;
+	} else if (key->type == JSON_PATH_NUM) {
+		data = &key->num;
+		data_size = sizeof(key->num);
+	} else {
+		unreachable();
+	}
+	PMurHash32_Process(&h, &carry, data, data_size);
+	return PMurHash32_Result(h, carry, data_size);
+}
+
+void
+json_tree_node_create(struct json_tree_node *node,
+		      struct json_path_node *path_node, uint32_t path_node_hash)
+{
+	assert(path_node == NULL ||
+	       path_node_hash == json_path_node_hash(path_node));
+	memset(node, 0, sizeof(struct json_tree_node));
+	if (path_node != NULL) {
+		node->key = *path_node;
+		node->key_hash = path_node_hash;
+	} else {
+		node->key.type = JSON_PATH_END;
+	}
+	rlist_create(&node->sibling);
+	rlist_create(&node->children);
+}
+
+void
+json_tree_node_destroy(struct json_tree_node *node)
+{
+	if (node->child_by_path_node != NULL)
+		mh_json_tree_node_delete(node->child_by_path_node);
+}
+
+struct json_tree_node *
+json_tree_lookup_by_path_node(struct json_tree_node *parent,
+			      struct json_path_node *path_node,
+			      uint32_t path_node_hash)
+{
+	if (unlikely(parent->child_by_path_node == NULL))
+		return NULL;
+	struct mh_json_tree_node info;
+	info.key = *path_node;
+	info.key_hash = path_node_hash;
+	mh_int_t id =
+		mh_json_tree_node_find(parent->child_by_path_node, &info, NULL);
+	if (id == mh_end(parent->child_by_path_node))
+		return NULL;
+	struct mh_json_tree_node *ht_node =
+		mh_json_tree_node_node(parent->child_by_path_node, id);
+	assert(ht_node == NULL || ht_node->node != NULL);
+	struct json_tree_node *ret = ht_node != NULL ? ht_node->node : NULL;
+	assert(ret == NULL || ret->parent == parent);
+	return ret;
+}
+
+int
+json_tree_add(struct json_tree_node *parent, struct json_tree_node *node)
+{
+	assert(node->parent == NULL);
+	if (unlikely(parent->child_by_path_node == NULL)) {
+		parent->child_by_path_node = mh_json_tree_node_new();
+		if (parent->child_by_path_node == NULL)
+			return -1;
+	}
+	assert(json_tree_lookup_by_path_node(parent, &node->key,
+					     node->key_hash) == NULL);
+	struct mh_json_tree_node ht_node;
+	ht_node.key = node->key;
+	ht_node.key_hash = node->key_hash;
+	ht_node.node = node;
+	mh_int_t rc = mh_json_tree_node_put(parent->child_by_path_node,
+					    &ht_node, NULL, NULL);
+	if (rc == mh_end(parent->child_by_path_node))
+		return -1;
+	node->parent = parent;
+	rlist_add(&parent->children, &node->sibling);
+	return 0;
+}
+
+void
+json_tree_remove(struct json_tree_node *parent, struct json_tree_node *node)
+{
+	assert(node->parent == parent);
+	assert(parent->child_by_path_node != NULL);
+	assert(json_tree_lookup_by_path_node(parent, &node->key,
+					     node->key_hash) == node);
+	rlist_del(&node->sibling);
+	struct mh_json_tree_node info;
+	info.key = node->key;
+	info.key_hash = node->key_hash;
+	mh_int_t id =
+		mh_json_tree_node_find(parent->child_by_path_node, &info, NULL);
+	assert(id != mh_end(parent->child_by_path_node));
+	mh_json_tree_node_del(parent->child_by_path_node, id, NULL);
+	assert(json_tree_lookup_by_path_node(parent, &node->key,
+					     node->key_hash) == NULL);
+}
+
+struct json_tree_node *
+json_tree_lookup_by_path(struct json_tree_node *parent, const char *path,
+			 uint32_t path_len)
+{
+	if (unlikely(parent->child_by_path_node == NULL))
+		return NULL;
+	int rc;
+	struct json_path_parser parser;
+	struct json_path_node path_node;
+	struct json_tree_node *ret = parent;
+	json_path_parser_create(&parser, path, path_len);
+	while ((rc = json_path_next(&parser, &path_node)) == 0 &&
+	       path_node.type != JSON_PATH_END && ret != NULL) {
+		uint32_t path_node_hash = json_path_node_hash(&path_node);
+		struct json_tree_node *node =
+			json_tree_lookup_by_path_node(ret, &path_node,
+						      path_node_hash);
+		assert(node == NULL || node->parent == ret);
+		ret = node;
+	}
+	if (rc != 0 || path_node.type != JSON_PATH_END)
+		return NULL;
+	return ret;
+}
+
+/**
+ * Return the next child of a JSON tree node.
+ * @param parent Parent of @pos record.
+ * @param pos Last visited JSON tree node record.
+ * @retval not NULL pointer to next item if exists.
+ */
+static struct json_tree_node *
+json_tree_next_child(struct json_tree_node *parent, struct json_tree_node *pos)
+{
+	struct json_tree_node *next;
+	if (pos == NULL) {
+		next = rlist_first_entry(&parent->children,
+					 struct json_tree_node, sibling);
+	} else {
+		next = rlist_next_entry(pos, sibling);
+	}
+	if (&next->sibling != &parent->children)
+		return next;
+	return NULL;
+}
+
+/**
+ * Find leftmost record relative for node.
+ * @param pos JSON tree node.
+ * @retval not NULL pointer to next item if exists.
+ */
+static struct json_tree_node *
+json_tree_leftmost(struct json_tree_node *pos)
+{
+	struct json_tree_node *last;
+	do {
+		last = pos;
+		pos = json_tree_next_child(pos, NULL);
+	} while (pos != NULL);
+	return last;
+}
+
+struct json_tree_node *
+json_tree_next_pre(struct json_tree_node *parent, struct json_tree_node *pos)
+{
+	if (pos == NULL)
+		return parent;
+	struct json_tree_node *next = json_tree_next_child(pos, NULL);
+	if (next != NULL)
+		return next;
+	while (pos != parent) {
+		next = json_tree_next_child(pos->parent, pos);
+		if (next != NULL)
+			return next;
+		pos = pos->parent;
+	}
+	return NULL;
+}
+
+struct json_tree_node *
+json_tree_next_post(struct json_tree_node *parent, struct json_tree_node *pos)
+{
+	struct json_tree_node *next;
+	if (pos == NULL)
+		return json_tree_leftmost(parent);
+	if (pos == parent)
+		return NULL;
+	next = json_tree_next_child(pos->parent, pos);
+	if (next != NULL)
+		return json_tree_leftmost(next);
+	return pos->parent;
+}
diff --git a/src/lib/json/tree.h b/src/lib/json/tree.h
new file mode 100644
index 0000000..7a9b309
--- /dev/null
+++ b/src/lib/json/tree.h
@@ -0,0 +1,241 @@
+#ifndef TARANTOOL_JSON_TREE_H_INCLUDED
+#define TARANTOOL_JSON_TREE_H_INCLUDED
+/*
+ * Copyright 2010-2018 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <stdint.h>
+#include "small/rlist.h"
+#include "path.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+struct mh_json_tree_node_t;
+
+/**
+ * JSON tree is a hierarchical data structure that organize JSON
+ * nodes produced by parser.
+ * Structure key records point to source strings memory.
+ */
+struct json_tree_node {
+	/** Path component represented by this node. */
+	struct json_path_node key;
+	/** Hash calculated for key field. */
+	uint32_t key_hash;
+	/**
+	 * Children hashtable:
+	 * json_path_node -> json_tree_node.
+	 */
+	struct mh_json_tree_node_t *child_by_path_node;
+	/**
+	 * Link in json_tree_node::children of the parent node.
+	 */
+	struct rlist sibling;
+	/**
+	 * List of child nodes, linked by
+	 * json_tree_node::sibling.
+	 */
+	struct rlist children;
+	/** Pointer to parent json_tree_node record. */
+	struct json_tree_node *parent;
+};
+
+/**
+ * Compute the hash value of a JSON path component.
+ * @param key JSON path component to compute the hash for.
+ * @retval Hash value.
+ */
+uint32_t
+json_path_node_hash(struct json_path_node *key);
+
+/**
+ * Init a JSON tree node.
+ * @param node JSON tree node to initialize.
+ * @param path_node JSON path component the new node will
+ *                  represent. May be NULL, in this case
+ *                  @node initialized as root record.
+ * @param path_node_hash Hash of @path_node, should be calculated
+ *                       with json_path_node_hash.
+ */
+void
+json_tree_node_create(struct json_tree_node *node,
+		      struct json_path_node *path_node,
+		      uint32_t path_node_hash);
+
+/**
+ * Destroy a JSON tree node.
+ * @param node JSON tree node to destruct.
+ */
+void
+json_tree_node_destroy(struct json_tree_node *node);
+
+/**
+ * Look up a child of a JSON tree node by a path component.
+ * @param parent JSON tree node to start lookup with.
+ * @param path_node JSON parser node with data.
+ * @param path_node_hash Hash of @path_node.
+ * @retval not NULL pointer to JSON tree node if any.
+ * @retval NULL on nothing found.
+ */
+struct json_tree_node *
+json_tree_lookup_by_path_node(struct json_tree_node *parent,
+			      struct json_path_node *path_node,
+			      uint32_t path_node_hash);
+
+/**
+ * Add a new JSON tree node at the given position in the tree.
+ * The parent node must not have a child representing the same
+ * path component.
+ * @param parent JSON tree parent node.
+ * @param node JSON tree node to append.
+ * @retval 0 On success, -1 on memory allocation error.
+ */
+int
+json_tree_add(struct json_tree_node *parent,
+	      struct json_tree_node *node);
+
+/**
+ * Delete a JSON tree node at the given position in the tree.
+ * The parent node must have such child.
+ * @param parent JSON tree parent node.
+ * @param node JSON tree node to add.
+ */
+void
+json_tree_remove(struct json_tree_node *parent,
+		 struct json_tree_node *node);
+
+/**
+ * Lookup a field by path in given position in the JSON tree.
+ * @param parent JSON tree parent node.
+ * @param path JSON path string.
+ * @param path_len Length of @path.
+ * @retval not NULL pointer to leaf record if any.
+ */
+struct json_tree_node *
+json_tree_lookup_by_path(struct json_tree_node *parent,
+			 const char *path, uint32_t path_len);
+
+/**
+ * Make pre order traversal in JSON tree.
+ * @param parent The JSON path tree to walk parent.
+ * @param pos Iterator pointer, NULL for initialization.
+ * @retval not NULL pointer to next node. NULL if nothing found.
+ */
+struct json_tree_node *
+json_tree_next_pre(struct json_tree_node *parent,
+		   struct json_tree_node *pos);
+
+/**
+ * Make post order traversal in JSON tree.
+ * @param parent The JSON path tree to walk parent.
+ * @param pos Iterator pointer, NULL for initialization.
+ * @retval not NULL pointer to next node. NULL if nothing found.
+ */
+struct json_tree_node *
+json_tree_next_post(struct json_tree_node *parent,
+		    struct json_tree_node *pos);
+
+/** Return entry by json_tree_node item. */
+#define json_tree_entry(item, type, member) ({				     \
+	const typeof( ((type *)0)->member ) *__mptr = (item);		     \
+	(type *)( (char *)__mptr - ((size_t) &((type *)0)->member) ); })
+
+/** Make lookup in tree by path and return entry. */
+#define json_tree_lookup_entry_by_path(root, path, path_len, type, member)   \
+({struct json_tree_node *__tree_node =					     \
+	json_tree_lookup_by_path(&(root)->member, path, path_len);	     \
+ __tree_node != NULL ? json_tree_entry(__tree_node, type, member) :	     \
+		       NULL; })
+
+/** Make lookup in tree by path_node and return entry. */
+#define json_tree_lookup_entry_by_path_node(root, path_node, path_node_hash, \
+					    type, member)		     \
+({struct json_tree_node *__tree_node =					     \
+	json_tree_lookup_by_path_node(&(root)->member, path_node,	     \
+				      path_node_hash);			     \
+ __tree_node != NULL ? json_tree_entry(__tree_node, type, member) :	     \
+		       NULL; })
+
+/** Make pre-order traversal in JSON tree. */
+#define json_tree_foreach_pre(item, root)				     \
+for ((item) = json_tree_next_pre((root), NULL); (item) != NULL;		     \
+     (item) = json_tree_next_pre((root), (item)))
+
+/** Make post-order traversal in JSON tree. */
+#define json_tree_foreach_post(item, root)				     \
+for ((item) = json_tree_next_post((root), NULL); (item) != NULL;	     \
+     (item) = json_tree_next_post((root), (item)))
+
+/**
+ * Make safe post-order traversal in JSON tree.
+ * Safe for building destructors.
+ */
+#define json_tree_foreach_safe(item, root)				     \
+for (struct json_tree_node *__iter = json_tree_next_post((root), NULL);      \
+     (((item) = __iter) && (__iter = json_tree_next_post((root), (item))),   \
+     (item) != NULL);)
+
+/** Make post-order traversal in JSON tree and return entry. */
+#define json_tree_foreach_entry_pre(item, root, member)			     \
+for (struct json_tree_node *__iter =					     \
+     json_tree_next_pre(&(root)->member, NULL);				     \
+     __iter != NULL && ((item) = json_tree_entry(__iter, typeof(*(root)),    \
+					       member));		     \
+     __iter = json_tree_next_pre(&(root)->member, __iter))
+
+/** Make pre-order traversal in JSON tree and return entry. */
+#define json_tree_foreach_entry_post(item, root, member)		     \
+for (struct json_tree_node *__iter =					     \
+     json_tree_next_post(&(root)->member, NULL);			     \
+     __iter != NULL && ((item) = json_tree_entry(__iter, typeof(*(root)),    \
+					       member));		     \
+     __iter = json_tree_next_post(&(root)->member, __iter))
+
+/**
+ * Make secure post-order traversal in JSON tree and return entry.
+ */
+#define json_tree_foreach_entry_safe(item, root, member)		     \
+for (struct json_tree_node *__tmp_iter, *__iter =			     \
+     json_tree_next_post(NULL, &(root)->member);			     \
+     __iter != NULL && ((item) = json_tree_entry(__iter, typeof(*(root)),    \
+						      member)) &&	     \
+     (__iter != NULL && (__tmp_iter =					     \
+			 json_tree_next_post(&(root)->member, __iter))),     \
+     (__iter != NULL && ((item) = json_tree_entry(__iter, typeof(*(root)),   \
+						  member)));		     \
+     __iter = __tmp_iter)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TARANTOOL_JSON_TREE_H_INCLUDED */
diff --git a/test/unit/json_path.c b/test/unit/json_path.c
index 1d7e7d3..369a02b 100644
--- a/test/unit/json_path.c
+++ b/test/unit/json_path.c
@@ -1,7 +1,9 @@
 #include "json/path.h"
+#include "json/tree.h"
 #include "unit.h"
 #include "trivia/util.h"
 #include <string.h>
+#include <stdbool.h>
 
 #define reset_to_new_path(value) \
 	path = value; \
@@ -159,14 +161,215 @@ test_errors()
 	footer();
 }
 
+struct test_struct {
+	int value;
+	struct json_tree_node tree_node;
+};
+
+struct test_struct *
+test_add_path(struct test_struct *root, const char *path, uint32_t path_len,
+	      struct test_struct *records_pool, int *pool_idx)
+{
+	int rc;
+	struct json_path_parser parser;
+	struct json_path_node path_node;
+	struct test_struct *parent = root;
+	json_path_parser_create(&parser, path, path_len);
+	while ((rc = json_path_next(&parser, &path_node)) == 0 &&
+		path_node.type != JSON_PATH_END) {
+		uint32_t path_node_hash = json_path_node_hash(&path_node);
+		struct test_struct *new_node =
+			json_tree_lookup_entry_by_path_node(parent, &path_node,
+							    path_node_hash,
+							    struct test_struct,
+							    tree_node);
+		if (new_node == NULL) {
+			new_node = &records_pool[*pool_idx];
+			*pool_idx = *pool_idx + 1;
+			json_tree_node_create(&new_node->tree_node, &path_node,
+					      path_node_hash);
+			rc = json_tree_add(&parent->tree_node,
+					   &new_node->tree_node);
+			fail_if(rc != 0);
+		}
+		parent = new_node;
+	}
+	fail_if(rc != 0 || path_node.type != JSON_PATH_END);
+	return parent;
+}
+
+void
+test_tree()
+{
+	header();
+	plan(42);
+
+	struct test_struct records[6];
+	for (int i = 0; i < 6; i++)
+		records[i].value = i;
+	json_tree_node_create(&records[0].tree_node, NULL, 0);
+	fail_if(&records[0] != json_tree_entry(&records[0].tree_node,
+					       struct test_struct, tree_node));
+
+	const char *path1 = "[1][10]";
+	const char *path2 = "[1][20].file";
+	const char *path3_to_remove = "[2]";
+	const char *path_unregistered = "[1][3]";
+
+	int records_idx = 1;
+	struct test_struct *node;
+	node = test_add_path(&records[0], path1, strlen(path1), records,
+			     &records_idx);
+	is(node, &records[2], "add path '%s'", path1);
+
+	node = test_add_path(&records[0], path2, strlen(path2), records,
+			     &records_idx);
+	is(node, &records[4], "add path '%s'", path2);
+
+	node = json_tree_lookup_entry_by_path(&records[0], path1, strlen(path1),
+					      struct test_struct, tree_node);
+	is(node, &records[2], "lookup path '%s'", path1);
+
+	node = json_tree_lookup_entry_by_path(&records[0], path2, strlen(path2),
+					      struct test_struct, tree_node);
+	is(node, &records[4], "lookup path '%s'", path2);
+
+	node = json_tree_lookup_entry_by_path(&records[0], path_unregistered,
+					      strlen(path_unregistered),
+					      struct test_struct, tree_node);
+	is(node, NULL, "lookup unregistered path '%s'", path_unregistered);
+
+	node = test_add_path(&records[0], path3_to_remove,
+			     strlen(path3_to_remove), records, &records_idx);
+	is(node, &records[5], "add path to remove '%s'", path3_to_remove);
+	if (node != NULL) {
+		json_tree_remove(&records[0].tree_node, &node->tree_node);
+		json_tree_node_destroy(&node->tree_node);
+	} else {
+		isnt(node, NULL, "can't remove empty node!");
+	}
+
+	/* Test iterators. */
+	struct json_tree_node *tree_record = NULL;
+	const struct json_tree_node *tree_nodes_preorder[] =
+		{&records[0].tree_node, &records[1].tree_node,
+		 &records[3].tree_node, &records[4].tree_node,
+		 &records[2].tree_node};
+	int cnt = sizeof(tree_nodes_preorder)/sizeof(tree_nodes_preorder[0]);
+	int idx = 0;
+	json_tree_foreach_pre(tree_record, &records[0].tree_node) {
+		if (idx >= cnt)
+			break;
+		struct test_struct *t1 =
+			json_tree_entry(tree_record, struct test_struct,
+					tree_node);
+		struct test_struct *t2 =
+			json_tree_entry(tree_nodes_preorder[idx],
+					struct test_struct, tree_node);
+		is(tree_record, tree_nodes_preorder[idx],
+		   "test foreach pre order %d: have %d expected %d",
+		   idx, t1->value, t2->value);
+		++idx;
+	}
+	is(idx, 5, "records iterated count %d of %d", idx, cnt);
+
+
+	const struct json_tree_node *tree_nodes_postorder[] =
+		{&records[4].tree_node, &records[3].tree_node,
+		 &records[2].tree_node, &records[1].tree_node,
+		 &records[0].tree_node};
+	cnt = sizeof(tree_nodes_postorder)/sizeof(tree_nodes_postorder[0]);
+	idx = 0;
+	json_tree_foreach_post(tree_record, &records[0].tree_node) {
+		if (idx >= cnt)
+			break;
+		struct test_struct *t1 =
+			json_tree_entry(tree_record, struct test_struct,
+					tree_node);
+		struct test_struct *t2 =
+			json_tree_entry(tree_nodes_postorder[idx],
+					struct test_struct, tree_node);
+		is(tree_record, tree_nodes_postorder[idx],
+		   "test foreach post order %d: have %d expected of %d",
+		   idx, t1->value, t2->value);
+		++idx;
+	}
+	is(idx, 5, "records iterated count %d of %d", idx, cnt);
+
+	idx = 0;
+	json_tree_foreach_safe(tree_record, &records[0].tree_node) {
+		if (idx >= cnt)
+			break;
+		struct test_struct *t1 =
+			json_tree_entry(tree_record, struct test_struct,
+					tree_node);
+		struct test_struct *t2 =
+			json_tree_entry(tree_nodes_postorder[idx],
+					struct test_struct, tree_node);
+		is(tree_record, tree_nodes_postorder[idx],
+		   "test foreach safe order %d: have %d expected of %d",
+		   idx, t1->value, t2->value);
+		++idx;
+	}
+	is(idx, 5, "records iterated count %d of %d", idx, cnt);
+
+	idx = 0;
+	json_tree_foreach_entry_pre(node, &records[0], tree_node) {
+		if (idx >= cnt)
+			break;
+		struct test_struct *t =
+			json_tree_entry(tree_nodes_preorder[idx],
+					struct test_struct, tree_node);
+		is(&node->tree_node, tree_nodes_preorder[idx],
+		   "test foreach entry pre order %d: have %d expected of %d",
+		   idx, node->value, t->value);
+		idx++;
+	}
+	is(idx, 5, "records iterated count %d of %d", idx, cnt);
+
+	idx = 0;
+	json_tree_foreach_entry_post(node, &records[0], tree_node) {
+		if (idx >= cnt)
+			break;
+		struct test_struct *t =
+			json_tree_entry(tree_nodes_postorder[idx],
+					struct test_struct, tree_node);
+		is(&node->tree_node, tree_nodes_postorder[idx],
+		   "test foreach entry post order %d: have %d expected of %d",
+		   idx, node->value, t->value);
+		idx++;
+	}
+	is(idx, 5, "records iterated count %d of %d", idx, cnt);
+
+	idx = 0;
+	json_tree_foreach_entry_safe(node, &records[0], tree_node) {
+		if (idx >= cnt)
+			break;
+		struct test_struct *t =
+			json_tree_entry(tree_nodes_postorder[idx],
+					struct test_struct, tree_node);
+		is(&node->tree_node, tree_nodes_postorder[idx],
+		   "test foreach entry safe order %d: have %d expected of %d",
+		   idx, node->value, t->value);
+		json_tree_node_destroy(&node->tree_node);
+		idx++;
+	}
+	is(idx, 5, "records iterated count %d of %d", idx, cnt);
+
+
+	check_plan();
+	footer();
+}
+
 int
 main()
 {
 	header();
-	plan(2);
+	plan(3);
 
 	test_basic();
 	test_errors();
+	test_tree();
 
 	int rc = check_plan();
 	footer();
-- 
2.7.4

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] [PATCH v1 1/1] lib: implement JSON tree class for json library
  2018-10-01 12:27 [PATCH v1 1/1] lib: implement JSON tree class for json library Kirill Shcherbatov
  2018-10-01 21:34 ` Vladimir Davydov
@ 2018-10-04 22:54 ` Konstantin Osipov
  1 sibling, 0 replies; 8+ messages in thread
From: Konstantin Osipov @ 2018-10-04 22:54 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, Kirill Shcherbatov

* Kirill Shcherbatov <kshcherbatov@tarantool.org> [18/10/01 18:05]:
> +/** Hash record representing json_tree_node. */
> +struct mh_json_tree_node {
> +	struct json_path_node key;

A trivial question: does this data structure represent an entire
json path or a fragment of a json path? 
> +	uint32_t key_hash;
> +	struct json_tree_node *node;
> +};

Same here. Are we hashing an entire json path or a fragment of the
path? 

I of course could look up in the implementation. My point is that
the comment could be much more informative with just a few words
related to substance, and not formal repetition of what is already
evident. This review remark is relevant about the rest of the
comments. You did not think about the reader, all the comments are
purely formal.

> +
> +/**
> + * Compare json_tree_node hash bindings @a and @b.
> + * @param a Hash record to compare.
> + * @param b Hash record to compare.
> + * @retval 0 On equal, not 0 otherwise.
> + */
> +bool
> +mh_json_tree_node_cmp(const struct mh_json_tree_node *a,
> +		      const struct mh_json_tree_node *b)
> +{
> +	return (a->key.type != b->key.type) ||
> +		((a->key.type != JSON_PATH_STR ||
> +		a->key.len != b->key.len ||
> +		memcmp(a->key.str, b->key.str,
> +		       a->key.len) != 0) &&
> +		(a->key.type != JSON_PATH_NUM ||
> +		a->key_hash != b->key_hash));
> +}

> +
> +/** Define tree record hashtable. */
> +#define MH_SOURCE 1
> +#define mh_name _json_tree_node
> +#define mh_key_t struct mh_json_tree_node *
> +#define mh_node_t struct mh_json_tree_node
> +#define mh_arg_t void *
> +#define mh_hash(a, arg) ((a)->key_hash)
> +#define mh_hash_key(a, arg) ((a)->key_hash)
> +#define mh_cmp(a, b, arg) (mh_json_tree_node_cmp((a), (b)))
> +#define mh_cmp_key(a, b, arg) mh_cmp(a, b, arg)
> +#include "salad/mhash.h"
> +
> +static const uint32_t json_path_node_hash_seed = 13U;
> +
> +uint32_t
> +json_path_node_hash(struct json_path_node *key)
> +{
> +	uint32_t h = json_path_node_hash_seed;
> +	uint32_t carry = 0;
> +	uint8_t *data;
> +	uint32_t data_size;
> +	if (key->type == JSON_PATH_STR) {
> +		data = (uint8_t *)key->str;
> +		data_size = key->len;
> +	} else if (key->type == JSON_PATH_NUM) {
> +		data = (uint8_t *)&key->num;
> +		data_size = sizeof(key->num);
> +	} else {
> +		assert(0);
> +	}
> +	PMurHash32_Process(&h, &carry, data, data_size);
> +	return PMurHash32_Result(h, carry, data_size);

Why did you choose murmur and not luajit hash?

> +	memset(node, 0, sizeof(struct json_tree_node));
> +	node->children_ht = mh_json_tree_node_new();
> +	if (node->children_ht == NULL)
> +		return -1;
> +	if (path_node != NULL) {
> +		memcpy(&node->key, path_node, sizeof(struct json_path_node));
> +		node->key_hash = path_node_hash;
> +	} else {
> +		node->key.type = JSON_PATH_END;
> +	}
> +	rlist_create(&node->siblings);
> +	rlist_create(&node->children);
> +	return 0;

There should be a comment describing what API this data structure
provides and how it works. Why did you choose to create a nested
hash in every node?  What is the complexity of the json path
lookup? There should be a fastpath for exact match which only
evaluates a path hash once.

> +struct json_tree_node *
> +json_tree_lookup_path(struct json_tree_node *root, const char *path,
> +		      uint32_t path_len)
> +{
> +	int rc;
> +	struct json_path_parser parser;
> +	struct json_path_node path_node;
> +	json_path_parser_create(&parser, path, path_len);
> +	while ((rc = json_path_next(&parser, &path_node)) == 0 &&
> +	       path_node.type != JSON_PATH_END && root != NULL) {
> +		uint32_t path_node_hash = json_path_node_hash(&path_node);
> +		struct json_tree_node *node =
> +			json_tree_lookup(root, &path_node, path_node_hash);
> +		assert(node == NULL || node->parent == root);
> +		root = node;
> +	}
> +	if (rc != 0 || path_node.type != JSON_PATH_END)
> +		return NULL;
> +	return root;
> +}

> +/**
> + * Destroy JSON tree node.
> + * @param node JSON tree node to destruct.
> + * @retval 0 On success, -1 on error.
> + */

> +/**
> + * Lookup record with data @path_node having hash @path_node_hash
> + * in JSON tree @node childs.
> + * @param node JSON tree root node.
> + * @param path_node JSON parser node with data.
> + * @param path_node_hash Hash of @path_node.
> + * @retval not NULL pointer to JSON tree node if any.
> + */

I literally hate your comments. 

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] Re: [PATCH v1 1/1] lib: implement JSON tree class for json library
  2018-10-03  7:15   ` [tarantool-patches] " Kirill Shcherbatov
@ 2018-10-04 22:59     ` Konstantin Osipov
  2018-10-05  9:50       ` Vladimir Davydov
  0 siblings, 1 reply; 8+ messages in thread
From: Konstantin Osipov @ 2018-10-04 22:59 UTC (permalink / raw)
  To: tarantool-patches; +Cc: Vladimir Davydov

* Kirill Shcherbatov <kshcherbatov@tarantool.org> [18/10/03 15:28]:

Before we proceed with everything else, please, why do you need to
parse a path to look it up?


-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] Re: [PATCH v1 1/1] lib: implement JSON tree class for json library
  2018-10-04 22:59     ` Konstantin Osipov
@ 2018-10-05  9:50       ` Vladimir Davydov
  2018-10-06 12:37         ` Konstantin Osipov
  0 siblings, 1 reply; 8+ messages in thread
From: Vladimir Davydov @ 2018-10-05  9:50 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Fri, Oct 05, 2018 at 01:59:08AM +0300, Konstantin Osipov wrote:
> * Kirill Shcherbatov <kshcherbatov@tarantool.org> [18/10/03 15:28]:
> 
> Before we proceed with everything else, please, why do you need to
> parse a path to look it up?

Let me try to explain, because I'm backing this implementation.

How else can you look up a json path in a tree without splitting it in
nodes?

The only way I see is maintaining a hash table mapping full paths to
tree nodes.

There will be such a hash table in each root tuple_field object (it's
not a part of the generic json tree infrastructure). It will be used to
speed up data indexing. Why would we need to bother implementing lookup
in a tree then? Because I assume we want to speed up lookups over all
sub-paths, e.g. suppose we have a tuple

t = {
     {
      customer = {
       name = {
        first = ...,
        last = ...,
       },
       ...
      }
     },
     ...
    }

Then we can lookup 'customer.name.first' in Lua using any of the
following ways:

  t['[1].customer.name.first']
  t[1]['customer.name.first']
  t[1].customer['name.first']
  t[1].customer.name.first

  etc

I assume we want to use tuple field map so as to avoid msgpack decode
whenever possible, not just for cases 1 or 2. Moreover, I guess
t[1].customer.name.first will be more commonly used.

We could of course store a hash table over all full paths in each
intermediate node, but that would result in maintaining way too many
hash entries, e.g. for a tuple field corresponding to 'customer', we
would have to store all the following strings in the hash: 'name',
'name.first' and so on for other descendants. So I think that we only
want to use the hash table for data indexing, i.e. when a field is
looked up by full path like t[1]['customer.name.first']. For all other
references we will split the path into nodes and descend down the tree
to find the appropriate tuple field map slot.

Note, we need the tree in any case, even if we decide not to use it for
lookups at all - it's necessary for building a tuple field map without
unpacking the same prefix multiple times, e.g. if we have two indexed
paths

  a.b.c.d
  a.b.c.e

Then we want to decode 'a.b.c' only once, not twice. The tree helps
achieve that.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [tarantool-patches] Re: [PATCH v1 1/1] lib: implement JSON tree class for json library
  2018-10-05  9:50       ` Vladimir Davydov
@ 2018-10-06 12:37         ` Konstantin Osipov
  2018-10-08 10:07           ` Vladimir Davydov
  0 siblings, 1 reply; 8+ messages in thread
From: Konstantin Osipov @ 2018-10-06 12:37 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [18/10/05 12:52]:
> There will be such a hash table in each root tuple_field object (it's
> not a part of the generic json tree infrastructure). It will be used to
> speed up data indexing. Why would we need to bother implementing lookup
> in a tree then? Because I assume we want to speed up lookups over all
> sub-paths, e.g. suppose we have a tuple

Why do we maintain a separate hash for tuple field names and json
paths within a tuple?

json paths are just a continuation of existing ways to access a
tuple field.

Looks like we need to rebuild existing implementation of field
names support, not add something on a side.

> 
> t = {
>      {
>       customer = {
>        name = {
>         first = ...,
>         last = ...,
>        },
>        ...
>       }
>      },
>      ...
>     }
> 
> Then we can lookup 'customer.name.first' in Lua using any of the
> following ways:
> 
>   t['[1].customer.name.first']
>   t[1]['customer.name.first']
>   t[1].customer['name.first']
>   t[1].customer.name.first
> 
>   etc
> 
> I assume we want to use tuple field map so as to avoid msgpack decode
> whenever possible, not just for cases 1 or 2. Moreover, I guess
> t[1].customer.name.first will be more commonly used.


> 
> We could of course store a hash table over all full paths in each
> intermediate node, but that would result in maintaining way too many
> hash entries, e.g. for a tuple field corresponding to 'customer', we
> would have to store all the following strings in the hash: 'name',
> 'name.first' and so on for other descendants. So I think that we only
> want to use the hash table for data indexing, i.e. when a field is
> looked up by full path like t[1]['customer.name.first']. For all other
> references we will split the path into nodes and descend down the tree
> to find the appropriate tuple field map slot.

I would try to create a single hash table for all possible paths,
including intermediate paths, and store it in the tuple format.

The hash table would store pointers to json tree nodes.

And I also think we need json tree nodes for top level fields -
they are just the same as intermediate fields.

I would also keep in mind that in the future we will automatically
flatten tuples according to a schema before storing in a space.

Flattening, from path access perspective, is simply a replacement
of one path with another  (foo.bar.baz -> [1][1][3]).

> Note, we need the tree in any case, even if we decide not to use it for
> lookups at all - it's necessary for building a tuple field map without
> unpacking the same prefix multiple times, e.g. if we have two indexed
> paths
> 
>   a.b.c.d
>   a.b.c.e
> 
> Then we want to decode 'a.b.c' only once, not twice. The tree helps
> achieve that.

OK, this is clear.

-- 
Konstantin Osipov, Moscow, Russia, +7 903 626 22 32
http://tarantool.io - www.twitter.com/kostja_osipov

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [tarantool-patches] Re: [PATCH v1 1/1] lib: implement JSON tree class for json library
  2018-10-06 12:37         ` Konstantin Osipov
@ 2018-10-08 10:07           ` Vladimir Davydov
  0 siblings, 0 replies; 8+ messages in thread
From: Vladimir Davydov @ 2018-10-08 10:07 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches

On Sat, Oct 06, 2018 at 03:37:14PM +0300, Konstantin Osipov wrote:
> * Vladimir Davydov <vdavydov.dev@gmail.com> [18/10/05 12:52]:
> > There will be such a hash table in each root tuple_field object (it's
> > not a part of the generic json tree infrastructure). It will be used to
> > speed up data indexing. Why would we need to bother implementing lookup
> > in a tree then? Because I assume we want to speed up lookups over all
> > sub-paths, e.g. suppose we have a tuple
> 
> Why do we maintain a separate hash for tuple field names and json
> paths within a tuple?
> 
> json paths are just a continuation of existing ways to access a
> tuple field.
> 
> Looks like we need to rebuild existing implementation of field
> names support, not add something on a side.

Field names are not quite the same as json paths within a field.
Basically, they are just aliases to field indexes, i.e. the same tuple
field can be referenced both by name and by index. With json paths, this
is impossible. Handling this properly would complicate the code IMO.
Also, I believe that incorporating json paths in tuple_dictionary can
slow down users that don't use json paths.

That said, I don't think it'd be good idea to mix the two notions, but
we can discuss it you wish.

> 
> > 
> > t = {
> >      {
> >       customer = {
> >        name = {
> >         first = ...,
> >         last = ...,
> >        },
> >        ...
> >       }
> >      },
> >      ...
> >     }
> > 
> > Then we can lookup 'customer.name.first' in Lua using any of the
> > following ways:
> > 
> >   t['[1].customer.name.first']
> >   t[1]['customer.name.first']
> >   t[1].customer['name.first']
> >   t[1].customer.name.first
> > 
> >   etc
> > 
> > I assume we want to use tuple field map so as to avoid msgpack decode
> > whenever possible, not just for cases 1 or 2. Moreover, I guess
> > t[1].customer.name.first will be more commonly used.
> 
> 
> > 
> > We could of course store a hash table over all full paths in each
> > intermediate node, but that would result in maintaining way too many
> > hash entries, e.g. for a tuple field corresponding to 'customer', we
> > would have to store all the following strings in the hash: 'name',
> > 'name.first' and so on for other descendants. So I think that we only
> > want to use the hash table for data indexing, i.e. when a field is
> > looked up by full path like t[1]['customer.name.first']. For all other
> > references we will split the path into nodes and descend down the tree
> > to find the appropriate tuple field map slot.
> 
> I would try to create a single hash table for all possible paths,
> including intermediate paths, and store it in the tuple format.

This means that the number of hashed paths would grow quadratically with
the number of nodes, e.g. for

  a.b.c.d
  a.b.e.f

we'd have to store

  a, a.b, a.b.c, a.b.c.d, a.b.e, a.b.e.f

in root fields and

  b, b.c, b.c.d, b.e, b.e.f, c, c.d, e, e.f, d, f

in intermediate fields.

I don't think it's worthwhile, because to compute the hash of a json
path, we have to parse it (at least, I don't see any other way), so we
could descend the json path tree as we go looking up path nodes one by
one, instead of looking up a full path in a hash table.

> 
> The hash table would store pointers to json tree nodes.
> 
> And I also think we need json tree nodes for top level fields -
> they are just the same as intermediate fields.
> 
> I would also keep in mind that in the future we will automatically
> flatten tuples according to a schema before storing in a space.
> 
> Flattening, from path access perspective, is simply a replacement
> of one path with another  (foo.bar.baz -> [1][1][3]).

I think tuple flattening is about turning a tree into an array, i.e.

foo.bar.baz -> [N]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2018-10-08 10:07 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-01 12:27 [PATCH v1 1/1] lib: implement JSON tree class for json library Kirill Shcherbatov
2018-10-01 21:34 ` Vladimir Davydov
2018-10-03  7:15   ` [tarantool-patches] " Kirill Shcherbatov
2018-10-04 22:59     ` Konstantin Osipov
2018-10-05  9:50       ` Vladimir Davydov
2018-10-06 12:37         ` Konstantin Osipov
2018-10-08 10:07           ` Vladimir Davydov
2018-10-04 22:54 ` [tarantool-patches] " Konstantin Osipov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox