[tarantool-patches] [PATCH v2 1/3] Introduce json_path_parser

Kirill Shcherbatov kshcherbatov at tarantool.org
Thu Mar 29 17:22:02 MSK 2018


From: Vladislav Shpilevoy <v.shpilevoy at tarantool.org>

Needed for #1285 and for #1261
---
 src/lib/CMakeLists.txt      |   1 +
 src/lib/json/CMakeLists.txt |   6 ++
 src/lib/json/path.c         | 188 ++++++++++++++++++++++++++++++++++++++++++++
 src/lib/json/path.h         | 111 ++++++++++++++++++++++++++
 test/unit/CMakeLists.txt    |   3 +
 test/unit/json_path.c       | 162 ++++++++++++++++++++++++++++++++++++++
 test/unit/json_path.result  |  82 +++++++++++++++++++
 7 files changed, 553 insertions(+)
 create mode 100644 src/lib/json/CMakeLists.txt
 create mode 100644 src/lib/json/path.c
 create mode 100644 src/lib/json/path.h
 create mode 100644 test/unit/json_path.c
 create mode 100644 test/unit/json_path.result

diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 0b274ca..98ff19b 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SMALL_EMBEDDED ON)
 add_subdirectory(small)
 add_subdirectory(salad)
 add_subdirectory(csv)
+add_subdirectory(json)
 if(ENABLE_BUNDLED_MSGPUCK)
     add_subdirectory(msgpuck EXCLUDE_FROM_ALL)
 endif()
diff --git a/src/lib/json/CMakeLists.txt b/src/lib/json/CMakeLists.txt
new file mode 100644
index 0000000..203fe6f
--- /dev/null
+++ b/src/lib/json/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(lib_sources
+    path.c
+)
+
+set_source_files_compile_flags(${lib_sources})
+add_library(json_path STATIC ${lib_sources})
diff --git a/src/lib/json/path.c b/src/lib/json/path.c
new file mode 100644
index 0000000..4a6174e
--- /dev/null
+++ b/src/lib/json/path.c
@@ -0,0 +1,188 @@
+/*
+ * Copyright 2010-2016 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "path.h"
+#include <ctype.h>
+#include "trivia/util.h"
+
+/** Same as strtoull(), but with limited length. */
+static inline uint64_t
+strntoull(const char *src, int len) {
+	uint64_t value = 0;
+	for (const char *end = src + len; src < end; ++src) {
+		assert(isdigit(*src));
+		value = value * 10 + *src - (int)'0';
+	}
+	return value;
+}
+
+/**
+ * Parse string identifier in quotes. Parser either stops right
+ * after the closing quote, or returns an error position.
+ * @param parser JSON path parser.
+ * @param[out] node JSON node to store result.
+ *
+ * @retval     0 Success.
+ * @retval not 0 1-based position of a syntax error.
+ */
+static inline int
+json_parse_string(struct json_path_parser *parser, struct json_path_node *node)
+{
+	const char *end = parser->src + parser->src_len;
+	const char *pos = parser->pos;
+	assert(pos < end);
+	char quote_type = *pos;
+	assert(quote_type == '\'' || quote_type == '"');
+	/* Skip first quote. */
+	int len = 0;
+	++pos;
+	const char *str = pos;
+	for (char c = *pos; pos < end && quote_type != c; c = *++pos)
+		++len;
+	/* A string must be terminated with quote. */
+	if (*pos != quote_type || len == 0)
+		return pos - parser->src + 1;
+	/* Skip the closing quote. */
+	parser->pos = pos + 1;
+	node->type = JSON_PATH_STR;
+	node->str = str;
+	node->len = len;
+	return 0;
+}
+
+/**
+ * Parse digit sequence into integer until non-digit is met.
+ * Parser stops right after the last digit.
+ * @param parser JSON parser.
+ * @param[out] node JSON node to store result.
+ *
+ * @retval     0 Success.
+ * @retval not 0 1-based position of a syntax error.
+ */
+static inline int
+json_parse_integer(struct json_path_parser *parser, struct json_path_node *node)
+{
+	const char *end = parser->src + parser->src_len;
+	const char *pos = parser->pos;
+	assert(pos < end);
+	const char *str = pos;
+	int len = 0;
+	for (char c = *pos; pos < end && isdigit(c); c = *++pos)
+		++len;
+	if (len == 0)
+		return pos - parser->src + 1;
+	parser->pos = pos;
+	node->type = JSON_PATH_NUM;
+	node->num = strntoull(str, len);
+	return 0;
+}
+
+/**
+ * Parse identifier out of quotes. It can contain only alphas,
+ * digits and underscores. And can not contain digit at the first
+ * position. Parser is stoped right after the last non-digit,
+ * non-alpha and non-underscore symbol.
+ * @param parser JSON parser.
+ * @param[out] node JSON node to store result.
+ *
+ * @retval     0 Success.
+ * @retval not 0 1-based position of a syntax error.
+ */
+static inline int
+json_parse_identifier(struct json_path_parser *parser,
+		      struct json_path_node *node)
+{
+	const char *end = parser->src + parser->src_len;
+	const char *pos = parser->pos;
+	assert(pos < end);
+	const char *str = pos;
+	char c = *pos;
+	/* First symbol can not be digit. */
+	if (!isalpha(c) && c != '_')
+		return pos - parser->src + 1;
+	int len = 1;
+	for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
+	     c = *++pos)
+		++len;
+	assert(len > 0);
+	parser->pos = pos;
+	node->type = JSON_PATH_STR;
+	node->str = str;
+	node->len = len;
+	return 0;
+}
+
+int
+json_path_next(struct json_path_parser *parser, struct json_path_node *node)
+{
+	const char *end = parser->src + parser->src_len;
+	if (end == parser->pos) {
+		node->type = JSON_PATH_END;
+		return 0;
+	}
+	char c = *parser->pos;
+	int rc;
+	switch(c) {
+	case '[':
+		++parser->pos;
+		/* Error for []. */
+		if (parser->pos == end)
+			return parser->pos - parser->src + 1;
+		c = *parser->pos;
+		if (c == '"' || c == '\'')
+			rc = json_parse_string(parser, node);
+		else
+			rc = json_parse_integer(parser, node);
+		if (rc != 0)
+			return rc;
+		/*
+		 * Expression, started from [ must be finished
+		 * with ] regardless of its type.
+		 */
+		if (parser->pos == end || *parser->pos != ']')
+			return parser->pos - parser->src + 1;
+		/* Skip ]. */
+		++parser->pos;
+		break;
+	case '.':
+		/* Skip dot. */
+		++parser->pos;
+		if (parser->pos == end)
+			return parser->pos - parser->src + 1;
+		FALLTHROUGH
+	default:
+		rc = json_parse_identifier(parser, node);
+		if (rc != 0)
+			return rc;
+		break;
+	}
+	return 0;
+}
diff --git a/src/lib/json/path.h b/src/lib/json/path.h
new file mode 100644
index 0000000..6e8db4c
--- /dev/null
+++ b/src/lib/json/path.h
@@ -0,0 +1,111 @@
+#ifndef TARANTOOL_JSON_PATH_H_INCLUDED
+#define TARANTOOL_JSON_PATH_H_INCLUDED
+/*
+ * Copyright 2010-2016 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Parser for JSON paths:
+ * <field>, <.field>, <[123]>, <['field']> and their combinations.
+ */
+struct json_path_parser {
+	/** Source string. */
+	const char *src;
+	/** Length of src. */
+	int src_len;
+	/** Current parser's position. */
+	const char *pos;
+};
+
+enum json_path_type {
+	JSON_PATH_NUM,
+	JSON_PATH_STR,
+	/** Parser reached end of path. */
+	JSON_PATH_END,
+};
+
+/**
+ * Element of a JSON path. It can be either string or number.
+ * String idenfiers are in ["..."] and between dots. Numbers are
+ * indexes in [...].
+ */
+struct json_path_node {
+	enum json_path_type type;
+	union {
+		struct {
+			/** String identifier. */
+			const char *str;
+			/** Length of @a str. */
+			int len;
+		};
+		/** Index value. */
+		uint64_t num;
+	};
+};
+
+/**
+ * Create @a parser.
+ * @param[out] parser Parser to create.
+ * @param src Source string.
+ * @param src_len Length of @a src.
+ */
+static inline void
+json_path_parser_create(struct json_path_parser *parser, const char *src,
+			int src_len)
+{
+	parser->src = src;
+	parser->src_len = src_len;
+	parser->pos = src;
+}
+
+/**
+ * Get a next path node.
+ * @param parser Parser.
+ * @param[out] node Node to store parsed result.
+ * @retval 0   Success. For result see @a node.str, node.len,
+ *             node.num.
+ * @retval > 0 Position of a syntax error. A position is 1-based
+ *             and starts from a beginning of a source string.
+ */
+int
+json_path_next(struct json_path_parser *parser, struct json_path_node *node);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TARANTOOL_JSON_PATH_H_INCLUDED */
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 943788b..fe8b2d2 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -129,6 +129,9 @@ add_executable(reflection_cxx.test reflection_cxx.cc unit.c
 add_executable(csv.test csv.c)
 target_link_libraries(csv.test csv)
 
+add_executable(json_path.test json_path.c)
+target_link_libraries(json_path.test json_path unit)
+
 add_executable(rmean.test rmean.cc)
 target_link_libraries(rmean.test stat unit)
 add_executable(histogram.test histogram.c)
diff --git a/test/unit/json_path.c b/test/unit/json_path.c
new file mode 100644
index 0000000..599658b
--- /dev/null
+++ b/test/unit/json_path.c
@@ -0,0 +1,162 @@
+#include "json/path.h"
+#include "unit.h"
+#include "trivia/util.h"
+#include <string.h>
+
+#define reset_to_new_path(value) \
+	path = value; \
+	len = strlen(value); \
+	json_path_parser_create(&parser, path, len);
+
+#define is_next_index(value_len, value) \
+	path = parser.pos; \
+	is(json_path_next(&parser, &node), 0, "parse <%." #value_len "s>", \
+	   path); \
+	is(node.type, JSON_PATH_NUM, "<%." #value_len "s> is num", path); \
+	is(node.num, value, "<%." #value_len "s> is " #value, path);
+
+#define is_next_key(value) \
+	len = strlen(value); \
+	is(json_path_next(&parser, &node), 0, "parse <" value ">"); \
+	is(node.type, JSON_PATH_STR, "<" value "> is str"); \
+	is(node.len, len, "len is %d", len); \
+	is(strncmp(node.str, value, len), 0, "str is " value);
+
+void
+test_basic()
+{
+	header();
+	plan(53);
+	const char *path;
+	int len;
+	struct json_path_parser parser;
+	struct json_path_node node;
+
+	reset_to_new_path("[0].field1.field2['field3'][5]");
+	is_next_index(3, 0);
+	is_next_key("field1");
+	is_next_key("field2");
+	is_next_key("field3");
+	is_next_index(3, 5);
+
+	reset_to_new_path("[3].field[2].field")
+	is_next_index(3, 3);
+	is_next_key("field");
+	is_next_index(3, 2);
+	is_next_key("field");
+
+	reset_to_new_path("[\"f1\"][\"f2'3'\"]");
+	is_next_key("f1");
+	is_next_key("f2'3'");
+
+	/* Support both '.field1...' and 'field1...'. */
+	reset_to_new_path(".field1");
+	is_next_key("field1");
+
+	/* Long number. */
+	reset_to_new_path("[1234]");
+	is_next_index(6, 1234);
+
+	/* Empty path. */
+	reset_to_new_path("");
+	is(json_path_next(&parser, &node), 0, "parse empty path");
+	is(node.type, JSON_PATH_END, "is str");
+
+	/* Path with no '.' at the beginning. */
+	reset_to_new_path("field1.field2");
+	is_next_key("field1");
+
+	check_plan();
+	footer();
+}
+
+#define check_new_path_on_error(value, errpos) \
+	reset_to_new_path(value); \
+	struct json_path_node node; \
+	is(json_path_next(&parser, &node), errpos, "error on position %d" \
+	   " for <%s>", errpos, path);
+
+struct path_and_errpos {
+	const char *path;
+	int errpos;
+};
+
+void
+test_errors()
+{
+	header();
+	plan(18);
+	const char *path;
+	int len;
+	struct json_path_parser parser;
+	const struct path_and_errpos errors[] = {
+		/* Double [[. */
+		{"[[", 2},
+		/* Not string inside []. */
+		{"[field]", 2},
+		/* String outside of []. */
+		{"'field1'.field2", 1},
+		/* Empty brackets. */
+		{"[]", 2},
+		/* Empty string. */
+		{"''", 1},
+		/* Spaces between identifiers. */
+		{" field1", 1},
+		/* Start from digit. */
+		{"1field", 1},
+		{".1field", 2},
+		/* Unfinished identifiers. */
+		{"['field", 8},
+		{"['field'", 9},
+		{"[123", 5},
+		{"['']", 3},
+		/*
+		 * Not trivial error: can not write
+		 * '[]' after '.'.
+		 */
+		{".[123]", 2},
+		/* Misc. */
+		{"[.]", 2},
+	};
+	for (size_t i = 0; i < lengthof(errors); ++i) {
+		reset_to_new_path(errors[i].path);
+		int errpos = errors[i].errpos;
+		struct json_path_node node;
+		is(json_path_next(&parser, &node), errpos,
+		   "error on position %d for <%s>", errpos, path);
+	}
+
+	reset_to_new_path("f.[2]")
+	struct json_path_node node;
+	json_path_next(&parser, &node);
+	is(json_path_next(&parser, &node), 3, "can not write <field.[index]>")
+
+	reset_to_new_path("f.")
+	json_path_next(&parser, &node);
+	is(json_path_next(&parser, &node), 3, "error in leading <.>");
+
+	reset_to_new_path("fiel d1")
+	json_path_next(&parser, &node);
+	is(json_path_next(&parser, &node), 5, "space inside identifier");
+
+	reset_to_new_path("field\t1")
+	json_path_next(&parser, &node);
+	is(json_path_next(&parser, &node), 6, "tab inside identifier");
+
+	check_plan();
+	footer();
+}
+
+int
+main()
+{
+	header();
+	plan(2);
+
+	test_basic();
+	test_errors();
+
+	int rc = check_plan();
+	footer();
+	return rc;
+}
diff --git a/test/unit/json_path.result b/test/unit/json_path.result
new file mode 100644
index 0000000..6d28113
--- /dev/null
+++ b/test/unit/json_path.result
@@ -0,0 +1,82 @@
+	*** main ***
+1..2
+	*** test_basic ***
+    1..53
+    ok 1 - parse <[0]>
+    ok 2 - <[0]> is num
+    ok 3 - <[0]> is 0
+    ok 4 - parse <field1>
+    ok 5 - <field1> is str
+    ok 6 - len is 6
+    ok 7 - str is field1
+    ok 8 - parse <field2>
+    ok 9 - <field2> is str
+    ok 10 - len is 6
+    ok 11 - str is field2
+    ok 12 - parse <field3>
+    ok 13 - <field3> is str
+    ok 14 - len is 6
+    ok 15 - str is field3
+    ok 16 - parse <[5]>
+    ok 17 - <[5]> is num
+    ok 18 - <[5]> is 5
+    ok 19 - parse <[3]>
+    ok 20 - <[3]> is num
+    ok 21 - <[3]> is 3
+    ok 22 - parse <field>
+    ok 23 - <field> is str
+    ok 24 - len is 5
+    ok 25 - str is field
+    ok 26 - parse <[2]>
+    ok 27 - <[2]> is num
+    ok 28 - <[2]> is 2
+    ok 29 - parse <field>
+    ok 30 - <field> is str
+    ok 31 - len is 5
+    ok 32 - str is field
+    ok 33 - parse <f1>
+    ok 34 - <f1> is str
+    ok 35 - len is 2
+    ok 36 - str is f1
+    ok 37 - parse <f2'3'>
+    ok 38 - <f2'3'> is str
+    ok 39 - len is 5
+    ok 40 - str is f2'3'
+    ok 41 - parse <field1>
+    ok 42 - <field1> is str
+    ok 43 - len is 6
+    ok 44 - str is field1
+    ok 45 - parse <[1234]>
+    ok 46 - <[1234]> is num
+    ok 47 - <[1234]> is 1234
+    ok 48 - parse empty path
+    ok 49 - is str
+    ok 50 - parse <field1>
+    ok 51 - <field1> is str
+    ok 52 - len is 6
+    ok 53 - str is field1
+ok 1 - subtests
+	*** test_basic: done ***
+	*** test_errors ***
+    1..18
+    ok 1 - error on position 2 for <[[>
+    ok 2 - error on position 2 for <[field]>
+    ok 3 - error on position 1 for <'field1'.field2>
+    ok 4 - error on position 2 for <[]>
+    ok 5 - error on position 1 for <''>
+    ok 6 - error on position 1 for < field1>
+    ok 7 - error on position 1 for <1field>
+    ok 8 - error on position 2 for <.1field>
+    ok 9 - error on position 8 for <['field>
+    ok 10 - error on position 9 for <['field'>
+    ok 11 - error on position 5 for <[123>
+    ok 12 - error on position 3 for <['']>
+    ok 13 - error on position 2 for <.[123]>
+    ok 14 - error on position 2 for <[.]>
+    ok 15 - can not write <field.[index]>
+    ok 16 - error in leading <.>
+    ok 17 - space inside identifier
+    ok 18 - tab inside identifier
+ok 2 - subtests
+	*** test_errors: done ***
+	*** main: done ***
-- 
2.7.4





More information about the Tarantool-patches mailing list