[tarantool-patches] [PATCH v2 2/3] Introduce json_path_parser with Unicode support.
Kirill Shcherbatov
kshcherbatov at tarantool.org
Fri Apr 6 14:08:56 MSK 2018
Needed for #1285 and for #1261
---
src/lib/CMakeLists.txt | 1 +
src/lib/json/CMakeLists.txt | 6 ++
src/lib/json/path.c | 242 ++++++++++++++++++++++++++++++++++++++++++++
src/lib/json/path.h | 114 +++++++++++++++++++++
test/unit/CMakeLists.txt | 3 +
test/unit/json_path.c | 165 ++++++++++++++++++++++++++++++
test/unit/json_path.result | 84 +++++++++++++++
7 files changed, 615 insertions(+)
create mode 100644 src/lib/json/CMakeLists.txt
create mode 100644 src/lib/json/path.c
create mode 100644 src/lib/json/path.h
create mode 100644 test/unit/json_path.c
create mode 100644 test/unit/json_path.result
diff --git a/src/lib/CMakeLists.txt b/src/lib/CMakeLists.txt
index 0b274ca..98ff19b 100644
--- a/src/lib/CMakeLists.txt
+++ b/src/lib/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SMALL_EMBEDDED ON)
add_subdirectory(small)
add_subdirectory(salad)
add_subdirectory(csv)
+add_subdirectory(json)
if(ENABLE_BUNDLED_MSGPUCK)
add_subdirectory(msgpuck EXCLUDE_FROM_ALL)
endif()
diff --git a/src/lib/json/CMakeLists.txt b/src/lib/json/CMakeLists.txt
new file mode 100644
index 0000000..203fe6f
--- /dev/null
+++ b/src/lib/json/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(lib_sources
+ path.c
+)
+
+set_source_files_compile_flags(${lib_sources})
+add_library(json_path STATIC ${lib_sources})
diff --git a/src/lib/json/path.c b/src/lib/json/path.c
new file mode 100644
index 0000000..9f9cec3
--- /dev/null
+++ b/src/lib/json/path.c
@@ -0,0 +1,242 @@
+/*
+ * Copyright 2010-2016 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "path.h"
+#include <ctype.h>
+#include <unicode/uchar.h>
+#include "trivia/util.h"
+
+/**
+ * Parse string and update parser's state.
+ * @param parser JSON path parser. Upates pos, signs_read.
+ * @param[out] UChar32 to store result.
+ *
+ * @retval 0 Success.
+ * @retval > 0 1-based position of a syntax error.
+ */
+static inline int
+json_read_symbol(struct json_path_parser *parser, UChar32 *out)
+{
+ if (parser->offset == parser->src_len)
+ return parser->symbol_count + 1;
+ U8_NEXT_OR_FFFD(parser->src, parser->offset, parser->src_len, *out)
+ if (*out == 0xFFFD)
+ return parser->symbol_count + 1;
+ ++parser->symbol_count;
+ return 0;
+}
+
+/**
+ * Reset parser state to previous one.
+ * @param parser JSON path parser.
+ * @param old parser read offset.
+ * @param signs to drop in signs_read counter.
+ */
+static inline void
+json_revert_symbol(struct json_path_parser *parser, int offset)
+{
+ parser->offset = offset;
+ --parser->symbol_count;
+}
+
+/** Fast forward when it is known that a symbol is 1-byte char. */
+static inline void
+json_propagate_char(struct json_path_parser *parser)
+{
+ ++parser->offset;
+ ++parser->symbol_count;
+}
+
+/** Get a current symbol as a 1-byte char. */
+static inline char
+json_current_char(struct json_path_parser *parser)
+{
+ return *(parser->src + parser->offset);
+}
+
+/**
+ * Parse string identifier in quotes. Parser either stops right
+ * after the closing quote, or returns an error position.
+ * @param parser JSON path parser.
+ * @param[out] node JSON node to store result.
+ * @param quote_type Quote by that a string must be terminated.
+ *
+ * @retval 0 Success.
+ * @retval > 0 1-based position of a syntax error.
+ */
+static inline int
+json_parse_string(struct json_path_parser *parser, struct json_path_node *node,
+ UChar32 quote_type)
+{
+ assert(parser->offset < parser->src_len);
+ assert(quote_type == json_current_char(parser));
+ /* The first symbol is always char - ' or ". */
+ json_propagate_char(parser);
+ int str_offset = parser->offset;
+ UChar32 c;
+ int rc;
+ while ((rc = json_read_symbol(parser, &c)) == 0) {
+ if (c == quote_type) {
+ int len = parser->offset - str_offset - 1;
+ if (len == 0)
+ return parser->symbol_count;
+ node->type = JSON_PATH_STR;
+ node->str = parser->src + str_offset;
+ node->len = len;
+ return 0;
+ }
+ }
+ return rc;
+}
+
+/**
+ * Parse digit sequence into integer until non-digit is met.
+ * Parser stops right after the last digit.
+ * @param parser JSON parser. Updates signs_read field.
+ * @param[out] node JSON node to store result.
+ *
+ * @retval 0 Success.
+ * @retval > 0 1-based position of a syntax error.
+ */
+static inline int
+json_parse_integer(struct json_path_parser *parser, struct json_path_node *node)
+{
+ const char *end = parser->src + parser->src_len;
+ const char *pos = parser->src + parser->offset;
+ assert(pos < end);
+ int len = 0;
+ uint64_t value = 0;
+ char c = *pos;
+ if (! isdigit(c))
+ return parser->symbol_count + 1;
+ do {
+ value = value * 10 + c - (int)'0';
+ ++len;
+ } while (++pos < end && isdigit((c = *pos)));
+ parser->offset += len;
+ parser->symbol_count += len;
+ node->type = JSON_PATH_NUM;
+ node->num = value;
+ return 0;
+}
+
+/**
+ * Check that a symbol can be part of a JSON path not inside
+ * ["..."].
+ */
+static inline bool
+json_is_valid_identifier_sym(UChar32 c)
+{
+ return u_isUAlphabetic(c) || c == (UChar32)'_' || u_isdigit(c);
+}
+
+/**
+ * Parse identifier out of quotes. It can contain only alphas,
+ * digits and underscores. And can not contain digit at the first
+ * position. Parser is stoped right after the last non-digit,
+ * non-alpha and non-underscore symbol.
+ * @param parser JSON parser. Updates signs_read field.
+ * @param[out] node JSON node to store result.
+ *
+ * @retval 0 Success.
+ * @retval > 0 1-based position of a syntax error.
+ */
+static inline int
+json_parse_identifier(struct json_path_parser *parser,
+ struct json_path_node *node)
+{
+ assert(parser->offset < parser->src_len);
+ int str_offset = parser->offset;
+ UChar32 c;
+ int rc = json_read_symbol(parser, &c);
+ if (rc != 0)
+ return rc;
+ /* First symbol can not be digit. */
+ if (!u_isalpha(c) && c != (UChar32)'_')
+ return parser->symbol_count;
+ int last_offset = parser->offset;
+ while ((rc = json_read_symbol(parser, &c)) == 0) {
+ if (! json_is_valid_identifier_sym(c)) {
+ json_revert_symbol(parser, last_offset);
+ break;
+ }
+ last_offset = parser->offset;
+ }
+ node->type = JSON_PATH_STR;
+ node->str = parser->src + str_offset;
+ node->len = parser->offset - str_offset;
+ return 0;
+}
+
+int
+json_path_next(struct json_path_parser *parser, struct json_path_node *node)
+{
+ UChar32 c;
+ int last_offset = parser->offset;
+ int rc = json_read_symbol(parser, &c);
+ if (rc != 0) {
+ if (parser->offset == parser->src_len) {
+ node->type = JSON_PATH_END;
+ return 0;
+ }
+ return rc;
+ }
+ switch(c) {
+ case (UChar32)'[':
+ /* Error for '['. */
+ if (parser->offset == parser->src_len)
+ return parser->symbol_count;
+ c = json_current_char(parser);
+ if (c == '"' || c == '\'')
+ rc = json_parse_string(parser, node, c);
+ else
+ rc = json_parse_integer(parser, node);
+ if (rc != 0)
+ return rc;
+ /*
+ * Expression, started from [ must be finished
+ * with ] regardless of its type.
+ */
+ if (parser->offset == parser->src_len ||
+ json_current_char(parser) != ']')
+ return parser->symbol_count + 1;
+ /* Skip ] - one byte char. */
+ json_propagate_char(parser);
+ return 0;
+ case (UChar32)'.':
+ if (parser->offset == parser->src_len)
+ return parser->symbol_count + 1;
+ return json_parse_identifier(parser, node);
+ default:
+ json_revert_symbol(parser, last_offset);
+ return json_parse_identifier(parser, node);
+ }
+}
diff --git a/src/lib/json/path.h b/src/lib/json/path.h
new file mode 100644
index 0000000..1775507
--- /dev/null
+++ b/src/lib/json/path.h
@@ -0,0 +1,114 @@
+#ifndef TARANTOOL_JSON_PATH_H_INCLUDED
+#define TARANTOOL_JSON_PATH_H_INCLUDED
+/*
+ * Copyright 2010-2016 Tarantool AUTHORS: please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Parser for JSON paths:
+ * <field>, <.field>, <[123]>, <['field']> and their combinations.
+ */
+struct json_path_parser {
+ /** Source string. */
+ const char *src;
+ /** Length of string. */
+ int src_len;
+ /** Current parser's offset. */
+ int offset;
+ /** Successfully parsed signs count. */
+ int symbol_count;
+};
+
+enum json_path_type {
+ JSON_PATH_NUM,
+ JSON_PATH_STR,
+ /** Parser reached end of path. */
+ JSON_PATH_END,
+};
+
+/**
+ * Element of a JSON path. It can be either string or number.
+ * String idenfiers are in ["..."] and between dots. Numbers are
+ * indexes in [...].
+ */
+struct json_path_node {
+ enum json_path_type type;
+ union {
+ struct {
+ /** String identifier. */
+ const char *str;
+ /** Length of @a str. */
+ int len;
+ };
+ /** Index value. */
+ uint64_t num;
+ };
+};
+
+/**
+ * Init @a parser.
+ * @param[out] parser Parser to create.
+ * @param src Source string.
+ * @param src_len Length of @a src.
+ */
+static inline void
+json_path_parser_create(struct json_path_parser *parser, const char *src,
+ int src_len)
+{
+ parser->src = src;
+ parser->src_len = src_len;
+ parser->offset = 0;
+ parser->symbol_count = 0;
+}
+
+/**
+ * Get a next path node.
+ * @param parser Parser.
+ * @param[out] node Node to store parsed result.
+ * @retval 0 Success. For result see @a node.str, node.len,
+ * node.num.
+ * @retval > 0 Position of a syntax error. A position is 1-based
+ * and starts from a beginning of a source string.
+ */
+int
+json_path_next(struct json_path_parser *parser, struct json_path_node *node);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* TARANTOOL_JSON_PATH_H_INCLUDED */
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 943788b..667194c 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -129,6 +129,9 @@ add_executable(reflection_cxx.test reflection_cxx.cc unit.c
add_executable(csv.test csv.c)
target_link_libraries(csv.test csv)
+add_executable(json_path.test json_path.c)
+target_link_libraries(json_path.test json_path unit ${ICU_LIBRARIES})
+
add_executable(rmean.test rmean.cc)
target_link_libraries(rmean.test stat unit)
add_executable(histogram.test histogram.c)
diff --git a/test/unit/json_path.c b/test/unit/json_path.c
new file mode 100644
index 0000000..336885b
--- /dev/null
+++ b/test/unit/json_path.c
@@ -0,0 +1,165 @@
+#include "json/path.h"
+#include "unit.h"
+#include "trivia/util.h"
+#include <string.h>
+
+#define reset_to_new_path(value) \
+ path = value; \
+ len = strlen(value); \
+ json_path_parser_create(&parser, path, len);
+
+#define is_next_index(value_len, value) \
+ path = parser.src + parser.offset; \
+ is(json_path_next(&parser, &node), 0, "parse <%." #value_len "s>", \
+ path); \
+ is(node.type, JSON_PATH_NUM, "<%." #value_len "s> is num", path); \
+ is(node.num, value, "<%." #value_len "s> is " #value, path);
+
+#define is_next_key(value) \
+ len = strlen(value); \
+ is(json_path_next(&parser, &node), 0, "parse <" value ">"); \
+ is(node.type, JSON_PATH_STR, "<" value "> is str"); \
+ is(node.len, len, "len is %d", len); \
+ is(strncmp(node.str, value, len), 0, "str is " value);
+
+void
+test_basic()
+{
+ header();
+ plan(53);
+ const char *path;
+ int len;
+ struct json_path_parser parser;
+ struct json_path_node node;
+
+ reset_to_new_path("[0].field1.field2['field3'][5]");
+ is_next_index(3, 0);
+ is_next_key("field1");
+ is_next_key("field2");
+ is_next_key("field3");
+ is_next_index(3, 5);
+
+ reset_to_new_path("[3].field[2].field")
+ is_next_index(3, 3);
+ is_next_key("field");
+ is_next_index(3, 2);
+ is_next_key("field");
+
+ reset_to_new_path("[\"f1\"][\"f2'3'\"]");
+ is_next_key("f1");
+ is_next_key("f2'3'");
+
+ /* Support both '.field1...' and 'field1...'. */
+ reset_to_new_path(".field1");
+ is_next_key("field1");
+
+ /* Long number. */
+ reset_to_new_path("[1234]");
+ is_next_index(6, 1234);
+
+ /* Empty path. */
+ reset_to_new_path("");
+ is(json_path_next(&parser, &node), 0, "parse empty path");
+ is(node.type, JSON_PATH_END, "is str");
+
+ /* Path with no '.' at the beginning. */
+ reset_to_new_path("field1.field2");
+ is_next_key("field1");
+
+ check_plan();
+ footer();
+}
+
+#define check_new_path_on_error(value, errpos) \
+ reset_to_new_path(value); \
+ struct json_path_node node; \
+ is(json_path_next(&parser, &node), errpos, "error on position %d" \
+ " for <%s>", errpos, path);
+
+struct path_and_errpos {
+ const char *path;
+ int errpos;
+};
+
+void
+test_errors()
+{
+ header();
+ plan(20);
+ const char *path;
+ int len;
+ struct json_path_parser parser;
+ const struct path_and_errpos errors[] = {
+ /* Double [[. */
+ {"[[", 2},
+ /* Not string inside []. */
+ {"[field]", 2},
+ /* String outside of []. */
+ {"'field1'.field2", 1},
+ /* Empty brackets. */
+ {"[]", 2},
+ /* Empty string. */
+ {"''", 1},
+ /* Spaces between identifiers. */
+ {" field1", 1},
+ /* Start from digit. */
+ {"1field", 1},
+ {".1field", 2},
+ /* Unfinished identifiers. */
+ {"['field", 8},
+ {"['field'", 9},
+ {"[123", 5},
+ {"['']", 3},
+ /*
+ * Not trivial error: can not write
+ * '[]' after '.'.
+ */
+ {".[123]", 2},
+ /* Misc. */
+ {"[.]", 2},
+ /* Invalid UNICODE */
+ {"['aaa\xc2\xc2']", 6},
+ {".\xc2\xc2", 2},
+ };
+ for (size_t i = 0; i < lengthof(errors); ++i) {
+ reset_to_new_path(errors[i].path);
+ int errpos = errors[i].errpos;
+ struct json_path_node node;
+ is(json_path_next(&parser, &node), errpos,
+ "error on position %d for <%s>", errpos, path);
+ }
+
+ reset_to_new_path("f.[2]")
+ struct json_path_node node;
+ json_path_next(&parser, &node);
+ is(json_path_next(&parser, &node), 3, "can not write <field.[index]>")
+
+ reset_to_new_path("f.")
+ json_path_next(&parser, &node);
+ is(json_path_next(&parser, &node), 3, "error in leading <.>");
+
+ reset_to_new_path("fiel d1")
+ json_path_next(&parser, &node);
+ is(json_path_next(&parser, &node), 5, "space inside identifier");
+
+ reset_to_new_path("field\t1")
+ json_path_next(&parser, &node);
+ is(json_path_next(&parser, &node), 6, "tab inside identifier");
+
+ check_plan();
+ footer();
+}
+
+int
+main()
+{
+ header();
+ plan(2);
+
+ test_basic();
+ test_errors();
+
+ int rc = check_plan();
+ footer();
+ return rc;
+}
diff --git a/test/unit/json_path.result b/test/unit/json_path.result
new file mode 100644
index 0000000..197316d
--- /dev/null
+++ b/test/unit/json_path.result
@@ -0,0 +1,84 @@
+ *** main ***
+1..2
+ *** test_basic ***
+ 1..53
+ ok 1 - parse <[0]>
+ ok 2 - <[0]> is num
+ ok 3 - <[0]> is 0
+ ok 4 - parse <field1>
+ ok 5 - <field1> is str
+ ok 6 - len is 6
+ ok 7 - str is field1
+ ok 8 - parse <field2>
+ ok 9 - <field2> is str
+ ok 10 - len is 6
+ ok 11 - str is field2
+ ok 12 - parse <field3>
+ ok 13 - <field3> is str
+ ok 14 - len is 6
+ ok 15 - str is field3
+ ok 16 - parse <[5]>
+ ok 17 - <[5]> is num
+ ok 18 - <[5]> is 5
+ ok 19 - parse <[3]>
+ ok 20 - <[3]> is num
+ ok 21 - <[3]> is 3
+ ok 22 - parse <field>
+ ok 23 - <field> is str
+ ok 24 - len is 5
+ ok 25 - str is field
+ ok 26 - parse <[2]>
+ ok 27 - <[2]> is num
+ ok 28 - <[2]> is 2
+ ok 29 - parse <field>
+ ok 30 - <field> is str
+ ok 31 - len is 5
+ ok 32 - str is field
+ ok 33 - parse <f1>
+ ok 34 - <f1> is str
+ ok 35 - len is 2
+ ok 36 - str is f1
+ ok 37 - parse <f2'3'>
+ ok 38 - <f2'3'> is str
+ ok 39 - len is 5
+ ok 40 - str is f2'3'
+ ok 41 - parse <field1>
+ ok 42 - <field1> is str
+ ok 43 - len is 6
+ ok 44 - str is field1
+ ok 45 - parse <[1234]>
+ ok 46 - <[1234]> is num
+ ok 47 - <[1234]> is 1234
+ ok 48 - parse empty path
+ ok 49 - is str
+ ok 50 - parse <field1>
+ ok 51 - <field1> is str
+ ok 52 - len is 6
+ ok 53 - str is field1
+ok 1 - subtests
+ *** test_basic: done ***
+ *** test_errors ***
+ 1..20
+ ok 1 - error on position 2 for <[[>
+ ok 2 - error on position 2 for <[field]>
+ ok 3 - error on position 1 for <'field1'.field2>
+ ok 4 - error on position 2 for <[]>
+ ok 5 - error on position 1 for <''>
+ ok 6 - error on position 1 for < field1>
+ ok 7 - error on position 1 for <1field>
+ ok 8 - error on position 2 for <.1field>
+ ok 9 - error on position 8 for <['field>
+ ok 10 - error on position 9 for <['field'>
+ ok 11 - error on position 5 for <[123>
+ ok 12 - error on position 3 for <['']>
+ ok 13 - error on position 2 for <.[123]>
+ ok 14 - error on position 2 for <[.]>
+ ok 15 - error on position 6 for <['aaaÂÂ']>
+ ok 16 - error on position 2 for <.ÂÂ>
+ ok 17 - can not write <field.[index]>
+ ok 18 - error in leading <.>
+ ok 19 - space inside identifier
+ ok 20 - tab inside identifier
+ok 2 - subtests
+ *** test_errors: done ***
+ *** main: done ***
--
2.7.4
More information about the Tarantool-patches
mailing list