[PATCH v3 2/3] Introduce json_path_parser with Unicode support.
Vladimir Davydov
vdavydov.dev at gmail.com
Tue Apr 10 19:36:19 MSK 2018
On Tue, Apr 10, 2018 at 11:31:45AM +0300, Vladislav Shpilevoy wrote:
> diff --git a/src/lib/json/path.c b/src/lib/json/path.c
> new file mode 100644
> index 000000000..681419f90
> --- /dev/null
> +++ b/src/lib/json/path.c
> +/**
> + * Read a single symbol from a string starting from an offset.
> + * @param parser JSON path parser.
> + * @param[out] UChar32 Read symbol.
> + *
> + * @retval 0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_read_symbol(struct json_path_parser *parser, UChar32 *out)
> +{
> + if (parser->offset == parser->src_len) {
> + *out = U_SENTINEL;
This is not necessary AFAICS.
> + return parser->symbol_count + 1;
> + }
> + U8_NEXT(parser->src, parser->offset, parser->src_len, *out);
> + if (*out == U_SENTINEL)
> + return parser->symbol_count + 1;
> + ++parser->symbol_count;
> + return 0;
> +}
> +
> +/**
> + * Rollback one symbol offset.
> + * @param parser JSON path parser.
> + * @param offset Offset to the previous symbol.
> + */
> +static inline void
> +json_revert_symbol(struct json_path_parser *parser, int offset)
Not so good name IMO. Can't think of a better one though.
> +{
> + parser->offset = offset;
> + --parser->symbol_count;
> +}
> +
> +/** Fast forward when it is known that a symbol is 1-byte char. */
> +static inline void
> +json_propagate_char(struct json_path_parser *parser)
Bad name IMO, json_skip_char would be better.
> +{
> + ++parser->offset;
> + ++parser->symbol_count;
> +}
> +
> +/** Get a current symbol as a 1-byte char. */
> +static inline char
> +json_current_char(struct json_path_parser *parser)
> +{
> + return *(parser->src + parser->offset);
> +}
> +
> +/**
> + * Parse string identifier in quotes. Parser either stops right
> + * after the closing quote, or returns an error position.
> + * @param parser JSON path parser.
> + * @param[out] node JSON node to store result.
> + * @param quote_type Quote by that a string must be terminated.
> + *
> + * @retval 0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_parse_string(struct json_path_parser *parser, struct json_path_node *node,
> + UChar32 quote_type)
> +{
> + assert(parser->offset < parser->src_len);
> + assert(quote_type == json_current_char(parser));
Why pass quote_type if you know that it equals json_current_char?
> + /* The first symbol is always char - ' or ". */
> + json_propagate_char(parser);
> + int str_offset = parser->offset;
> + UChar32 c;
> + int rc;
> + while ((rc = json_read_symbol(parser, &c)) == 0) {
> + if (c == quote_type) {
> + int len = parser->offset - str_offset - 1;
> + if (len == 0)
> + return parser->symbol_count;
> + node->type = JSON_PATH_STR;
> + node->str = parser->src + str_offset;
> + node->len = len;
> + return 0;
> + }
> + }
> + return rc;
> +}
> +
> +/**
> + * Parse digit sequence into integer until non-digit is met.
> + * Parser stops right after the last digit.
> + * @param parser JSON parser.
> + * @param[out] node JSON node to store result.
> + *
> + * @retval 0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_parse_integer(struct json_path_parser *parser, struct json_path_node *node)
> +{
> + const char *end = parser->src + parser->src_len;
> + const char *pos = parser->src + parser->offset;
> + assert(pos < end);
> + int len = 0;
> + uint64_t value = 0;
> + char c = *pos;
> + if (! isdigit(c))
> + return parser->symbol_count + 1;
> + do {
> + value = value * 10 + c - (int)'0';
> + ++len;
> + } while (++pos < end && isdigit((c = *pos)));
> + parser->offset += len;
> + parser->symbol_count += len;
> + node->type = JSON_PATH_NUM;
> + node->num = value;
strtol?
> + return 0;
> +}
> +
> +/**
> + * Check that a symbol can be part of a JSON path not inside
> + * ["..."].
> + */
> +static inline bool
> +json_is_valid_identifier_symbol(UChar32 c)
> +{
> + return u_isUAlphabetic(c) || c == (UChar32)'_' || u_isdigit(c);
> +}
> +
> +/**
> + * Parse identifier out of quotes. It can contain only alphas,
> + * digits and underscores. And can not contain digit at the first
> + * position. Parser is stoped right after the last non-digit,
> + * non-alpha and non-underscore symbol.
> + * @param parser JSON parser. Updates signs_read field.
What is 'signs_read'?
> + * @param[out] node JSON node to store result.
> + *
> + * @retval 0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_parse_identifier(struct json_path_parser *parser,
> + struct json_path_node *node)
> +{
> + assert(parser->offset < parser->src_len);
> + int str_offset = parser->offset;
> + UChar32 c;
> + int rc = json_read_symbol(parser, &c);
> + if (rc != 0)
> + return rc;
> + /* First symbol can not be digit. */
> + if (!u_isalpha(c) && c != (UChar32)'_')
> + return parser->symbol_count;
> + int last_offset = parser->offset;
> + while ((rc = json_read_symbol(parser, &c)) == 0) {
> + if (! json_is_valid_identifier_symbol(c)) {
I would fold this function, because we have to use u_isalpha right above
anyway.
> + json_revert_symbol(parser, last_offset);
> + break;
> + }
> + last_offset = parser->offset;
> + }
> + node->type = JSON_PATH_STR;
> + node->str = parser->src + str_offset;
> + node->len = parser->offset - str_offset;
> + return 0;
> +}
> +
> +int
> +json_path_next(struct json_path_parser *parser, struct json_path_node *node)
> +{
> + UChar32 c;
> + int last_offset = parser->offset;
> + int rc = json_read_symbol(parser, &c);
> + if (rc != 0) {
> + if (parser->offset == parser->src_len) {
> + node->type = JSON_PATH_END;
> + return 0;
> + }
IMO it would be more straightforward to first check EOF and only then
proceed to reading the string, not vice versa.
> + return rc;
> + }
> + switch(c) {
> + case (UChar32)'[':
> + /* Error for '[\0'. */
> + if (parser->offset == parser->src_len)
> + return parser->symbol_count;
> + c = json_current_char(parser);
> + if (c == '"' || c == '\'')
> + rc = json_parse_string(parser, node, c);
> + else
> + rc = json_parse_integer(parser, node);
> + if (rc != 0)
> + return rc;
> + /*
> + * Expression, started from [ must be finished
> + * with ] regardless of its type.
> + */
> + if (parser->offset == parser->src_len ||
> + json_current_char(parser) != ']')
> + return parser->symbol_count + 1;
> + /* Skip ] - one byte char. */
> + json_propagate_char(parser);
> + return 0;
> + case (UChar32)'.':
> + if (parser->offset == parser->src_len)
> + return parser->symbol_count + 1;
> + return json_parse_identifier(parser, node);
> + default:
> + json_revert_symbol(parser, last_offset);
> + return json_parse_identifier(parser, node);
> + }
> +}
> diff --git a/test/unit/json_path.c b/test/unit/json_path.c
> new file mode 100644
> index 000000000..fb35bf898
> --- /dev/null
> +++ b/test/unit/json_path.c
> @@ -0,0 +1,172 @@
> +#include "json/path.h"
> +#include "unit.h"
> +#include "trivia/util.h"
> +#include <string.h>
> +
> +#define reset_to_new_path(value) \
> + path = value; \
> + len = strlen(value); \
> + json_path_parser_create(&parser, path, len);
> +
> +#define is_next_index(value_len, value) \
> + path = parser.src + parser.offset; \
> + is(json_path_next(&parser, &node), 0, "parse <%." #value_len "s>", \
> + path); \
> + is(node.type, JSON_PATH_NUM, "<%." #value_len "s> is num", path); \
> + is(node.num, value, "<%." #value_len "s> is " #value, path);
> +
> +#define is_next_key(value) \
> + len = strlen(value); \
> + is(json_path_next(&parser, &node), 0, "parse <" value ">"); \
> + is(node.type, JSON_PATH_STR, "<" value "> is str"); \
> + is(node.len, len, "len is %d", len); \
> + is(strncmp(node.str, value, len), 0, "str is " value);
> +
> +void
> +test_basic()
> +{
> + header();
> + plan(67);
> + const char *path;
> + int len;
> + struct json_path_parser parser;
> + struct json_path_node node;
> +
> + reset_to_new_path("[0].field1.field2['field3'][5]");
> + is_next_index(3, 0);
> + is_next_key("field1");
> + is_next_key("field2");
> + is_next_key("field3");
> + is_next_index(3, 5);
> +
> + reset_to_new_path("[3].field[2].field")
> + is_next_index(3, 3);
> + is_next_key("field");
> + is_next_index(3, 2);
> + is_next_key("field");
> +
> + reset_to_new_path("[\"f1\"][\"f2'3'\"]");
> + is_next_key("f1");
> + is_next_key("f2'3'");
> +
> + /* Support both '.field1...' and 'field1...'. */
The comment is misleading - there's no test case for 'field1...'
> + reset_to_new_path(".field1");
> + is_next_key("field1");
Shouldn't it return an empty string before 'field1' for such an input?
More information about the Tarantool-patches
mailing list