From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Date: Tue, 10 Apr 2018 19:36:19 +0300 From: Vladimir Davydov Subject: Re: [PATCH v3 2/3] Introduce json_path_parser with Unicode support. Message-ID: <20180410163619.egndfoanpazswmba@esperanza> References: MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: To: kshcherbatov@tarantool.org Cc: tarantool-patches@freelists.org, v.shpilevoy@tarantool.org List-ID: On Tue, Apr 10, 2018 at 11:31:45AM +0300, Vladislav Shpilevoy wrote: > diff --git a/src/lib/json/path.c b/src/lib/json/path.c > new file mode 100644 > index 000000000..681419f90 > --- /dev/null > +++ b/src/lib/json/path.c > +/** > + * Read a single symbol from a string starting from an offset. > + * @param parser JSON path parser. > + * @param[out] UChar32 Read symbol. > + * > + * @retval 0 Success. > + * @retval > 0 1-based position of a syntax error. > + */ > +static inline int > +json_read_symbol(struct json_path_parser *parser, UChar32 *out) > +{ > + if (parser->offset == parser->src_len) { > + *out = U_SENTINEL; This is not necessary AFAICS. > + return parser->symbol_count + 1; > + } > + U8_NEXT(parser->src, parser->offset, parser->src_len, *out); > + if (*out == U_SENTINEL) > + return parser->symbol_count + 1; > + ++parser->symbol_count; > + return 0; > +} > + > +/** > + * Rollback one symbol offset. > + * @param parser JSON path parser. > + * @param offset Offset to the previous symbol. > + */ > +static inline void > +json_revert_symbol(struct json_path_parser *parser, int offset) Not so good name IMO. Can't think of a better one though. > +{ > + parser->offset = offset; > + --parser->symbol_count; > +} > + > +/** Fast forward when it is known that a symbol is 1-byte char. */ > +static inline void > +json_propagate_char(struct json_path_parser *parser) Bad name IMO, json_skip_char would be better. > +{ > + ++parser->offset; > + ++parser->symbol_count; > +} > + > +/** Get a current symbol as a 1-byte char. */ > +static inline char > +json_current_char(struct json_path_parser *parser) > +{ > + return *(parser->src + parser->offset); > +} > + > +/** > + * Parse string identifier in quotes. Parser either stops right > + * after the closing quote, or returns an error position. > + * @param parser JSON path parser. > + * @param[out] node JSON node to store result. > + * @param quote_type Quote by that a string must be terminated. > + * > + * @retval 0 Success. > + * @retval > 0 1-based position of a syntax error. > + */ > +static inline int > +json_parse_string(struct json_path_parser *parser, struct json_path_node *node, > + UChar32 quote_type) > +{ > + assert(parser->offset < parser->src_len); > + assert(quote_type == json_current_char(parser)); Why pass quote_type if you know that it equals json_current_char? > + /* The first symbol is always char - ' or ". */ > + json_propagate_char(parser); > + int str_offset = parser->offset; > + UChar32 c; > + int rc; > + while ((rc = json_read_symbol(parser, &c)) == 0) { > + if (c == quote_type) { > + int len = parser->offset - str_offset - 1; > + if (len == 0) > + return parser->symbol_count; > + node->type = JSON_PATH_STR; > + node->str = parser->src + str_offset; > + node->len = len; > + return 0; > + } > + } > + return rc; > +} > + > +/** > + * Parse digit sequence into integer until non-digit is met. > + * Parser stops right after the last digit. > + * @param parser JSON parser. > + * @param[out] node JSON node to store result. > + * > + * @retval 0 Success. > + * @retval > 0 1-based position of a syntax error. > + */ > +static inline int > +json_parse_integer(struct json_path_parser *parser, struct json_path_node *node) > +{ > + const char *end = parser->src + parser->src_len; > + const char *pos = parser->src + parser->offset; > + assert(pos < end); > + int len = 0; > + uint64_t value = 0; > + char c = *pos; > + if (! isdigit(c)) > + return parser->symbol_count + 1; > + do { > + value = value * 10 + c - (int)'0'; > + ++len; > + } while (++pos < end && isdigit((c = *pos))); > + parser->offset += len; > + parser->symbol_count += len; > + node->type = JSON_PATH_NUM; > + node->num = value; strtol? > + return 0; > +} > + > +/** > + * Check that a symbol can be part of a JSON path not inside > + * ["..."]. > + */ > +static inline bool > +json_is_valid_identifier_symbol(UChar32 c) > +{ > + return u_isUAlphabetic(c) || c == (UChar32)'_' || u_isdigit(c); > +} > + > +/** > + * Parse identifier out of quotes. It can contain only alphas, > + * digits and underscores. And can not contain digit at the first > + * position. Parser is stoped right after the last non-digit, > + * non-alpha and non-underscore symbol. > + * @param parser JSON parser. Updates signs_read field. What is 'signs_read'? > + * @param[out] node JSON node to store result. > + * > + * @retval 0 Success. > + * @retval > 0 1-based position of a syntax error. > + */ > +static inline int > +json_parse_identifier(struct json_path_parser *parser, > + struct json_path_node *node) > +{ > + assert(parser->offset < parser->src_len); > + int str_offset = parser->offset; > + UChar32 c; > + int rc = json_read_symbol(parser, &c); > + if (rc != 0) > + return rc; > + /* First symbol can not be digit. */ > + if (!u_isalpha(c) && c != (UChar32)'_') > + return parser->symbol_count; > + int last_offset = parser->offset; > + while ((rc = json_read_symbol(parser, &c)) == 0) { > + if (! json_is_valid_identifier_symbol(c)) { I would fold this function, because we have to use u_isalpha right above anyway. > + json_revert_symbol(parser, last_offset); > + break; > + } > + last_offset = parser->offset; > + } > + node->type = JSON_PATH_STR; > + node->str = parser->src + str_offset; > + node->len = parser->offset - str_offset; > + return 0; > +} > + > +int > +json_path_next(struct json_path_parser *parser, struct json_path_node *node) > +{ > + UChar32 c; > + int last_offset = parser->offset; > + int rc = json_read_symbol(parser, &c); > + if (rc != 0) { > + if (parser->offset == parser->src_len) { > + node->type = JSON_PATH_END; > + return 0; > + } IMO it would be more straightforward to first check EOF and only then proceed to reading the string, not vice versa. > + return rc; > + } > + switch(c) { > + case (UChar32)'[': > + /* Error for '[\0'. */ > + if (parser->offset == parser->src_len) > + return parser->symbol_count; > + c = json_current_char(parser); > + if (c == '"' || c == '\'') > + rc = json_parse_string(parser, node, c); > + else > + rc = json_parse_integer(parser, node); > + if (rc != 0) > + return rc; > + /* > + * Expression, started from [ must be finished > + * with ] regardless of its type. > + */ > + if (parser->offset == parser->src_len || > + json_current_char(parser) != ']') > + return parser->symbol_count + 1; > + /* Skip ] - one byte char. */ > + json_propagate_char(parser); > + return 0; > + case (UChar32)'.': > + if (parser->offset == parser->src_len) > + return parser->symbol_count + 1; > + return json_parse_identifier(parser, node); > + default: > + json_revert_symbol(parser, last_offset); > + return json_parse_identifier(parser, node); > + } > +} > diff --git a/test/unit/json_path.c b/test/unit/json_path.c > new file mode 100644 > index 000000000..fb35bf898 > --- /dev/null > +++ b/test/unit/json_path.c > @@ -0,0 +1,172 @@ > +#include "json/path.h" > +#include "unit.h" > +#include "trivia/util.h" > +#include > + > +#define reset_to_new_path(value) \ > + path = value; \ > + len = strlen(value); \ > + json_path_parser_create(&parser, path, len); > + > +#define is_next_index(value_len, value) \ > + path = parser.src + parser.offset; \ > + is(json_path_next(&parser, &node), 0, "parse <%." #value_len "s>", \ > + path); \ > + is(node.type, JSON_PATH_NUM, "<%." #value_len "s> is num", path); \ > + is(node.num, value, "<%." #value_len "s> is " #value, path); > + > +#define is_next_key(value) \ > + len = strlen(value); \ > + is(json_path_next(&parser, &node), 0, "parse <" value ">"); \ > + is(node.type, JSON_PATH_STR, "<" value "> is str"); \ > + is(node.len, len, "len is %d", len); \ > + is(strncmp(node.str, value, len), 0, "str is " value); > + > +void > +test_basic() > +{ > + header(); > + plan(67); > + const char *path; > + int len; > + struct json_path_parser parser; > + struct json_path_node node; > + > + reset_to_new_path("[0].field1.field2['field3'][5]"); > + is_next_index(3, 0); > + is_next_key("field1"); > + is_next_key("field2"); > + is_next_key("field3"); > + is_next_index(3, 5); > + > + reset_to_new_path("[3].field[2].field") > + is_next_index(3, 3); > + is_next_key("field"); > + is_next_index(3, 2); > + is_next_key("field"); > + > + reset_to_new_path("[\"f1\"][\"f2'3'\"]"); > + is_next_key("f1"); > + is_next_key("f2'3'"); > + > + /* Support both '.field1...' and 'field1...'. */ The comment is misleading - there's no test case for 'field1...' > + reset_to_new_path(".field1"); > + is_next_key("field1"); Shouldn't it return an empty string before 'field1' for such an input?