[PATCH v3 2/3] Introduce json_path_parser with Unicode support.

Tue Apr 10 19:36:19 MSK 2018

On Tue, Apr 10, 2018 at 11:31:45AM +0300, Vladislav Shpilevoy wrote:
> diff --git a/src/lib/json/path.c b/src/lib/json/path.c
> new file mode 100644
> index 000000000..681419f90
> --- /dev/null
> +++ b/src/lib/json/path.c

> +/**
> + * Read a single symbol from a string starting from an offset.
> + * @param parser JSON path parser.
> + * @param[out] UChar32 Read symbol.
> + *
> + * @retval   0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_read_symbol(struct json_path_parser *parser, UChar32 *out)
> +{
> +	if (parser->offset == parser->src_len) {

> +		*out = U_SENTINEL;

This is not necessary AFAICS.

> +		return parser->symbol_count + 1;
> +	}
> +	U8_NEXT(parser->src, parser->offset, parser->src_len, *out);
> +	if (*out == U_SENTINEL)
> +		return parser->symbol_count + 1;
> +	++parser->symbol_count;
> +	return 0;
> +}
> +
> +/**
> + * Rollback one symbol offset.
> + * @param parser JSON path parser.
> + * @param offset Offset to the previous symbol.
> + */
> +static inline void
> +json_revert_symbol(struct json_path_parser *parser, int offset)

Not so good name IMO. Can't think of a better one though.

> +{
> +	parser->offset = offset;
> +	--parser->symbol_count;
> +}
> +
> +/** Fast forward when it is known that a symbol is 1-byte char. */
> +static inline void
> +json_propagate_char(struct json_path_parser *parser)

Bad name IMO, json_skip_char would be better.

> +{
> +	++parser->offset;
> +	++parser->symbol_count;
> +}
> +
> +/** Get a current symbol as a 1-byte char. */
> +static inline char
> +json_current_char(struct json_path_parser *parser)
> +{
> +	return *(parser->src + parser->offset);
> +}
> +
> +/**
> + * Parse string identifier in quotes. Parser either stops right
> + * after the closing quote, or returns an error position.
> + * @param parser JSON path parser.
> + * @param[out] node JSON node to store result.
> + * @param quote_type Quote by that a string must be terminated.
> + *
> + * @retval   0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_parse_string(struct json_path_parser *parser, struct json_path_node *node,
> +		  UChar32 quote_type)
> +{
> +	assert(parser->offset < parser->src_len);

> +	assert(quote_type == json_current_char(parser));

Why pass quote_type if you know that it equals json_current_char?

> +	/* The first symbol is always char  - ' or ". */
> +	json_propagate_char(parser);
> +	int str_offset = parser->offset;
> +	UChar32 c;
> +	int rc;
> +	while ((rc = json_read_symbol(parser, &c)) == 0) {
> +		if (c == quote_type) {
> +			int len = parser->offset - str_offset - 1;
> +			if (len == 0)
> +				return parser->symbol_count;
> +			node->type = JSON_PATH_STR;
> +			node->str = parser->src + str_offset;
> +			node->len = len;
> +			return 0;
> +		}
> +	}
> +	return rc;
> +}
> +
> +/**
> + * Parse digit sequence into integer until non-digit is met.
> + * Parser stops right after the last digit.
> + * @param parser JSON parser.
> + * @param[out] node JSON node to store result.
> + *
> + * @retval   0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_parse_integer(struct json_path_parser *parser, struct json_path_node *node)
> +{
> +	const char *end = parser->src + parser->src_len;
> +	const char *pos = parser->src + parser->offset;
> +	assert(pos < end);
> +	int len = 0;
> +	uint64_t value = 0;
> +	char c = *pos;
> +	if (! isdigit(c))
> +		return parser->symbol_count + 1;
> +	do {
> +		value = value * 10 + c - (int)'0';
> +		++len;
> +	} while (++pos < end && isdigit((c = *pos)));
> +	parser->offset += len;
> +	parser->symbol_count += len;
> +	node->type = JSON_PATH_NUM;
> +	node->num = value;

strtol?

> +	return 0;
> +}
> +
> +/**
> + * Check that a symbol can be part of a JSON path not inside
> + * ["..."].
> + */
> +static inline bool
> +json_is_valid_identifier_symbol(UChar32 c)
> +{
> +	return u_isUAlphabetic(c) || c == (UChar32)'_' || u_isdigit(c);
> +}
> +
> +/**
> + * Parse identifier out of quotes. It can contain only alphas,
> + * digits and underscores. And can not contain digit at the first
> + * position. Parser is stoped right after the last non-digit,
> + * non-alpha and non-underscore symbol.
> + * @param parser JSON parser. Updates signs_read field.

What is 'signs_read'?

> + * @param[out] node JSON node to store result.
> + *
> + * @retval   0 Success.
> + * @retval > 0 1-based position of a syntax error.
> + */
> +static inline int
> +json_parse_identifier(struct json_path_parser *parser,
> +		      struct json_path_node *node)
> +{
> +	assert(parser->offset < parser->src_len);
> +	int str_offset = parser->offset;
> +	UChar32 c;
> +	int rc = json_read_symbol(parser, &c);
> +	if (rc != 0)
> +		return rc;
> +	/* First symbol can not be digit. */
> +	if (!u_isalpha(c) && c != (UChar32)'_')
> +		return parser->symbol_count;
> +	int last_offset = parser->offset;
> +	while ((rc = json_read_symbol(parser, &c)) == 0) {

> +		if (! json_is_valid_identifier_symbol(c)) {

I would fold this function, because we have to use u_isalpha right above
anyway.

> +			json_revert_symbol(parser, last_offset);
> +			break;
> +		}
> +		last_offset = parser->offset;
> +	}
> +	node->type = JSON_PATH_STR;
> +	node->str = parser->src + str_offset;
> +	node->len = parser->offset - str_offset;
> +	return 0;
> +}
> +
> +int
> +json_path_next(struct json_path_parser *parser, struct json_path_node *node)
> +{
> +	UChar32 c;
> +	int last_offset = parser->offset;
> +	int rc = json_read_symbol(parser, &c);
> +	if (rc != 0) {
> +		if (parser->offset == parser->src_len) {
> +			node->type = JSON_PATH_END;
> +			return 0;
> +		}

IMO it would be more straightforward to first check EOF and only then
proceed to reading the string, not vice versa.

> +		return rc;
> +	}
> +	switch(c) {
> +	case (UChar32)'[':
> +		/* Error for '[\0'. */
> +		if (parser->offset == parser->src_len)
> +			return parser->symbol_count;
> +		c = json_current_char(parser);
> +		if (c == '"' || c == '\'')
> +			rc = json_parse_string(parser, node, c);
> +		else
> +			rc = json_parse_integer(parser, node);
> +		if (rc != 0)
> +			return rc;
> +		/*
> +		 * Expression, started from [ must be finished
> +		 * with ] regardless of its type.
> +		 */
> +		if (parser->offset == parser->src_len ||
> +		    json_current_char(parser) != ']')
> +			return parser->symbol_count + 1;
> +		/* Skip ] - one byte char. */
> +		json_propagate_char(parser);
> +		return 0;
> +	case (UChar32)'.':
> +		if (parser->offset == parser->src_len)
> +			return parser->symbol_count + 1;
> +		return json_parse_identifier(parser, node);
> +	default:
> +		json_revert_symbol(parser, last_offset);
> +		return json_parse_identifier(parser, node);
> +	}
> +}

> diff --git a/test/unit/json_path.c b/test/unit/json_path.c
> new file mode 100644
> index 000000000..fb35bf898
> --- /dev/null
> +++ b/test/unit/json_path.c
> @@ -0,0 +1,172 @@
> +#include "json/path.h"
> +#include "unit.h"
> +#include "trivia/util.h"
> +#include <string.h>
> +
> +#define reset_to_new_path(value) \
> +	path = value; \
> +	len = strlen(value); \
> +	json_path_parser_create(&parser, path, len);
> +
> +#define is_next_index(value_len, value) \
> +	path = parser.src + parser.offset; \
> +	is(json_path_next(&parser, &node), 0, "parse <%." #value_len "s>", \
> +	   path); \
> +	is(node.type, JSON_PATH_NUM, "<%." #value_len "s> is num", path); \
> +	is(node.num, value, "<%." #value_len "s> is " #value, path);
> +
> +#define is_next_key(value) \
> +	len = strlen(value); \
> +	is(json_path_next(&parser, &node), 0, "parse <" value ">"); \
> +	is(node.type, JSON_PATH_STR, "<" value "> is str"); \
> +	is(node.len, len, "len is %d", len); \
> +	is(strncmp(node.str, value, len), 0, "str is " value);
> +
> +void
> +test_basic()
> +{
> +	header();
> +	plan(67);
> +	const char *path;
> +	int len;
> +	struct json_path_parser parser;
> +	struct json_path_node node;
> +
> +	reset_to_new_path("[0].field1.field2['field3'][5]");
> +	is_next_index(3, 0);
> +	is_next_key("field1");
> +	is_next_key("field2");
> +	is_next_key("field3");
> +	is_next_index(3, 5);
> +
> +	reset_to_new_path("[3].field[2].field")
> +	is_next_index(3, 3);
> +	is_next_key("field");
> +	is_next_index(3, 2);
> +	is_next_key("field");
> +
> +	reset_to_new_path("[\"f1\"][\"f2'3'\"]");
> +	is_next_key("f1");
> +	is_next_key("f2'3'");
> +
> +	/* Support both '.field1...' and 'field1...'. */

The comment is misleading - there's no test case for 'field1...'

> +	reset_to_new_path(".field1");
> +	is_next_key("field1");

Shouldn't it return an empty string before 'field1' for such an input?