[tarantool-patches] Re: [PATCH v3 2/3] Introduce json_path_parser with Unicode support.

Tue Apr 10 21:42:08 MSK 2018

On 10/04/2018 19:36, Vladimir Davydov wrote:
> On Tue, Apr 10, 2018 at 11:31:45AM +0300, Vladislav Shpilevoy wrote:
>> diff --git a/src/lib/json/path.c b/src/lib/json/path.c
>> new file mode 100644
>> index 000000000..681419f90
>> --- /dev/null
>> +++ b/src/lib/json/path.c
> 
>> +/**
>> + * Read a single symbol from a string starting from an offset.
>> + * @param parser JSON path parser.
>> + * @param[out] UChar32 Read symbol.
>> + *
>> + * @retval   0 Success.
>> + * @retval > 0 1-based position of a syntax error.
>> + */
>> +static inline int
>> +json_read_symbol(struct json_path_parser *parser, UChar32 *out)
>> +{
>> +	if (parser->offset == parser->src_len) {
> 
>> +		*out = U_SENTINEL;
> 
> This is not necessary AFAICS.

One of compilers prints warning, that 'UChar32 c' is not initialized, 
where 'c' is argument of json_read_symbol.
>> +/** Fast forward when it is known that a symbol is 1-byte char. */
>> +static inline void
>> +json_propagate_char(struct json_path_parser *parser)
> 
> Bad name IMO, json_skip_char would be better.

As you wish.

diff --git a/src/lib/json/path.c b/src/lib/json/path.c
index 681419f90..1fccf82d0 100644
--- a/src/lib/json/path.c
+++ b/src/lib/json/path.c
@@ -72,7 +72,7 @@ json_revert_symbol(struct json_path_parser *parser, 
int offset)

  /** Fast forward when it is known that a symbol is 1-byte char. */
  static inline void
-json_propagate_char(struct json_path_parser *parser)
+json_skip_char(struct json_path_parser *parser)
  {
         ++parser->offset;
         ++parser->symbol_count;

>> +/**
>> + * Parse string identifier in quotes. Parser either stops right
>> + * after the closing quote, or returns an error position.
>> + * @param parser JSON path parser.
>> + * @param[out] node JSON node to store result.
>> + * @param quote_type Quote by that a string must be terminated.
>> + *
>> + * @retval   0 Success.
>> + * @retval > 0 1-based position of a syntax error.
>> + */
>> +static inline int
>> +json_parse_string(struct json_path_parser *parser, struct json_path_node *node,
>> +		  UChar32 quote_type)
>> +{
>> +	assert(parser->offset < parser->src_len);
>> +	assert(quote_type == json_current_char(parser));
> 
> Why pass quote_type if you know that it equals json_current_char?

I do not want to calculate *(src + offset) if it is already done in the 
caller function. Besides, I do not like json_current_char(), but now I 
can not find how to drop it with no simple inline.
>> +static inline int
>> +json_parse_integer(struct json_path_parser *parser, struct json_path_node *node)
>> +{
>> +	const char *end = parser->src + parser->src_len;
>> +	const char *pos = parser->src + parser->offset;
>> +	assert(pos < end);
>> +	int len = 0;
>> +	uint64_t value = 0;
>> +	char c = *pos;
>> +	if (! isdigit(c))
>> +		return parser->symbol_count + 1;
>> +	do {
>> +		value = value * 10 + c - (int)'0';
>> +		++len;
>> +	} while (++pos < end && isdigit((c = *pos)));
>> +	parser->offset += len;
>> +	parser->symbol_count += len;
>> +	node->type = JSON_PATH_NUM;
>> +	node->num = value;
> 
> strtol?

Now number value is calculated simultaneously with reading and offsets 
propagation in exactly N symbols, where N - is length of a number.

You propose to read N symbols and then read them again to build a number 
- it is 2N readings. Seems to be slower.

Moreover, strtol will check each symbol on is digit, but we already know 
that all len symbols are digits. And strtol does not have strntol 
version. So I can input invalid JSON like this: '[1234', and strtol can 
do not stop after 4, if next memory area byte looks like a digit.

First version of JSON path parser had strntoll() implementation and 2N 
readings, but the bench showed, then building a number on fly is faster.

> 
>> +	return 0;
>> +}
>> +
>> +/**
>> + * Check that a symbol can be part of a JSON path not inside
>> + * ["..."].
>> + */
>> +static inline bool
>> +json_is_valid_identifier_symbol(UChar32 c)
>> +{
>> +	return u_isUAlphabetic(c) || c == (UChar32)'_' || u_isdigit(c);
>> +}
>> +
>> +/**
>> + * Parse identifier out of quotes. It can contain only alphas,
>> + * digits and underscores. And can not contain digit at the first
>> + * position. Parser is stoped right after the last non-digit,
>> + * non-alpha and non-underscore symbol.
>> + * @param parser JSON parser. Updates signs_read field.
> 
> What is 'signs_read'?

Did not notice that on self-review(

Fixed.

@@ -166,7 +166,7 @@ json_is_valid_identifier_symbol(UChar32 c)
   * digits and underscores. And can not contain digit at the first
   * position. Parser is stoped right after the last non-digit,
   * non-alpha and non-underscore symbol.
- * @param parser JSON parser. Updates signs_read field.
+ * @param parser JSON parser.
   * @param[out] node JSON node to store result.
   *
   * @retval   0 Success.

>> +	/* First symbol can not be digit. */
>> +	if (!u_isalpha(c) && c != (UChar32)'_')
>> +		return parser->symbol_count;
>> +	int last_offset = parser->offset;
>> +	while ((rc = json_read_symbol(parser, &c)) == 0) {
> 
>> +		if (! json_is_valid_identifier_symbol(c)) {
> 
> I would fold this function, because we have to use u_isalpha right above
> anyway.

is_valid_identifier checks not only on alpha and '_' - it checks on 
digits. An identifier can contain digits on non-first place.

>> +}
>> +
>> +int
>> +json_path_next(struct json_path_parser *parser, struct json_path_node *node)
>> +{
>> +	UChar32 c;
>> +	int last_offset = parser->offset;
>> +	int rc = json_read_symbol(parser, &c);
>> +	if (rc != 0) {
>> +		if (parser->offset == parser->src_len) {
>> +			node->type = JSON_PATH_END;
>> +			return 0;
>> +		}
> 
> IMO it would be more straightforward to first check EOF and only then
> proceed to reading the string, not vice versa.

As you wish.

@@ -202,16 +202,15 @@ json_parse_identifier(struct json_path_parser *parser,
  int
  json_path_next(struct json_path_parser *parser, struct json_path_node 
*node)
  {
+       if (parser->offset == parser->src_len) {
+               node->type = JSON_PATH_END;
+               return 0;
+       }
         UChar32 c;
         int last_offset = parser->offset;
         int rc = json_read_symbol(parser, &c);
-       if (rc != 0) {
-               if (parser->offset == parser->src_len) {
-                       node->type = JSON_PATH_END;
-                       return 0;
-               }
+       if (rc != 0)
                 return rc;
-       }
         switch(c) {

>> +void
>> +test_basic()
>> +{
>> +	header();
>> +	plan(67);
>> +	const char *path;
>> +	int len;
>> +	struct json_path_parser parser;
>> +	struct json_path_node node;
>> +
>> +	reset_to_new_path("[0].field1.field2['field3'][5]");
>> +	is_next_index(3, 0);
>> +	is_next_key("field1");
>> +	is_next_key("field2");
>> +	is_next_key("field3");
>> +	is_next_index(3, 5);
>> +
>> +	reset_to_new_path("[3].field[2].field")
>> +	is_next_index(3, 3);
>> +	is_next_key("field");
>> +	is_next_index(3, 2);
>> +	is_next_key("field");
>> +
>> +	reset_to_new_path("[\"f1\"][\"f2'3'\"]");
>> +	is_next_key("f1");
>> +	is_next_key("f2'3'");
>> +
>> +	/* Support both '.field1...' and 'field1...'. */
> 
> The comment is misleading - there's no test case for 'field1...'

Thanks for the catch.

diff --git a/test/unit/json_path.c b/test/unit/json_path.c
index fb35bf898..1d7e7d372 100644
--- a/test/unit/json_path.c
+++ b/test/unit/json_path.c
@@ -26,7 +26,7 @@ void
  test_basic()
  {
         header();
-       plan(67);
+       plan(71);
         const char *path;
         int len;
         struct json_path_parser parser;
@@ -52,6 +52,8 @@ test_basic()
         /* Support both '.field1...' and 'field1...'. */
         reset_to_new_path(".field1");
         is_next_key("field1");
+       reset_to_new_path("field1");
+       is_next_key("field1");

> 
>> +	reset_to_new_path(".field1");
>> +	is_next_key("field1");
> 
> Shouldn't it return an empty string before 'field1' for such an input?

No, here everything is ok. My JSON path parser allows to start a path 
with '.' and it is not considered as an empty string + '.'. Exception is 
'.[...]' - it is an error (see tests in test_errors()).

I allowed '.identifier' syntax to lay emphasis that it is actually 
'tuple.identifier'. The JSON path is just a suffix for a tuple.