[tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support

Kirill Shcherbatov kshcherbatov at tarantool.org
Mon Apr 2 22:19:27 MSK 2018


diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
index 99b9ff2..ec6f7cd 100644
--- a/src/box/lua/tuple.c
+++ b/src/box/lua/tuple.c
@@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L)
  static inline int
  tuple_field_go_to_index(const char **field, uint64_t index)
  {
-	assert(index >= 0);
  	enum mp_type type = mp_typeof(**field);
  	if (type == MP_ARRAY) {
  		if (index == 0)
@@ -497,7 +496,9 @@ tuple_field_go_to_key(const char **field, const char 
*key, int len)
  static int
  lbox_tuple_field_by_path(struct lua_State *L)
  {
+	int mark = 0;
  	const char *field;
+	const char *path = NULL;
  	struct tuple *tuple = luaT_istuple(L, 1);
  	/* Is checked in Lua wrapper. */
  	assert(tuple != NULL);
@@ -506,6 +507,17 @@ lbox_tuple_field_by_path(struct lua_State *L)
  		index -= TUPLE_INDEX_BASE;
  		if (index < 0) {
  not_found:
+		if (!path)
+			goto exit_not_found;
+		uint32_t path_len = strlen(path);
+		uint32_t path_hash = lua_hash(path, path_len);
+		field = tuple_field_by_name(tuple, path,
+		                            path_len, path_hash);
+		if (field)
+			goto push_value;
+		if (mark || path_len == 0)
+			luaL_error(L, "Error in path on position %d", mark);
+exit_not_found:
  			lua_pushinteger(L, -1);
  			lua_pushnil(L);
  			return 2;
@@ -520,13 +532,15 @@ push_value:
  	}
  	assert(lua_isstring(L, 2));
  	size_t path_len;
-	const char *path = lua_tolstring(L, 2, &path_len);
+	path = lua_tolstring(L, 2, &path_len);
  	struct json_path_parser parser;
  	struct json_path_node node;
  	json_path_parser_create(&parser, path, path_len);
  	int rc = json_path_next(&parser, &node);
-	if (rc != 0 || node.type == JSON_PATH_END)
-		luaL_error(L, "Error in path on position %d", rc);
+	if (rc != 0 || node.type == JSON_PATH_END) {
+		mark = rc;
+		goto not_found;
+	}
  	if (node.type == JSON_PATH_NUM) {
  		int index = node.num;
  		if (index == 0)
@@ -553,8 +567,10 @@ push_value:
  			name_hash = lua_hash(name, name_len);
  		}
  		field = tuple_field_by_name(tuple, name, name_len, name_hash);
-		if (field == NULL)
+		if (field == NULL) {
+			rc = 1;
  			goto not_found;
+		}
  	}
  	while ((rc = json_path_next(&parser, &node)) == 0 &&
  	       node.type != JSON_PATH_END) {
diff --git a/src/lib/json/path.c b/src/lib/json/path.c
index 4a6174e..d9473be 100644
--- a/src/lib/json/path.c
+++ b/src/lib/json/path.c
@@ -31,6 +31,8 @@

  #include "path.h"
  #include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
  #include "trivia/util.h"

  /** Same as strtoull(), but with limited length. */
@@ -45,6 +47,44 @@ strntoull(const char *src, int len) {
  }

  /**
+ * Checks is multibyte character whose first byte
+ * is pointed to by mb_str is alphabetic.
+ * NOTE: You have to clean global context
+ * with mbtowc(NULL, 0, 0); before first call
+ * @param mb_str
+ * @param mb_str_size
+ * @return multebyte character bytes count Success
+ * @retval 0 on non-alphabetic character or parse failure
+ */
+static inline size_t
+mbtowc_check_alpha(const char *mb_str, size_t mb_str_size)
+{
+	wchar_t wc;
+	ssize_t ret = mbtowc(&wc, mb_str, mb_str_size);
+	if (ret <= 0 || !iswalpha((wint_t)wc))
+		return 0;
+	return (size_t)ret;
+}
+
+
+/**
+ * Counts user-string sign count in mb_str_size bytes
+ * @param mb_str
+ * @param mb_str_size
+ * @return sign count
+ */
+static inline int
+mbtowc_count(struct json_path_parser *parser, const char *mb_str, 
size_t mb_str_len)
+{
+	char src[mb_str_len+1];
+	memcpy(src, mb_str, mb_str_len);
+	src[mb_str_len] = '\0';
+	mbtowc(NULL, 0, 0);
+	int rc = (int)mbstowcs(NULL, (const char *)src, 0);
+	return (rc >= 0) ? rc + (parser->src_len < (int)mb_str_len) : 1;
+}
+
+/**
   * Parse string identifier in quotes. Parser either stops right
   * after the closing quote, or returns an error position.
   * @param parser JSON path parser.
@@ -58,22 +98,26 @@ json_parse_string(struct json_path_parser *parser, 
struct json_path_node *node)
  {
  	const char *end = parser->src + parser->src_len;
  	const char *pos = parser->pos;
+	const char *str = pos;
  	assert(pos < end);
  	char quote_type = *pos;
  	assert(quote_type == '\'' || quote_type == '"');
-	/* Skip first quote. */
-	int len = 0;
-	++pos;
-	const char *str = pos;
-	for (char c = *pos; pos < end && quote_type != c; c = *++pos)
-		++len;
+	/* Skip first quote. */++pos;
+	mbtowc(NULL, 0, 0);
  	/* A string must be terminated with quote. */
+	for (; *pos != quote_type && pos < end;) {
+		int wc_sz = mbtowc(NULL, pos, end - pos);
+		if (wc_sz <= 0)
+			return pos - parser->src + 1;
+		pos += wc_sz;
+	}
+	int len = (int)(pos - str - 1);
  	if (*pos != quote_type || len == 0)
  		return pos - parser->src + 1;
  	/* Skip the closing quote. */
  	parser->pos = pos + 1;
  	node->type = JSON_PATH_STR;
-	node->str = str;
+	node->str = str + 1;
  	node->len = len;
  	return 0;
  }
@@ -124,19 +168,21 @@ json_parse_identifier(struct json_path_parser *parser,
  	const char *pos = parser->pos;
  	assert(pos < end);
  	const char *str = pos;
-	char c = *pos;
-	/* First symbol can not be digit. */
-	if (!isalpha(c) && c != '_')
+	size_t mb_size = 0;
+	/* clean global mb to wc decode context */
+	mbtowc(NULL, 0, 0);
+	/* First symbol also can not be digit. */
+	if (!(mb_size = mbtowc_check_alpha(pos, end - pos)) && (*pos != '_'))
  		return pos - parser->src + 1;
-	int len = 1;
-	for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
-	     c = *++pos)
-		++len;
-	assert(len > 0);
+	while (mb_size) {
+		pos += mb_size;
+		mb_size = mbtowc_check_alpha(pos, end - pos);
+		mb_size += (!mb_size && (isdigit(*pos) || *pos == '_'));
+	}
  	parser->pos = pos;
  	node->type = JSON_PATH_STR;
  	node->str = str;
-	node->len = len;
+	node->len = (int)(pos - str);
  	return 0;
  }

@@ -155,20 +201,22 @@ json_path_next(struct json_path_parser *parser, 
struct json_path_node *node)
  		++parser->pos;
  		/* Error for []. */
  		if (parser->pos == end)
-			return parser->pos - parser->src + 1;
+			return mbtowc_count(parser, parser->src,
+			                    parser->pos - parser->src + 1);
  		c = *parser->pos;
  		if (c == '"' || c == '\'')
  			rc = json_parse_string(parser, node);
  		else
  			rc = json_parse_integer(parser, node);
  		if (rc != 0)
-			return rc;
+			return mbtowc_count(parser, parser->src, rc);
  		/*
  		 * Expression, started from [ must be finished
  		 * with ] regardless of its type.
  		 */
  		if (parser->pos == end || *parser->pos != ']')
-			return parser->pos - parser->src + 1;
+			return mbtowc_count(parser, parser->src,
+			                    parser->pos - parser->src + 1);
  		/* Skip ]. */
  		++parser->pos;
  		break;
@@ -176,12 +224,13 @@ json_path_next(struct json_path_parser *parser, 
struct json_path_node *node)
  		/* Skip dot. */
  		++parser->pos;
  		if (parser->pos == end)
-			return parser->pos - parser->src + 1;
+			return mbtowc_count(parser, parser->src,
+			                    parser->pos - parser->src + 1);
  		FALLTHROUGH
  	default:
  		rc = json_parse_identifier(parser, node);
  		if (rc != 0)
-			return rc;
+			return mbtowc_count(parser, parser->src, rc);
  		break;
  	}
  	return 0;
diff --git a/src/lib/json/path.h b/src/lib/json/path.h
index 6e8db4c..b1028f6 100644
--- a/src/lib/json/path.h
+++ b/src/lib/json/path.h
@@ -33,6 +33,8 @@

  #include <stdbool.h>
  #include <stdint.h>
+#include <string.h>
+#include <malloc.h>

  #ifdef __cplusplus
  extern "C" {
diff --git a/test/engine/tuple.result b/test/engine/tuple.result
index 2d7367a..c4b361a 100644
--- a/test/engine/tuple.result
+++ b/test/engine/tuple.result
@@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
  pk = s:create_index('pk')
  ---
  ...
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", 
hello中国world = {中国 = 'test'}}}
  ---
  ...
  field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, 
d=4} }, [-1] = 200}
@@ -626,7 +626,7 @@ t[1]
  ...
  t[2]
  ---
-- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
+- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key': 
'key1', 'value': 'value1'}]
  ...
  t[3]
  ---
@@ -673,6 +673,10 @@ t["[2][6].value"]
  ---
  - value1
  ...
+t["[2][6].hello中国world"]
+---
+- {'中国': 'test'}
+...
  t["[2][6]['key']"]
  ---
  - key1
@@ -681,10 +685,18 @@ t["[2][6]['value']"]
  ---
  - value1
  ...
+t["[2][6]['hello中国world']"]
+---
+- {'中国': 'test'}
+...
  t["[3].k3[2].c"]
  ---
  - 3
  ...
+t["[2][6]['hello中国world'].中国"]
+---
+- test
+...
  t["[4]"]
  ---
  - '123456'
diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
index ba3482d..476be90 100644
--- a/test/engine/tuple.test.lua
+++ b/test/engine/tuple.test.lua
@@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'}
  format[4] = {name = 'field4', type = 'string'}
  s = box.schema.space.create('test', {format = format})
  pk = s:create_index('pk')
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", 
hello中国world = {中国 = 'test'}}}
  field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, 
d=4} }, [-1] = 200}
  t = s:replace{1, field2, field3, "123456"}
  t[1]
@@ -223,9 +223,12 @@ t["[2][5][2]"]
  t["[2][5][3]"]
  t["[2][6].key"]
  t["[2][6].value"]
+t["[2][6].hello中国world"]
  t["[2][6]['key']"]
  t["[2][6]['value']"]
+t["[2][6]['hello中国world']"]
  t["[3].k3[2].c"]
+t["[2][6]['hello中国world'].中国"]
  t["[4]"]
  t.field1
  t.field2[5]
-- 
2.7.4



On 30.03.2018 13:24, v.shpilevoy at tarantool.org wrote:
> See below 5 comments. And see at the travis: 
> https://travis-ci.org/tarantool/tarantool/builds/359964649?utm_source=github_status&utm_medium=notification - 
> seems like the patch does not work. box/alter.test.lua with unicode and 
> your own tests are failed.
> 
>> 29 марта 2018 г., в 21:04, Kirill Shcherbatov 
>> <kshcherbatov at tarantool.org <mailto:kshcherbatov at tarantool.org>> 
>> написал(а):
>>
>> From eb16ee6ea2d2ca5801b6b09c403c6e2f23984bd9 Mon Sep 17 00:00:00 2001
>> From: Kirill Shcherbatov <kshcherbatov at tarantool.org 
>> <mailto:kshcherbatov at tarantool.org>>
>> Date: Thu, 29 Mar 2018 21:03:21 +0300
>> Subject: [PATCH] Multibyte characters support
>>
>> ---
>> src/box/lua/tuple.c        |  1 -
>> src/lib/json/path.c        | 35 ++++++++++++++++++++++++++++++-----
>> src/lib/json/path.h        |  2 ++
>> test/engine/tuple.result   | 16 ++++++++++++++--
>> test/engine/tuple.test.lua |  5 ++++-
>> 5 files changed, 50 insertions(+), 9 deletions(-)
>>
>> diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
>> index 99b9ff2..c3a435b 100644
>> --- a/src/box/lua/tuple.c
>> +++ b/src/box/lua/tuple.c
>> @@ -44,6 +46,26 @@ strntoull(const char *src, int len) {
>> return value;
>> }
>>
> 
> 1. Please, add comments what the function does, what arguments gets, 
> what returns.
> 
>> +static inline size_t
>> +mbsize(const char *str, size_t str_size)
>> +{
>> +mbstate_t ps;
>> +memset(&ps, 0, sizeof(ps));
>> +mbrlen(NULL,0,&ps);
> 
> 2. I do not like this always-zero ps. Why you can not use mblen instead 
> of mbrlen?
> 
>> +return mbrlen(str, str_size, &ps);
>> +}
>> +
>> +static inline int
>> +mb2wcisalpha(char *mb_char, size_t mb_char_size)
>> +{
>> +assert(mb_char_size < 5);
>> +wchar_t buff[2];
> 
> 3. Sorry, I still can not understand, why do you need 2 wchar, if here only
> one is checked? Why do you use mbsrtowcs instead of mbrtowc.
> 
>> +mbstate_t ps;
>> +memset(&ps, 0, sizeof(ps));
>> +mbsrtowcs(buff, (const char **)&mb_char, mb_char_size + 1, &ps);
>> +return iswalpha((wint_t)buff[0]);
>> +}
>> +
>> /**
>>  * Parse string identifier in quotes. Parser either stops right
>>  * after the closing quote, or returns an error position.
>> @@ -126,17 +148,20 @@ json_parse_identifier(struct json_path_parser 
>> *parser,
>> const char *str = pos;
>> char c = *pos;
>> /* First symbol can not be digit. */
>> -if (!isalpha(c) && c != '_')
>> -return pos - parser->src + 1;
>> +size_t mb_size = 0;
>> +if ((mb_size = mbsize(pos, end - pos)) && !mb2wcisalpha((char *)pos, 
>> mb_size) && c != '_')
>> +return pos - parser->src + mb_size;
> 
> 4. I propose to join mbsize and mb2wcisalpha under the single function 
> check_alpha()
> or something, that either accepts char ** and moves it forward, or 
> returns parsed size.
> 
> And why you do not check mbsize() for negative results? In the doc I 
> saw, that it can
> return values like (size_t)-1 or (size_t)-2. And you implementation 
> accepts it as ok.
> 
>> int len = 1;
>> -for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
>> -    c = *++pos)
>> +for (c = *(pos += mb_size);
>> +    pos < end && (((mb_size = mbsize(pos, end - pos))
>> +                  && mb2wcisalpha((char *)pos, mb_size)) || c == '_' 
>> || isdigit(c));
>> +    c = *(pos += mb_size))
>> ++len;
>> assert(len > 0);
>> parser->pos = pos;
>> node->type = JSON_PATH_STR;
>> node->str = str;
>> -node->len = len;
>> +node->len = (int)(pos - str);
>> return 0;
>> }
>>
>> diff --git a/src/lib/json/path.h b/src/lib/json/path.h
>> index 6e8db4c..b1028f6 100644
>> --- a/src/lib/json/path.h
>> +++ b/src/lib/json/path.h
>> @@ -33,6 +33,8 @@
>>
>> #include <stdbool.h>
>> #include <stdint.h>
>> +#include <string.h>
>> +#include <malloc.h>
>>
>> #ifdef __cplusplus
>> extern "C" {
>> diff --git a/test/engine/tuple.result b/test/engine/tuple.result
>> index 2d7367a..c4b361a 100644
>> --- a/test/engine/tuple.result
>> +++ b/test/engine/tuple.result
>> @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
>> pk = s:create_index('pk')
>> ---
>> ...
>> -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
>> +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中 
>> 国world = {中国 = 'test'}}}
> 
> 5. Please, add unit tests on unicode. See commit
> 49df120aaa7592e6475bf6974fe6f225db9234de (Introduce 
> json_path_parser) for examples.
>>
> 




More information about the Tarantool-patches mailing list