[tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support
Kirill Shcherbatov
kshcherbatov at tarantool.org
Mon Apr 2 22:19:27 MSK 2018
diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
index 99b9ff2..ec6f7cd 100644
--- a/src/box/lua/tuple.c
+++ b/src/box/lua/tuple.c
@@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L)
static inline int
tuple_field_go_to_index(const char **field, uint64_t index)
{
- assert(index >= 0);
enum mp_type type = mp_typeof(**field);
if (type == MP_ARRAY) {
if (index == 0)
@@ -497,7 +496,9 @@ tuple_field_go_to_key(const char **field, const char
*key, int len)
static int
lbox_tuple_field_by_path(struct lua_State *L)
{
+ int mark = 0;
const char *field;
+ const char *path = NULL;
struct tuple *tuple = luaT_istuple(L, 1);
/* Is checked in Lua wrapper. */
assert(tuple != NULL);
@@ -506,6 +507,17 @@ lbox_tuple_field_by_path(struct lua_State *L)
index -= TUPLE_INDEX_BASE;
if (index < 0) {
not_found:
+ if (!path)
+ goto exit_not_found;
+ uint32_t path_len = strlen(path);
+ uint32_t path_hash = lua_hash(path, path_len);
+ field = tuple_field_by_name(tuple, path,
+ path_len, path_hash);
+ if (field)
+ goto push_value;
+ if (mark || path_len == 0)
+ luaL_error(L, "Error in path on position %d", mark);
+exit_not_found:
lua_pushinteger(L, -1);
lua_pushnil(L);
return 2;
@@ -520,13 +532,15 @@ push_value:
}
assert(lua_isstring(L, 2));
size_t path_len;
- const char *path = lua_tolstring(L, 2, &path_len);
+ path = lua_tolstring(L, 2, &path_len);
struct json_path_parser parser;
struct json_path_node node;
json_path_parser_create(&parser, path, path_len);
int rc = json_path_next(&parser, &node);
- if (rc != 0 || node.type == JSON_PATH_END)
- luaL_error(L, "Error in path on position %d", rc);
+ if (rc != 0 || node.type == JSON_PATH_END) {
+ mark = rc;
+ goto not_found;
+ }
if (node.type == JSON_PATH_NUM) {
int index = node.num;
if (index == 0)
@@ -553,8 +567,10 @@ push_value:
name_hash = lua_hash(name, name_len);
}
field = tuple_field_by_name(tuple, name, name_len, name_hash);
- if (field == NULL)
+ if (field == NULL) {
+ rc = 1;
goto not_found;
+ }
}
while ((rc = json_path_next(&parser, &node)) == 0 &&
node.type != JSON_PATH_END) {
diff --git a/src/lib/json/path.c b/src/lib/json/path.c
index 4a6174e..d9473be 100644
--- a/src/lib/json/path.c
+++ b/src/lib/json/path.c
@@ -31,6 +31,8 @@
#include "path.h"
#include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
#include "trivia/util.h"
/** Same as strtoull(), but with limited length. */
@@ -45,6 +47,44 @@ strntoull(const char *src, int len) {
}
/**
+ * Checks is multibyte character whose first byte
+ * is pointed to by mb_str is alphabetic.
+ * NOTE: You have to clean global context
+ * with mbtowc(NULL, 0, 0); before first call
+ * @param mb_str
+ * @param mb_str_size
+ * @return multebyte character bytes count Success
+ * @retval 0 on non-alphabetic character or parse failure
+ */
+static inline size_t
+mbtowc_check_alpha(const char *mb_str, size_t mb_str_size)
+{
+ wchar_t wc;
+ ssize_t ret = mbtowc(&wc, mb_str, mb_str_size);
+ if (ret <= 0 || !iswalpha((wint_t)wc))
+ return 0;
+ return (size_t)ret;
+}
+
+
+/**
+ * Counts user-string sign count in mb_str_size bytes
+ * @param mb_str
+ * @param mb_str_size
+ * @return sign count
+ */
+static inline int
+mbtowc_count(struct json_path_parser *parser, const char *mb_str,
size_t mb_str_len)
+{
+ char src[mb_str_len+1];
+ memcpy(src, mb_str, mb_str_len);
+ src[mb_str_len] = '\0';
+ mbtowc(NULL, 0, 0);
+ int rc = (int)mbstowcs(NULL, (const char *)src, 0);
+ return (rc >= 0) ? rc + (parser->src_len < (int)mb_str_len) : 1;
+}
+
+/**
* Parse string identifier in quotes. Parser either stops right
* after the closing quote, or returns an error position.
* @param parser JSON path parser.
@@ -58,22 +98,26 @@ json_parse_string(struct json_path_parser *parser,
struct json_path_node *node)
{
const char *end = parser->src + parser->src_len;
const char *pos = parser->pos;
+ const char *str = pos;
assert(pos < end);
char quote_type = *pos;
assert(quote_type == '\'' || quote_type == '"');
- /* Skip first quote. */
- int len = 0;
- ++pos;
- const char *str = pos;
- for (char c = *pos; pos < end && quote_type != c; c = *++pos)
- ++len;
+ /* Skip first quote. */++pos;
+ mbtowc(NULL, 0, 0);
/* A string must be terminated with quote. */
+ for (; *pos != quote_type && pos < end;) {
+ int wc_sz = mbtowc(NULL, pos, end - pos);
+ if (wc_sz <= 0)
+ return pos - parser->src + 1;
+ pos += wc_sz;
+ }
+ int len = (int)(pos - str - 1);
if (*pos != quote_type || len == 0)
return pos - parser->src + 1;
/* Skip the closing quote. */
parser->pos = pos + 1;
node->type = JSON_PATH_STR;
- node->str = str;
+ node->str = str + 1;
node->len = len;
return 0;
}
@@ -124,19 +168,21 @@ json_parse_identifier(struct json_path_parser *parser,
const char *pos = parser->pos;
assert(pos < end);
const char *str = pos;
- char c = *pos;
- /* First symbol can not be digit. */
- if (!isalpha(c) && c != '_')
+ size_t mb_size = 0;
+ /* clean global mb to wc decode context */
+ mbtowc(NULL, 0, 0);
+ /* First symbol also can not be digit. */
+ if (!(mb_size = mbtowc_check_alpha(pos, end - pos)) && (*pos != '_'))
return pos - parser->src + 1;
- int len = 1;
- for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
- c = *++pos)
- ++len;
- assert(len > 0);
+ while (mb_size) {
+ pos += mb_size;
+ mb_size = mbtowc_check_alpha(pos, end - pos);
+ mb_size += (!mb_size && (isdigit(*pos) || *pos == '_'));
+ }
parser->pos = pos;
node->type = JSON_PATH_STR;
node->str = str;
- node->len = len;
+ node->len = (int)(pos - str);
return 0;
}
@@ -155,20 +201,22 @@ json_path_next(struct json_path_parser *parser,
struct json_path_node *node)
++parser->pos;
/* Error for []. */
if (parser->pos == end)
- return parser->pos - parser->src + 1;
+ return mbtowc_count(parser, parser->src,
+ parser->pos - parser->src + 1);
c = *parser->pos;
if (c == '"' || c == '\'')
rc = json_parse_string(parser, node);
else
rc = json_parse_integer(parser, node);
if (rc != 0)
- return rc;
+ return mbtowc_count(parser, parser->src, rc);
/*
* Expression, started from [ must be finished
* with ] regardless of its type.
*/
if (parser->pos == end || *parser->pos != ']')
- return parser->pos - parser->src + 1;
+ return mbtowc_count(parser, parser->src,
+ parser->pos - parser->src + 1);
/* Skip ]. */
++parser->pos;
break;
@@ -176,12 +224,13 @@ json_path_next(struct json_path_parser *parser,
struct json_path_node *node)
/* Skip dot. */
++parser->pos;
if (parser->pos == end)
- return parser->pos - parser->src + 1;
+ return mbtowc_count(parser, parser->src,
+ parser->pos - parser->src + 1);
FALLTHROUGH
default:
rc = json_parse_identifier(parser, node);
if (rc != 0)
- return rc;
+ return mbtowc_count(parser, parser->src, rc);
break;
}
return 0;
diff --git a/src/lib/json/path.h b/src/lib/json/path.h
index 6e8db4c..b1028f6 100644
--- a/src/lib/json/path.h
+++ b/src/lib/json/path.h
@@ -33,6 +33,8 @@
#include <stdbool.h>
#include <stdint.h>
+#include <string.h>
+#include <malloc.h>
#ifdef __cplusplus
extern "C" {
diff --git a/test/engine/tuple.result b/test/engine/tuple.result
index 2d7367a..c4b361a 100644
--- a/test/engine/tuple.result
+++ b/test/engine/tuple.result
@@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
pk = s:create_index('pk')
---
...
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1",
hello中国world = {中国 = 'test'}}}
---
...
field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3,
d=4} }, [-1] = 200}
@@ -626,7 +626,7 @@ t[1]
...
t[2]
---
-- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
+- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key':
'key1', 'value': 'value1'}]
...
t[3]
---
@@ -673,6 +673,10 @@ t["[2][6].value"]
---
- value1
...
+t["[2][6].hello中国world"]
+---
+- {'中国': 'test'}
+...
t["[2][6]['key']"]
---
- key1
@@ -681,10 +685,18 @@ t["[2][6]['value']"]
---
- value1
...
+t["[2][6]['hello中国world']"]
+---
+- {'中国': 'test'}
+...
t["[3].k3[2].c"]
---
- 3
...
+t["[2][6]['hello中国world'].中国"]
+---
+- test
+...
t["[4]"]
---
- '123456'
diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
index ba3482d..476be90 100644
--- a/test/engine/tuple.test.lua
+++ b/test/engine/tuple.test.lua
@@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'}
format[4] = {name = 'field4', type = 'string'}
s = box.schema.space.create('test', {format = format})
pk = s:create_index('pk')
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1",
hello中国world = {中国 = 'test'}}}
field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3,
d=4} }, [-1] = 200}
t = s:replace{1, field2, field3, "123456"}
t[1]
@@ -223,9 +223,12 @@ t["[2][5][2]"]
t["[2][5][3]"]
t["[2][6].key"]
t["[2][6].value"]
+t["[2][6].hello中国world"]
t["[2][6]['key']"]
t["[2][6]['value']"]
+t["[2][6]['hello中国world']"]
t["[3].k3[2].c"]
+t["[2][6]['hello中国world'].中国"]
t["[4]"]
t.field1
t.field2[5]
--
2.7.4
On 30.03.2018 13:24, v.shpilevoy at tarantool.org wrote:
> See below 5 comments. And see at the travis:
> https://travis-ci.org/tarantool/tarantool/builds/359964649?utm_source=github_status&utm_medium=notification -
> seems like the patch does not work. box/alter.test.lua with unicode and
> your own tests are failed.
>
>> 29 марта 2018 г., в 21:04, Kirill Shcherbatov
>> <kshcherbatov at tarantool.org <mailto:kshcherbatov at tarantool.org>>
>> написал(а):
>>
>> From eb16ee6ea2d2ca5801b6b09c403c6e2f23984bd9 Mon Sep 17 00:00:00 2001
>> From: Kirill Shcherbatov <kshcherbatov at tarantool.org
>> <mailto:kshcherbatov at tarantool.org>>
>> Date: Thu, 29 Mar 2018 21:03:21 +0300
>> Subject: [PATCH] Multibyte characters support
>>
>> ---
>> src/box/lua/tuple.c | 1 -
>> src/lib/json/path.c | 35 ++++++++++++++++++++++++++++++-----
>> src/lib/json/path.h | 2 ++
>> test/engine/tuple.result | 16 ++++++++++++++--
>> test/engine/tuple.test.lua | 5 ++++-
>> 5 files changed, 50 insertions(+), 9 deletions(-)
>>
>> diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
>> index 99b9ff2..c3a435b 100644
>> --- a/src/box/lua/tuple.c
>> +++ b/src/box/lua/tuple.c
>> @@ -44,6 +46,26 @@ strntoull(const char *src, int len) {
>> return value;
>> }
>>
>
> 1. Please, add comments what the function does, what arguments gets,
> what returns.
>
>> +static inline size_t
>> +mbsize(const char *str, size_t str_size)
>> +{
>> +mbstate_t ps;
>> +memset(&ps, 0, sizeof(ps));
>> +mbrlen(NULL,0,&ps);
>
> 2. I do not like this always-zero ps. Why you can not use mblen instead
> of mbrlen?
>
>> +return mbrlen(str, str_size, &ps);
>> +}
>> +
>> +static inline int
>> +mb2wcisalpha(char *mb_char, size_t mb_char_size)
>> +{
>> +assert(mb_char_size < 5);
>> +wchar_t buff[2];
>
> 3. Sorry, I still can not understand, why do you need 2 wchar, if here only
> one is checked? Why do you use mbsrtowcs instead of mbrtowc.
>
>> +mbstate_t ps;
>> +memset(&ps, 0, sizeof(ps));
>> +mbsrtowcs(buff, (const char **)&mb_char, mb_char_size + 1, &ps);
>> +return iswalpha((wint_t)buff[0]);
>> +}
>> +
>> /**
>> * Parse string identifier in quotes. Parser either stops right
>> * after the closing quote, or returns an error position.
>> @@ -126,17 +148,20 @@ json_parse_identifier(struct json_path_parser
>> *parser,
>> const char *str = pos;
>> char c = *pos;
>> /* First symbol can not be digit. */
>> -if (!isalpha(c) && c != '_')
>> -return pos - parser->src + 1;
>> +size_t mb_size = 0;
>> +if ((mb_size = mbsize(pos, end - pos)) && !mb2wcisalpha((char *)pos,
>> mb_size) && c != '_')
>> +return pos - parser->src + mb_size;
>
> 4. I propose to join mbsize and mb2wcisalpha under the single function
> check_alpha()
> or something, that either accepts char ** and moves it forward, or
> returns parsed size.
>
> And why you do not check mbsize() for negative results? In the doc I
> saw, that it can
> return values like (size_t)-1 or (size_t)-2. And you implementation
> accepts it as ok.
>
>> int len = 1;
>> -for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
>> - c = *++pos)
>> +for (c = *(pos += mb_size);
>> + pos < end && (((mb_size = mbsize(pos, end - pos))
>> + && mb2wcisalpha((char *)pos, mb_size)) || c == '_'
>> || isdigit(c));
>> + c = *(pos += mb_size))
>> ++len;
>> assert(len > 0);
>> parser->pos = pos;
>> node->type = JSON_PATH_STR;
>> node->str = str;
>> -node->len = len;
>> +node->len = (int)(pos - str);
>> return 0;
>> }
>>
>> diff --git a/src/lib/json/path.h b/src/lib/json/path.h
>> index 6e8db4c..b1028f6 100644
>> --- a/src/lib/json/path.h
>> +++ b/src/lib/json/path.h
>> @@ -33,6 +33,8 @@
>>
>> #include <stdbool.h>
>> #include <stdint.h>
>> +#include <string.h>
>> +#include <malloc.h>
>>
>> #ifdef __cplusplus
>> extern "C" {
>> diff --git a/test/engine/tuple.result b/test/engine/tuple.result
>> index 2d7367a..c4b361a 100644
>> --- a/test/engine/tuple.result
>> +++ b/test/engine/tuple.result
>> @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
>> pk = s:create_index('pk')
>> ---
>> ...
>> -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
>> +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中
>> 国world = {中国 = 'test'}}}
>
> 5. Please, add unit tests on unicode. See commit
> 49df120aaa7592e6475bf6974fe6f225db9234de (Introduce
> json_path_parser) for examples.
>>
>
More information about the Tarantool-patches
mailing list