From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id EE0222A7E1 for ; Mon, 2 Apr 2018 15:19:31 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id Acm5IwfsxCIW for ; Mon, 2 Apr 2018 15:19:31 -0400 (EDT) Received: from smtpng2.m.smailru.net (smtpng2.m.smailru.net [94.100.179.3]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 11B202A7DD for ; Mon, 2 Apr 2018 15:19:30 -0400 (EDT) Subject: [tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support References: <55E839FD-8366-4BA0-BDAF-5C13661E40F7@tarantool.org> From: Kirill Shcherbatov Message-ID: Date: Mon, 2 Apr 2018 22:19:27 +0300 MIME-Version: 1.0 In-Reply-To: <55E839FD-8366-4BA0-BDAF-5C13661E40F7@tarantool.org> Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Language: en-US Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: "v.shpilevoy@tarantool.org" , tarantool-patches@freelists.org diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c index 99b9ff2..ec6f7cd 100644 --- a/src/box/lua/tuple.c +++ b/src/box/lua/tuple.c @@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L) static inline int tuple_field_go_to_index(const char **field, uint64_t index) { - assert(index >= 0); enum mp_type type = mp_typeof(**field); if (type == MP_ARRAY) { if (index == 0) @@ -497,7 +496,9 @@ tuple_field_go_to_key(const char **field, const char *key, int len) static int lbox_tuple_field_by_path(struct lua_State *L) { + int mark = 0; const char *field; + const char *path = NULL; struct tuple *tuple = luaT_istuple(L, 1); /* Is checked in Lua wrapper. */ assert(tuple != NULL); @@ -506,6 +507,17 @@ lbox_tuple_field_by_path(struct lua_State *L) index -= TUPLE_INDEX_BASE; if (index < 0) { not_found: + if (!path) + goto exit_not_found; + uint32_t path_len = strlen(path); + uint32_t path_hash = lua_hash(path, path_len); + field = tuple_field_by_name(tuple, path, + path_len, path_hash); + if (field) + goto push_value; + if (mark || path_len == 0) + luaL_error(L, "Error in path on position %d", mark); +exit_not_found: lua_pushinteger(L, -1); lua_pushnil(L); return 2; @@ -520,13 +532,15 @@ push_value: } assert(lua_isstring(L, 2)); size_t path_len; - const char *path = lua_tolstring(L, 2, &path_len); + path = lua_tolstring(L, 2, &path_len); struct json_path_parser parser; struct json_path_node node; json_path_parser_create(&parser, path, path_len); int rc = json_path_next(&parser, &node); - if (rc != 0 || node.type == JSON_PATH_END) - luaL_error(L, "Error in path on position %d", rc); + if (rc != 0 || node.type == JSON_PATH_END) { + mark = rc; + goto not_found; + } if (node.type == JSON_PATH_NUM) { int index = node.num; if (index == 0) @@ -553,8 +567,10 @@ push_value: name_hash = lua_hash(name, name_len); } field = tuple_field_by_name(tuple, name, name_len, name_hash); - if (field == NULL) + if (field == NULL) { + rc = 1; goto not_found; + } } while ((rc = json_path_next(&parser, &node)) == 0 && node.type != JSON_PATH_END) { diff --git a/src/lib/json/path.c b/src/lib/json/path.c index 4a6174e..d9473be 100644 --- a/src/lib/json/path.c +++ b/src/lib/json/path.c @@ -31,6 +31,8 @@ #include "path.h" #include +#include +#include #include "trivia/util.h" /** Same as strtoull(), but with limited length. */ @@ -45,6 +47,44 @@ strntoull(const char *src, int len) { } /** + * Checks is multibyte character whose first byte + * is pointed to by mb_str is alphabetic. + * NOTE: You have to clean global context + * with mbtowc(NULL, 0, 0); before first call + * @param mb_str + * @param mb_str_size + * @return multebyte character bytes count Success + * @retval 0 on non-alphabetic character or parse failure + */ +static inline size_t +mbtowc_check_alpha(const char *mb_str, size_t mb_str_size) +{ + wchar_t wc; + ssize_t ret = mbtowc(&wc, mb_str, mb_str_size); + if (ret <= 0 || !iswalpha((wint_t)wc)) + return 0; + return (size_t)ret; +} + + +/** + * Counts user-string sign count in mb_str_size bytes + * @param mb_str + * @param mb_str_size + * @return sign count + */ +static inline int +mbtowc_count(struct json_path_parser *parser, const char *mb_str, size_t mb_str_len) +{ + char src[mb_str_len+1]; + memcpy(src, mb_str, mb_str_len); + src[mb_str_len] = '\0'; + mbtowc(NULL, 0, 0); + int rc = (int)mbstowcs(NULL, (const char *)src, 0); + return (rc >= 0) ? rc + (parser->src_len < (int)mb_str_len) : 1; +} + +/** * Parse string identifier in quotes. Parser either stops right * after the closing quote, or returns an error position. * @param parser JSON path parser. @@ -58,22 +98,26 @@ json_parse_string(struct json_path_parser *parser, struct json_path_node *node) { const char *end = parser->src + parser->src_len; const char *pos = parser->pos; + const char *str = pos; assert(pos < end); char quote_type = *pos; assert(quote_type == '\'' || quote_type == '"'); - /* Skip first quote. */ - int len = 0; - ++pos; - const char *str = pos; - for (char c = *pos; pos < end && quote_type != c; c = *++pos) - ++len; + /* Skip first quote. */++pos; + mbtowc(NULL, 0, 0); /* A string must be terminated with quote. */ + for (; *pos != quote_type && pos < end;) { + int wc_sz = mbtowc(NULL, pos, end - pos); + if (wc_sz <= 0) + return pos - parser->src + 1; + pos += wc_sz; + } + int len = (int)(pos - str - 1); if (*pos != quote_type || len == 0) return pos - parser->src + 1; /* Skip the closing quote. */ parser->pos = pos + 1; node->type = JSON_PATH_STR; - node->str = str; + node->str = str + 1; node->len = len; return 0; } @@ -124,19 +168,21 @@ json_parse_identifier(struct json_path_parser *parser, const char *pos = parser->pos; assert(pos < end); const char *str = pos; - char c = *pos; - /* First symbol can not be digit. */ - if (!isalpha(c) && c != '_') + size_t mb_size = 0; + /* clean global mb to wc decode context */ + mbtowc(NULL, 0, 0); + /* First symbol also can not be digit. */ + if (!(mb_size = mbtowc_check_alpha(pos, end - pos)) && (*pos != '_')) return pos - parser->src + 1; - int len = 1; - for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c)); - c = *++pos) - ++len; - assert(len > 0); + while (mb_size) { + pos += mb_size; + mb_size = mbtowc_check_alpha(pos, end - pos); + mb_size += (!mb_size && (isdigit(*pos) || *pos == '_')); + } parser->pos = pos; node->type = JSON_PATH_STR; node->str = str; - node->len = len; + node->len = (int)(pos - str); return 0; } @@ -155,20 +201,22 @@ json_path_next(struct json_path_parser *parser, struct json_path_node *node) ++parser->pos; /* Error for []. */ if (parser->pos == end) - return parser->pos - parser->src + 1; + return mbtowc_count(parser, parser->src, + parser->pos - parser->src + 1); c = *parser->pos; if (c == '"' || c == '\'') rc = json_parse_string(parser, node); else rc = json_parse_integer(parser, node); if (rc != 0) - return rc; + return mbtowc_count(parser, parser->src, rc); /* * Expression, started from [ must be finished * with ] regardless of its type. */ if (parser->pos == end || *parser->pos != ']') - return parser->pos - parser->src + 1; + return mbtowc_count(parser, parser->src, + parser->pos - parser->src + 1); /* Skip ]. */ ++parser->pos; break; @@ -176,12 +224,13 @@ json_path_next(struct json_path_parser *parser, struct json_path_node *node) /* Skip dot. */ ++parser->pos; if (parser->pos == end) - return parser->pos - parser->src + 1; + return mbtowc_count(parser, parser->src, + parser->pos - parser->src + 1); FALLTHROUGH default: rc = json_parse_identifier(parser, node); if (rc != 0) - return rc; + return mbtowc_count(parser, parser->src, rc); break; } return 0; diff --git a/src/lib/json/path.h b/src/lib/json/path.h index 6e8db4c..b1028f6 100644 --- a/src/lib/json/path.h +++ b/src/lib/json/path.h @@ -33,6 +33,8 @@ #include #include +#include +#include #ifdef __cplusplus extern "C" { diff --git a/test/engine/tuple.result b/test/engine/tuple.result index 2d7367a..c4b361a 100644 --- a/test/engine/tuple.result +++ b/test/engine/tuple.result @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format}) pk = s:create_index('pk') --- ... -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}} --- ... field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200} @@ -626,7 +626,7 @@ t[1] ... t[2] --- -- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}] +- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key': 'key1', 'value': 'value1'}] ... t[3] --- @@ -673,6 +673,10 @@ t["[2][6].value"] --- - value1 ... +t["[2][6].hello中国world"] +--- +- {'中国': 'test'} +... t["[2][6]['key']"] --- - key1 @@ -681,10 +685,18 @@ t["[2][6]['value']"] --- - value1 ... +t["[2][6]['hello中国world']"] +--- +- {'中国': 'test'} +... t["[3].k3[2].c"] --- - 3 ... +t["[2][6]['hello中国world'].中国"] +--- +- test +... t["[4]"] --- - '123456' diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua index ba3482d..476be90 100644 --- a/test/engine/tuple.test.lua +++ b/test/engine/tuple.test.lua @@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'} format[4] = {name = 'field4', type = 'string'} s = box.schema.space.create('test', {format = format}) pk = s:create_index('pk') -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}} field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200} t = s:replace{1, field2, field3, "123456"} t[1] @@ -223,9 +223,12 @@ t["[2][5][2]"] t["[2][5][3]"] t["[2][6].key"] t["[2][6].value"] +t["[2][6].hello中国world"] t["[2][6]['key']"] t["[2][6]['value']"] +t["[2][6]['hello中国world']"] t["[3].k3[2].c"] +t["[2][6]['hello中国world'].中国"] t["[4]"] t.field1 t.field2[5] -- 2.7.4 On 30.03.2018 13:24, v.shpilevoy@tarantool.org wrote: > See below 5 comments. And see at the travis: > https://travis-ci.org/tarantool/tarantool/builds/359964649?utm_source=github_status&utm_medium=notification - > seems like the patch does not work. box/alter.test.lua with unicode and > your own tests are failed. > >> 29 марта 2018 г., в 21:04, Kirill Shcherbatov >> > >> написал(а): >> >> From eb16ee6ea2d2ca5801b6b09c403c6e2f23984bd9 Mon Sep 17 00:00:00 2001 >> From: Kirill Shcherbatov > > >> Date: Thu, 29 Mar 2018 21:03:21 +0300 >> Subject: [PATCH] Multibyte characters support >> >> --- >> src/box/lua/tuple.c        |  1 - >> src/lib/json/path.c        | 35 ++++++++++++++++++++++++++++++----- >> src/lib/json/path.h        |  2 ++ >> test/engine/tuple.result   | 16 ++++++++++++++-- >> test/engine/tuple.test.lua |  5 ++++- >> 5 files changed, 50 insertions(+), 9 deletions(-) >> >> diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c >> index 99b9ff2..c3a435b 100644 >> --- a/src/box/lua/tuple.c >> +++ b/src/box/lua/tuple.c >> @@ -44,6 +46,26 @@ strntoull(const char *src, int len) { >> return value; >> } >> > > 1. Please, add comments what the function does, what arguments gets, > what returns. > >> +static inline size_t >> +mbsize(const char *str, size_t str_size) >> +{ >> +mbstate_t ps; >> +memset(&ps, 0, sizeof(ps)); >> +mbrlen(NULL,0,&ps); > > 2. I do not like this always-zero ps. Why you can not use mblen instead > of mbrlen? > >> +return mbrlen(str, str_size, &ps); >> +} >> + >> +static inline int >> +mb2wcisalpha(char *mb_char, size_t mb_char_size) >> +{ >> +assert(mb_char_size < 5); >> +wchar_t buff[2]; > > 3. Sorry, I still can not understand, why do you need 2 wchar, if here only > one is checked? Why do you use mbsrtowcs instead of mbrtowc. > >> +mbstate_t ps; >> +memset(&ps, 0, sizeof(ps)); >> +mbsrtowcs(buff, (const char **)&mb_char, mb_char_size + 1, &ps); >> +return iswalpha((wint_t)buff[0]); >> +} >> + >> /** >>  * Parse string identifier in quotes. Parser either stops right >>  * after the closing quote, or returns an error position. >> @@ -126,17 +148,20 @@ json_parse_identifier(struct json_path_parser >> *parser, >> const char *str = pos; >> char c = *pos; >> /* First symbol can not be digit. */ >> -if (!isalpha(c) && c != '_') >> -return pos - parser->src + 1; >> +size_t mb_size = 0; >> +if ((mb_size = mbsize(pos, end - pos)) && !mb2wcisalpha((char *)pos, >> mb_size) && c != '_') >> +return pos - parser->src + mb_size; > > 4. I propose to join mbsize and mb2wcisalpha under the single function > check_alpha() > or something, that either accepts char ** and moves it forward, or > returns parsed size. > > And why you do not check mbsize() for negative results? In the doc I > saw, that it can > return values like (size_t)-1 or (size_t)-2. And you implementation > accepts it as ok. > >> int len = 1; >> -for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c)); >> -    c = *++pos) >> +for (c = *(pos += mb_size); >> +    pos < end && (((mb_size = mbsize(pos, end - pos)) >> +                  && mb2wcisalpha((char *)pos, mb_size)) || c == '_' >> || isdigit(c)); >> +    c = *(pos += mb_size)) >> ++len; >> assert(len > 0); >> parser->pos = pos; >> node->type = JSON_PATH_STR; >> node->str = str; >> -node->len = len; >> +node->len = (int)(pos - str); >> return 0; >> } >> >> diff --git a/src/lib/json/path.h b/src/lib/json/path.h >> index 6e8db4c..b1028f6 100644 >> --- a/src/lib/json/path.h >> +++ b/src/lib/json/path.h >> @@ -33,6 +33,8 @@ >> >> #include >> #include >> +#include >> +#include >> >> #ifdef __cplusplus >> extern "C" { >> diff --git a/test/engine/tuple.result b/test/engine/tuple.result >> index 2d7367a..c4b361a 100644 >> --- a/test/engine/tuple.result >> +++ b/test/engine/tuple.result >> @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format}) >> pk = s:create_index('pk') >> --- >> ... >> -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} >> +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中 >> 国world = {中国 = 'test'}}} > > 5. Please, add unit tests on unicode. See commit > 49df120aaa7592e6475bf6974fe6f225db9234de (Introduce > json_path_parser) for examples. >> >