See below 5 comments. And see at the travis: https://travis-ci.org/tarantool/tarantool/builds/359964649?utm_source=github_status&utm_medium=notification - seems like the patch does not work. box/alter.test.lua with unicode and your own tests are failed. > 29 марта 2018 г., в 21:04, Kirill Shcherbatov написал(а): > > From eb16ee6ea2d2ca5801b6b09c403c6e2f23984bd9 Mon Sep 17 00:00:00 2001 > From: Kirill Shcherbatov > Date: Thu, 29 Mar 2018 21:03:21 +0300 > Subject: [PATCH] Multibyte characters support > > --- > src/box/lua/tuple.c | 1 - > src/lib/json/path.c | 35 ++++++++++++++++++++++++++++++----- > src/lib/json/path.h | 2 ++ > test/engine/tuple.result | 16 ++++++++++++++-- > test/engine/tuple.test.lua | 5 ++++- > 5 files changed, 50 insertions(+), 9 deletions(-) > > diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c > index 99b9ff2..c3a435b 100644 > --- a/src/box/lua/tuple.c > +++ b/src/box/lua/tuple.c > @@ -44,6 +46,26 @@ strntoull(const char *src, int len) { > return value; > } > 1. Please, add comments what the function does, what arguments gets, what returns. > +static inline size_t > +mbsize(const char *str, size_t str_size) > +{ > + mbstate_t ps; > + memset(&ps, 0, sizeof(ps)); > + mbrlen(NULL,0,&ps); 2. I do not like this always-zero ps. Why you can not use mblen instead of mbrlen? > + return mbrlen(str, str_size, &ps); > +} > + > +static inline int > +mb2wcisalpha(char *mb_char, size_t mb_char_size) > +{ > + assert(mb_char_size < 5); > + wchar_t buff[2]; 3. Sorry, I still can not understand, why do you need 2 wchar, if here only one is checked? Why do you use mbsrtowcs instead of mbrtowc. > + mbstate_t ps; > + memset(&ps, 0, sizeof(ps)); > + mbsrtowcs(buff, (const char **)&mb_char, mb_char_size + 1, &ps); > + return iswalpha((wint_t)buff[0]); > +} > + > /** > * Parse string identifier in quotes. Parser either stops right > * after the closing quote, or returns an error position. > @@ -126,17 +148,20 @@ json_parse_identifier(struct json_path_parser *parser, > const char *str = pos; > char c = *pos; > /* First symbol can not be digit. */ > - if (!isalpha(c) && c != '_') > - return pos - parser->src + 1; > + size_t mb_size = 0; > + if ((mb_size = mbsize(pos, end - pos)) && !mb2wcisalpha((char *)pos, mb_size) && c != '_') > + return pos - parser->src + mb_size; 4. I propose to join mbsize and mb2wcisalpha under the single function check_alpha() or something, that either accepts char ** and moves it forward, or returns parsed size. And why you do not check mbsize() for negative results? In the doc I saw, that it can return values like (size_t)-1 or (size_t)-2. And you implementation accepts it as ok. > int len = 1; > - for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c)); > - c = *++pos) > + for (c = *(pos += mb_size); > + pos < end && (((mb_size = mbsize(pos, end - pos)) > + && mb2wcisalpha((char *)pos, mb_size)) || c == '_' || isdigit(c)); > + c = *(pos += mb_size)) > ++len; > assert(len > 0); > parser->pos = pos; > node->type = JSON_PATH_STR; > node->str = str; > - node->len = len; > + node->len = (int)(pos - str); > return 0; > } > > diff --git a/src/lib/json/path.h b/src/lib/json/path.h > index 6e8db4c..b1028f6 100644 > --- a/src/lib/json/path.h > +++ b/src/lib/json/path.h > @@ -33,6 +33,8 @@ > > #include > #include > +#include > +#include > > #ifdef __cplusplus > extern "C" { > diff --git a/test/engine/tuple.result b/test/engine/tuple.result > index 2d7367a..c4b361a 100644 > --- a/test/engine/tuple.result > +++ b/test/engine/tuple.result > @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format}) > pk = s:create_index('pk') > --- > ... > -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} > +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}} 5. Please, add unit tests on unicode. See commit 49df120aaa7592e6475bf6974fe6f225db9234de (Introduce json_path_parser) for examples. >