From: Kirill Shcherbatov <kshcherbatov@tarantool.org> To: tarantool-patches@freelists.org Cc: "v.shpilevoy@tarantool.org" <v.shpilevoy@tarantool.org> Subject: [tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support Date: Thu, 29 Mar 2018 21:04:33 +0300 [thread overview] Message-ID: <db8586f2-a673-edb6-29fc-cc59eade2aac@tarantool.org> (raw) In-Reply-To: <c01884e081330285b8dddc47564eb698c43c352a.1522333265.git.kshcherbatov@tarantool.org> From eb16ee6ea2d2ca5801b6b09c403c6e2f23984bd9 Mon Sep 17 00:00:00 2001 From: Kirill Shcherbatov <kshcherbatov@tarantool.org> Date: Thu, 29 Mar 2018 21:03:21 +0300 Subject: [PATCH] Multibyte characters support --- src/box/lua/tuple.c | 1 - src/lib/json/path.c | 35 ++++++++++++++++++++++++++++++----- src/lib/json/path.h | 2 ++ test/engine/tuple.result | 16 ++++++++++++++-- test/engine/tuple.test.lua | 5 ++++- 5 files changed, 50 insertions(+), 9 deletions(-) diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c index 99b9ff2..c3a435b 100644 --- a/src/box/lua/tuple.c +++ b/src/box/lua/tuple.c @@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L) static inline int tuple_field_go_to_index(const char **field, uint64_t index) { - assert(index >= 0); enum mp_type type = mp_typeof(**field); if (type == MP_ARRAY) { if (index == 0) diff --git a/src/lib/json/path.c b/src/lib/json/path.c index 4a6174e..1234f6b 100644 --- a/src/lib/json/path.c +++ b/src/lib/json/path.c @@ -31,6 +31,8 @@ #include "path.h" #include <ctype.h> +#include <wchar.h> +#include <wctype.h> #include "trivia/util.h" /** Same as strtoull(), but with limited length. */ @@ -44,6 +46,26 @@ strntoull(const char *src, int len) { return value; } +static inline size_t +mbsize(const char *str, size_t str_size) +{ + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + mbrlen(NULL,0,&ps); + return mbrlen(str, str_size, &ps); +} + +static inline int +mb2wcisalpha(char *mb_char, size_t mb_char_size) +{ + assert(mb_char_size < 5); + wchar_t buff[2]; + mbstate_t ps; + memset(&ps, 0, sizeof(ps)); + mbsrtowcs(buff, (const char **)&mb_char, mb_char_size + 1, &ps); + return iswalpha((wint_t)buff[0]); +} + /** * Parse string identifier in quotes. Parser either stops right * after the closing quote, or returns an error position. @@ -126,17 +148,20 @@ json_parse_identifier(struct json_path_parser *parser, const char *str = pos; char c = *pos; /* First symbol can not be digit. */ - if (!isalpha(c) && c != '_') - return pos - parser->src + 1; + size_t mb_size = 0; + if ((mb_size = mbsize(pos, end - pos)) && !mb2wcisalpha((char *)pos, mb_size) && c != '_') + return pos - parser->src + mb_size; int len = 1; - for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c)); - c = *++pos) + for (c = *(pos += mb_size); + pos < end && (((mb_size = mbsize(pos, end - pos)) + && mb2wcisalpha((char *)pos, mb_size)) || c == '_' || isdigit(c)); + c = *(pos += mb_size)) ++len; assert(len > 0); parser->pos = pos; node->type = JSON_PATH_STR; node->str = str; - node->len = len; + node->len = (int)(pos - str); return 0; } diff --git a/src/lib/json/path.h b/src/lib/json/path.h index 6e8db4c..b1028f6 100644 --- a/src/lib/json/path.h +++ b/src/lib/json/path.h @@ -33,6 +33,8 @@ #include <stdbool.h> #include <stdint.h> +#include <string.h> +#include <malloc.h> #ifdef __cplusplus extern "C" { diff --git a/test/engine/tuple.result b/test/engine/tuple.result index 2d7367a..c4b361a 100644 --- a/test/engine/tuple.result +++ b/test/engine/tuple.result @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format}) pk = s:create_index('pk') --- ... -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}} --- ... field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200} @@ -626,7 +626,7 @@ t[1] ... t[2] --- -- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}] +- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key': 'key1', 'value': 'value1'}] ... t[3] --- @@ -673,6 +673,10 @@ t["[2][6].value"] --- - value1 ... +t["[2][6].hello中国world"] +--- +- {'中国': 'test'} +... t["[2][6]['key']"] --- - key1 @@ -681,10 +685,18 @@ t["[2][6]['value']"] --- - value1 ... +t["[2][6]['hello中国world']"] +--- +- {'中国': 'test'} +... t["[3].k3[2].c"] --- - 3 ... +t["[2][6]['hello中国world'].中国"] +--- +- test +... t["[4]"] --- - '123456' diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua index ba3482d..476be90 100644 --- a/test/engine/tuple.test.lua +++ b/test/engine/tuple.test.lua @@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'} format[4] = {name = 'field4', type = 'string'} s = box.schema.space.create('test', {format = format}) pk = s:create_index('pk') -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}} field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200} t = s:replace{1, field2, field3, "123456"} t[1] @@ -223,9 +223,12 @@ t["[2][5][2]"] t["[2][5][3]"] t["[2][6].key"] t["[2][6].value"] +t["[2][6].hello中国world"] t["[2][6]['key']"] t["[2][6]['value']"] +t["[2][6]['hello中国world']"] t["[3].k3[2].c"] +t["[2][6]['hello中国world'].中国"] t["[4]"] t.field1 t.field2[5] -- 2.7.4 On 29.03.2018 17:22, Kirill Shcherbatov wrote: > --- > src/box/lua/tuple.c | 1 - > src/lib/json/path.c | 19 +++++++++++++++++-- > test/engine/tuple.result | 20 ++++++++++++++++++-- > test/engine/tuple.test.lua | 6 +++++- > 4 files changed, 40 insertions(+), 6 deletions(-) > > diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c > index 99b9ff2..c3a435b 100644 > --- a/src/box/lua/tuple.c > +++ b/src/box/lua/tuple.c > @@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L) > static inline int > tuple_field_go_to_index(const char **field, uint64_t index) > { > - assert(index >= 0); > enum mp_type type = mp_typeof(**field); > if (type == MP_ARRAY) { > if (index == 0) > diff --git a/src/lib/json/path.c b/src/lib/json/path.c > index 4a6174e..3e1bb80 100644 > --- a/src/lib/json/path.c > +++ b/src/lib/json/path.c > @@ -31,6 +31,8 @@ > > #include "path.h" > #include <ctype.h> > +#include <wchar.h> > +#include <wctype.h> > #include "trivia/util.h" > > /** Same as strtoull(), but with limited length. */ > @@ -44,6 +46,19 @@ strntoull(const char *src, int len) { > return value; > } > > +static inline int > +ismbaswcalpha(const char *str, size_t str_len_max) > +{ > + assert(str_len_max < 1024); > + wchar_t buff[1024]; > + mbstate_t ps; > + memset(&ps, 0, sizeof(ps)); > + str_len_max = mbrlen(str, str_len_max, &ps); > + memset(&ps, 0, sizeof(ps)); > + mbsrtowcs(buff, &str, str_len_max, &ps); > + return iswalpha((wint_t)buff[0]); > +} > + > /** > * Parse string identifier in quotes. Parser either stops right > * after the closing quote, or returns an error position. > @@ -126,10 +141,10 @@ json_parse_identifier(struct json_path_parser *parser, > const char *str = pos; > char c = *pos; > /* First symbol can not be digit. */ > - if (!isalpha(c) && c != '_') > + if (!ismbaswcalpha(pos, end - pos) && c != '_') > return pos - parser->src + 1; > int len = 1; > - for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c)); > + for (c = *++pos; pos < end && (ismbaswcalpha(pos, end - pos) || c == '_' || isdigit(c)); > c = *++pos) > ++len; > assert(len > 0); > diff --git a/test/engine/tuple.result b/test/engine/tuple.result > index 2d7367a..d6eb4fa 100644 > --- a/test/engine/tuple.result > +++ b/test/engine/tuple.result > @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format}) > pk = s:create_index('pk') > --- > ... > -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} > +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}} > --- > ... > field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200} > @@ -626,7 +626,7 @@ t[1] > ... > t[2] > --- > -- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}] > +- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key': 'key1', 'value': 'value1'}] > ... > t[3] > --- > @@ -665,6 +665,10 @@ t["[2][5][3]"] > --- > - 7 > ... > +t["[2][5][3]['hello中国world'].中"] > +--- > +- null > +... > t["[2][6].key"] > --- > - key1 > @@ -673,6 +677,10 @@ t["[2][6].value"] > --- > - value1 > ... > +t["[2][6].hello中国world"] > +--- > +- {'中国': 'test'} > +... > t["[2][6]['key']"] > --- > - key1 > @@ -681,10 +689,18 @@ t["[2][6]['value']"] > --- > - value1 > ... > +t["[2][6]['hello中国world']"] > +--- > +- {'中国': 'test'} > +... > t["[3].k3[2].c"] > --- > - 3 > ... > +t["[2][6]['hello中国world'].中国"] > +--- > +- test > +... > t["[4]"] > --- > - '123456' > diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua > index ba3482d..5a3bcfa 100644 > --- a/test/engine/tuple.test.lua > +++ b/test/engine/tuple.test.lua > @@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'} > format[4] = {name = 'field4', type = 'string'} > s = box.schema.space.create('test', {format = format}) > pk = s:create_index('pk') > -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}} > +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}} > field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200} > t = s:replace{1, field2, field3, "123456"} > t[1] > @@ -221,11 +221,15 @@ t["[2][5]"] > t["[2][5][1]"] > t["[2][5][2]"] > t["[2][5][3]"] > +t["[2][5][3]['hello中国world'].中"] > t["[2][6].key"] > t["[2][6].value"] > +t["[2][6].hello中国world"] > t["[2][6]['key']"] > t["[2][6]['value']"] > +t["[2][6]['hello中国world']"] > t["[3].k3[2].c"] > +t["[2][6]['hello中国world'].中国"] > t["[4]"] > t.field1 > t.field2[5] >
next prev parent reply other threads:[~2018-03-29 18:04 UTC|newest] Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-03-29 14:22 [tarantool-patches] [PATCH v2 0/3] tuple field access via a json path Kirill Shcherbatov 2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 1/3] Introduce json_path_parser Kirill Shcherbatov 2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 2/3] lua: implement json path access to tuple fields Kirill Shcherbatov 2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 3/3] Multibyte characters support Kirill Shcherbatov 2018-03-29 18:04 ` Kirill Shcherbatov [this message] 2018-03-30 10:24 ` [tarantool-patches] " v.shpilevoy 2018-03-30 10:25 ` v.shpilevoy 2018-04-02 19:19 ` Kirill Shcherbatov 2018-04-03 10:20 ` Vladislav Shpilevoy 2018-04-05 14:09 ` [tarantool-patches] [PATCH v2 1/1] ICU Unicode support for JSON parser Kirill Shcherbatov 2018-04-05 18:00 ` [tarantool-patches] " Kirill Shcherbatov 2018-04-05 23:32 ` Vladislav Shpilevoy 2018-04-04 10:37 ` [tarantool-patches] [PATCH v2 3/3] Multibyte characters support ICU Kirill Shcherbatov 2018-04-04 11:30 ` [tarantool-patches] " Vladislav Shpilevoy
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=db8586f2-a673-edb6-29fc-cc59eade2aac@tarantool.org \ --to=kshcherbatov@tarantool.org \ --cc=tarantool-patches@freelists.org \ --cc=v.shpilevoy@tarantool.org \ --subject='[tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox