[tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support
Kirill Shcherbatov
kshcherbatov at tarantool.org
Thu Mar 29 21:04:33 MSK 2018
From eb16ee6ea2d2ca5801b6b09c403c6e2f23984bd9 Mon Sep 17 00:00:00 2001
From: Kirill Shcherbatov <kshcherbatov at tarantool.org>
Date: Thu, 29 Mar 2018 21:03:21 +0300
Subject: [PATCH] Multibyte characters support
---
src/box/lua/tuple.c | 1 -
src/lib/json/path.c | 35 ++++++++++++++++++++++++++++++-----
src/lib/json/path.h | 2 ++
test/engine/tuple.result | 16 ++++++++++++++--
test/engine/tuple.test.lua | 5 ++++-
5 files changed, 50 insertions(+), 9 deletions(-)
diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
index 99b9ff2..c3a435b 100644
--- a/src/box/lua/tuple.c
+++ b/src/box/lua/tuple.c
@@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L)
static inline int
tuple_field_go_to_index(const char **field, uint64_t index)
{
- assert(index >= 0);
enum mp_type type = mp_typeof(**field);
if (type == MP_ARRAY) {
if (index == 0)
diff --git a/src/lib/json/path.c b/src/lib/json/path.c
index 4a6174e..1234f6b 100644
--- a/src/lib/json/path.c
+++ b/src/lib/json/path.c
@@ -31,6 +31,8 @@
#include "path.h"
#include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
#include "trivia/util.h"
/** Same as strtoull(), but with limited length. */
@@ -44,6 +46,26 @@ strntoull(const char *src, int len) {
return value;
}
+static inline size_t
+mbsize(const char *str, size_t str_size)
+{
+ mbstate_t ps;
+ memset(&ps, 0, sizeof(ps));
+ mbrlen(NULL,0,&ps);
+ return mbrlen(str, str_size, &ps);
+}
+
+static inline int
+mb2wcisalpha(char *mb_char, size_t mb_char_size)
+{
+ assert(mb_char_size < 5);
+ wchar_t buff[2];
+ mbstate_t ps;
+ memset(&ps, 0, sizeof(ps));
+ mbsrtowcs(buff, (const char **)&mb_char, mb_char_size + 1, &ps);
+ return iswalpha((wint_t)buff[0]);
+}
+
/**
* Parse string identifier in quotes. Parser either stops right
* after the closing quote, or returns an error position.
@@ -126,17 +148,20 @@ json_parse_identifier(struct json_path_parser *parser,
const char *str = pos;
char c = *pos;
/* First symbol can not be digit. */
- if (!isalpha(c) && c != '_')
- return pos - parser->src + 1;
+ size_t mb_size = 0;
+ if ((mb_size = mbsize(pos, end - pos)) && !mb2wcisalpha((char *)pos,
mb_size) && c != '_')
+ return pos - parser->src + mb_size;
int len = 1;
- for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
- c = *++pos)
+ for (c = *(pos += mb_size);
+ pos < end && (((mb_size = mbsize(pos, end - pos))
+ && mb2wcisalpha((char *)pos, mb_size)) || c == '_'
|| isdigit(c));
+ c = *(pos += mb_size))
++len;
assert(len > 0);
parser->pos = pos;
node->type = JSON_PATH_STR;
node->str = str;
- node->len = len;
+ node->len = (int)(pos - str);
return 0;
}
diff --git a/src/lib/json/path.h b/src/lib/json/path.h
index 6e8db4c..b1028f6 100644
--- a/src/lib/json/path.h
+++ b/src/lib/json/path.h
@@ -33,6 +33,8 @@
#include <stdbool.h>
#include <stdint.h>
+#include <string.h>
+#include <malloc.h>
#ifdef __cplusplus
extern "C" {
diff --git a/test/engine/tuple.result b/test/engine/tuple.result
index 2d7367a..c4b361a 100644
--- a/test/engine/tuple.result
+++ b/test/engine/tuple.result
@@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
pk = s:create_index('pk')
---
...
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1",
hello中国world = {中国 = 'test'}}}
---
...
field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3,
d=4} }, [-1] = 200}
@@ -626,7 +626,7 @@ t[1]
...
t[2]
---
-- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
+- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key':
'key1', 'value': 'value1'}]
...
t[3]
---
@@ -673,6 +673,10 @@ t["[2][6].value"]
---
- value1
...
+t["[2][6].hello中国world"]
+---
+- {'中国': 'test'}
+...
t["[2][6]['key']"]
---
- key1
@@ -681,10 +685,18 @@ t["[2][6]['value']"]
---
- value1
...
+t["[2][6]['hello中国world']"]
+---
+- {'中国': 'test'}
+...
t["[3].k3[2].c"]
---
- 3
...
+t["[2][6]['hello中国world'].中国"]
+---
+- test
+...
t["[4]"]
---
- '123456'
diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
index ba3482d..476be90 100644
--- a/test/engine/tuple.test.lua
+++ b/test/engine/tuple.test.lua
@@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'}
format[4] = {name = 'field4', type = 'string'}
s = box.schema.space.create('test', {format = format})
pk = s:create_index('pk')
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1",
hello中国world = {中国 = 'test'}}}
field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3,
d=4} }, [-1] = 200}
t = s:replace{1, field2, field3, "123456"}
t[1]
@@ -223,9 +223,12 @@ t["[2][5][2]"]
t["[2][5][3]"]
t["[2][6].key"]
t["[2][6].value"]
+t["[2][6].hello中国world"]
t["[2][6]['key']"]
t["[2][6]['value']"]
+t["[2][6]['hello中国world']"]
t["[3].k3[2].c"]
+t["[2][6]['hello中国world'].中国"]
t["[4]"]
t.field1
t.field2[5]
--
2.7.4
On 29.03.2018 17:22, Kirill Shcherbatov wrote:
> ---
> src/box/lua/tuple.c | 1 -
> src/lib/json/path.c | 19 +++++++++++++++++--
> test/engine/tuple.result | 20 ++++++++++++++++++--
> test/engine/tuple.test.lua | 6 +++++-
> 4 files changed, 40 insertions(+), 6 deletions(-)
>
> diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
> index 99b9ff2..c3a435b 100644
> --- a/src/box/lua/tuple.c
> +++ b/src/box/lua/tuple.c
> @@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L)
> static inline int
> tuple_field_go_to_index(const char **field, uint64_t index)
> {
> - assert(index >= 0);
> enum mp_type type = mp_typeof(**field);
> if (type == MP_ARRAY) {
> if (index == 0)
> diff --git a/src/lib/json/path.c b/src/lib/json/path.c
> index 4a6174e..3e1bb80 100644
> --- a/src/lib/json/path.c
> +++ b/src/lib/json/path.c
> @@ -31,6 +31,8 @@
>
> #include "path.h"
> #include <ctype.h>
> +#include <wchar.h>
> +#include <wctype.h>
> #include "trivia/util.h"
>
> /** Same as strtoull(), but with limited length. */
> @@ -44,6 +46,19 @@ strntoull(const char *src, int len) {
> return value;
> }
>
> +static inline int
> +ismbaswcalpha(const char *str, size_t str_len_max)
> +{
> + assert(str_len_max < 1024);
> + wchar_t buff[1024];
> + mbstate_t ps;
> + memset(&ps, 0, sizeof(ps));
> + str_len_max = mbrlen(str, str_len_max, &ps);
> + memset(&ps, 0, sizeof(ps));
> + mbsrtowcs(buff, &str, str_len_max, &ps);
> + return iswalpha((wint_t)buff[0]);
> +}
> +
> /**
> * Parse string identifier in quotes. Parser either stops right
> * after the closing quote, or returns an error position.
> @@ -126,10 +141,10 @@ json_parse_identifier(struct json_path_parser *parser,
> const char *str = pos;
> char c = *pos;
> /* First symbol can not be digit. */
> - if (!isalpha(c) && c != '_')
> + if (!ismbaswcalpha(pos, end - pos) && c != '_')
> return pos - parser->src + 1;
> int len = 1;
> - for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
> + for (c = *++pos; pos < end && (ismbaswcalpha(pos, end - pos) || c == '_' || isdigit(c));
> c = *++pos)
> ++len;
> assert(len > 0);
> diff --git a/test/engine/tuple.result b/test/engine/tuple.result
> index 2d7367a..d6eb4fa 100644
> --- a/test/engine/tuple.result
> +++ b/test/engine/tuple.result
> @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
> pk = s:create_index('pk')
> ---
> ...
> -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
> +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}}
> ---
> ...
> field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200}
> @@ -626,7 +626,7 @@ t[1]
> ...
> t[2]
> ---
> -- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
> +- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key': 'key1', 'value': 'value1'}]
> ...
> t[3]
> ---
> @@ -665,6 +665,10 @@ t["[2][5][3]"]
> ---
> - 7
> ...
> +t["[2][5][3]['hello中国world'].中"]
> +---
> +- null
> +...
> t["[2][6].key"]
> ---
> - key1
> @@ -673,6 +677,10 @@ t["[2][6].value"]
> ---
> - value1
> ...
> +t["[2][6].hello中国world"]
> +---
> +- {'中国': 'test'}
> +...
> t["[2][6]['key']"]
> ---
> - key1
> @@ -681,10 +689,18 @@ t["[2][6]['value']"]
> ---
> - value1
> ...
> +t["[2][6]['hello中国world']"]
> +---
> +- {'中国': 'test'}
> +...
> t["[3].k3[2].c"]
> ---
> - 3
> ...
> +t["[2][6]['hello中国world'].中国"]
> +---
> +- test
> +...
> t["[4]"]
> ---
> - '123456'
> diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
> index ba3482d..5a3bcfa 100644
> --- a/test/engine/tuple.test.lua
> +++ b/test/engine/tuple.test.lua
> @@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'}
> format[4] = {name = 'field4', type = 'string'}
> s = box.schema.space.create('test', {format = format})
> pk = s:create_index('pk')
> -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
> +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}}
> field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200}
> t = s:replace{1, field2, field3, "123456"}
> t[1]
> @@ -221,11 +221,15 @@ t["[2][5]"]
> t["[2][5][1]"]
> t["[2][5][2]"]
> t["[2][5][3]"]
> +t["[2][5][3]['hello中国world'].中"]
> t["[2][6].key"]
> t["[2][6].value"]
> +t["[2][6].hello中国world"]
> t["[2][6]['key']"]
> t["[2][6]['value']"]
> +t["[2][6]['hello中国world']"]
> t["[3].k3[2].c"]
> +t["[2][6]['hello中国world'].中国"]
> t["[4]"]
> t.field1
> t.field2[5]
>
More information about the Tarantool-patches
mailing list