Tarantool development patches archive
 help / color / mirror / Atom feed
From: Kirill Shcherbatov <kshcherbatov@tarantool.org>
To: tarantool-patches@freelists.org
Cc: "v.shpilevoy@tarantool.org" <v.shpilevoy@tarantool.org>
Subject: [tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support
Date: Thu, 29 Mar 2018 21:04:33 +0300	[thread overview]
Message-ID: <db8586f2-a673-edb6-29fc-cc59eade2aac@tarantool.org> (raw)
In-Reply-To: <c01884e081330285b8dddc47564eb698c43c352a.1522333265.git.kshcherbatov@tarantool.org>

 From eb16ee6ea2d2ca5801b6b09c403c6e2f23984bd9 Mon Sep 17 00:00:00 2001
From: Kirill Shcherbatov <kshcherbatov@tarantool.org>
Date: Thu, 29 Mar 2018 21:03:21 +0300
Subject: [PATCH] Multibyte characters support

---
  src/box/lua/tuple.c        |  1 -
  src/lib/json/path.c        | 35 ++++++++++++++++++++++++++++++-----
  src/lib/json/path.h        |  2 ++
  test/engine/tuple.result   | 16 ++++++++++++++--
  test/engine/tuple.test.lua |  5 ++++-
  5 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
index 99b9ff2..c3a435b 100644
--- a/src/box/lua/tuple.c
+++ b/src/box/lua/tuple.c
@@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L)
  static inline int
  tuple_field_go_to_index(const char **field, uint64_t index)
  {
-	assert(index >= 0);
  	enum mp_type type = mp_typeof(**field);
  	if (type == MP_ARRAY) {
  		if (index == 0)
diff --git a/src/lib/json/path.c b/src/lib/json/path.c
index 4a6174e..1234f6b 100644
--- a/src/lib/json/path.c
+++ b/src/lib/json/path.c
@@ -31,6 +31,8 @@

  #include "path.h"
  #include <ctype.h>
+#include <wchar.h>
+#include <wctype.h>
  #include "trivia/util.h"

  /** Same as strtoull(), but with limited length. */
@@ -44,6 +46,26 @@ strntoull(const char *src, int len) {
  	return value;
  }

+static inline size_t
+mbsize(const char *str, size_t str_size)
+{
+	mbstate_t ps;
+	memset(&ps, 0, sizeof(ps));
+	mbrlen(NULL,0,&ps);
+	return mbrlen(str, str_size, &ps);
+}
+
+static inline int
+mb2wcisalpha(char *mb_char, size_t mb_char_size)
+{
+	assert(mb_char_size < 5);
+	wchar_t buff[2];
+	mbstate_t ps;
+	memset(&ps, 0, sizeof(ps));
+	mbsrtowcs(buff, (const char **)&mb_char, mb_char_size + 1, &ps);
+	return iswalpha((wint_t)buff[0]);
+}
+
  /**
   * Parse string identifier in quotes. Parser either stops right
   * after the closing quote, or returns an error position.
@@ -126,17 +148,20 @@ json_parse_identifier(struct json_path_parser *parser,
  	const char *str = pos;
  	char c = *pos;
  	/* First symbol can not be digit. */
-	if (!isalpha(c) && c != '_')
-		return pos - parser->src + 1;
+	size_t mb_size = 0;
+	if ((mb_size = mbsize(pos, end - pos)) && !mb2wcisalpha((char *)pos, 
mb_size) && c != '_')
+		return pos - parser->src + mb_size;
  	int len = 1;
-	for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
-	     c = *++pos)
+	for (c = *(pos += mb_size);
+	     pos < end && (((mb_size = mbsize(pos, end - pos))
+	                   && mb2wcisalpha((char *)pos, mb_size)) || c == '_' 
|| isdigit(c));
+	     c = *(pos += mb_size))
  		++len;
  	assert(len > 0);
  	parser->pos = pos;
  	node->type = JSON_PATH_STR;
  	node->str = str;
-	node->len = len;
+	node->len = (int)(pos - str);
  	return 0;
  }

diff --git a/src/lib/json/path.h b/src/lib/json/path.h
index 6e8db4c..b1028f6 100644
--- a/src/lib/json/path.h
+++ b/src/lib/json/path.h
@@ -33,6 +33,8 @@

  #include <stdbool.h>
  #include <stdint.h>
+#include <string.h>
+#include <malloc.h>

  #ifdef __cplusplus
  extern "C" {
diff --git a/test/engine/tuple.result b/test/engine/tuple.result
index 2d7367a..c4b361a 100644
--- a/test/engine/tuple.result
+++ b/test/engine/tuple.result
@@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
  pk = s:create_index('pk')
  ---
  ...
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", 
hello中国world = {中国 = 'test'}}}
  ---
  ...
  field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, 
d=4} }, [-1] = 200}
@@ -626,7 +626,7 @@ t[1]
  ...
  t[2]
  ---
-- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
+- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key': 
'key1', 'value': 'value1'}]
  ...
  t[3]
  ---
@@ -673,6 +673,10 @@ t["[2][6].value"]
  ---
  - value1
  ...
+t["[2][6].hello中国world"]
+---
+- {'中国': 'test'}
+...
  t["[2][6]['key']"]
  ---
  - key1
@@ -681,10 +685,18 @@ t["[2][6]['value']"]
  ---
  - value1
  ...
+t["[2][6]['hello中国world']"]
+---
+- {'中国': 'test'}
+...
  t["[3].k3[2].c"]
  ---
  - 3
  ...
+t["[2][6]['hello中国world'].中国"]
+---
+- test
+...
  t["[4]"]
  ---
  - '123456'
diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
index ba3482d..476be90 100644
--- a/test/engine/tuple.test.lua
+++ b/test/engine/tuple.test.lua
@@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'}
  format[4] = {name = 'field4', type = 'string'}
  s = box.schema.space.create('test', {format = format})
  pk = s:create_index('pk')
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", 
hello中国world = {中国 = 'test'}}}
  field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, 
d=4} }, [-1] = 200}
  t = s:replace{1, field2, field3, "123456"}
  t[1]
@@ -223,9 +223,12 @@ t["[2][5][2]"]
  t["[2][5][3]"]
  t["[2][6].key"]
  t["[2][6].value"]
+t["[2][6].hello中国world"]
  t["[2][6]['key']"]
  t["[2][6]['value']"]
+t["[2][6]['hello中国world']"]
  t["[3].k3[2].c"]
+t["[2][6]['hello中国world'].中国"]
  t["[4]"]
  t.field1
  t.field2[5]
-- 
2.7.4



On 29.03.2018 17:22, Kirill Shcherbatov wrote:
> ---
>   src/box/lua/tuple.c        |  1 -
>   src/lib/json/path.c        | 19 +++++++++++++++++--
>   test/engine/tuple.result   | 20 ++++++++++++++++++--
>   test/engine/tuple.test.lua |  6 +++++-
>   4 files changed, 40 insertions(+), 6 deletions(-)
> 
> diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
> index 99b9ff2..c3a435b 100644
> --- a/src/box/lua/tuple.c
> +++ b/src/box/lua/tuple.c
> @@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L)
>   static inline int
>   tuple_field_go_to_index(const char **field, uint64_t index)
>   {
> -	assert(index >= 0);
>   	enum mp_type type = mp_typeof(**field);
>   	if (type == MP_ARRAY) {
>   		if (index == 0)
> diff --git a/src/lib/json/path.c b/src/lib/json/path.c
> index 4a6174e..3e1bb80 100644
> --- a/src/lib/json/path.c
> +++ b/src/lib/json/path.c
> @@ -31,6 +31,8 @@
>   
>   #include "path.h"
>   #include <ctype.h>
> +#include <wchar.h>
> +#include <wctype.h>
>   #include "trivia/util.h"
>   
>   /** Same as strtoull(), but with limited length. */
> @@ -44,6 +46,19 @@ strntoull(const char *src, int len) {
>   	return value;
>   }
>   
> +static inline int
> +ismbaswcalpha(const char *str, size_t str_len_max)
> +{
> +	assert(str_len_max < 1024);
> +	wchar_t buff[1024];
> +	mbstate_t ps;
> +	memset(&ps, 0, sizeof(ps));
> +	str_len_max = mbrlen(str, str_len_max, &ps);
> +	memset(&ps, 0, sizeof(ps));
> +	mbsrtowcs(buff, &str, str_len_max, &ps);
> +	return iswalpha((wint_t)buff[0]);
> +}
> +
>   /**
>    * Parse string identifier in quotes. Parser either stops right
>    * after the closing quote, or returns an error position.
> @@ -126,10 +141,10 @@ json_parse_identifier(struct json_path_parser *parser,
>   	const char *str = pos;
>   	char c = *pos;
>   	/* First symbol can not be digit. */
> -	if (!isalpha(c) && c != '_')
> +	if (!ismbaswcalpha(pos, end - pos) && c != '_')
>   		return pos - parser->src + 1;
>   	int len = 1;
> -	for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
> +	for (c = *++pos; pos < end && (ismbaswcalpha(pos, end - pos) || c == '_' || isdigit(c));
>   	     c = *++pos)
>   		++len;
>   	assert(len > 0);
> diff --git a/test/engine/tuple.result b/test/engine/tuple.result
> index 2d7367a..d6eb4fa 100644
> --- a/test/engine/tuple.result
> +++ b/test/engine/tuple.result
> @@ -611,7 +611,7 @@ s = box.schema.space.create('test', {format = format})
>   pk = s:create_index('pk')
>   ---
>   ...
> -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
> +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}}
>   ---
>   ...
>   field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200}
> @@ -626,7 +626,7 @@ t[1]
>   ...
>   t[2]
>   ---
> -- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
> +- [1, 2, 3, '4', [5, 6, 7], {'hello中国world': {'中国': 'test'}, 'key': 'key1', 'value': 'value1'}]
>   ...
>   t[3]
>   ---
> @@ -665,6 +665,10 @@ t["[2][5][3]"]
>   ---
>   - 7
>   ...
> +t["[2][5][3]['hello中国world'].中"]
> +---
> +- null
> +...
>   t["[2][6].key"]
>   ---
>   - key1
> @@ -673,6 +677,10 @@ t["[2][6].value"]
>   ---
>   - value1
>   ...
> +t["[2][6].hello中国world"]
> +---
> +- {'中国': 'test'}
> +...
>   t["[2][6]['key']"]
>   ---
>   - key1
> @@ -681,10 +689,18 @@ t["[2][6]['value']"]
>   ---
>   - value1
>   ...
> +t["[2][6]['hello中国world']"]
> +---
> +- {'中国': 'test'}
> +...
>   t["[3].k3[2].c"]
>   ---
>   - 3
>   ...
> +t["[2][6]['hello中国world'].中国"]
> +---
> +- test
> +...
>   t["[4]"]
>   ---
>   - '123456'
> diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
> index ba3482d..5a3bcfa 100644
> --- a/test/engine/tuple.test.lua
> +++ b/test/engine/tuple.test.lua
> @@ -207,7 +207,7 @@ format[3] = {name = 'field3', type = 'map'}
>   format[4] = {name = 'field4', type = 'string'}
>   s = box.schema.space.create('test', {format = format})
>   pk = s:create_index('pk')
> -field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
> +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1", hello中国world = {中国 = 'test'}}}
>   field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, d=4} }, [-1] = 200}
>   t = s:replace{1, field2, field3, "123456"}
>   t[1]
> @@ -221,11 +221,15 @@ t["[2][5]"]
>   t["[2][5][1]"]
>   t["[2][5][2]"]
>   t["[2][5][3]"]
> +t["[2][5][3]['hello中国world'].中"]
>   t["[2][6].key"]
>   t["[2][6].value"]
> +t["[2][6].hello中国world"]
>   t["[2][6]['key']"]
>   t["[2][6]['value']"]
> +t["[2][6]['hello中国world']"]
>   t["[3].k3[2].c"]
> +t["[2][6]['hello中国world'].中国"]
>   t["[4]"]
>   t.field1
>   t.field2[5]
> 

  reply	other threads:[~2018-03-29 18:04 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-29 14:22 [tarantool-patches] [PATCH v2 0/3] tuple field access via a json path Kirill Shcherbatov
2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 1/3] Introduce json_path_parser Kirill Shcherbatov
2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 2/3] lua: implement json path access to tuple fields Kirill Shcherbatov
2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 3/3] Multibyte characters support Kirill Shcherbatov
2018-03-29 18:04   ` Kirill Shcherbatov [this message]
2018-03-30 10:24     ` [tarantool-patches] " v.shpilevoy
2018-03-30 10:25       ` v.shpilevoy
2018-04-02 19:19       ` Kirill Shcherbatov
2018-04-03 10:20         ` Vladislav Shpilevoy
2018-04-05 14:09           ` [tarantool-patches] [PATCH v2 1/1] ICU Unicode support for JSON parser Kirill Shcherbatov
2018-04-05 18:00             ` [tarantool-patches] " Kirill Shcherbatov
2018-04-05 23:32               ` Vladislav Shpilevoy
2018-04-04 10:37 ` [tarantool-patches] [PATCH v2 3/3] Multibyte characters support ICU Kirill Shcherbatov
2018-04-04 11:30   ` [tarantool-patches] " Vladislav Shpilevoy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=db8586f2-a673-edb6-29fc-cc59eade2aac@tarantool.org \
    --to=kshcherbatov@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='[tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox