Tarantool development patches archive
 help / color / mirror / Atom feed
From: Kirill Shcherbatov <kshcherbatov@tarantool.org>
To: tarantool-patches@freelists.org
Cc: "v.shpilevoy@tarantool.org" <v.shpilevoy@tarantool.org>
Subject: [tarantool-patches] [PATCH v2 3/3] Multibyte characters support ICU
Date: Wed, 4 Apr 2018 13:37:24 +0300	[thread overview]
Message-ID: <01a0181a-19f9-c38c-94ea-65ab549a4517@tarantool.org> (raw)
In-Reply-To: <cover.1522333265.git.kshcherbatov@tarantool.org>

ICU Implementation

 From 8703e465382ba05817e7703550694c3790972e54 Mon Sep 17 00:00:00 2001
Message-Id: 
<8703e465382ba05817e7703550694c3790972e54.1522838002.git.kshcherbatov@tarantool.org>
In-Reply-To: <cover.1522838002.git.kshcherbatov@tarantool.org>
References: <cover.1522838002.git.kshcherbatov@tarantool.org>
From: Kirill Shcherbatov <kshcherbatov@tarantool.org>
Date: Wed, 4 Apr 2018 13:06:22 +0300
Subject: [PATCH v2 3/3] ICU Unicode parsing implementation

---
  src/box/lua/tuple.c        |  31 +++++--
  src/lib/json/path.c        | 211 
+++++++++++++++++++++++++++++++++------------
  src/lib/json/path.h        |  30 ++++---
  test/engine/tuple.result   |  43 +++++++--
  test/engine/tuple.test.lua |  13 ++-
  test/unit/CMakeLists.txt   |   2 +-
  test/unit/json_path.c      |   4 +-
  7 files changed, 247 insertions(+), 87 deletions(-)

diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
index 99b9ff2..b89e1f9 100644
--- a/src/box/lua/tuple.c
+++ b/src/box/lua/tuple.c
@@ -413,7 +413,6 @@ lbox_tuple_transform(struct lua_State *L)
  static inline int
  tuple_field_go_to_index(const char **field, uint64_t index)
  {
-	assert(index >= 0);
  	enum mp_type type = mp_typeof(**field);
  	if (type == MP_ARRAY) {
  		if (index == 0)
@@ -497,6 +496,12 @@ tuple_field_go_to_key(const char **field, const 
char *key, int len)
  static int
  lbox_tuple_field_by_path(struct lua_State *L)
  {
+	int err_pos = 0;
+	struct json_path_parser parser;
+	/* Need uninitialized structure to
+	 * json_path_parser_deinit on lua_isnumber */
+	memset(&parser, 0, sizeof(parser));
+	const char *path = NULL;
  	const char *field;
  	struct tuple *tuple = luaT_istuple(L, 1);
  	/* Is checked in Lua wrapper. */
@@ -506,6 +511,18 @@ lbox_tuple_field_by_path(struct lua_State *L)
  		index -= TUPLE_INDEX_BASE;
  		if (index < 0) {
  not_found:
+			if (!path)
+				goto exit_not_found;
+			uint32_t path_len = strlen(path);
+			uint32_t path_hash = lua_hash(path, path_len);
+			field = tuple_field_by_name(tuple, path,
+			                            path_len, path_hash);
+			if (field)
+				goto push_value;
+			if (err_pos || path_len == 0)
+				luaL_error(L, "Error in path on position %d", err_pos);
+exit_not_found:
+			json_path_parser_deinit(&parser);
  			lua_pushinteger(L, -1);
  			lua_pushnil(L);
  			return 2;
@@ -514,19 +531,21 @@ not_found:
  		if (field == NULL)
  			goto not_found;
  push_value:
+		json_path_parser_deinit(&parser);
  		lua_pushinteger(L, 0);
  		luamp_decode(L, luaL_msgpack_default, &field);
  		return 2;
  	}
  	assert(lua_isstring(L, 2));
  	size_t path_len;
-	const char *path = lua_tolstring(L, 2, &path_len);
-	struct json_path_parser parser;
+	path = lua_tolstring(L, 2, &path_len);
  	struct json_path_node node;
-	json_path_parser_create(&parser, path, path_len);
+	json_path_parser_init(&parser, path, path_len);
  	int rc = json_path_next(&parser, &node);
-	if (rc != 0 || node.type == JSON_PATH_END)
-		luaL_error(L, "Error in path on position %d", rc);
+	if (rc != 0 || node.type == JSON_PATH_END) {
+		err_pos = rc;
+		goto not_found;
+	}
  	if (node.type == JSON_PATH_NUM) {
  		int index = node.num;
  		if (index == 0)
diff --git a/src/lib/json/path.c b/src/lib/json/path.c
index 4a6174e..4aadb3a 100644
--- a/src/lib/json/path.c
+++ b/src/lib/json/path.c
@@ -31,8 +31,11 @@

  #include "path.h"
  #include <ctype.h>
+#include <unicode/uchar.h>
  #include "trivia/util.h"

+#define REPLACEMENT_CHARACTER (0xFFFD)
+
  /** Same as strtoull(), but with limited length. */
  static inline uint64_t
  strntoull(const char *src, int len) {
@@ -45,6 +48,51 @@ strntoull(const char *src, int len) {
  }

  /**
+ * Parse string and update parser's state.
+ * @param[out] parser JSON path parser. Upates pos, signs_read.
+ * @param[out] UChar32 to store result.
+ *
+ * @retval 1 Success.
+ * @retval 0 End of string.
+ * @retval -1 Parse error.
+ */
+static inline int
+parser_read_sign(struct json_path_parser *parser, UChar32 *out)
+{
+	int rc;
+	UErrorCode status = U_ZERO_ERROR;
+	if (parser->pos == parser->end)
+		return 0;
+	*out = ucnv_getNextUChar(parser->utf8conv, &parser->pos, parser->end, 
&status);
+	parser->invalid_sign_off += (rc = U_SUCCESS(status));
+	return rc ? 1 : -1;
+}
+
+/**
+ * Parse string and update parser's state.
+ * @param[out] parser JSON path parser. Upates pos, signs_read.
+ * @param old parser read offset.
+ * @param signs to drop.
+ */
+static inline void
+parser_reset_pos(struct json_path_parser *parser, const char *old_pos, 
int signs)
+{
+	parser->pos = old_pos;
+	parser->invalid_sign_off -= signs;
+}
+
+static inline bool
+string_valid_sign(UChar32 c)
+{
+	int8_t type = u_charType(c);
+	return !(c == REPLACEMENT_CHARACTER ||
+	         type == U_UNASSIGNED ||
+	         type == U_LINE_SEPARATOR ||
+	         type == U_CONTROL_CHAR ||
+	         type == U_PARAGRAPH_SEPARATOR);
+}
+
+/**
   * Parse string identifier in quotes. Parser either stops right
   * after the closing quote, or returns an error position.
   * @param parser JSON path parser.
@@ -56,22 +104,24 @@ strntoull(const char *src, int len) {
  static inline int
  json_parse_string(struct json_path_parser *parser, struct 
json_path_node *node)
  {
-	const char *end = parser->src + parser->src_len;
-	const char *pos = parser->pos;
-	assert(pos < end);
-	char quote_type = *pos;
-	assert(quote_type == '\'' || quote_type == '"');
-	/* Skip first quote. */
-	int len = 0;
-	++pos;
-	const char *str = pos;
-	for (char c = *pos; pos < end && quote_type != c; c = *++pos)
-		++len;
-	/* A string must be terminated with quote. */
-	if (*pos != quote_type || len == 0)
-		return pos - parser->src + 1;
-	/* Skip the closing quote. */
-	parser->pos = pos + 1;
+	assert(parser->pos < parser->end);
+	UChar32 quote_type;
+	(void)parser_read_sign(parser, &quote_type);
+	assert(quote_type == (UChar32)'\'' || quote_type == (UChar32)'"');
+	const char *str = parser->pos;
+	UChar32 c = 0;
+	int rc = 0;
+
+	while (((rc = parser_read_sign(parser, &c)) > 0)
+	       && string_valid_sign(c) && c != quote_type);
+	int len = (int)(parser->pos - str - 1);
+	if (rc < 0 || len == 0)
+		return -1;
+	if (c != (UChar32)quote_type) {
+		parser->invalid_sign_off++;
+		return -1;
+	}
+
  	node->type = JSON_PATH_STR;
  	node->str = str;
  	node->len = len;
@@ -81,7 +131,7 @@ json_parse_string(struct json_path_parser *parser, 
struct json_path_node *node)
  /**
   * Parse digit sequence into integer until non-digit is met.
   * Parser stops right after the last digit.
- * @param parser JSON parser.
+ * @param[out] parser JSON parser. Updates signs_read field.
   * @param[out] node JSON node to store result.
   *
   * @retval     0 Success.
@@ -90,27 +140,40 @@ json_parse_string(struct json_path_parser *parser, 
struct json_path_node *node)
  static inline int
  json_parse_integer(struct json_path_parser *parser, struct 
json_path_node *node)
  {
-	const char *end = parser->src + parser->src_len;
-	const char *pos = parser->pos;
-	assert(pos < end);
-	const char *str = pos;
-	int len = 0;
-	for (char c = *pos; pos < end && isdigit(c); c = *++pos)
-		++len;
-	if (len == 0)
-		return pos - parser->src + 1;
-	parser->pos = pos;
+	assert(parser->pos < parser->end);
+	const char *str = parser->pos;
+	const char *last_pos = parser->pos;
+	int len = 0, rc = 0;
+	UChar32 c = 0;
+
+	while (((rc = parser_read_sign(parser, &c)) > 0) && u_isdigit(c)) {
+		last_pos = parser->pos;
+		len++;
+	}
+	if (rc > 0 && len > 0 && !u_isdigit(c))
+		parser_reset_pos(parser, last_pos, 1);
+	if (rc < 0 || len == 0)
+		return -1;
+
  	node->type = JSON_PATH_NUM;
  	node->num = strntoull(str, len);
  	return 0;
  }

+static inline bool
+identifier_valid_sign(UChar32 c)
+{
+	return u_isUAlphabetic(c)
+	       || c == (UChar32)'_'
+	       || u_isdigit(c);
+}
+
  /**
   * Parse identifier out of quotes. It can contain only alphas,
   * digits and underscores. And can not contain digit at the first
   * position. Parser is stoped right after the last non-digit,
   * non-alpha and non-underscore symbol.
- * @param parser JSON parser.
+ * @param[out] parser JSON parser. Updates signs_read field.
   * @param[out] node JSON node to store result.
   *
   * @retval     0 Success.
@@ -120,68 +183,102 @@ static inline int
  json_parse_identifier(struct json_path_parser *parser,
  		      struct json_path_node *node)
  {
-	const char *end = parser->src + parser->src_len;
-	const char *pos = parser->pos;
-	assert(pos < end);
-	const char *str = pos;
-	char c = *pos;
+	assert(parser->pos < parser->end);
+	const char *str = parser->pos;
+	UChar32 c;
+	int rc = 0;
+	if (parser_read_sign(parser, &c) < 0)
+		return -1;
  	/* First symbol can not be digit. */
-	if (!isalpha(c) && c != '_')
-		return pos - parser->src + 1;
-	int len = 1;
-	for (c = *++pos; pos < end && (isalpha(c) || c == '_' || isdigit(c));
-	     c = *++pos)
-		++len;
-	assert(len > 0);
-	parser->pos = pos;
+	if (!u_isalpha(c) && c != (UChar32)'_')
+		return -1;
+
+	const char *last_pos = parser->pos;
+	while ((rc = parser_read_sign(parser, &c)) > 0 && 
identifier_valid_sign(c))
+		last_pos = parser->pos;
+	if (rc > 0 && !identifier_valid_sign(c))
+		parser_reset_pos(parser, last_pos, 1);
+	if (rc < 0)
+		return -1;
+
  	node->type = JSON_PATH_STR;
  	node->str = str;
-	node->len = len;
+	node->len = (int)(parser->pos - str);
  	return 0;
  }

  int
+json_path_parser_init(struct json_path_parser *parser, const char *src,
+                      int src_len)
+{
+	UErrorCode status = U_ZERO_ERROR ;
+	parser->utf8conv = ucnv_open("utf8", &status);
+	if (U_FAILURE(status))
+		return -1;
+	assert(parser->utf8conv);
+	parser->src = src;
+	parser->end = src + src_len;
+	parser->pos = src;
+	parser->invalid_sign_off = 0;
+	return 0;
+}
+
+void
+json_path_parser_deinit(struct json_path_parser *parser)
+{
+	if (parser->utf8conv)
+		ucnv_close(parser->utf8conv);
+}
+
+static inline int
+error_sign_offset(struct json_path_parser *parser)
+{
+	return parser->invalid_sign_off;
+}
+
+int
  json_path_next(struct json_path_parser *parser, struct json_path_node 
*node)
  {
-	const char *end = parser->src + parser->src_len;
+	assert(parser->utf8conv);
+	const char *end = parser->end;
  	if (end == parser->pos) {
  		node->type = JSON_PATH_END;
  		return 0;
  	}
-	char c = *parser->pos;
+	UChar32 c = 0;
+	const char *last_pos = parser->pos;
+	if (parser_read_sign(parser, &c) < 0)
+		return error_sign_offset(parser);
  	int rc;
  	switch(c) {
-	case '[':
-		++parser->pos;
+	case (UChar32)'[':
  		/* Error for []. */
  		if (parser->pos == end)
-			return parser->pos - parser->src + 1;
+			return parser->invalid_sign_off;
  		c = *parser->pos;
  		if (c == '"' || c == '\'')
  			rc = json_parse_string(parser, node);
  		else
  			rc = json_parse_integer(parser, node);
  		if (rc != 0)
-			return rc;
+			return parser->invalid_sign_off;
  		/*
  		 * Expression, started from [ must be finished
  		 * with ] regardless of its type.
  		 */
  		if (parser->pos == end || *parser->pos != ']')
-			return parser->pos - parser->src + 1;
+			return parser->invalid_sign_off + 1;
  		/* Skip ]. */
-		++parser->pos;
+		(void)parser_read_sign(parser, &c);
  		break;
-	case '.':
-		/* Skip dot. */
-		++parser->pos;
-		if (parser->pos == end)
-			return parser->pos - parser->src + 1;
-		FALLTHROUGH
  	default:
+		if (c != (UChar32)'.')
+			parser_reset_pos(parser, last_pos, 1);
+		else if (parser->pos == end)
+			return parser->invalid_sign_off + 1;
  		rc = json_parse_identifier(parser, node);
  		if (rc != 0)
-			return rc;
+			return parser->invalid_sign_off;
  		break;
  	}
  	return 0;
diff --git a/src/lib/json/path.h b/src/lib/json/path.h
index 6e8db4c..0ff68c4 100644
--- a/src/lib/json/path.h
+++ b/src/lib/json/path.h
@@ -33,6 +33,9 @@

  #include <stdbool.h>
  #include <stdint.h>
+#include <unicode/ucnv_err.h>
+#include <unicode/ucnv.h>
+#include <assert.h>

  #ifdef __cplusplus
  extern "C" {
@@ -45,10 +48,12 @@ extern "C" {
  struct json_path_parser {
  	/** Source string. */
  	const char *src;
-	/** Length of src. */
-	int src_len;
+	/** End of string. */
+	const char *end;
  	/** Current parser's position. */
  	const char *pos;
+	int invalid_sign_off;
+	UConverter* utf8conv;
  };

  enum json_path_type {
@@ -78,19 +83,22 @@ struct json_path_node {
  };

  /**
- * Create @a parser.
+ * Init @a parser.
   * @param[out] parser Parser to create.
   * @param src Source string.
   * @param src_len Length of @a src.
+ * @retval 0 Success.
+ * @retval -1 Init error.
   */
-static inline void
-json_path_parser_create(struct json_path_parser *parser, const char *src,
-			int src_len)
-{
-	parser->src = src;
-	parser->src_len = src_len;
-	parser->pos = src;
-}
+int
+json_path_parser_init(struct json_path_parser *parser, const char *src,
+                      int src_len);
+/**
+ * Deinit @a parser.
+ * @param[out] parser instance to deinit.
+ */
+void
+json_path_parser_deinit(struct json_path_parser *parser);

  /**
   * Get a next path node.
diff --git a/test/engine/tuple.result b/test/engine/tuple.result
index 2d7367a..6b597d6 100644
--- a/test/engine/tuple.result
+++ b/test/engine/tuple.result
@@ -602,7 +602,10 @@ format[2] = {name = 'field2', type = 'array'}
  format[3] = {name = 'field3', type = 'map'}
  ---
  ...
-format[4] = {name = 'field4', type = 'string'}
+format[4] = {name = 'field4', type = 'string' }
+---
+...
+format[5] = {name = "[2][6]['привет中国world']['中国a']", type = 'string'}
  ---
  ...
  s = box.schema.space.create('test', {format = format})
@@ -611,13 +614,13 @@ s = box.schema.space.create('test', {format = format})
  pk = s:create_index('pk')
  ---
  ...
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {привет中国world={中国="привет"}, 
key="value1", value="key1"}}
  ---
  ...
  field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, 
d=4} }, [-1] = 200}
  ---
  ...
-t = s:replace{1, field2, field3, "123456"}
+t = s:replace{1, field2, field3, "123456", "yes, this"}
  ---
  ...
  t[1]
@@ -626,7 +629,7 @@ t[1]
  ...
  t[2]
  ---
-- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
+- [1, 2, 3, '4', [5, 6, 7], {'привет中国world': {'中国': 'привет'}, 'key': 
'value1', 'value': 'key1'}]
  ...
  t[3]
  ---
@@ -667,19 +670,43 @@ t["[2][5][3]"]
  ...
  t["[2][6].key"]
  ---
-- key1
+- value1
  ...
  t["[2][6].value"]
  ---
-- value1
+- key1
  ...
  t["[2][6]['key']"]
  ---
-- key1
+- value1
  ...
  t["[2][6]['value']"]
  ---
-- value1
+- key1
+...
+t[2][6].привет中国world.中国
+---
+- привет
+...
+t["[2][6].привет中国world"].中国
+---
+- привет
+...
+t["[2][6].привет中国world.中国"]
+---
+- привет
+...
+t["[2][6]['привет中国world']"]["中国"]
+---
+- привет
+...
+t["[2][6]['привет中国world']['中国']"]
+---
+- привет
+...
+t["[2][6]['привет中国world']['中国a']"]
+---
+- yes, this
  ...
  t["[3].k3[2].c"]
  ---
diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
index ba3482d..90da8b2 100644
--- a/test/engine/tuple.test.lua
+++ b/test/engine/tuple.test.lua
@@ -204,12 +204,13 @@ format = {}
  format[1] = {name = 'field1', type = 'unsigned'}
  format[2] = {name = 'field2', type = 'array'}
  format[3] = {name = 'field3', type = 'map'}
-format[4] = {name = 'field4', type = 'string'}
+format[4] = {name = 'field4', type = 'string' }
+format[5] = {name = "[2][6]['привет中国world']['中国a']", type = 'string'}
  s = box.schema.space.create('test', {format = format})
  pk = s:create_index('pk')
-field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
+field2 = {1, 2, 3, "4", {5,6,7}, {привет中国world={中国="привет"}, 
key="value1", value="key1"}}
  field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, {c=3, 
d=4} }, [-1] = 200}
-t = s:replace{1, field2, field3, "123456"}
+t = s:replace{1, field2, field3, "123456", "yes, this"}
  t[1]
  t[2]
  t[3]
@@ -225,6 +226,12 @@ t["[2][6].key"]
  t["[2][6].value"]
  t["[2][6]['key']"]
  t["[2][6]['value']"]
+t[2][6].привет中国world.中国
+t["[2][6].привет中国world"].中国
+t["[2][6].привет中国world.中国"]
+t["[2][6]['привет中国world']"]["中国"]
+t["[2][6]['привет中国world']['中国']"]
+t["[2][6]['привет中国world']['中国a']"]
  t["[3].k3[2].c"]
  t["[4]"]
  t.field1
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index fe8b2d2..667194c 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -130,7 +130,7 @@ add_executable(csv.test csv.c)
  target_link_libraries(csv.test csv)

  add_executable(json_path.test json_path.c)
-target_link_libraries(json_path.test json_path unit)
+target_link_libraries(json_path.test json_path unit ${ICU_LIBRARIES})

  add_executable(rmean.test rmean.cc)
  target_link_libraries(rmean.test stat unit)
diff --git a/test/unit/json_path.c b/test/unit/json_path.c
index 599658b..b62afd2 100644
--- a/test/unit/json_path.c
+++ b/test/unit/json_path.c
@@ -6,7 +6,7 @@
  #define reset_to_new_path(value) \
  	path = value; \
  	len = strlen(value); \
-	json_path_parser_create(&parser, path, len);
+	(void)json_path_parser_init(&parser, path, len);

  #define is_next_index(value_len, value) \
  	path = parser.pos; \
@@ -30,6 +30,7 @@ test_basic()
  	const char *path;
  	int len;
  	struct json_path_parser parser;
+	memset(&parser, 0, sizeof(parser));
  	struct json_path_node node;

  	reset_to_new_path("[0].field1.field2['field3'][5]");
@@ -89,6 +90,7 @@ test_errors()
  	const char *path;
  	int len;
  	struct json_path_parser parser;
+	memset(&parser, 0, sizeof(parser));
  	const struct path_and_errpos errors[] = {
  		/* Double [[. */
  		{"[[", 2},
-- 
2.7.4



On 29.03.2018 17:22, Kirill Shcherbatov wrote:
 > From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
 >
 > In progress ...
 >
 > Closes #1285
 > ---
 >   src/box/CMakeLists.txt     |   2 +-
 >   src/box/lua/tuple.c        | 176 
+++++++++++++++++++++++++++++++++++-----
 >   src/box/lua/tuple.lua      |  45 +++--------
 >   test/engine/tuple.result   | 198 
+++++++++++++++++++++++++++++++++++++++++++++
 >   test/engine/tuple.test.lua |  59 ++++++++++++++
 >   5 files changed, 428 insertions(+), 52 deletions(-)
 >
 > diff --git a/src/box/CMakeLists.txt b/src/box/CMakeLists.txt
 > index e420fe3..add0ff9 100644
 > --- a/src/box/CMakeLists.txt
 > +++ b/src/box/CMakeLists.txt
 > @@ -130,5 +130,5 @@ add_library(box STATIC
 >       ${bin_sources})
 >
 >   target_link_libraries(box box_error tuple stat xrow xlog vclock 
crc32 scramble
 > -                      ${common_libraries})
 > +                      json_path ${common_libraries})
 >   add_dependencies(box build_bundled_libs)
 > diff --git a/src/box/lua/tuple.c b/src/box/lua/tuple.c
 > index 7ca4299..99b9ff2 100644
 > --- a/src/box/lua/tuple.c
 > +++ b/src/box/lua/tuple.c
 > @@ -41,6 +41,7 @@
 >   #include "box/tuple_convert.h"
 >   #include "box/errcode.h"
 >   #include "box/memtx_tuple.h"
 > +#include "json/path.h"
 >
 >   /** {{{ box.tuple Lua library
 >    *
 > @@ -402,36 +403,175 @@ lbox_tuple_transform(struct lua_State *L)
 >   }
 >
 >   /**
 > - * Find a tuple field using its name.
 > + * Propagate @a field to MessagePack(field)[index].
 > + * @param[in][out] field Field to propagate.
 > + * @param index 1-based index to propagate to.
 > + *
 > + * @retval  0 Success, the index was found.
 > + * @retval -1 Not found.
 > + */
 > +static inline int
 > +tuple_field_go_to_index(const char **field, uint64_t index)
 > +{
 > +	assert(index >= 0);
 > +	enum mp_type type = mp_typeof(**field);
 > +	if (type == MP_ARRAY) {
 > +		if (index == 0)
 > +			return -1;
 > +		/* Make index 0-based. */
 > +		index -= TUPLE_INDEX_BASE;
 > +		uint32_t count = mp_decode_array(field);
 > +		if (index >= count)
 > +			return -1;
 > +		for (; index > 0; --index)
 > +			mp_next(field);
 > +		return 0;
 > +	} else if (type == MP_MAP) {
 > +		uint64_t count = mp_decode_map(field);
 > +		for (; count > 0; --count) {
 > +			type = mp_typeof(**field);
 > +			if (type == MP_UINT) {
 > +				uint64_t value = mp_decode_uint(field);
 > +				if (value == index)
 > +					return 0;
 > +			} else if (type == MP_INT) {
 > +				int64_t value = mp_decode_int(field);
 > +				if (value >= 0 && (uint64_t)value == index)
 > +					return 0;
 > +			} else {
 > +				/* Skip key. */
 > +				mp_next(field);
 > +			}
 > +			/* Skip value. */
 > +			mp_next(field);
 > +		}
 > +	}
 > +	return -1;
 > +}
 > +
 > +/**
 > + * Propagate @a field to MessagePack(field)[key].
 > + * @param[in][out] field Field to propagate.
 > + * @param key Key to propagate to.
 > + * @param len Length of @a key.
 > + *
 > + * @retval  0 Success, the index was found.
 > + * @retval -1 Not found.
 > + */
 > +static inline int
 > +tuple_field_go_to_key(const char **field, const char *key, int len)
 > +{
 > +	enum mp_type type = mp_typeof(**field);
 > +	if (type != MP_MAP)
 > +		return -1;
 > +	uint64_t count = mp_decode_map(field);
 > +	for (; count > 0; --count) {
 > +		type = mp_typeof(**field);
 > +		if (type == MP_STR) {
 > +			uint32_t value_len;
 > +			const char *value = mp_decode_str(field, &value_len);
 > +			if (value_len == (uint)len &&
 > +			    memcmp(value, key, len) == 0)
 > +				return 0;
 > +		} else {
 > +			/* Skip key. */
 > +			mp_next(field);
 > +		}
 > +		/* Skip value. */
 > +		mp_next(field);
 > +	}
 > +	return -1;
 > +}
 > +
 > +/**
 > + * Find a tuple field by JSON path.
 >    * @param L Lua state.
 > - * @param tuple 1-th argument on lua stack, tuple to get field
 > + * @param tuple 1-th argument on a lua stack, tuple to get field
 >    *        from.
 > - * @param field_name 2-th argument on lua stack, field name to
 > - *        get.
 > + * @param path 2-th argument on lua stack. Can be field name,
 > + *        JSON path to a field or a field number.
 >    *
 >    * @retval If a field was not found, return -1 and nil to lua else
 >    *         return 0 and decoded field.
 >    */
 >   static int
 > -lbox_tuple_field_by_name(struct lua_State *L)
 > +lbox_tuple_field_by_path(struct lua_State *L)
 >   {
 > +	const char *field;
 >   	struct tuple *tuple = luaT_istuple(L, 1);
 >   	/* Is checked in Lua wrapper. */
 >   	assert(tuple != NULL);
 > -	assert(lua_isstring(L, 2));
 > -	size_t name_len;
 > -	const char *name = lua_tolstring(L, 2, &name_len);
 > -	uint32_t name_hash = lua_hashstring(L, 2);
 > -	const char *field =
 > -		tuple_field_by_name(tuple, name, name_len, name_hash);
 > -	if (field == NULL) {
 > -		lua_pushinteger(L, -1);
 > -		lua_pushnil(L);
 > +	if (lua_isnumber(L, 2)) {
 > +		int index = lua_tointeger(L, 2);
 > +		index -= TUPLE_INDEX_BASE;
 > +		if (index < 0) {
 > +not_found:
 > +			lua_pushinteger(L, -1);
 > +			lua_pushnil(L);
 > +			return 2;
 > +		}
 > +		field = tuple_field(tuple, index);
 > +		if (field == NULL)
 > +			goto not_found;
 > +push_value:
 > +		lua_pushinteger(L, 0);
 > +		luamp_decode(L, luaL_msgpack_default, &field);
 >   		return 2;
 >   	}
 > -	lua_pushinteger(L, 0);
 > -	luamp_decode(L, luaL_msgpack_default, &field);
 > -	return 2;
 > +	assert(lua_isstring(L, 2));
 > +	size_t path_len;
 > +	const char *path = lua_tolstring(L, 2, &path_len);
 > +	struct json_path_parser parser;
 > +	struct json_path_node node;
 > +	json_path_parser_create(&parser, path, path_len);
 > +	int rc = json_path_next(&parser, &node);
 > +	if (rc != 0 || node.type == JSON_PATH_END)
 > +		luaL_error(L, "Error in path on position %d", rc);
 > +	if (node.type == JSON_PATH_NUM) {
 > +		int index = node.num;
 > +		if (index == 0)
 > +			goto not_found;
 > +		index -= TUPLE_INDEX_BASE;
 > +		field = tuple_field(tuple, index);
 > +		if (field == NULL)
 > +			goto not_found;
 > +	} else {
 > +		assert(node.type == JSON_PATH_STR);
 > +		/* First part of a path is a field name. */
 > +		const char *name = node.str;
 > +		uint32_t name_len = node.len;
 > +		uint32_t name_hash;
 > +		if (path_len == name_len) {
 > +			name_hash = lua_hashstring(L, 2);
 > +		} else {
 > +			/*
 > +			 * If a string is "field....", then its
 > +			 * precalculated juajit hash can not be
 > +			 * used. A tuple dictionary hashes only
 > +			 * name, not path.
 > +			 */
 > +			name_hash = lua_hash(name, name_len);
 > +		}
 > +		field = tuple_field_by_name(tuple, name, name_len, name_hash);
 > +		if (field == NULL)
 > +			goto not_found;
 > +	}
 > +	while ((rc = json_path_next(&parser, &node)) == 0 &&
 > +	       node.type != JSON_PATH_END) {
 > +		if (node.type == JSON_PATH_NUM) {
 > +			rc = tuple_field_go_to_index(&field, node.num);
 > +		} else {
 > +			assert(node.type == JSON_PATH_STR);
 > +			rc = tuple_field_go_to_key(&field, node.str, node.len);
 > +		}
 > +		if (rc != 0)
 > +			goto not_found;
 > +	}
 > +	if (rc == 0)
 > +		goto push_value;
 > +	luaL_error(L, "Error in path on position %d", rc);
 > +	unreachable();
 > +	goto not_found;
 >   }
 >
 >   static int
 > @@ -470,8 +610,8 @@ static const struct luaL_Reg lbox_tuple_meta[] = {
 >   	{"tostring", lbox_tuple_to_string},
 >   	{"slice", lbox_tuple_slice},
 >   	{"transform", lbox_tuple_transform},
 > -	{"tuple_field_by_name", lbox_tuple_field_by_name},
 >   	{"tuple_to_map", lbox_tuple_to_map},
 > +	{"tuple_field_by_path", lbox_tuple_field_by_path},
 >   	{NULL, NULL}
 >   };
 >
 > diff --git a/src/box/lua/tuple.lua b/src/box/lua/tuple.lua
 > index 001971a..b51b4df 100644
 > --- a/src/box/lua/tuple.lua
 > +++ b/src/box/lua/tuple.lua
 > @@ -9,16 +9,9 @@ local internal = require('box.internal')
 >
 >   ffi.cdef[[
 >   /** \cond public */
 > -typedef struct tuple_format box_tuple_format_t;
 > -
 > -box_tuple_format_t *
 > -box_tuple_format_default(void);
 >
 >   typedef struct tuple box_tuple_t;
 >
 > -box_tuple_t *
 > -box_tuple_new(box_tuple_format_t *format, const char *data, const 
char *end);
 > -
 >   int
 >   box_tuple_ref(box_tuple_t *tuple);
 >
 > @@ -34,9 +27,6 @@ box_tuple_bsize(const box_tuple_t *tuple);
 >   ssize_t
 >   box_tuple_to_buf(const box_tuple_t *tuple, char *buf, size_t size);
 >
 > -box_tuple_format_t *
 > -box_tuple_format(const box_tuple_t *tuple);
 > -
 >   const char *
 >   box_tuple_field(const box_tuple_t *tuple, uint32_t i);
 >
 > @@ -278,9 +268,9 @@ end
 >
 >   msgpackffi.on_encode(const_tuple_ref_t, tuple_to_msgpack)
 >
 > -local function tuple_field_by_name(tuple, name)
 > +local function tuple_field_by_path(tuple, path)
 >       tuple_check(tuple, "tuple['field_name']");
 > -    return internal.tuple.tuple_field_by_name(tuple, name)
 > +    return internal.tuple.tuple_field_by_path(tuple, path)
 >   end
 >
 >   local methods = {
 > @@ -306,33 +296,22 @@ end
 >
 >   methods["__serialize"] = tuple_totable -- encode hook for 
msgpack/yaml/json
 >
 > -local tuple_field = function(tuple, field_n)
 > -    local field = builtin.box_tuple_field(tuple, field_n - 1)
 > -    if field == nil then
 > -        return nil
 > -    end
 > -    -- Use () to shrink stack to the first return value
 > -    return (msgpackffi.decode_unchecked(field))
 > -end
 > -
 > -
 >   ffi.metatype(tuple_t, {
 >       __len = function(tuple)
 >           return builtin.box_tuple_field_count(tuple)
 >       end;
 >       __tostring = internal.tuple.tostring;
 >       __index = function(tuple, key)
 > -        if type(key) == "number" then
 > -            return tuple_field(tuple, key)
 > -        elseif type(key) == "string" then
 > -            -- Try to get a field with a name = key. If it was not
 > -            -- found (rc ~= 0) then return a method from the
 > -            -- vtable. If a collision occurred, then fields have
 > -            -- higher priority. For example, if a tuple T has a
 > -            -- field with name 'bsize', then T.bsize returns field
 > -            -- value, not tuple_bsize function. To access hidden
 > -            -- methods use 'box.tuple.<method_name>(T, [args...])'.
 > -            local rc, field = tuple_field_by_name(tuple, key)
 > +        if type(key) == "string" or type(key) == "number" then
 > +            -- Try to get a field by json path or by [index]. If
 > +            -- it was not found (rc ~= 0) then return a method
 > +            -- from the vtable. If a collision occurred, then
 > +            -- fields have higher priority. For example, if a
 > +            -- tuple T has a field with name 'bsize', then T.bsize
 > +            -- returns field value, not tuple_bsize function. To
 > +            -- access hidden methods use
 > +            -- 'box.tuple.<method_name>(T, [args...])'.
 > +            local rc, field = tuple_field_by_path(tuple, key)
 >               if rc == 0 then
 >                   return field
 >               end
 > diff --git a/test/engine/tuple.result b/test/engine/tuple.result
 > index b3b23b2..2d7367a 100644
 > --- a/test/engine/tuple.result
 > +++ b/test/engine/tuple.result
 > @@ -590,6 +590,204 @@ maplen(t1map), t1map[1], t1map[2], t1map[3]
 >   s:drop()
 >   ---
 >   ...
 > +format = {}
 > +---
 > +...
 > +format[1] = {name = 'field1', type = 'unsigned'}
 > +---
 > +...
 > +format[2] = {name = 'field2', type = 'array'}
 > +---
 > +...
 > +format[3] = {name = 'field3', type = 'map'}
 > +---
 > +...
 > +format[4] = {name = 'field4', type = 'string'}
 > +---
 > +...
 > +s = box.schema.space.create('test', {format = format})
 > +---
 > +...
 > +pk = s:create_index('pk')
 > +---
 > +...
 > +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
 > +---
 > +...
 > +field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, 
{c=3, d=4} }, [-1] = 200}
 > +---
 > +...
 > +t = s:replace{1, field2, field3, "123456"}
 > +---
 > +...
 > +t[1]
 > +---
 > +- 1
 > +...
 > +t[2]
 > +---
 > +- [1, 2, 3, '4', [5, 6, 7], {'key': 'key1', 'value': 'value1'}]
 > +...
 > +t[3]
 > +---
 > +- {'k1': 100, 'k3': [{'a': 1, 'b': 2}, {'c': 3, 'd': 4}], -1: 200, 
10: 100, 'k2': [
 > +    1, 2, 3]}
 > +...
 > +t[4]
 > +---
 > +- '123456'
 > +...
 > +t[2][1]
 > +---
 > +- 1
 > +...
 > +t["[2][1]"]
 > +---
 > +- 1
 > +...
 > +t[2][5]
 > +---
 > +- [5, 6, 7]
 > +...
 > +t["[2][5]"]
 > +---
 > +- [5, 6, 7]
 > +...
 > +t["[2][5][1]"]
 > +---
 > +- 5
 > +...
 > +t["[2][5][2]"]
 > +---
 > +- 6
 > +...
 > +t["[2][5][3]"]
 > +---
 > +- 7
 > +...
 > +t["[2][6].key"]
 > +---
 > +- key1
 > +...
 > +t["[2][6].value"]
 > +---
 > +- value1
 > +...
 > +t["[2][6]['key']"]
 > +---
 > +- key1
 > +...
 > +t["[2][6]['value']"]
 > +---
 > +- value1
 > +...
 > +t["[3].k3[2].c"]
 > +---
 > +- 3
 > +...
 > +t["[4]"]
 > +---
 > +- '123456'
 > +...
 > +t.field1
 > +---
 > +- 1
 > +...
 > +t.field2[5]
 > +---
 > +- [5, 6, 7]
 > +...
 > +t[".field1"]
 > +---
 > +- 1
 > +...
 > +t["field1"]
 > +---
 > +- 1
 > +...
 > +t["[3][10]"]
 > +---
 > +- 100
 > +...
 > +-- Not found.
 > +t[0]
 > +---
 > +- null
 > +...
 > +t["[0]"]
 > +---
 > +- null
 > +...
 > +t["[1000]"]
 > +---
 > +- null
 > +...
 > +t.field1000
 > +---
 > +- null
 > +...
 > +t["not_found"]
 > +---
 > +- null
 > +...
 > +t["[2][5][10]"]
 > +---
 > +- null
 > +...
 > +t["[2][6].key100"]
 > +---
 > +- null
 > +...
 > +t["[2][0]"] -- 0-based index in array.
 > +---
 > +- null
 > +...
 > +t["[4][3]"] -- Can not index string.
 > +---
 > +- null
 > +...
 > +t["[4]['key']"]
 > +---
 > +- null
 > +...
 > +-- Not found 'a'. Return 'null' despite of syntax error on a
 > +-- next position.
 > +t["a.b.c d.e.f"]
 > +---
 > +- null
 > +...
 > +-- Sytax errors.
 > +t[""]
 > +---
 > +- error: 'builtin/box/tuple.lua:314: Error in path on position 0'
 > +...
 > +t["[2].[5]"]
 > +---
 > +- error: 'builtin/box/tuple.lua:314: Error in path on position 5'
 > +...
 > +t["[-1]"]
 > +---
 > +- error: 'builtin/box/tuple.lua:314: Error in path on position 2'
 > +...
 > +t[".."]
 > +---
 > +- error: 'builtin/box/tuple.lua:314: Error in path on position 2'
 > +...
 > +t["[["]
 > +---
 > +- error: 'builtin/box/tuple.lua:314: Error in path on position 2'
 > +...
 > +t["]]"]
 > +---
 > +- error: 'builtin/box/tuple.lua:314: Error in path on position 1'
 > +...
 > +t["{"]
 > +---
 > +- error: 'builtin/box/tuple.lua:314: Error in path on position 1'
 > +...
 > +s:drop()
 > +---
 > +...
 >   engine = nil
 >   ---
 >   ...
 > diff --git a/test/engine/tuple.test.lua b/test/engine/tuple.test.lua
 > index 6d7d254..ba3482d 100644
 > --- a/test/engine/tuple.test.lua
 > +++ b/test/engine/tuple.test.lua
 > @@ -200,5 +200,64 @@ t1map = t1:tomap()
 >   maplen(t1map), t1map[1], t1map[2], t1map[3]
 >   s:drop()
 >
 > +format = {}
 > +format[1] = {name = 'field1', type = 'unsigned'}
 > +format[2] = {name = 'field2', type = 'array'}
 > +format[3] = {name = 'field3', type = 'map'}
 > +format[4] = {name = 'field4', type = 'string'}
 > +s = box.schema.space.create('test', {format = format})
 > +pk = s:create_index('pk')
 > +field2 = {1, 2, 3, "4", {5,6,7}, {key="key1", value="value1"}}
 > +field3 = {[10] = 100, k1 = 100, k2 = {1,2,3}, k3 = { {a=1, b=2}, 
{c=3, d=4} }, [-1] = 200}
 > +t = s:replace{1, field2, field3, "123456"}
 > +t[1]
 > +t[2]
 > +t[3]
 > +t[4]
 > +t[2][1]
 > +t["[2][1]"]
 > +t[2][5]
 > +t["[2][5]"]
 > +t["[2][5][1]"]
 > +t["[2][5][2]"]
 > +t["[2][5][3]"]
 > +t["[2][6].key"]
 > +t["[2][6].value"]
 > +t["[2][6]['key']"]
 > +t["[2][6]['value']"]
 > +t["[3].k3[2].c"]
 > +t["[4]"]
 > +t.field1
 > +t.field2[5]
 > +t[".field1"]
 > +t["field1"]
 > +t["[3][10]"]
 > +
 > +-- Not found.
 > +t[0]
 > +t["[0]"]
 > +t["[1000]"]
 > +t.field1000
 > +t["not_found"]
 > +t["[2][5][10]"]
 > +t["[2][6].key100"]
 > +t["[2][0]"] -- 0-based index in array.
 > +t["[4][3]"] -- Can not index string.
 > +t["[4]['key']"]
 > +-- Not found 'a'. Return 'null' despite of syntax error on a
 > +-- next position.
 > +t["a.b.c d.e.f"]
 > +
 > +-- Sytax errors.
 > +t[""]
 > +t["[2].[5]"]
 > +t["[-1]"]
 > +t[".."]
 > +t["[["]
 > +t["]]"]
 > +t["{"]
 > +
 > +s:drop()
 > +
 >   engine = nil
 >   test_run = nil
 >

  parent reply	other threads:[~2018-04-04 10:37 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-03-29 14:22 [tarantool-patches] [PATCH v2 0/3] tuple field access via a json path Kirill Shcherbatov
2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 1/3] Introduce json_path_parser Kirill Shcherbatov
2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 2/3] lua: implement json path access to tuple fields Kirill Shcherbatov
2018-03-29 14:22 ` [tarantool-patches] [PATCH v2 3/3] Multibyte characters support Kirill Shcherbatov
2018-03-29 18:04   ` [tarantool-patches] " Kirill Shcherbatov
2018-03-30 10:24     ` v.shpilevoy
2018-03-30 10:25       ` v.shpilevoy
2018-04-02 19:19       ` Kirill Shcherbatov
2018-04-03 10:20         ` Vladislav Shpilevoy
2018-04-05 14:09           ` [tarantool-patches] [PATCH v2 1/1] ICU Unicode support for JSON parser Kirill Shcherbatov
2018-04-05 18:00             ` [tarantool-patches] " Kirill Shcherbatov
2018-04-05 23:32               ` Vladislav Shpilevoy
2018-04-04 10:37 ` Kirill Shcherbatov [this message]
2018-04-04 11:30   ` [tarantool-patches] Re: [PATCH v2 3/3] Multibyte characters support ICU Vladislav Shpilevoy

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=01a0181a-19f9-c38c-94ea-65ab549a4517@tarantool.org \
    --to=kshcherbatov@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --cc=v.shpilevoy@tarantool.org \
    --subject='Re: [tarantool-patches] [PATCH v2 3/3] Multibyte characters support ICU' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox