[tarantool-patches] [PATCH v3 4/4] lua: introduce utf8 built-in globaly visible module

Tarantool development patches archive
 help / color / mirror / Atom feed

From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
To: tarantool-patches@freelists.org
Cc: kostja@tarantool.org
Subject: [tarantool-patches] [PATCH v3 4/4] lua: introduce utf8 built-in globaly visible module
Date: Tue, 15 May 2018 22:54:08 +0300	[thread overview]
Message-ID: <b28ae2d61d92c8687ca525dcd96ff073f3e795fa.1526414017.git.v.shpilevoy@tarantool.org> (raw)
In-Reply-To: <cover.1526414017.git.v.shpilevoy@tarantool.org>
In-Reply-To: <cover.1526414017.git.v.shpilevoy@tarantool.org>

utf8 is a module partially compatible with Lua 5.3 utf8 and
lua-utf8 third party module.
Partially means, that not all functions are implemented.

The patch introduces these ones:
upper, lower, len, char, sub, next.

Len and char works exactly like in Lua 5.3. Other functions work
like in lua-utf8, because they are not presented in Lua 5.3.

Tarantool utf8 has extensions:

* isupper/lower/alpha/digit, that check some property by a symbol
  or by its code;

* cmp/casecmp, that compare two UTF8 strings.

Closes #3290
Closes #3385
Closes #3081
---
 src/CMakeLists.txt           |   3 +-
 src/lua/init.c               |   3 +
 src/lua/utf8.c               | 479 +++++++++++++++++++++++++++++++++++++++++++
 src/lua/utf8.h               |  42 ++++
 test/app-tap/string.test.lua | 163 ++++++++++++++-
 test/box/ddl.result          |  15 ++
 test/box/ddl.test.lua        |   8 +
 7 files changed, 711 insertions(+), 2 deletions(-)
 create mode 100644 src/lua/utf8.c
 create mode 100644 src/lua/utf8.h

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5bf17614b..2a952923e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -166,6 +166,7 @@ set (server_sources
      lua/fio.c
      lua/crypto.c
      lua/httpc.c
+     lua/utf8.c
      ${lua_sources}
      ${PROJECT_SOURCE_DIR}/third_party/lua-yaml/lyaml.cc
      ${PROJECT_SOURCE_DIR}/third_party/lua-yaml/b64.c
@@ -210,7 +211,7 @@ endif()
 
 set_source_files_compile_flags(${server_sources})
 add_library(server STATIC ${server_sources})
-target_link_libraries(server core bit uri uuid)
+target_link_libraries(server core bit uri uuid ${ICU_LIBRARIES})
 
 # Rule of thumb: if exporting a symbol from a static library, list the
 # library here.
diff --git a/src/lua/init.c b/src/lua/init.c
index a0a7f63f6..58af1d121 100644
--- a/src/lua/init.c
+++ b/src/lua/init.c
@@ -57,6 +57,7 @@
 #include "lua/pickle.h"
 #include "lua/fio.h"
 #include "lua/httpc.h"
+#include "lua/utf8.h"
 #include "digest.h"
 #include <small/ibuf.h>
 
@@ -399,6 +400,7 @@ tarantool_lua_init(const char *tarantool_bin, int argc, char **argv)
 	lua_call(L, 0, 0);
 	lua_register(L, "tonumber64", lbox_tonumber64);
 
+	tarantool_lua_utf8_init(L);
 	tarantool_lua_utils_init(L);
 	tarantool_lua_fiber_init(L);
 	tarantool_lua_fiber_cond_init(L);
@@ -629,6 +631,7 @@ tarantool_lua_run_script(char *path, bool interactive,
 void
 tarantool_lua_free()
 {
+	tarantool_lua_utf8_free();
 	/*
 	 * Some part of the start script panicked, and called
 	 * exit().  The call stack in this case leads us back to
diff --git a/src/lua/utf8.c b/src/lua/utf8.c
new file mode 100644
index 000000000..e3b2b0a7f
--- /dev/null
+++ b/src/lua/utf8.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <unicode/ucasemap.h>
+#include <unicode/uchar.h>
+#include <coll.h>
+#include "lua/utils.h"
+#include "lua/utf8.h"
+#include "diag.h"
+#include "small/ibuf.h"
+
+extern struct ibuf *tarantool_lua_ibuf;
+
+/** Default universal casemap for case transformations. */
+static UCaseMap *root_map = NULL;
+
+/** Collations for cmp/casecmp functions. */
+static struct coll *unicode_coll = NULL;
+static struct coll *unicode_ci_coll = NULL;
+
+static int
+utf8_str_to_case(struct lua_State *L, const char *src, int src_bsize,
+		 bool is_to_upper)
+{
+	int i = 0;
+	int dst_bsize = src_bsize;
+	(void) i;
+	do {
+		UErrorCode err = U_ZERO_ERROR;
+		ibuf_reset(tarantool_lua_ibuf);
+		char *dst = ibuf_alloc(tarantool_lua_ibuf, dst_bsize);
+		if (dst == NULL) {
+			diag_set(OutOfMemory, dst_bsize, "ibuf_alloc", "dst");
+			return luaT_error(L);
+		}
+		int real_bsize;
+		if (is_to_upper) {
+			real_bsize = ucasemap_utf8ToUpper(root_map, dst,
+							  dst_bsize, src,
+							  src_bsize, &err);
+		} else {
+			real_bsize = ucasemap_utf8ToLower(root_map, dst,
+							  dst_bsize, src,
+							  src_bsize, &err);
+		}
+		if (err == U_ZERO_ERROR ||
+		    err == U_STRING_NOT_TERMINATED_WARNING) {
+			lua_pushlstring(L, dst, real_bsize);
+			return 1;
+		} else if (err == U_BUFFER_OVERFLOW_ERROR) {
+			assert(real_bsize > dst_bsize);
+			dst_bsize = real_bsize;
+		} else {
+			lua_pushnil(L);
+			lua_pushstring(L, tt_sprintf("error during ICU case "\
+						     "transform: %s",
+						     u_errorName(err)));
+			return 2;
+		}
+		/*
+		 * On a first run either all is ok, or
+		 * toLower/Upper returned needed bsize, that is
+		 * allocated on a second iteration. Third
+		 * iteration is not possible.
+		 */
+		assert(++i < 2);
+	} while (true);
+	unreachable();
+	return 0;
+}
+
+/**
+ * Convert a UTF8 string into upper case.
+ * @param String to convert.
+ * @retval not nil String consisting of upper letters.
+ * @retval nil, error Error.
+ */
+static int
+utf8_upper(struct lua_State *L)
+{
+	if (lua_gettop(L) != 1 || !lua_isstring(L, 1))
+		return luaL_error(L, "Usage: utf8.upper(<string>)");
+	size_t len;
+	const char *str = lua_tolstring(L, 1, &len);
+	return utf8_str_to_case(L, str, len, true);
+}
+
+/**
+ * Convert a UTF8 string into lower case.
+ * @param String to convert.
+ * @retval not nil String consisting of lower letters.
+ * @retval nil, error Error.
+ */
+static int
+utf8_lower(struct lua_State *L)
+{
+	if (lua_gettop(L) != 1 || !lua_isstring(L, 1))
+		return luaL_error(L, "Usage: utf8.lower(<string>)");
+	size_t len;
+	const char *str = lua_tolstring(L, 1, &len);
+	return utf8_str_to_case(L, str, len, false);
+}
+
+/**
+ * Calculate a 1-based positive byte offset in a string by any
+ * 1-based offset (possibly negative).
+ * @param offset Original 1-based offset with any sign.
+ * @param len A string byte length.
+ * @retval 1-based positive offset.
+ */
+static inline int
+utf8_convert_offset(int offset, size_t len)
+{
+	if (offset >= 0)
+		return offset;
+	else if ((size_t)-offset > len)
+		return 0;
+	return len + offset + 1;
+}
+
+/**
+ * Calculate length of a UTF8 string. Length here is symbol count.
+ * Works like utf8.len in Lua 5.3. Can take negative offsets. A
+ * negative offset is an offset from the end of string.
+ * Positive position must be inside .
+ * @param String to get length.
+ * @param Start byte offset in [1, #str + 1]. Must point to the
+ *        start of symbol. On invalid symbol an error is returned.
+ * @param End byte offset in [0, #str]. Can point to the middle of
+ *        symbol. Partial symbol is counted too.
+ * @retval not nil Symbol count.
+ * @retval nil, number Error. Byte position of the error is
+ *         returned in the second value.
+ * @retval nil, string Error. Reason is returned in the second
+ *         value.
+ */
+static int
+utf8_len(struct lua_State *L)
+{
+	if (lua_gettop(L) > 3 || !lua_isstring(L, 1))
+		return luaL_error(L, "Usage: utf8.len(<string>, [i, [j]])");
+	size_t slen;
+	const char *str = lua_tolstring(L, 1, &slen);
+	int len = (int) slen;
+	int start_pos = utf8_convert_offset(luaL_optinteger(L, 2, 1), len);
+	int end_pos = utf8_convert_offset(luaL_optinteger(L, 3, -1), len);
+	if (start_pos < 1 || --start_pos > len || end_pos > len) {
+		lua_pushnil(L);
+		lua_pushstring(L, "position is out of string");
+		return 2;
+	}
+	int result = 0;
+	if (end_pos > start_pos) {
+		UChar32 c;
+		while (start_pos < end_pos) {
+			++result;
+			U8_NEXT(str, start_pos, len, c);
+			if (c == U_SENTINEL) {
+				lua_pushnil(L);
+				lua_pushinteger(L, start_pos);
+				return 2;
+			}
+		}
+	}
+	lua_pushinteger(L, result);
+	return 1;
+}
+
+/**
+ * Get next symbol code by @an offset.
+ * @param String to get symbol code.
+ * @param Byte offset from which get.
+ *
+ * @retval - No more symbols.
+ * @retval not nil, not nil Byte offset and symbol code.
+ */
+static int
+utf8_next(struct lua_State *L)
+{
+	if (lua_gettop(L) > 2 || !lua_isstring(L, 1))
+		return luaL_error(L, "Usage: utf8.next(<string>, "\
+				     "[<byte offset>])");
+	size_t slen;
+	const char *str = lua_tolstring(L, 1, &slen);
+	int len = (int) slen;
+	int pos = utf8_convert_offset(luaL_optinteger(L, 2, 1), len);
+	if (pos > 0)
+		--pos;
+	if (pos >= len)
+		return 0;
+	UChar32 c;
+	U8_NEXT(str, pos, len, c);
+	if (c == U_SENTINEL)
+		return 0;
+	lua_pushinteger(L, pos + 1);
+	lua_pushinteger(L, c);
+	return 2;
+}
+
+/**
+ * Convert a UTF8 char code (or codes) into Lua string. When
+ * multiple codes are provided, they are concatenated into a
+ * monolite string.
+ * @param Char codes.
+ * @retval Result UTF8 string.
+ */
+static int
+utf8_char(struct lua_State *L)
+{
+	int top = lua_gettop(L);
+	if (top < 1)
+		return luaL_error(L, "Usage: utf8.char(<char code>");
+	int len = 0;
+	UChar32 c;
+	/* Fast way - convert one symbol. */
+	if (top == 1) {
+		char buf[U8_MAX_LENGTH];
+		c = luaL_checkinteger(L, 1);
+		U8_APPEND_UNSAFE(buf, len, c);
+		assert(len <= (int)sizeof(buf));
+		lua_pushlstring(L, buf, len);
+		return 1;
+	}
+	/* Slow way - use dynamic buffer. */
+	ibuf_reset(tarantool_lua_ibuf);
+	char *str = ibuf_alloc(tarantool_lua_ibuf, top * U8_MAX_LENGTH);
+	if (str == NULL) {
+		diag_set(OutOfMemory, top * U8_MAX_LENGTH, "ibuf_alloc",
+			 "str");
+		return luaT_error(L);
+	}
+	for (int i = 1; i <= top; ++i) {
+		c = luaL_checkinteger(L, i);
+		U8_APPEND_UNSAFE(str, len, c);
+	}
+	lua_pushlstring(L, str, len);
+	return 1;
+}
+
+/**
+ * Get byte offsets by symbol positions in a string. Positions can
+ * be negative.
+ * @param s Original string.
+ * @param len Length of @an s.
+ * @param start_pos Start position (symbol offset).
+ * @param end_pos End position (symbol offset).
+ * @param[out] start_offset_ Start position (byte offset).
+ * @param[out] end_offset_ End position (byte offset).
+ */
+static void
+utf8_sub(const uint8_t *s, int len, int start_pos, int end_pos,
+	 int *start_offset_, int *end_offset_)
+{
+	int start_offset = 0, end_offset = len;
+	if (start_pos >= 0) {
+		U8_FWD_N(s, start_offset, len, start_pos);
+		if (end_pos >= 0) {
+			/* --[-------]---- ...  */
+			int n = end_pos - start_pos;
+			end_offset = start_offset;
+			U8_FWD_N(s, end_offset, len, n);
+		} else {
+			/* --[---- ... ----]--- */
+			int n = -(end_pos + 1);
+			U8_BACK_N(s, 0, end_offset, n);
+		}
+	} else {
+		int n;
+		if (end_pos < 0) {
+			/* ... -----[-----]--- */
+			n = -(end_pos + 1);
+			U8_BACK_N(s, 0, end_offset, n);
+			start_offset = end_offset;
+			n = end_pos - start_pos + 1;
+		} else {
+			/* ---]-- ... --[---- */
+			end_offset = 0;
+			U8_FWD_N(s, end_offset, len, end_pos);
+			n = -start_pos;
+			start_offset = len;
+		}
+		U8_BACK_N(s, 0, start_offset, n);
+	}
+	*start_offset_ = start_offset;
+	if (start_offset <= end_offset)
+		*end_offset_ = end_offset;
+	else
+		*end_offset_ = start_offset;
+}
+
+/**
+ * Get a substring from a UTF8 string.
+ * @param String to get a substring.
+ * @param Start position in symbol count. Optional, can be
+ *        negative.
+ * @param End position in symbol count. Optional, can be negative.
+ *
+ * @retval Substring.
+ */
+static int
+utf8_lua_sub(struct lua_State *L)
+{
+	if (lua_gettop(L) < 2 || !lua_isstring(L, 1))
+		return luaL_error(L, "Usage: utf8.sub(<string>, [i, [j]])");
+	int start_pos = luaL_checkinteger(L, 2);
+	if (start_pos > 0)
+		--start_pos;
+	int end_pos = luaL_optinteger(L, 3, -1);
+	size_t slen;
+	const char *str = lua_tolstring(L, 1, &slen);
+	int len = (int) slen;
+	int start_offset, end_offset;
+	utf8_sub((const uint8_t *) str, len, start_pos, end_pos, &start_offset,
+		 &end_offset);
+	assert(end_offset >= start_offset);
+	lua_pushlstring(L, str + start_offset, end_offset - start_offset);
+	return 1;
+}
+
+/**
+ * Macro to easy create lua wrappers for ICU symbol checkers.
+ * @param One stmbol code or string.
+ * @retval True, if the symbol has a requested property. Else
+ *         false.
+ */
+#define UCHAR32_CHECKER(name) \
+static int \
+utf8_##name(struct lua_State *L) \
+{ \
+	if (lua_gettop(L) != 1) \
+		return luaL_error(L, "Usage: utf8."#name"(<string> or "\
+				     "<one symbol code>)"); \
+	UChar32 c; \
+	bool result = false; \
+	if (lua_type(L, 1) == LUA_TSTRING) { \
+		size_t slen; \
+		const char *str = lua_tolstring(L, 1, &slen); \
+		int len = (int) slen; \
+		if (len > 0) { \
+			int offset = 0; \
+			U8_NEXT(str, offset, len, c); \
+			result = c != U_SENTINEL && offset == len && \
+				 u_##name(c); \
+		} \
+	} else { \
+		result = u_##name(luaL_checkinteger(L, 1)); \
+	} \
+	lua_pushboolean(L, result); \
+	return 1; \
+}\
+
+UCHAR32_CHECKER(islower)
+UCHAR32_CHECKER(isupper)
+UCHAR32_CHECKER(isdigit)
+UCHAR32_CHECKER(isalpha)
+
+static inline int
+utf8_cmp_impl(struct lua_State *L, const char *usage, struct coll *coll)
+{
+	assert(coll != NULL);
+	if (lua_gettop(L) != 2 || !lua_isstring(L, 1) || !lua_isstring(L, 2))
+		luaL_error(L, usage);
+	size_t l1, l2;
+	const char *s1 = lua_tolstring(L, 1, &l1);
+	const char *s2 = lua_tolstring(L, 2, &l2);
+	lua_pushinteger(L, coll->cmp(s1, l1, s2, l2, coll));
+	return 1;
+}
+
+/**
+ * Compare two UTF8 strings.
+ * @param s1 First string.
+ * @param s1 Second string.
+ *
+ * @retval <0 s1 < s2.
+ * @retval >0 s1 > s2.
+ * @retval =0 s1 = s2.
+ */
+static int
+utf8_cmp(struct lua_State *L)
+{
+	return utf8_cmp_impl(L, "Usage: utf8.cmp(<string1>, <string2>)",
+			     unicode_coll);
+}
+
+/**
+ * Compare two UTF8 strings ignoring case.
+ * @param s1 First string.
+ * @param s1 Second string.
+ *
+ * @retval <0 s1 < s2.
+ * @retval >0 s1 > s2.
+ * @retval =0 s1 = s2.
+ */
+static int
+utf8_casecmp(struct lua_State *L)
+{
+	return utf8_cmp_impl(L, "Usage: utf8.casecmp(<string1>, <string2>)",
+			     unicode_ci_coll);
+}
+
+static const struct luaL_Reg utf8_lib[] = {
+	{"upper", utf8_upper},
+	{"lower", utf8_lower},
+	{"len", utf8_len},
+	{"next", utf8_next},
+	{"char", utf8_char},
+	{"sub", utf8_lua_sub},
+	{"islower", utf8_islower},
+	{"isupper", utf8_isupper},
+	{"isdigit", utf8_isdigit},
+	{"isalpha", utf8_isalpha},
+	{"cmp", utf8_cmp},
+	{"casecmp", utf8_casecmp},
+	{NULL, NULL}
+};
+
+void
+tarantool_lua_utf8_init(struct lua_State *L)
+{
+	UErrorCode err = U_ZERO_ERROR;
+	root_map = ucasemap_open("", 0, &err);
+	if (root_map == NULL) {
+		luaL_error(L, tt_sprintf("error in ICU ucasemap_open: %s",
+					 u_errorName(err)));
+	}
+	struct coll_def def;
+	memset(&def, 0, sizeof(def));
+	unicode_coll = coll_new(&def);
+	if (unicode_coll == NULL)
+		goto error_coll;
+	def.icu.strength = COLL_ICU_STRENGTH_PRIMARY;
+	unicode_ci_coll = coll_new(&def);
+	if (unicode_ci_coll == NULL)
+		goto error_coll;
+	luaL_register(L, "utf8", utf8_lib);
+	lua_pop(L, 1);
+	return;
+error_coll:
+	tarantool_lua_utf8_free();
+	luaT_error(L);
+}
+
+void
+tarantool_lua_utf8_free()
+{
+	ucasemap_close(root_map);
+	if (unicode_coll != NULL)
+		coll_unref(unicode_coll);
+	if (unicode_ci_coll != NULL)
+		coll_unref(unicode_ci_coll);
+}
diff --git a/src/lua/utf8.h b/src/lua/utf8.h
new file mode 100644
index 000000000..567ad51f7
--- /dev/null
+++ b/src/lua/utf8.h
@@ -0,0 +1,42 @@
+#ifndef TARANTOOL_LUA_UTF8_H_INCLUDED
+#define TARANTOOL_LUA_UTF8_H_INCLUDED
+/*
+ * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ *    copyright notice, this list of conditions and the
+ *    following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ *    copyright notice, this list of conditions and the following
+ *    disclaimer in the documentation and/or other materials
+ *    provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+struct lua_State;
+
+void
+tarantool_lua_utf8_init(struct lua_State *L);
+
+void
+tarantool_lua_utf8_free();
+
+#endif /* TARANTOOL_LUA_UTF8_H_INCLUDED */
diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
index 852a7923c..1d10dcfc9 100755
--- a/test/app-tap/string.test.lua
+++ b/test/app-tap/string.test.lua
@@ -3,7 +3,7 @@
 local tap = require('tap')
 local test = tap.test("string extensions")
 
-test:plan(5)
+test:plan(6)
 
 test:test("split", function(test)
     test:plan(10)
@@ -128,4 +128,165 @@ test:test("strip", function(test)
     test:ok(err and err:match("%(string expected, got number%)"))
 end )
 
+test:test("unicode", function(test)
+    test:plan(102)
+    local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺'
+    local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺'
+    local lower_res = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i i i̇ 勺#☢༺'
+    local s = utf8.upper(str)
+    test:is(s, upper_res, 'default locale upper')
+    s = utf8.lower(str)
+    test:is(s, lower_res, 'default locale lower')
+    test:is(utf8.upper(''), '', 'empty string upper')
+    test:is(utf8.lower(''), '', 'empty string lower')
+    local err
+    s, err = pcall(utf8.upper, true)
+    test:isnt(err:find('Usage'), nil, 'upper usage is checked')
+    s, err = pcall(utf8.lower, true)
+    test:isnt(err:find('Usage'), nil, 'lower usage is checked')
+
+    test:is(utf8.isupper('a'), false, 'isupper("a")')
+    test:is(utf8.isupper('A'), true, 'isupper("A")')
+    test:is(utf8.islower('a'), true, 'islower("a")')
+    test:is(utf8.islower('A'), false, 'islower("A")')
+    test:is(utf8.isalpha('a'), true, 'isalpha("a")')
+    test:is(utf8.isalpha('A'), true, 'isalpha("A")')
+    test:is(utf8.isalpha('aa'), false, 'isalpha("aa")')
+    test:is(utf8.isalpha('勺'), true, 'isalpha("勺")')
+    test:is(utf8.isupper('Ё'), true, 'isupper("Ё")')
+    test:is(utf8.islower('ё'), true, 'islower("ё")')
+    test:is(utf8.isdigit('a'), false, 'isdigit("a")')
+    test:is(utf8.isdigit('1'), true, 'isdigit("1")')
+    test:is(utf8.isdigit('9'), true, 'isdigit("9")')
+
+    test:is(utf8.len(str), 56, 'len works on complex string')
+    s = '12İ☢勺34'
+    test:is(utf8.len(s), 7, 'len works no options')
+    test:is(utf8.len(s, 1), 7, 'default start is 1')
+    test:is(utf8.len(s, 2), 6, 'start 2')
+    test:is(utf8.len(s, 3), 5, 'start 3')
+    local c
+    c, err = utf8.len(s, 4)
+    test:isnil(c, 'middle of symbol offset is error')
+    test:is(err, 4, 'error on 4 byte')
+    test:is(utf8.len(s, 5), 4, 'start 5')
+    c, err = utf8.len(s, 6)
+    test:is(err, 6, 'error on 6 byte')
+    c, err = utf8.len(s, 0)
+    test:is(err, 'position is out of string', 'range is out of string')
+    test:is(utf8.len(s, #s), 1, 'start from the end')
+    test:is(utf8.len(s, #s + 1), 0, 'position is out of string')
+    test:is(utf8.len(s, 1, -1), 7, 'default end is -1')
+    test:is(utf8.len(s, 1, -2), 6, 'end -2')
+    test:is(utf8.len(s, 1, -3), 5, 'end -3')
+    test:is(utf8.len(s, 1, -4), 5, 'end in the middle of symbol')
+    test:is(utf8.len(s, 1, -5), 5, 'end in the middle of symbol')
+    test:is(utf8.len(s, 1, -6), 5, 'end in the middle of symbol')
+    test:is(utf8.len(s, 1, -7), 4, 'end -7')
+    test:is(utf8.len(s, 2, -7), 3, '[2, -7]')
+    test:is(utf8.len(s, 3, -7), 2, '[3, -7]')
+    c, err = utf8.len(s, 4, -7)
+    test:is(err, 4, '[4, -7] is error - start from the middle of symbol')
+    test:is(utf8.len(s, 10, -100), 0, 'it is ok to be out of str by end pos')
+    test:is(utf8.len(s, 10, -10), 0, 'it is ok to swap end and start pos')
+    test:is(utf8.len(''), 0, 'empty len')
+    test:is(utf8.len(s, -6, -1), 3, 'pass both negative offsets')
+    test:is(utf8.len(s, 3, 3), 1, "end in the middle on the same symbol as start")
+    c, err = utf8.len('a\xF4')
+    test:is(err, 2, "invalid unicode in the middle of the string")
+
+    local chars = {}
+    local codes = {}
+    for _, code in utf8.next, s do
+        table.insert(chars, utf8.char(code))
+        table.insert(codes, code)
+    end
+    test:is(table.concat(chars), s, "next and char works")
+    c, err = pcall(utf8.char, 'kek')
+    test:isnt(err:find('bad argument'), nil, 'char usage is checked')
+    c, err = pcall(utf8.next, true)
+    test:isnt(err:find('Usage'), nil, 'next usage is checked')
+    c, err = pcall(utf8.next, '1234', true)
+    test:isnt(err:find('bad argument'), nil, 'next usage is checked')
+    local offset
+    offset, c = utf8.next('')
+    test:isnil(offset, 'next on empty - nil offset')
+    test:isnil(c, 'next on empty - nil code')
+    offset, c = utf8.next('123', 100)
+    test:isnil(offset, 'out of string - nil offset')
+    test:isnil(c, 'out of string - nil code')
+    test:is(utf8.char(unpack(codes)), s, 'char with multiple values')
+
+    local uppers = 0
+    local lowers = 0
+    local digits = 0
+    local letters = 0
+    for _, code in utf8.next, str do
+        if utf8.isupper(code) then uppers = uppers + 1 end
+        if utf8.islower(code) then lowers = lowers + 1 end
+        if utf8.isalpha(code) then letters = letters + 1 end
+        if utf8.isdigit(code) then digits = digits + 1 end
+    end
+    test:is(uppers, 13, 'uppers by code')
+    test:is(lowers, 19, 'lowers by code')
+    test:is(letters, 33, 'letters by code')
+    test:is(digits, 4, 'digits by code')
+
+    s = '12345678'
+    test:is(utf8.sub(s, 1, 1), '1', 'sub [1]')
+    test:is(utf8.sub(s, 1, 2), '12', 'sub [1:2]')
+    test:is(utf8.sub(s, 2, 2), '2', 'sub [2:2]')
+    test:is(utf8.sub(s, 0, 2), '12', 'sub [0:2]')
+    test:is(utf8.sub(s, 3, 7), '34567', 'sub [3:7]')
+    test:is(utf8.sub(s, 7, 3), '', 'sub [7:3]')
+    test:is(utf8.sub(s, 3, 100), '345678', 'sub [3:100]')
+    test:is(utf8.sub(s, 100, 3), '', 'sub [100:3]')
+
+    test:is(utf8.sub(s, 5), '5678', 'sub [5:]')
+    test:is(utf8.sub(s, 1, -1), s, 'sub [1:-1]')
+    test:is(utf8.sub(s, 1, -2), '1234567', 'sub [1:-2]')
+    test:is(utf8.sub(s, 2, -2), '234567', 'sub [2:-2]')
+    test:is(utf8.sub(s, 3, -3), '3456', 'sub [3:-3]')
+    test:is(utf8.sub(s, 5, -4), '5', 'sub [5:-4]')
+    test:is(utf8.sub(s, 7, -7), '', 'sub[7:-7]')
+
+    test:is(utf8.sub(s, -2, -1), '78', 'sub [-2:-1]')
+    test:is(utf8.sub(s, -1, -1), '8', 'sub [-1:-1]')
+    test:is(utf8.sub(s, -4, -2), '567', 'sub [-4:-2]')
+    test:is(utf8.sub(s, -400, -2), '1234567', 'sub [-400:-2]')
+    test:is(utf8.sub(s, -3, -5), '', 'sub [-3:-5]')
+
+    test:is(utf8.sub(s, -6, 5), '345', 'sub [-6:5]')
+    test:is(utf8.sub(s, -5, 4), '4', 'sub [-5:4]')
+    test:is(utf8.sub(s, -2, 2), '', 'sub [-2:2]')
+    test:is(utf8.sub(s, -1, 8), '8', 'sub [-1:8]')
+
+    c, err = pcall(utf8.sub)
+    test:isnt(err:find('Usage'), nil, 'usage is checked')
+    c, err = pcall(utf8.sub, true)
+    test:isnt(err:find('Usage'), nil, 'usage is checked')
+    c, err = pcall(utf8.sub, '123')
+    test:isnt(err:find('Usage'), nil, 'usage is checked')
+    c, err = pcall(utf8.sub, '123', true)
+    test:isnt(err:find('bad argument'), nil, 'usage is checked')
+    c, err = pcall(utf8.sub, '123', 1, true)
+    test:isnt(err:find('bad argument'), nil, 'usage is checked')
+
+    local s1 = '☢'
+    local s2 = 'İ'
+    test:is(s1 < s2, false, 'test binary cmp')
+    test:is(utf8.cmp(s1, s2) < 0, true, 'test unicode <')
+    test:is(utf8.cmp(s1, s1) == 0, true, 'test unicode eq')
+    test:is(utf8.cmp(s2, s1) > 0, true, 'test unicode >')
+    test:is(utf8.casecmp('a', 'A') == 0, true, 'test icase ==')
+    test:is(utf8.casecmp('b', 'A') > 0, true, 'test icase >, first')
+    test:is(utf8.casecmp('B', 'a') > 0, true, 'test icase >, second >')
+    test:is(utf8.cmp('', '') == 0, true, 'test empty compare')
+    test:is(utf8.cmp('', 'a') < 0, true, 'test left empty compare')
+    test:is(utf8.cmp('a', '') > 0, true, 'test right empty compare')
+    test:is(utf8.casecmp('', '') == 0, true, 'test empty icompare')
+    test:is(utf8.casecmp('', 'a') < 0, true, 'test left empty icompare')
+    test:is(utf8.casecmp('a', '') > 0, true, 'test right empty icompare')
+end)
+
 os.exit(test:check() == true and 0 or -1)
diff --git a/test/box/ddl.result b/test/box/ddl.result
index f249f8fe3..30f0cf7ec 100644
--- a/test/box/ddl.result
+++ b/test/box/ddl.result
@@ -500,6 +500,21 @@ box.space._collation.index.name:delete{'test'}
 - [3, 'test', 0, 'ICU', 'ru_RU', {}]
 ...
 --
+-- gh-3290: expose ICU into Lua. It uses built-in collations, that
+-- must work even if a collation is deleted from _collation.
+--
+t = box.space._collation:delete{1}
+---
+...
+utf8.cmp('abc', 'def')
+---
+- -1
+...
+box.space._collation:replace(t)
+---
+- [1, 'unicode', 1, 'ICU', '', {}]
+...
+--
 -- gh-2839: allow to store custom fields in field definition.
 --
 format = {}
diff --git a/test/box/ddl.test.lua b/test/box/ddl.test.lua
index 6029c6eb6..ebbefe77b 100644
--- a/test/box/ddl.test.lua
+++ b/test/box/ddl.test.lua
@@ -191,6 +191,14 @@ test_run:cmd('restart server default')
 box.space._collation:select{}
 box.space._collation.index.name:delete{'test'}
 
+--
+-- gh-3290: expose ICU into Lua. It uses built-in collations, that
+-- must work even if a collation is deleted from _collation.
+--
+t = box.space._collation:delete{1}
+utf8.cmp('abc', 'def')
+box.space._collation:replace(t)
+
 --
 -- gh-2839: allow to store custom fields in field definition.
 --
-- 
2.15.1 (Apple Git-101)

     prev parent reply	other threads:[~2018-05-15 19:54 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-15 19:54 [tarantool-patches] [PATCH v3 0/4] Lua utf8 module Vladislav Shpilevoy
2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 1/4] error: introduce error rebulding API Vladislav Shpilevoy
2018-05-16 17:06   ` [tarantool-patches] " Vladislav Shpilevoy
2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 2/4] collation: split collation into core and box objects Vladislav Shpilevoy
2018-05-16 17:07   ` [tarantool-patches] " Vladislav Shpilevoy
2018-05-16 17:17     ` Konstantin Osipov
2018-05-16 17:19       ` Vladislav Shpilevoy
2018-05-17 19:23   ` Vladislav Shpilevoy
2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 3/4] collation: introduce collation fingerprint Vladislav Shpilevoy
2018-05-17 19:24   ` [tarantool-patches] " Vladislav Shpilevoy
2018-05-15 19:54 ` Vladislav Shpilevoy [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b28ae2d61d92c8687ca525dcd96ff073f3e795fa.1526414017.git.v.shpilevoy@tarantool.org \
    --to=v.shpilevoy@tarantool.org \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [tarantool-patches] [PATCH v3 4/4] lua: introduce utf8 built-in globaly visible module' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox