[tarantool-patches] [PATCH v3 4/4] lua: introduce utf8 built-in globaly visible module
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Tue May 15 22:54:08 MSK 2018
utf8 is a module partially compatible with Lua 5.3 utf8 and
lua-utf8 third party module.
Partially means, that not all functions are implemented.
The patch introduces these ones:
upper, lower, len, char, sub, next.
Len and char works exactly like in Lua 5.3. Other functions work
like in lua-utf8, because they are not presented in Lua 5.3.
Tarantool utf8 has extensions:
* isupper/lower/alpha/digit, that check some property by a symbol
or by its code;
* cmp/casecmp, that compare two UTF8 strings.
Closes #3290
Closes #3385
Closes #3081
---
src/CMakeLists.txt | 3 +-
src/lua/init.c | 3 +
src/lua/utf8.c | 479 +++++++++++++++++++++++++++++++++++++++++++
src/lua/utf8.h | 42 ++++
test/app-tap/string.test.lua | 163 ++++++++++++++-
test/box/ddl.result | 15 ++
test/box/ddl.test.lua | 8 +
7 files changed, 711 insertions(+), 2 deletions(-)
create mode 100644 src/lua/utf8.c
create mode 100644 src/lua/utf8.h
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5bf17614b..2a952923e 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -166,6 +166,7 @@ set (server_sources
lua/fio.c
lua/crypto.c
lua/httpc.c
+ lua/utf8.c
${lua_sources}
${PROJECT_SOURCE_DIR}/third_party/lua-yaml/lyaml.cc
${PROJECT_SOURCE_DIR}/third_party/lua-yaml/b64.c
@@ -210,7 +211,7 @@ endif()
set_source_files_compile_flags(${server_sources})
add_library(server STATIC ${server_sources})
-target_link_libraries(server core bit uri uuid)
+target_link_libraries(server core bit uri uuid ${ICU_LIBRARIES})
# Rule of thumb: if exporting a symbol from a static library, list the
# library here.
diff --git a/src/lua/init.c b/src/lua/init.c
index a0a7f63f6..58af1d121 100644
--- a/src/lua/init.c
+++ b/src/lua/init.c
@@ -57,6 +57,7 @@
#include "lua/pickle.h"
#include "lua/fio.h"
#include "lua/httpc.h"
+#include "lua/utf8.h"
#include "digest.h"
#include <small/ibuf.h>
@@ -399,6 +400,7 @@ tarantool_lua_init(const char *tarantool_bin, int argc, char **argv)
lua_call(L, 0, 0);
lua_register(L, "tonumber64", lbox_tonumber64);
+ tarantool_lua_utf8_init(L);
tarantool_lua_utils_init(L);
tarantool_lua_fiber_init(L);
tarantool_lua_fiber_cond_init(L);
@@ -629,6 +631,7 @@ tarantool_lua_run_script(char *path, bool interactive,
void
tarantool_lua_free()
{
+ tarantool_lua_utf8_free();
/*
* Some part of the start script panicked, and called
* exit(). The call stack in this case leads us back to
diff --git a/src/lua/utf8.c b/src/lua/utf8.c
new file mode 100644
index 000000000..e3b2b0a7f
--- /dev/null
+++ b/src/lua/utf8.c
@@ -0,0 +1,479 @@
+/*
+ * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <unicode/ucasemap.h>
+#include <unicode/uchar.h>
+#include <coll.h>
+#include "lua/utils.h"
+#include "lua/utf8.h"
+#include "diag.h"
+#include "small/ibuf.h"
+
+extern struct ibuf *tarantool_lua_ibuf;
+
+/** Default universal casemap for case transformations. */
+static UCaseMap *root_map = NULL;
+
+/** Collations for cmp/casecmp functions. */
+static struct coll *unicode_coll = NULL;
+static struct coll *unicode_ci_coll = NULL;
+
+static int
+utf8_str_to_case(struct lua_State *L, const char *src, int src_bsize,
+ bool is_to_upper)
+{
+ int i = 0;
+ int dst_bsize = src_bsize;
+ (void) i;
+ do {
+ UErrorCode err = U_ZERO_ERROR;
+ ibuf_reset(tarantool_lua_ibuf);
+ char *dst = ibuf_alloc(tarantool_lua_ibuf, dst_bsize);
+ if (dst == NULL) {
+ diag_set(OutOfMemory, dst_bsize, "ibuf_alloc", "dst");
+ return luaT_error(L);
+ }
+ int real_bsize;
+ if (is_to_upper) {
+ real_bsize = ucasemap_utf8ToUpper(root_map, dst,
+ dst_bsize, src,
+ src_bsize, &err);
+ } else {
+ real_bsize = ucasemap_utf8ToLower(root_map, dst,
+ dst_bsize, src,
+ src_bsize, &err);
+ }
+ if (err == U_ZERO_ERROR ||
+ err == U_STRING_NOT_TERMINATED_WARNING) {
+ lua_pushlstring(L, dst, real_bsize);
+ return 1;
+ } else if (err == U_BUFFER_OVERFLOW_ERROR) {
+ assert(real_bsize > dst_bsize);
+ dst_bsize = real_bsize;
+ } else {
+ lua_pushnil(L);
+ lua_pushstring(L, tt_sprintf("error during ICU case "\
+ "transform: %s",
+ u_errorName(err)));
+ return 2;
+ }
+ /*
+ * On a first run either all is ok, or
+ * toLower/Upper returned needed bsize, that is
+ * allocated on a second iteration. Third
+ * iteration is not possible.
+ */
+ assert(++i < 2);
+ } while (true);
+ unreachable();
+ return 0;
+}
+
+/**
+ * Convert a UTF8 string into upper case.
+ * @param String to convert.
+ * @retval not nil String consisting of upper letters.
+ * @retval nil, error Error.
+ */
+static int
+utf8_upper(struct lua_State *L)
+{
+ if (lua_gettop(L) != 1 || !lua_isstring(L, 1))
+ return luaL_error(L, "Usage: utf8.upper(<string>)");
+ size_t len;
+ const char *str = lua_tolstring(L, 1, &len);
+ return utf8_str_to_case(L, str, len, true);
+}
+
+/**
+ * Convert a UTF8 string into lower case.
+ * @param String to convert.
+ * @retval not nil String consisting of lower letters.
+ * @retval nil, error Error.
+ */
+static int
+utf8_lower(struct lua_State *L)
+{
+ if (lua_gettop(L) != 1 || !lua_isstring(L, 1))
+ return luaL_error(L, "Usage: utf8.lower(<string>)");
+ size_t len;
+ const char *str = lua_tolstring(L, 1, &len);
+ return utf8_str_to_case(L, str, len, false);
+}
+
+/**
+ * Calculate a 1-based positive byte offset in a string by any
+ * 1-based offset (possibly negative).
+ * @param offset Original 1-based offset with any sign.
+ * @param len A string byte length.
+ * @retval 1-based positive offset.
+ */
+static inline int
+utf8_convert_offset(int offset, size_t len)
+{
+ if (offset >= 0)
+ return offset;
+ else if ((size_t)-offset > len)
+ return 0;
+ return len + offset + 1;
+}
+
+/**
+ * Calculate length of a UTF8 string. Length here is symbol count.
+ * Works like utf8.len in Lua 5.3. Can take negative offsets. A
+ * negative offset is an offset from the end of string.
+ * Positive position must be inside .
+ * @param String to get length.
+ * @param Start byte offset in [1, #str + 1]. Must point to the
+ * start of symbol. On invalid symbol an error is returned.
+ * @param End byte offset in [0, #str]. Can point to the middle of
+ * symbol. Partial symbol is counted too.
+ * @retval not nil Symbol count.
+ * @retval nil, number Error. Byte position of the error is
+ * returned in the second value.
+ * @retval nil, string Error. Reason is returned in the second
+ * value.
+ */
+static int
+utf8_len(struct lua_State *L)
+{
+ if (lua_gettop(L) > 3 || !lua_isstring(L, 1))
+ return luaL_error(L, "Usage: utf8.len(<string>, [i, [j]])");
+ size_t slen;
+ const char *str = lua_tolstring(L, 1, &slen);
+ int len = (int) slen;
+ int start_pos = utf8_convert_offset(luaL_optinteger(L, 2, 1), len);
+ int end_pos = utf8_convert_offset(luaL_optinteger(L, 3, -1), len);
+ if (start_pos < 1 || --start_pos > len || end_pos > len) {
+ lua_pushnil(L);
+ lua_pushstring(L, "position is out of string");
+ return 2;
+ }
+ int result = 0;
+ if (end_pos > start_pos) {
+ UChar32 c;
+ while (start_pos < end_pos) {
+ ++result;
+ U8_NEXT(str, start_pos, len, c);
+ if (c == U_SENTINEL) {
+ lua_pushnil(L);
+ lua_pushinteger(L, start_pos);
+ return 2;
+ }
+ }
+ }
+ lua_pushinteger(L, result);
+ return 1;
+}
+
+/**
+ * Get next symbol code by @an offset.
+ * @param String to get symbol code.
+ * @param Byte offset from which get.
+ *
+ * @retval - No more symbols.
+ * @retval not nil, not nil Byte offset and symbol code.
+ */
+static int
+utf8_next(struct lua_State *L)
+{
+ if (lua_gettop(L) > 2 || !lua_isstring(L, 1))
+ return luaL_error(L, "Usage: utf8.next(<string>, "\
+ "[<byte offset>])");
+ size_t slen;
+ const char *str = lua_tolstring(L, 1, &slen);
+ int len = (int) slen;
+ int pos = utf8_convert_offset(luaL_optinteger(L, 2, 1), len);
+ if (pos > 0)
+ --pos;
+ if (pos >= len)
+ return 0;
+ UChar32 c;
+ U8_NEXT(str, pos, len, c);
+ if (c == U_SENTINEL)
+ return 0;
+ lua_pushinteger(L, pos + 1);
+ lua_pushinteger(L, c);
+ return 2;
+}
+
+/**
+ * Convert a UTF8 char code (or codes) into Lua string. When
+ * multiple codes are provided, they are concatenated into a
+ * monolite string.
+ * @param Char codes.
+ * @retval Result UTF8 string.
+ */
+static int
+utf8_char(struct lua_State *L)
+{
+ int top = lua_gettop(L);
+ if (top < 1)
+ return luaL_error(L, "Usage: utf8.char(<char code>");
+ int len = 0;
+ UChar32 c;
+ /* Fast way - convert one symbol. */
+ if (top == 1) {
+ char buf[U8_MAX_LENGTH];
+ c = luaL_checkinteger(L, 1);
+ U8_APPEND_UNSAFE(buf, len, c);
+ assert(len <= (int)sizeof(buf));
+ lua_pushlstring(L, buf, len);
+ return 1;
+ }
+ /* Slow way - use dynamic buffer. */
+ ibuf_reset(tarantool_lua_ibuf);
+ char *str = ibuf_alloc(tarantool_lua_ibuf, top * U8_MAX_LENGTH);
+ if (str == NULL) {
+ diag_set(OutOfMemory, top * U8_MAX_LENGTH, "ibuf_alloc",
+ "str");
+ return luaT_error(L);
+ }
+ for (int i = 1; i <= top; ++i) {
+ c = luaL_checkinteger(L, i);
+ U8_APPEND_UNSAFE(str, len, c);
+ }
+ lua_pushlstring(L, str, len);
+ return 1;
+}
+
+/**
+ * Get byte offsets by symbol positions in a string. Positions can
+ * be negative.
+ * @param s Original string.
+ * @param len Length of @an s.
+ * @param start_pos Start position (symbol offset).
+ * @param end_pos End position (symbol offset).
+ * @param[out] start_offset_ Start position (byte offset).
+ * @param[out] end_offset_ End position (byte offset).
+ */
+static void
+utf8_sub(const uint8_t *s, int len, int start_pos, int end_pos,
+ int *start_offset_, int *end_offset_)
+{
+ int start_offset = 0, end_offset = len;
+ if (start_pos >= 0) {
+ U8_FWD_N(s, start_offset, len, start_pos);
+ if (end_pos >= 0) {
+ /* --[-------]---- ... */
+ int n = end_pos - start_pos;
+ end_offset = start_offset;
+ U8_FWD_N(s, end_offset, len, n);
+ } else {
+ /* --[---- ... ----]--- */
+ int n = -(end_pos + 1);
+ U8_BACK_N(s, 0, end_offset, n);
+ }
+ } else {
+ int n;
+ if (end_pos < 0) {
+ /* ... -----[-----]--- */
+ n = -(end_pos + 1);
+ U8_BACK_N(s, 0, end_offset, n);
+ start_offset = end_offset;
+ n = end_pos - start_pos + 1;
+ } else {
+ /* ---]-- ... --[---- */
+ end_offset = 0;
+ U8_FWD_N(s, end_offset, len, end_pos);
+ n = -start_pos;
+ start_offset = len;
+ }
+ U8_BACK_N(s, 0, start_offset, n);
+ }
+ *start_offset_ = start_offset;
+ if (start_offset <= end_offset)
+ *end_offset_ = end_offset;
+ else
+ *end_offset_ = start_offset;
+}
+
+/**
+ * Get a substring from a UTF8 string.
+ * @param String to get a substring.
+ * @param Start position in symbol count. Optional, can be
+ * negative.
+ * @param End position in symbol count. Optional, can be negative.
+ *
+ * @retval Substring.
+ */
+static int
+utf8_lua_sub(struct lua_State *L)
+{
+ if (lua_gettop(L) < 2 || !lua_isstring(L, 1))
+ return luaL_error(L, "Usage: utf8.sub(<string>, [i, [j]])");
+ int start_pos = luaL_checkinteger(L, 2);
+ if (start_pos > 0)
+ --start_pos;
+ int end_pos = luaL_optinteger(L, 3, -1);
+ size_t slen;
+ const char *str = lua_tolstring(L, 1, &slen);
+ int len = (int) slen;
+ int start_offset, end_offset;
+ utf8_sub((const uint8_t *) str, len, start_pos, end_pos, &start_offset,
+ &end_offset);
+ assert(end_offset >= start_offset);
+ lua_pushlstring(L, str + start_offset, end_offset - start_offset);
+ return 1;
+}
+
+/**
+ * Macro to easy create lua wrappers for ICU symbol checkers.
+ * @param One stmbol code or string.
+ * @retval True, if the symbol has a requested property. Else
+ * false.
+ */
+#define UCHAR32_CHECKER(name) \
+static int \
+utf8_##name(struct lua_State *L) \
+{ \
+ if (lua_gettop(L) != 1) \
+ return luaL_error(L, "Usage: utf8."#name"(<string> or "\
+ "<one symbol code>)"); \
+ UChar32 c; \
+ bool result = false; \
+ if (lua_type(L, 1) == LUA_TSTRING) { \
+ size_t slen; \
+ const char *str = lua_tolstring(L, 1, &slen); \
+ int len = (int) slen; \
+ if (len > 0) { \
+ int offset = 0; \
+ U8_NEXT(str, offset, len, c); \
+ result = c != U_SENTINEL && offset == len && \
+ u_##name(c); \
+ } \
+ } else { \
+ result = u_##name(luaL_checkinteger(L, 1)); \
+ } \
+ lua_pushboolean(L, result); \
+ return 1; \
+}\
+
+UCHAR32_CHECKER(islower)
+UCHAR32_CHECKER(isupper)
+UCHAR32_CHECKER(isdigit)
+UCHAR32_CHECKER(isalpha)
+
+static inline int
+utf8_cmp_impl(struct lua_State *L, const char *usage, struct coll *coll)
+{
+ assert(coll != NULL);
+ if (lua_gettop(L) != 2 || !lua_isstring(L, 1) || !lua_isstring(L, 2))
+ luaL_error(L, usage);
+ size_t l1, l2;
+ const char *s1 = lua_tolstring(L, 1, &l1);
+ const char *s2 = lua_tolstring(L, 2, &l2);
+ lua_pushinteger(L, coll->cmp(s1, l1, s2, l2, coll));
+ return 1;
+}
+
+/**
+ * Compare two UTF8 strings.
+ * @param s1 First string.
+ * @param s1 Second string.
+ *
+ * @retval <0 s1 < s2.
+ * @retval >0 s1 > s2.
+ * @retval =0 s1 = s2.
+ */
+static int
+utf8_cmp(struct lua_State *L)
+{
+ return utf8_cmp_impl(L, "Usage: utf8.cmp(<string1>, <string2>)",
+ unicode_coll);
+}
+
+/**
+ * Compare two UTF8 strings ignoring case.
+ * @param s1 First string.
+ * @param s1 Second string.
+ *
+ * @retval <0 s1 < s2.
+ * @retval >0 s1 > s2.
+ * @retval =0 s1 = s2.
+ */
+static int
+utf8_casecmp(struct lua_State *L)
+{
+ return utf8_cmp_impl(L, "Usage: utf8.casecmp(<string1>, <string2>)",
+ unicode_ci_coll);
+}
+
+static const struct luaL_Reg utf8_lib[] = {
+ {"upper", utf8_upper},
+ {"lower", utf8_lower},
+ {"len", utf8_len},
+ {"next", utf8_next},
+ {"char", utf8_char},
+ {"sub", utf8_lua_sub},
+ {"islower", utf8_islower},
+ {"isupper", utf8_isupper},
+ {"isdigit", utf8_isdigit},
+ {"isalpha", utf8_isalpha},
+ {"cmp", utf8_cmp},
+ {"casecmp", utf8_casecmp},
+ {NULL, NULL}
+};
+
+void
+tarantool_lua_utf8_init(struct lua_State *L)
+{
+ UErrorCode err = U_ZERO_ERROR;
+ root_map = ucasemap_open("", 0, &err);
+ if (root_map == NULL) {
+ luaL_error(L, tt_sprintf("error in ICU ucasemap_open: %s",
+ u_errorName(err)));
+ }
+ struct coll_def def;
+ memset(&def, 0, sizeof(def));
+ unicode_coll = coll_new(&def);
+ if (unicode_coll == NULL)
+ goto error_coll;
+ def.icu.strength = COLL_ICU_STRENGTH_PRIMARY;
+ unicode_ci_coll = coll_new(&def);
+ if (unicode_ci_coll == NULL)
+ goto error_coll;
+ luaL_register(L, "utf8", utf8_lib);
+ lua_pop(L, 1);
+ return;
+error_coll:
+ tarantool_lua_utf8_free();
+ luaT_error(L);
+}
+
+void
+tarantool_lua_utf8_free()
+{
+ ucasemap_close(root_map);
+ if (unicode_coll != NULL)
+ coll_unref(unicode_coll);
+ if (unicode_ci_coll != NULL)
+ coll_unref(unicode_ci_coll);
+}
diff --git a/src/lua/utf8.h b/src/lua/utf8.h
new file mode 100644
index 000000000..567ad51f7
--- /dev/null
+++ b/src/lua/utf8.h
@@ -0,0 +1,42 @@
+#ifndef TARANTOOL_LUA_UTF8_H_INCLUDED
+#define TARANTOOL_LUA_UTF8_H_INCLUDED
+/*
+ * Copyright 2010-2018, Tarantool AUTHORS, please see AUTHORS file.
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the
+ * following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY <COPYRIGHT HOLDER> ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
+ * <COPYRIGHT HOLDER> OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+struct lua_State;
+
+void
+tarantool_lua_utf8_init(struct lua_State *L);
+
+void
+tarantool_lua_utf8_free();
+
+#endif /* TARANTOOL_LUA_UTF8_H_INCLUDED */
diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
index 852a7923c..1d10dcfc9 100755
--- a/test/app-tap/string.test.lua
+++ b/test/app-tap/string.test.lua
@@ -3,7 +3,7 @@
local tap = require('tap')
local test = tap.test("string extensions")
-test:plan(5)
+test:plan(6)
test:test("split", function(test)
test:plan(10)
@@ -128,4 +128,165 @@ test:test("strip", function(test)
test:ok(err and err:match("%(string expected, got number%)"))
end )
+test:test("unicode", function(test)
+ test:plan(102)
+ local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺'
+ local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺'
+ local lower_res = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i i i̇ 勺#☢༺'
+ local s = utf8.upper(str)
+ test:is(s, upper_res, 'default locale upper')
+ s = utf8.lower(str)
+ test:is(s, lower_res, 'default locale lower')
+ test:is(utf8.upper(''), '', 'empty string upper')
+ test:is(utf8.lower(''), '', 'empty string lower')
+ local err
+ s, err = pcall(utf8.upper, true)
+ test:isnt(err:find('Usage'), nil, 'upper usage is checked')
+ s, err = pcall(utf8.lower, true)
+ test:isnt(err:find('Usage'), nil, 'lower usage is checked')
+
+ test:is(utf8.isupper('a'), false, 'isupper("a")')
+ test:is(utf8.isupper('A'), true, 'isupper("A")')
+ test:is(utf8.islower('a'), true, 'islower("a")')
+ test:is(utf8.islower('A'), false, 'islower("A")')
+ test:is(utf8.isalpha('a'), true, 'isalpha("a")')
+ test:is(utf8.isalpha('A'), true, 'isalpha("A")')
+ test:is(utf8.isalpha('aa'), false, 'isalpha("aa")')
+ test:is(utf8.isalpha('勺'), true, 'isalpha("勺")')
+ test:is(utf8.isupper('Ё'), true, 'isupper("Ё")')
+ test:is(utf8.islower('ё'), true, 'islower("ё")')
+ test:is(utf8.isdigit('a'), false, 'isdigit("a")')
+ test:is(utf8.isdigit('1'), true, 'isdigit("1")')
+ test:is(utf8.isdigit('9'), true, 'isdigit("9")')
+
+ test:is(utf8.len(str), 56, 'len works on complex string')
+ s = '12İ☢勺34'
+ test:is(utf8.len(s), 7, 'len works no options')
+ test:is(utf8.len(s, 1), 7, 'default start is 1')
+ test:is(utf8.len(s, 2), 6, 'start 2')
+ test:is(utf8.len(s, 3), 5, 'start 3')
+ local c
+ c, err = utf8.len(s, 4)
+ test:isnil(c, 'middle of symbol offset is error')
+ test:is(err, 4, 'error on 4 byte')
+ test:is(utf8.len(s, 5), 4, 'start 5')
+ c, err = utf8.len(s, 6)
+ test:is(err, 6, 'error on 6 byte')
+ c, err = utf8.len(s, 0)
+ test:is(err, 'position is out of string', 'range is out of string')
+ test:is(utf8.len(s, #s), 1, 'start from the end')
+ test:is(utf8.len(s, #s + 1), 0, 'position is out of string')
+ test:is(utf8.len(s, 1, -1), 7, 'default end is -1')
+ test:is(utf8.len(s, 1, -2), 6, 'end -2')
+ test:is(utf8.len(s, 1, -3), 5, 'end -3')
+ test:is(utf8.len(s, 1, -4), 5, 'end in the middle of symbol')
+ test:is(utf8.len(s, 1, -5), 5, 'end in the middle of symbol')
+ test:is(utf8.len(s, 1, -6), 5, 'end in the middle of symbol')
+ test:is(utf8.len(s, 1, -7), 4, 'end -7')
+ test:is(utf8.len(s, 2, -7), 3, '[2, -7]')
+ test:is(utf8.len(s, 3, -7), 2, '[3, -7]')
+ c, err = utf8.len(s, 4, -7)
+ test:is(err, 4, '[4, -7] is error - start from the middle of symbol')
+ test:is(utf8.len(s, 10, -100), 0, 'it is ok to be out of str by end pos')
+ test:is(utf8.len(s, 10, -10), 0, 'it is ok to swap end and start pos')
+ test:is(utf8.len(''), 0, 'empty len')
+ test:is(utf8.len(s, -6, -1), 3, 'pass both negative offsets')
+ test:is(utf8.len(s, 3, 3), 1, "end in the middle on the same symbol as start")
+ c, err = utf8.len('a\xF4')
+ test:is(err, 2, "invalid unicode in the middle of the string")
+
+ local chars = {}
+ local codes = {}
+ for _, code in utf8.next, s do
+ table.insert(chars, utf8.char(code))
+ table.insert(codes, code)
+ end
+ test:is(table.concat(chars), s, "next and char works")
+ c, err = pcall(utf8.char, 'kek')
+ test:isnt(err:find('bad argument'), nil, 'char usage is checked')
+ c, err = pcall(utf8.next, true)
+ test:isnt(err:find('Usage'), nil, 'next usage is checked')
+ c, err = pcall(utf8.next, '1234', true)
+ test:isnt(err:find('bad argument'), nil, 'next usage is checked')
+ local offset
+ offset, c = utf8.next('')
+ test:isnil(offset, 'next on empty - nil offset')
+ test:isnil(c, 'next on empty - nil code')
+ offset, c = utf8.next('123', 100)
+ test:isnil(offset, 'out of string - nil offset')
+ test:isnil(c, 'out of string - nil code')
+ test:is(utf8.char(unpack(codes)), s, 'char with multiple values')
+
+ local uppers = 0
+ local lowers = 0
+ local digits = 0
+ local letters = 0
+ for _, code in utf8.next, str do
+ if utf8.isupper(code) then uppers = uppers + 1 end
+ if utf8.islower(code) then lowers = lowers + 1 end
+ if utf8.isalpha(code) then letters = letters + 1 end
+ if utf8.isdigit(code) then digits = digits + 1 end
+ end
+ test:is(uppers, 13, 'uppers by code')
+ test:is(lowers, 19, 'lowers by code')
+ test:is(letters, 33, 'letters by code')
+ test:is(digits, 4, 'digits by code')
+
+ s = '12345678'
+ test:is(utf8.sub(s, 1, 1), '1', 'sub [1]')
+ test:is(utf8.sub(s, 1, 2), '12', 'sub [1:2]')
+ test:is(utf8.sub(s, 2, 2), '2', 'sub [2:2]')
+ test:is(utf8.sub(s, 0, 2), '12', 'sub [0:2]')
+ test:is(utf8.sub(s, 3, 7), '34567', 'sub [3:7]')
+ test:is(utf8.sub(s, 7, 3), '', 'sub [7:3]')
+ test:is(utf8.sub(s, 3, 100), '345678', 'sub [3:100]')
+ test:is(utf8.sub(s, 100, 3), '', 'sub [100:3]')
+
+ test:is(utf8.sub(s, 5), '5678', 'sub [5:]')
+ test:is(utf8.sub(s, 1, -1), s, 'sub [1:-1]')
+ test:is(utf8.sub(s, 1, -2), '1234567', 'sub [1:-2]')
+ test:is(utf8.sub(s, 2, -2), '234567', 'sub [2:-2]')
+ test:is(utf8.sub(s, 3, -3), '3456', 'sub [3:-3]')
+ test:is(utf8.sub(s, 5, -4), '5', 'sub [5:-4]')
+ test:is(utf8.sub(s, 7, -7), '', 'sub[7:-7]')
+
+ test:is(utf8.sub(s, -2, -1), '78', 'sub [-2:-1]')
+ test:is(utf8.sub(s, -1, -1), '8', 'sub [-1:-1]')
+ test:is(utf8.sub(s, -4, -2), '567', 'sub [-4:-2]')
+ test:is(utf8.sub(s, -400, -2), '1234567', 'sub [-400:-2]')
+ test:is(utf8.sub(s, -3, -5), '', 'sub [-3:-5]')
+
+ test:is(utf8.sub(s, -6, 5), '345', 'sub [-6:5]')
+ test:is(utf8.sub(s, -5, 4), '4', 'sub [-5:4]')
+ test:is(utf8.sub(s, -2, 2), '', 'sub [-2:2]')
+ test:is(utf8.sub(s, -1, 8), '8', 'sub [-1:8]')
+
+ c, err = pcall(utf8.sub)
+ test:isnt(err:find('Usage'), nil, 'usage is checked')
+ c, err = pcall(utf8.sub, true)
+ test:isnt(err:find('Usage'), nil, 'usage is checked')
+ c, err = pcall(utf8.sub, '123')
+ test:isnt(err:find('Usage'), nil, 'usage is checked')
+ c, err = pcall(utf8.sub, '123', true)
+ test:isnt(err:find('bad argument'), nil, 'usage is checked')
+ c, err = pcall(utf8.sub, '123', 1, true)
+ test:isnt(err:find('bad argument'), nil, 'usage is checked')
+
+ local s1 = '☢'
+ local s2 = 'İ'
+ test:is(s1 < s2, false, 'test binary cmp')
+ test:is(utf8.cmp(s1, s2) < 0, true, 'test unicode <')
+ test:is(utf8.cmp(s1, s1) == 0, true, 'test unicode eq')
+ test:is(utf8.cmp(s2, s1) > 0, true, 'test unicode >')
+ test:is(utf8.casecmp('a', 'A') == 0, true, 'test icase ==')
+ test:is(utf8.casecmp('b', 'A') > 0, true, 'test icase >, first')
+ test:is(utf8.casecmp('B', 'a') > 0, true, 'test icase >, second >')
+ test:is(utf8.cmp('', '') == 0, true, 'test empty compare')
+ test:is(utf8.cmp('', 'a') < 0, true, 'test left empty compare')
+ test:is(utf8.cmp('a', '') > 0, true, 'test right empty compare')
+ test:is(utf8.casecmp('', '') == 0, true, 'test empty icompare')
+ test:is(utf8.casecmp('', 'a') < 0, true, 'test left empty icompare')
+ test:is(utf8.casecmp('a', '') > 0, true, 'test right empty icompare')
+end)
+
os.exit(test:check() == true and 0 or -1)
diff --git a/test/box/ddl.result b/test/box/ddl.result
index f249f8fe3..30f0cf7ec 100644
--- a/test/box/ddl.result
+++ b/test/box/ddl.result
@@ -500,6 +500,21 @@ box.space._collation.index.name:delete{'test'}
- [3, 'test', 0, 'ICU', 'ru_RU', {}]
...
--
+-- gh-3290: expose ICU into Lua. It uses built-in collations, that
+-- must work even if a collation is deleted from _collation.
+--
+t = box.space._collation:delete{1}
+---
+...
+utf8.cmp('abc', 'def')
+---
+- -1
+...
+box.space._collation:replace(t)
+---
+- [1, 'unicode', 1, 'ICU', '', {}]
+...
+--
-- gh-2839: allow to store custom fields in field definition.
--
format = {}
diff --git a/test/box/ddl.test.lua b/test/box/ddl.test.lua
index 6029c6eb6..ebbefe77b 100644
--- a/test/box/ddl.test.lua
+++ b/test/box/ddl.test.lua
@@ -191,6 +191,14 @@ test_run:cmd('restart server default')
box.space._collation:select{}
box.space._collation.index.name:delete{'test'}
+--
+-- gh-3290: expose ICU into Lua. It uses built-in collations, that
+-- must work even if a collation is deleted from _collation.
+--
+t = box.space._collation:delete{1}
+utf8.cmp('abc', 'def')
+box.space._collation:replace(t)
+
--
-- gh-2839: allow to store custom fields in field definition.
--
--
2.15.1 (Apple Git-101)
More information about the Tarantool-patches
mailing list