[tarantool-patches] [PATCH 1/7] lua: expose ICU upper/lower functions to Lua
Vladislav Shpilevoy
v.shpilevoy at tarantool.org
Thu Apr 26 02:29:01 MSK 2018
Lua can not work with unicode - in Lua it is enterpreted as a
binary. On such string built-in upper/lower functions do not
work. But Tarantool links with ICU that can solve the problem.
Lets expose ICU upper/lower function into Lua to enable correct
case transformations.
Closes #3290
---
src/lua/init.c | 2 +-
src/lua/string.lua | 140 +++++++++++++++++++++++++++++++++++++++++++
test/app-tap/string.test.lua | 34 ++++++++++-
3 files changed, 174 insertions(+), 2 deletions(-)
diff --git a/src/lua/init.c b/src/lua/init.c
index a0a7f63f6..9149362a0 100644
--- a/src/lua/init.c
+++ b/src/lua/init.c
@@ -124,9 +124,9 @@ static const char *lua_modules[] = {
"errno", errno_lua,
"fiber", fiber_lua,
"env", env_lua,
- "string", string_lua,
"table", table_lua,
"buffer", buffer_lua,
+ "string", string_lua,
"msgpackffi", msgpackffi_lua,
"crypto", crypto_lua,
"digest", digest_lua,
diff --git a/src/lua/string.lua b/src/lua/string.lua
index 5ff64c9f6..1c7226143 100644
--- a/src/lua/string.lua
+++ b/src/lua/string.lua
@@ -1,4 +1,5 @@
local ffi = require('ffi')
+local buffer = require('buffer')
ffi.cdef[[
const char *
@@ -6,6 +7,28 @@ ffi.cdef[[
const char *needle, size_t needle_len);
int memcmp(const char *mem1, const char *mem2, size_t num);
int isspace(int c);
+
+ typedef struct UCaseMap UCaseMap;
+ typedef int UErrorCode;
+
+ int32_t
+ ucasemap_utf8ToLower(const UCaseMap *csm, char *dest, int32_t destCapacity,
+ const char *src, int32_t srcLength,
+ UErrorCode *pErrorCode);
+
+ int32_t
+ ucasemap_utf8ToUpper(const UCaseMap *csm, char *dest, int32_t destCapacity,
+ const char *src, int32_t srcLength,
+ UErrorCode *pErrorCode);
+
+ UCaseMap *
+ ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
+
+ void
+ ucasemap_close(UCaseMap *csm);
+
+ const char *
+ u_errorName(UErrorCode code);
]]
local c_char_ptr = ffi.typeof('const char *')
@@ -313,6 +336,121 @@ local function string_rstrip(inp)
return (string.gsub(inp, "(.-)%s*$", "%1"))
end
+--
+-- ICU bindings.
+--
+--
+-- Ucasemap cache allows to do not create a new UCaseMap on each
+-- u_upper/u_lower call. References are weak to do not keep all
+-- ever created maps, so the cache is cleared periodically.
+--
+local ucasemap_cache = setmetatable({}, {__mode = 'v'})
+local errcode = ffi.new('int[1]')
+errcode[0] = 0
+--
+-- ICU UCaseMethod requires 0 error code as input, so after any
+-- error the errcode must be nullified.
+--
+local function icu_clear_error()
+ errcode[0] = 0
+end
+--
+-- String representation of the latest ICU error.
+--
+local function icu_error()
+ return ffi.string(ffi.C.u_errorName(errcode[0]))
+end
+--
+-- Find cached UCaseMap for @a locale, or create a new one and
+-- cache it.
+-- @param locale String locale or box.NULL for default.
+-- @retval nil Can neither get or create a UCaseMap.
+-- @retval not nil Needed UCaseMap.
+--
+local function ucasemap_retrieve(locale)
+ local ret = ucasemap_cache[locale]
+ if not ret then
+ ret = ffi.C.ucasemap_open(c_char_ptr(locale), 0, errcode)
+ if ret ~= nil then
+ ffi.gc(ret, ffi.C.ucasemap_close)
+ ucasemap_cache[locale] = ret
+ end
+ end
+ return ret
+end
+--
+-- Check ICU options for string.u_upper/u_lower.
+-- @param opts Options. Can contain only one option - locale.
+-- @param usage_err What to throw if opts types are violated.
+-- @retval String locale if found.
+-- @retval box.NULL if locale is not found.
+--
+local function icu_check_case_opts(opts, usage_err)
+ if opts then
+ if type(opts) ~= 'table' then
+ error(usage_err)
+ end
+ if opts.locale then
+ if type(opts.locale) ~= 'string' then
+ error(usage_err)
+ end
+ return opts.locale
+ end
+ end
+ return box.NULL
+end
+--
+-- Create upper/lower case version of @an inp string.
+-- @param inp Input string.
+-- @param opts Options. Can contain only one option - locale. In
+-- different locales different capital letters can exist
+-- for the same symbol. For example, in turkish locale
+-- upper('i') == 'İ', in english locale it is 'I'. See ICU
+-- documentation for locales.
+-- @param func Upper or lower FFI function.
+-- @param usage What to print on usage error.
+-- @retval nil, error Error.
+-- @retval not nil Uppercase version of @an inp.
+--
+local function string_u_to_case_impl(inp, opts, func, usage)
+ if type(inp) ~= 'string' then
+ error(usage)
+ end
+ icu_clear_error()
+ local map = ucasemap_retrieve(icu_check_case_opts(opts, usage))
+ if not map then
+ return nil, icu_error()
+ end
+ local src_len = #inp
+ inp = c_char_ptr(inp)
+ local buf = buffer.IBUF_SHARED
+ local buf_raw, ret
+ -- +1 for NULL termination. Else error appears in errcode.
+ local dst_len = src_len + 1
+::do_convert::
+ buf:reset()
+ buf_raw = buf:alloc(dst_len)
+ ret = func(map, buf_raw, dst_len, inp, src_len, errcode)
+ if ret <= dst_len then
+ if ret == 0 and errcode[0] ~= 0 then
+ return nil, icu_error()
+ end
+ return ffi.string(buf_raw, ret)
+ else
+ dst_len = ret + 1
+ goto do_convert
+ end
+end
+
+local function string_u_upper(inp, opts)
+ local usage = 'Usage: string.u_upper(str, {[locale = <string>}])'
+ return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToUpper, usage)
+end
+
+local function string_u_lower(inp, opts)
+ local usage = 'Usage: string.u_lower(str, {[locale = <string>}])'
+ return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage)
+end
-- It'll automatically set string methods, too.
local string = require('string')
@@ -326,3 +464,5 @@ string.hex = string_hex
string.strip = string_strip
string.lstrip = string_lstrip
string.rstrip = string_rstrip
+string.u_upper = string_u_upper
+string.u_lower = string_u_lower
diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
index 852a7923c..004e149e9 100755
--- a/test/app-tap/string.test.lua
+++ b/test/app-tap/string.test.lua
@@ -3,7 +3,7 @@
local tap = require('tap')
local test = tap.test("string extensions")
-test:plan(5)
+test:plan(6)
test:test("split", function(test)
test:plan(10)
@@ -114,6 +114,38 @@ test:test("hex", function(test)
test:is(string.hex(""), "", "hex empty string")
end)
+test:test("unicode", function(test)
+ test:plan(12)
+ local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺'
+ local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺'
+ local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺'
+ local lower_res = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i i i̇ 勺#☢༺'
+ local lower_turkish = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i ı i 勺#☢༺'
+ local s = string.u_upper(str)
+ test:is(s, upper_res, 'default locale upper')
+ s = string.u_lower(str)
+ test:is(s, lower_res, 'default locale lower')
+ s = string.u_upper(str, {locale = 'en_US'})
+ test:is(s, upper_res, 'en_US locale upper')
+ s = string.u_lower(str, {locale = 'en_US'})
+ test:is(s, lower_res, 'en_US locale lower')
+ s = string.u_upper(str, {locale = 'ru_RU'})
+ test:is(s, upper_res, 'ru_RU locale upper')
+ s = string.u_lower(str, {locale = 'ru_RU'})
+ test:is(s, lower_res, 'ru_RU locale lower')
+ s = string.u_upper(str, {locale = 'tr_TR'})
+ test:is(s, upper_turkish, 'tr_TR locale upper')
+ s = string.u_lower(str, {locale = 'tr_TR'})
+ test:is(s, lower_turkish, 'tr_TR locale lower')
+ local err
+ s, err = string.u_upper(str, {locale = 'not_existing locale tratatatata'})
+ test:is(s, upper_res, 'incorrect locale turns into default upper')
+ test:isnil(err, 'upper error is nil')
+ s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'})
+ test:is(s, lower_res, 'incorrect locale turns into default lower')
+ test:isnil(err, 'lower error is nil')
+end)
+
test:test("strip", function(test)
test:plan(6)
local str = " hello hello "
--
2.15.1 (Apple Git-101)
More information about the Tarantool-patches
mailing list