From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id D795822FB6 for ; Wed, 25 Apr 2018 19:29:13 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id hSBvxOTvxhrp for ; Wed, 25 Apr 2018 19:29:13 -0400 (EDT) Received: from smtp49.i.mail.ru (smtp49.i.mail.ru [94.100.177.109]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 0673222B6E for ; Wed, 25 Apr 2018 19:29:12 -0400 (EDT) From: Vladislav Shpilevoy Subject: [tarantool-patches] [PATCH 1/7] lua: expose ICU upper/lower functions to Lua Date: Thu, 26 Apr 2018 02:29:01 +0300 Message-Id: <4964845f82fc37f46f28b1713adf4527c219cb0d.1524698920.git.v.shpilevoy@tarantool.org> In-Reply-To: References: MIME-Version: 1.0 In-Reply-To: References: Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: kostja@tarantool.org Lua can not work with unicode - in Lua it is enterpreted as a binary. On such string built-in upper/lower functions do not work. But Tarantool links with ICU that can solve the problem. Lets expose ICU upper/lower function into Lua to enable correct case transformations. Closes #3290 --- src/lua/init.c | 2 +- src/lua/string.lua | 140 +++++++++++++++++++++++++++++++++++++++++++ test/app-tap/string.test.lua | 34 ++++++++++- 3 files changed, 174 insertions(+), 2 deletions(-) diff --git a/src/lua/init.c b/src/lua/init.c index a0a7f63f6..9149362a0 100644 --- a/src/lua/init.c +++ b/src/lua/init.c @@ -124,9 +124,9 @@ static const char *lua_modules[] = { "errno", errno_lua, "fiber", fiber_lua, "env", env_lua, - "string", string_lua, "table", table_lua, "buffer", buffer_lua, + "string", string_lua, "msgpackffi", msgpackffi_lua, "crypto", crypto_lua, "digest", digest_lua, diff --git a/src/lua/string.lua b/src/lua/string.lua index 5ff64c9f6..1c7226143 100644 --- a/src/lua/string.lua +++ b/src/lua/string.lua @@ -1,4 +1,5 @@ local ffi = require('ffi') +local buffer = require('buffer') ffi.cdef[[ const char * @@ -6,6 +7,28 @@ ffi.cdef[[ const char *needle, size_t needle_len); int memcmp(const char *mem1, const char *mem2, size_t num); int isspace(int c); + + typedef struct UCaseMap UCaseMap; + typedef int UErrorCode; + + int32_t + ucasemap_utf8ToLower(const UCaseMap *csm, char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode); + + int32_t + ucasemap_utf8ToUpper(const UCaseMap *csm, char *dest, int32_t destCapacity, + const char *src, int32_t srcLength, + UErrorCode *pErrorCode); + + UCaseMap * + ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode); + + void + ucasemap_close(UCaseMap *csm); + + const char * + u_errorName(UErrorCode code); ]] local c_char_ptr = ffi.typeof('const char *') @@ -313,6 +336,121 @@ local function string_rstrip(inp) return (string.gsub(inp, "(.-)%s*$", "%1")) end +-- +-- ICU bindings. +-- +-- +-- Ucasemap cache allows to do not create a new UCaseMap on each +-- u_upper/u_lower call. References are weak to do not keep all +-- ever created maps, so the cache is cleared periodically. +-- +local ucasemap_cache = setmetatable({}, {__mode = 'v'}) +local errcode = ffi.new('int[1]') +errcode[0] = 0 +-- +-- ICU UCaseMethod requires 0 error code as input, so after any +-- error the errcode must be nullified. +-- +local function icu_clear_error() + errcode[0] = 0 +end +-- +-- String representation of the latest ICU error. +-- +local function icu_error() + return ffi.string(ffi.C.u_errorName(errcode[0])) +end +-- +-- Find cached UCaseMap for @a locale, or create a new one and +-- cache it. +-- @param locale String locale or box.NULL for default. +-- @retval nil Can neither get or create a UCaseMap. +-- @retval not nil Needed UCaseMap. +-- +local function ucasemap_retrieve(locale) + local ret = ucasemap_cache[locale] + if not ret then + ret = ffi.C.ucasemap_open(c_char_ptr(locale), 0, errcode) + if ret ~= nil then + ffi.gc(ret, ffi.C.ucasemap_close) + ucasemap_cache[locale] = ret + end + end + return ret +end +-- +-- Check ICU options for string.u_upper/u_lower. +-- @param opts Options. Can contain only one option - locale. +-- @param usage_err What to throw if opts types are violated. +-- @retval String locale if found. +-- @retval box.NULL if locale is not found. +-- +local function icu_check_case_opts(opts, usage_err) + if opts then + if type(opts) ~= 'table' then + error(usage_err) + end + if opts.locale then + if type(opts.locale) ~= 'string' then + error(usage_err) + end + return opts.locale + end + end + return box.NULL +end +-- +-- Create upper/lower case version of @an inp string. +-- @param inp Input string. +-- @param opts Options. Can contain only one option - locale. In +-- different locales different capital letters can exist +-- for the same symbol. For example, in turkish locale +-- upper('i') == 'İ', in english locale it is 'I'. See ICU +-- documentation for locales. +-- @param func Upper or lower FFI function. +-- @param usage What to print on usage error. +-- @retval nil, error Error. +-- @retval not nil Uppercase version of @an inp. +-- +local function string_u_to_case_impl(inp, opts, func, usage) + if type(inp) ~= 'string' then + error(usage) + end + icu_clear_error() + local map = ucasemap_retrieve(icu_check_case_opts(opts, usage)) + if not map then + return nil, icu_error() + end + local src_len = #inp + inp = c_char_ptr(inp) + local buf = buffer.IBUF_SHARED + local buf_raw, ret + -- +1 for NULL termination. Else error appears in errcode. + local dst_len = src_len + 1 +::do_convert:: + buf:reset() + buf_raw = buf:alloc(dst_len) + ret = func(map, buf_raw, dst_len, inp, src_len, errcode) + if ret <= dst_len then + if ret == 0 and errcode[0] ~= 0 then + return nil, icu_error() + end + return ffi.string(buf_raw, ret) + else + dst_len = ret + 1 + goto do_convert + end +end + +local function string_u_upper(inp, opts) + local usage = 'Usage: string.u_upper(str, {[locale = }])' + return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToUpper, usage) +end + +local function string_u_lower(inp, opts) + local usage = 'Usage: string.u_lower(str, {[locale = }])' + return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage) +end -- It'll automatically set string methods, too. local string = require('string') @@ -326,3 +464,5 @@ string.hex = string_hex string.strip = string_strip string.lstrip = string_lstrip string.rstrip = string_rstrip +string.u_upper = string_u_upper +string.u_lower = string_u_lower diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua index 852a7923c..004e149e9 100755 --- a/test/app-tap/string.test.lua +++ b/test/app-tap/string.test.lua @@ -3,7 +3,7 @@ local tap = require('tap') local test = tap.test("string extensions") -test:plan(5) +test:plan(6) test:test("split", function(test) test:plan(10) @@ -114,6 +114,38 @@ test:test("hex", function(test) test:is(string.hex(""), "", "hex empty string") end) +test:test("unicode", function(test) + test:plan(12) + local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺' + local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺' + local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺' + local lower_res = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i i i̇ 勺#☢༺' + local lower_turkish = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i ı i 勺#☢༺' + local s = string.u_upper(str) + test:is(s, upper_res, 'default locale upper') + s = string.u_lower(str) + test:is(s, lower_res, 'default locale lower') + s = string.u_upper(str, {locale = 'en_US'}) + test:is(s, upper_res, 'en_US locale upper') + s = string.u_lower(str, {locale = 'en_US'}) + test:is(s, lower_res, 'en_US locale lower') + s = string.u_upper(str, {locale = 'ru_RU'}) + test:is(s, upper_res, 'ru_RU locale upper') + s = string.u_lower(str, {locale = 'ru_RU'}) + test:is(s, lower_res, 'ru_RU locale lower') + s = string.u_upper(str, {locale = 'tr_TR'}) + test:is(s, upper_turkish, 'tr_TR locale upper') + s = string.u_lower(str, {locale = 'tr_TR'}) + test:is(s, lower_turkish, 'tr_TR locale lower') + local err + s, err = string.u_upper(str, {locale = 'not_existing locale tratatatata'}) + test:is(s, upper_res, 'incorrect locale turns into default upper') + test:isnil(err, 'upper error is nil') + s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'}) + test:is(s, lower_res, 'incorrect locale turns into default lower') + test:isnil(err, 'lower error is nil') +end) + test:test("strip", function(test) test:plan(6) local str = " hello hello " -- 2.15.1 (Apple Git-101)