Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
To: tarantool-patches@freelists.org
Cc: kostja@tarantool.org
Subject: [tarantool-patches] [PATCH 1/7] lua: expose ICU upper/lower functions to Lua
Date: Thu, 26 Apr 2018 02:29:01 +0300	[thread overview]
Message-ID: <4964845f82fc37f46f28b1713adf4527c219cb0d.1524698920.git.v.shpilevoy@tarantool.org> (raw)
In-Reply-To: <cover.1524698920.git.v.shpilevoy@tarantool.org>
In-Reply-To: <cover.1524698920.git.v.shpilevoy@tarantool.org>

Lua can not work with unicode - in Lua it is enterpreted as a
binary. On such string built-in upper/lower functions do not
work. But Tarantool links with ICU that can solve the problem.
Lets expose ICU upper/lower function into Lua to enable correct
case transformations.

Closes #3290
---
 src/lua/init.c               |   2 +-
 src/lua/string.lua           | 140 +++++++++++++++++++++++++++++++++++++++++++
 test/app-tap/string.test.lua |  34 ++++++++++-
 3 files changed, 174 insertions(+), 2 deletions(-)

diff --git a/src/lua/init.c b/src/lua/init.c
index a0a7f63f6..9149362a0 100644
--- a/src/lua/init.c
+++ b/src/lua/init.c
@@ -124,9 +124,9 @@ static const char *lua_modules[] = {
 	"errno", errno_lua,
 	"fiber", fiber_lua,
 	"env", env_lua,
-	"string", string_lua,
 	"table", table_lua,
 	"buffer", buffer_lua,
+	"string", string_lua,
 	"msgpackffi", msgpackffi_lua,
 	"crypto", crypto_lua,
 	"digest", digest_lua,
diff --git a/src/lua/string.lua b/src/lua/string.lua
index 5ff64c9f6..1c7226143 100644
--- a/src/lua/string.lua
+++ b/src/lua/string.lua
@@ -1,4 +1,5 @@
 local ffi = require('ffi')
+local buffer = require('buffer')
 
 ffi.cdef[[
     const char *
@@ -6,6 +7,28 @@ ffi.cdef[[
            const char *needle,   size_t needle_len);
     int memcmp(const char *mem1, const char *mem2, size_t num);
     int isspace(int c);
+
+    typedef struct UCaseMap UCaseMap;
+    typedef int UErrorCode;
+
+    int32_t
+    ucasemap_utf8ToLower(const UCaseMap *csm, char *dest, int32_t destCapacity,
+                         const char *src, int32_t srcLength,
+                         UErrorCode *pErrorCode);
+
+    int32_t
+    ucasemap_utf8ToUpper(const UCaseMap *csm, char *dest, int32_t destCapacity,
+                         const char *src, int32_t srcLength,
+                         UErrorCode *pErrorCode);
+
+    UCaseMap *
+    ucasemap_open(const char *locale, uint32_t options, UErrorCode *pErrorCode);
+
+    void
+    ucasemap_close(UCaseMap *csm);
+
+    const char *
+    u_errorName(UErrorCode code);
 ]]
 
 local c_char_ptr = ffi.typeof('const char *')
@@ -313,6 +336,121 @@ local function string_rstrip(inp)
     return (string.gsub(inp, "(.-)%s*$", "%1"))
 end
 
+--
+-- ICU bindings.
+--
+--
+-- Ucasemap cache allows to do not create a new UCaseMap on each
+-- u_upper/u_lower call. References are weak to do not keep all
+-- ever created maps, so the cache is cleared periodically.
+--
+local ucasemap_cache = setmetatable({}, {__mode = 'v'})
+local errcode = ffi.new('int[1]')
+errcode[0] = 0
+--
+-- ICU UCaseMethod requires 0 error code as input, so after any
+-- error the errcode must be nullified.
+--
+local function icu_clear_error()
+    errcode[0] = 0
+end
+--
+-- String representation of the latest ICU error.
+--
+local function icu_error()
+    return ffi.string(ffi.C.u_errorName(errcode[0]))
+end
+--
+-- Find cached UCaseMap for @a locale, or create a new one and
+-- cache it.
+-- @param locale String locale or box.NULL for default.
+-- @retval nil Can neither get or create a UCaseMap.
+-- @retval not nil Needed UCaseMap.
+--
+local function ucasemap_retrieve(locale)
+    local ret = ucasemap_cache[locale]
+    if not ret then
+        ret = ffi.C.ucasemap_open(c_char_ptr(locale), 0, errcode)
+        if ret ~= nil then
+            ffi.gc(ret, ffi.C.ucasemap_close)
+            ucasemap_cache[locale] = ret
+        end
+    end
+    return ret
+end
+--
+-- Check ICU options for string.u_upper/u_lower.
+-- @param opts Options. Can contain only one option - locale.
+-- @param usage_err What to throw if opts types are violated.
+-- @retval String locale if found.
+-- @retval box.NULL if locale is not found.
+--
+local function icu_check_case_opts(opts, usage_err)
+    if opts then
+        if type(opts) ~= 'table' then
+            error(usage_err)
+        end
+        if opts.locale then
+            if type(opts.locale) ~= 'string' then
+                error(usage_err)
+            end
+            return opts.locale
+        end
+    end
+    return box.NULL
+end
+--
+-- Create upper/lower case version of @an inp string.
+-- @param inp Input string.
+-- @param opts Options. Can contain only one option - locale. In
+--        different locales different capital letters can exist
+--        for the same symbol. For example, in turkish locale
+--        upper('i') == 'İ', in english locale it is 'I'. See ICU
+--        documentation for locales.
+-- @param func Upper or lower FFI function.
+-- @param usage What to print on usage error.
+-- @retval nil, error Error.
+-- @retval not nil Uppercase version of @an inp.
+--
+local function string_u_to_case_impl(inp, opts, func, usage)
+    if type(inp) ~= 'string' then
+        error(usage)
+    end
+    icu_clear_error()
+    local map = ucasemap_retrieve(icu_check_case_opts(opts, usage))
+    if not map then
+        return nil, icu_error()
+    end
+    local src_len = #inp
+    inp = c_char_ptr(inp)
+    local buf = buffer.IBUF_SHARED
+    local buf_raw, ret
+    -- +1 for NULL termination. Else error appears in errcode.
+    local dst_len = src_len + 1
+::do_convert::
+    buf:reset()
+    buf_raw = buf:alloc(dst_len)
+    ret = func(map, buf_raw, dst_len, inp, src_len, errcode)
+    if ret <= dst_len then
+        if ret == 0 and errcode[0] ~= 0 then
+            return nil, icu_error()
+        end
+        return ffi.string(buf_raw, ret)
+    else
+        dst_len = ret + 1
+        goto do_convert
+    end
+end
+
+local function string_u_upper(inp, opts)
+    local usage = 'Usage: string.u_upper(str, {[locale = <string>}])'
+    return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToUpper, usage)
+end
+
+local function string_u_lower(inp, opts)
+    local usage = 'Usage: string.u_lower(str, {[locale = <string>}])'
+    return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage)
+end
 
 -- It'll automatically set string methods, too.
 local string = require('string')
@@ -326,3 +464,5 @@ string.hex        = string_hex
 string.strip      = string_strip
 string.lstrip      = string_lstrip
 string.rstrip      = string_rstrip
+string.u_upper    = string_u_upper
+string.u_lower    = string_u_lower
diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
index 852a7923c..004e149e9 100755
--- a/test/app-tap/string.test.lua
+++ b/test/app-tap/string.test.lua
@@ -3,7 +3,7 @@
 local tap = require('tap')
 local test = tap.test("string extensions")
 
-test:plan(5)
+test:plan(6)
 
 test:test("split", function(test)
     test:plan(10)
@@ -114,6 +114,38 @@ test:test("hex", function(test)
     test:is(string.hex(""), "", "hex empty string")
 end)
 
+test:test("unicode", function(test)
+    test:plan(12)
+    local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺'
+    local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺'
+    local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺'
+    local lower_res = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i i i̇ 勺#☢༺'
+    local lower_turkish = 'хеллоу ворлд ё ё я я э э ъ ъ hello world 1234 i ı i 勺#☢༺'
+    local s = string.u_upper(str)
+    test:is(s, upper_res, 'default locale upper')
+    s = string.u_lower(str)
+    test:is(s, lower_res, 'default locale lower')
+    s = string.u_upper(str, {locale = 'en_US'})
+    test:is(s, upper_res, 'en_US locale upper')
+    s = string.u_lower(str, {locale = 'en_US'})
+    test:is(s, lower_res, 'en_US locale lower')
+    s = string.u_upper(str, {locale = 'ru_RU'})
+    test:is(s, upper_res, 'ru_RU locale upper')
+    s = string.u_lower(str, {locale = 'ru_RU'})
+    test:is(s, lower_res, 'ru_RU locale lower')
+    s = string.u_upper(str, {locale = 'tr_TR'})
+    test:is(s, upper_turkish, 'tr_TR locale upper')
+    s = string.u_lower(str, {locale = 'tr_TR'})
+    test:is(s, lower_turkish, 'tr_TR locale lower')
+    local err
+    s, err = string.u_upper(str, {locale = 'not_existing locale tratatatata'})
+    test:is(s, upper_res, 'incorrect locale turns into default upper')
+    test:isnil(err, 'upper error is nil')
+    s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'})
+    test:is(s, lower_res, 'incorrect locale turns into default lower')
+    test:isnil(err, 'lower error is nil')
+end)
+
 test:test("strip", function(test)
     test:plan(6)
     local str = "  hello hello "
-- 
2.15.1 (Apple Git-101)

  reply	other threads:[~2018-04-25 23:29 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-25 23:29 [tarantool-patches] [PATCH 0/7] Expose ICU into Lua Vladislav Shpilevoy
2018-04-25 23:29 ` Vladislav Shpilevoy [this message]
2018-04-28  0:56   ` [tarantool-patches] Re: [PATCH 1/7] lua: expose ICU upper/lower functions to Lua Alexander Turenko
2018-04-25 23:29 ` [tarantool-patches] [PATCH 2/7] lua: implement string.u_count Vladislav Shpilevoy
2018-04-26 10:36   ` [tarantool-patches] " Vladislav Shpilevoy
2018-04-26 16:07   ` Vladislav Shpilevoy
2018-04-26 23:57   ` Vladislav Shpilevoy
2018-04-28  1:10   ` Alexander Turenko
2018-04-25 23:29 ` [tarantool-patches] [PATCH 3/7] alter: fix assertion in collations alter Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 4/7] Move struct on_access_denied_ctx into error.h Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 5/7] Merge box_error, stat and collations into core library Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 6/7] Always store built-in collations in the cache Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 7/7] lua: expose u_compare/u_icompare into Lua Vladislav Shpilevoy
2018-04-28  1:55 ` [tarantool-patches] Re: [PATCH 0/7] Expose ICU " Alexander Turenko

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4964845f82fc37f46f28b1713adf4527c219cb0d.1524698920.git.v.shpilevoy@tarantool.org \
    --to=v.shpilevoy@tarantool.org \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [tarantool-patches] [PATCH 1/7] lua: expose ICU upper/lower functions to Lua' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox