From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 1730222F96 for ; Wed, 25 Apr 2018 19:29:14 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id ZRQ6el_jRzZc for ; Wed, 25 Apr 2018 19:29:13 -0400 (EDT) Received: from smtp49.i.mail.ru (smtp49.i.mail.ru [94.100.177.109]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 19E7F22F7A for ; Wed, 25 Apr 2018 19:29:12 -0400 (EDT) From: Vladislav Shpilevoy Subject: [tarantool-patches] [PATCH 2/7] lua: implement string.u_count Date: Thu, 26 Apr 2018 02:29:02 +0300 Message-Id: In-Reply-To: References: MIME-Version: 1.0 In-Reply-To: References: Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: kostja@tarantool.org Lua can not calculate length of a unicode string correctly. But Tarantool has ICU on board - lets use it to calculate length. u_count has options, that allows to count only symbols of a specific class, for example, only capital letters, or digits. Options can be combined. Closes #3081 --- extra/exports | 1 + src/CMakeLists.txt | 1 + src/lua/string.lua | 52 ++++++++++++++++++++++++++++++++++++++++++++ src/util.c | 48 +++++++++++++++++++++++++++++++++++++++- test/app-tap/string.test.lua | 22 ++++++++++++++++++- 5 files changed, 122 insertions(+), 2 deletions(-) diff --git a/extra/exports b/extra/exports index a274bb23b..b0480fe79 100644 --- a/extra/exports +++ b/extra/exports @@ -40,6 +40,7 @@ title_set_status title_get_status exception_get_string exception_get_int +u_count tarantool_lua_ibuf uuid_nil diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8ab09e968..f489c88cf 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -110,6 +110,7 @@ target_link_libraries(core ${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES} ${MSGPUCK_LIBRARIES} + ${ICU_LIBRARIES} ) add_library(stat STATIC rmean.c latency.c histogram.c) diff --git a/src/lua/string.lua b/src/lua/string.lua index 1c7226143..6c566cb54 100644 --- a/src/lua/string.lua +++ b/src/lua/string.lua @@ -29,6 +29,9 @@ ffi.cdef[[ const char * u_errorName(UErrorCode code); + + int + u_count(const char *s, int bsize, uint8_t flags); ]] local c_char_ptr = ffi.typeof('const char *') @@ -452,6 +455,54 @@ local function string_u_lower(inp, opts) return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage) end +local U_COUNT_CLASS_ALL = 0 +local U_COUNT_CLASS_UPPER_LETTER = 1 +local U_COUNT_CLASS_LOWER_LETTER = 2 +local U_COUNT_CLASS_DIGIT = 4 + +-- +-- Calculate count of symbols matching the needed classes. +-- @param inp Input UTF8 string. +-- @param opts Options with needed classes. It supports 'all', +-- 'upper', 'lower', 'digit'. Opts is a table, where needed +-- class key is set to true. By default all classes are +-- needed, and count works like strlen (not bsize, like Lua +-- operator '#'). +-- @retval not nil Summary count of needed symbols. +-- @retval nil, position Invalid UTF8 on returned position. +-- +local function string_u_count(inp, opts) + local usage = 'Usage: string.u_count(str)' + if type(inp) ~= 'string' then + error(usage) + end + local flags = 0 + if opts then + if type(opts) ~= 'table' then + error(usage) + end + if not opts.all then + if opts.upper then + flags = bit.bor(flags, U_COUNT_CLASS_UPPER_LETTER) + end + if opts.lower then + flags = bit.bor(flags, U_COUNT_CLASS_LOWER_LETTER) + end + if opts.digit then + flags = bit.bor(flags, U_COUNT_CLASS_DIGIT) + end + end + end + local len = #inp + inp = c_char_ptr(inp) + local ret = ffi.C.u_count(inp, len, flags) + if ret >= 0 then + return ret + else + return nil, -ret + end +end + -- It'll automatically set string methods, too. local string = require('string') string.split = string_split @@ -466,3 +517,4 @@ string.lstrip = string_lstrip string.rstrip = string_rstrip string.u_upper = string_u_upper string.u_lower = string_u_lower +string.u_count = string_u_count diff --git a/src/util.c b/src/util.c index 9458695b9..c117dee05 100644 --- a/src/util.c +++ b/src/util.c @@ -40,7 +40,8 @@ #include #include #include - +#include +#include #include /* mp_char2escape[] table */ #include "say.h" @@ -321,3 +322,48 @@ fpconv_check() */ assert(buf[1] == '.'); } + +enum u_count_class { + U_COUNT_CLASS_ALL = 0, + U_COUNT_CLASS_UPPER_LETTER = 1, + U_COUNT_CLASS_LOWER_LETTER = 2, + U_COUNT_CLASS_DIGIT = 4, +}; + +/** + * Get length of a UTF8 string. + * @param s UTF8 string. + * @param bsize Binary size of @an s. + * @param flags Binary OR of u_count_class flags. + * @retval >=0 Count of symbols matched one of @a flags. + * @retval <0 Invalid UTF8 on the position -1 * returned value. + */ +int +u_count(const char *s, int bsize, uint8_t flags) +{ + int offset = 0; + int len = 0; + UChar32 c; + if (flags == 0) { + /* Fast path - just calculate strlen. */ + while (offset < bsize) { + U8_NEXT(s, offset, bsize, c); + if (c == U_SENTINEL) + return -(len + 1); + ++len; + } + return len; + } + /* Slow path - must check each symbol to match flags. */ + while (offset < bsize) { + U8_NEXT(s, offset, bsize, c); + if (c == U_SENTINEL) + return -(len + 1); + uint8_t f = 0; + f |= (flags & U_COUNT_CLASS_UPPER_LETTER) != 0 && u_isupper(c); + f |= (flags & U_COUNT_CLASS_LOWER_LETTER) != 0 && u_islower(c); + f |= (flags & U_COUNT_CLASS_DIGIT) != 0 && u_isdigit(c); + len += f != 0 ? 1 : 0; + } + return len; +} diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua index 004e149e9..650a5982d 100755 --- a/test/app-tap/string.test.lua +++ b/test/app-tap/string.test.lua @@ -115,7 +115,7 @@ test:test("hex", function(test) end) test:test("unicode", function(test) - test:plan(12) + test:plan(24) local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺' local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺' local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺' @@ -144,6 +144,26 @@ test:test("unicode", function(test) s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'}) test:is(s, lower_res, 'incorrect locale turns into default lower') test:isnil(err, 'lower error is nil') + + -- Test u_count. + test:is(string.u_count(str), 56, 'u_count works') + s, err = string.u_count("\xE2\x80\xE2") + test:is(err, 1, 'u_count checks for errors') + test:isnil(s, 'retval is nil on error') + test:is(string.u_count(''), 0, 'u_count works on empty strings') + s, err = pcall(string.u_count, 100) + test:isnt(err:find('Usage'), nil, 'usage is checked') + -- Test different symbol classes. + s, err = pcall(string.u_count, str, 1234) + test:isnt(err:find('Usage'), nil, 'usage checks options') + test:is(string.u_count(str, {all = true}), 56, 'option all') + test:is(string.u_count(str, {upper = true}), 13, 'option upper') + test:is(string.u_count(str, {lower = true}), 19, 'option lower') + test:is(string.u_count(str, {upper = true, lower = true}), 32, + 'options upper and lower') + test:is(string.u_count(str, {digit = true}), 4, 'option digit') + test:is(string.u_count(str, {digit = true, upper = true}), 17, + 'options digit and upper') end) test:test("strip", function(test) -- 2.15.1 (Apple Git-101)