From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 40E3921929 for ; Thu, 26 Apr 2018 06:36:57 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id G5gD3HcHChfr for ; Thu, 26 Apr 2018 06:36:57 -0400 (EDT) Received: from smtp51.i.mail.ru (smtp51.i.mail.ru [94.100.177.111]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 46C8721912 for ; Thu, 26 Apr 2018 06:36:55 -0400 (EDT) Subject: [tarantool-patches] Re: [PATCH 2/7] lua: implement string.u_count From: Vladislav Shpilevoy References: Message-ID: <9325cedb-2343-a6a9-c553-92bbeea2d4cf@tarantool.org> Date: Thu, 26 Apr 2018 13:36:52 +0300 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Language: en-US Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: kostja@tarantool.org, Alexander Turenko On the branch I added two new options - 'title' and 'letter'. Unicode has symbols, that are neither upper or lower - they are title. It is two-char symbols like Dž. 'letter' is just OR of 'upper', 'lower' and 'title'. On 26/04/2018 02:29, Vladislav Shpilevoy wrote: > Lua can not calculate length of a unicode string correctly. But > Tarantool has ICU on board - lets use it to calculate length. > > u_count has options, that allows to count only symbols of a > specific class, for example, only capital letters, or digits. > Options can be combined. > > Closes #3081 > --- > extra/exports | 1 + > src/CMakeLists.txt | 1 + > src/lua/string.lua | 52 ++++++++++++++++++++++++++++++++++++++++++++ > src/util.c | 48 +++++++++++++++++++++++++++++++++++++++- > test/app-tap/string.test.lua | 22 ++++++++++++++++++- > 5 files changed, 122 insertions(+), 2 deletions(-) > > diff --git a/extra/exports b/extra/exports > index a274bb23b..b0480fe79 100644 > --- a/extra/exports > +++ b/extra/exports > @@ -40,6 +40,7 @@ title_set_status > title_get_status > exception_get_string > exception_get_int > +u_count > > tarantool_lua_ibuf > uuid_nil > diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt > index 8ab09e968..f489c88cf 100644 > --- a/src/CMakeLists.txt > +++ b/src/CMakeLists.txt > @@ -110,6 +110,7 @@ target_link_libraries(core > ${LIBEIO_LIBRARIES} > ${LIBCORO_LIBRARIES} > ${MSGPUCK_LIBRARIES} > + ${ICU_LIBRARIES} > ) > > add_library(stat STATIC rmean.c latency.c histogram.c) > diff --git a/src/lua/string.lua b/src/lua/string.lua > index 1c7226143..6c566cb54 100644 > --- a/src/lua/string.lua > +++ b/src/lua/string.lua > @@ -29,6 +29,9 @@ ffi.cdef[[ > > const char * > u_errorName(UErrorCode code); > + > + int > + u_count(const char *s, int bsize, uint8_t flags); > ]] > > local c_char_ptr = ffi.typeof('const char *') > @@ -452,6 +455,54 @@ local function string_u_lower(inp, opts) > return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage) > end > > +local U_COUNT_CLASS_ALL = 0 > +local U_COUNT_CLASS_UPPER_LETTER = 1 > +local U_COUNT_CLASS_LOWER_LETTER = 2 > +local U_COUNT_CLASS_DIGIT = 4 > + > +-- > +-- Calculate count of symbols matching the needed classes. > +-- @param inp Input UTF8 string. > +-- @param opts Options with needed classes. It supports 'all', > +-- 'upper', 'lower', 'digit'. Opts is a table, where needed > +-- class key is set to true. By default all classes are > +-- needed, and count works like strlen (not bsize, like Lua > +-- operator '#'). > +-- @retval not nil Summary count of needed symbols. > +-- @retval nil, position Invalid UTF8 on returned position. > +-- > +local function string_u_count(inp, opts) > + local usage = 'Usage: string.u_count(str)' > + if type(inp) ~= 'string' then > + error(usage) > + end > + local flags = 0 > + if opts then > + if type(opts) ~= 'table' then > + error(usage) > + end > + if not opts.all then > + if opts.upper then > + flags = bit.bor(flags, U_COUNT_CLASS_UPPER_LETTER) > + end > + if opts.lower then > + flags = bit.bor(flags, U_COUNT_CLASS_LOWER_LETTER) > + end > + if opts.digit then > + flags = bit.bor(flags, U_COUNT_CLASS_DIGIT) > + end > + end > + end > + local len = #inp > + inp = c_char_ptr(inp) > + local ret = ffi.C.u_count(inp, len, flags) > + if ret >= 0 then > + return ret > + else > + return nil, -ret > + end > +end > + > -- It'll automatically set string methods, too. > local string = require('string') > string.split = string_split > @@ -466,3 +517,4 @@ string.lstrip = string_lstrip > string.rstrip = string_rstrip > string.u_upper = string_u_upper > string.u_lower = string_u_lower > +string.u_count = string_u_count > diff --git a/src/util.c b/src/util.c > index 9458695b9..c117dee05 100644 > --- a/src/util.c > +++ b/src/util.c > @@ -40,7 +40,8 @@ > #include > #include > #include > - > +#include > +#include > #include /* mp_char2escape[] table */ > > #include "say.h" > @@ -321,3 +322,48 @@ fpconv_check() > */ > assert(buf[1] == '.'); > } > + > +enum u_count_class { > + U_COUNT_CLASS_ALL = 0, > + U_COUNT_CLASS_UPPER_LETTER = 1, > + U_COUNT_CLASS_LOWER_LETTER = 2, > + U_COUNT_CLASS_DIGIT = 4, > +}; > + > +/** > + * Get length of a UTF8 string. > + * @param s UTF8 string. > + * @param bsize Binary size of @an s. > + * @param flags Binary OR of u_count_class flags. > + * @retval >=0 Count of symbols matched one of @a flags. > + * @retval <0 Invalid UTF8 on the position -1 * returned value. > + */ > +int > +u_count(const char *s, int bsize, uint8_t flags) > +{ > + int offset = 0; > + int len = 0; > + UChar32 c; > + if (flags == 0) { > + /* Fast path - just calculate strlen. */ > + while (offset < bsize) { > + U8_NEXT(s, offset, bsize, c); > + if (c == U_SENTINEL) > + return -(len + 1); > + ++len; > + } > + return len; > + } > + /* Slow path - must check each symbol to match flags. */ > + while (offset < bsize) { > + U8_NEXT(s, offset, bsize, c); > + if (c == U_SENTINEL) > + return -(len + 1); > + uint8_t f = 0; > + f |= (flags & U_COUNT_CLASS_UPPER_LETTER) != 0 && u_isupper(c); > + f |= (flags & U_COUNT_CLASS_LOWER_LETTER) != 0 && u_islower(c); > + f |= (flags & U_COUNT_CLASS_DIGIT) != 0 && u_isdigit(c); > + len += f != 0 ? 1 : 0; > + } > + return len; > +} > diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua > index 004e149e9..650a5982d 100755 > --- a/test/app-tap/string.test.lua > +++ b/test/app-tap/string.test.lua > @@ -115,7 +115,7 @@ test:test("hex", function(test) > end) > > test:test("unicode", function(test) > - test:plan(12) > + test:plan(24) > local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺' > local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺' > local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺' > @@ -144,6 +144,26 @@ test:test("unicode", function(test) > s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'}) > test:is(s, lower_res, 'incorrect locale turns into default lower') > test:isnil(err, 'lower error is nil') > + > + -- Test u_count. > + test:is(string.u_count(str), 56, 'u_count works') > + s, err = string.u_count("\xE2\x80\xE2") > + test:is(err, 1, 'u_count checks for errors') > + test:isnil(s, 'retval is nil on error') > + test:is(string.u_count(''), 0, 'u_count works on empty strings') > + s, err = pcall(string.u_count, 100) > + test:isnt(err:find('Usage'), nil, 'usage is checked') > + -- Test different symbol classes. > + s, err = pcall(string.u_count, str, 1234) > + test:isnt(err:find('Usage'), nil, 'usage checks options') > + test:is(string.u_count(str, {all = true}), 56, 'option all') > + test:is(string.u_count(str, {upper = true}), 13, 'option upper') > + test:is(string.u_count(str, {lower = true}), 19, 'option lower') > + test:is(string.u_count(str, {upper = true, lower = true}), 32, > + 'options upper and lower') > + test:is(string.u_count(str, {digit = true}), 4, 'option digit') > + test:is(string.u_count(str, {digit = true, upper = true}), 17, > + 'options digit and upper') > end) > > test:test("strip", function(test) >