[tarantool-patches] Re: [PATCH 2/7] lua: implement string.u_count

Thu Apr 26 13:36:52 MSK 2018

On the branch I added two new options - 'title' and 'letter'. Unicode
has symbols, that are neither upper or lower - they are title. It is
two-char symbols like ǅ.

'letter' is just OR of 'upper', 'lower' and 'title'.

On 26/04/2018 02:29, Vladislav Shpilevoy wrote:
> Lua can not calculate length of a unicode string correctly. But
> Tarantool has ICU on board - lets use it to calculate length.
> 
> u_count has options, that allows to count only symbols of a
> specific class, for example, only capital letters, or digits.
> Options can be combined.
> 
> Closes #3081
> ---
>   extra/exports                |  1 +
>   src/CMakeLists.txt           |  1 +
>   src/lua/string.lua           | 52 ++++++++++++++++++++++++++++++++++++++++++++
>   src/util.c                   | 48 +++++++++++++++++++++++++++++++++++++++-
>   test/app-tap/string.test.lua | 22 ++++++++++++++++++-
>   5 files changed, 122 insertions(+), 2 deletions(-)
> 
> diff --git a/extra/exports b/extra/exports
> index a274bb23b..b0480fe79 100644
> --- a/extra/exports
> +++ b/extra/exports
> @@ -40,6 +40,7 @@ title_set_status
>   title_get_status
>   exception_get_string
>   exception_get_int
> +u_count
>   
>   tarantool_lua_ibuf
>   uuid_nil
> diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
> index 8ab09e968..f489c88cf 100644
> --- a/src/CMakeLists.txt
> +++ b/src/CMakeLists.txt
> @@ -110,6 +110,7 @@ target_link_libraries(core
>       ${LIBEIO_LIBRARIES}
>       ${LIBCORO_LIBRARIES}
>       ${MSGPUCK_LIBRARIES}
> +    ${ICU_LIBRARIES}
>   )
>   
>   add_library(stat STATIC rmean.c latency.c histogram.c)
> diff --git a/src/lua/string.lua b/src/lua/string.lua
> index 1c7226143..6c566cb54 100644
> --- a/src/lua/string.lua
> +++ b/src/lua/string.lua
> @@ -29,6 +29,9 @@ ffi.cdef[[
>   
>       const char *
>       u_errorName(UErrorCode code);
> +
> +    int
> +    u_count(const char *s, int bsize, uint8_t flags);
>   ]]
>   
>   local c_char_ptr = ffi.typeof('const char *')
> @@ -452,6 +455,54 @@ local function string_u_lower(inp, opts)
>       return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage)
>   end
>   
> +local U_COUNT_CLASS_ALL = 0
> +local U_COUNT_CLASS_UPPER_LETTER = 1
> +local U_COUNT_CLASS_LOWER_LETTER = 2
> +local U_COUNT_CLASS_DIGIT = 4
> +
> +--
> +-- Calculate count of symbols matching the needed classes.
> +-- @param inp Input UTF8 string.
> +-- @param opts Options with needed classes. It supports 'all',
> +--        'upper', 'lower', 'digit'. Opts is a table, where needed
> +--        class key is set to true. By default all classes are
> +--        needed, and count works like strlen (not bsize, like Lua
> +--        operator '#').
> +-- @retval not nil Summary count of needed symbols.
> +-- @retval nil, position Invalid UTF8 on returned position.
> +--
> +local function string_u_count(inp, opts)
> +    local usage = 'Usage: string.u_count(str)'
> +    if type(inp) ~= 'string' then
> +        error(usage)
> +    end
> +    local flags = 0
> +    if opts then
> +        if type(opts) ~= 'table' then
> +            error(usage)
> +        end
> +        if not opts.all then
> +            if opts.upper then
> +                flags = bit.bor(flags, U_COUNT_CLASS_UPPER_LETTER)
> +            end
> +            if opts.lower then
> +                flags = bit.bor(flags, U_COUNT_CLASS_LOWER_LETTER)
> +            end
> +            if opts.digit then
> +                flags = bit.bor(flags, U_COUNT_CLASS_DIGIT)
> +            end
> +        end
> +    end
> +    local len = #inp
> +    inp = c_char_ptr(inp)
> +    local ret = ffi.C.u_count(inp, len, flags)
> +    if ret >= 0 then
> +        return ret
> +    else
> +        return nil, -ret
> +    end
> +end
> +
>   -- It'll automatically set string methods, too.
>   local string = require('string')
>   string.split      = string_split
> @@ -466,3 +517,4 @@ string.lstrip      = string_lstrip
>   string.rstrip      = string_rstrip
>   string.u_upper    = string_u_upper
>   string.u_lower    = string_u_lower
> +string.u_count    = string_u_count
> diff --git a/src/util.c b/src/util.c
> index 9458695b9..c117dee05 100644
> --- a/src/util.c
> +++ b/src/util.c
> @@ -40,7 +40,8 @@
>   #include <time.h>
>   #include <unistd.h>
>   #include <limits.h>
> -
> +#include <unicode/utf8.h>
> +#include <unicode/uchar.h>
>   #include <msgpuck/msgpuck.h> /* mp_char2escape[] table */
>   
>   #include "say.h"
> @@ -321,3 +322,48 @@ fpconv_check()
>   	 */
>   	assert(buf[1] == '.');
>   }
> +
> +enum u_count_class {
> +	U_COUNT_CLASS_ALL = 0,
> +	U_COUNT_CLASS_UPPER_LETTER = 1,
> +	U_COUNT_CLASS_LOWER_LETTER = 2,
> +	U_COUNT_CLASS_DIGIT = 4,
> +};
> +
> +/**
> + * Get length of a UTF8 string.
> + * @param s UTF8 string.
> + * @param bsize Binary size of @an s.
> + * @param flags Binary OR of u_count_class flags.
> + * @retval >=0 Count of symbols matched one of @a flags.
> + * @retval  <0 Invalid UTF8 on the position -1 * returned value.
> + */
> +int
> +u_count(const char *s, int bsize, uint8_t flags)
> +{
> +	int offset = 0;
> +	int len = 0;
> +	UChar32 c;
> +	if (flags == 0) {
> +		/* Fast path - just calculate strlen. */
> +		while (offset < bsize) {
> +			U8_NEXT(s, offset, bsize, c);
> +			if (c == U_SENTINEL)
> +				return -(len + 1);
> +			++len;
> +		}
> +		return len;
> +	}
> +	/* Slow path - must check each symbol to match flags. */
> +	while (offset < bsize) {
> +		U8_NEXT(s, offset, bsize, c);
> +		if (c == U_SENTINEL)
> +			return -(len + 1);
> +		uint8_t f = 0;
> +		f |= (flags & U_COUNT_CLASS_UPPER_LETTER) != 0 && u_isupper(c);
> +		f |= (flags & U_COUNT_CLASS_LOWER_LETTER) != 0 && u_islower(c);
> +		f |= (flags & U_COUNT_CLASS_DIGIT) != 0 && u_isdigit(c);
> +		len += f != 0 ? 1 : 0;
> +	}
> +	return len;
> +}
> diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
> index 004e149e9..650a5982d 100755
> --- a/test/app-tap/string.test.lua
> +++ b/test/app-tap/string.test.lua
> @@ -115,7 +115,7 @@ test:test("hex", function(test)
>   end)
>   
>   test:test("unicode", function(test)
> -    test:plan(12)
> +    test:plan(24)
>       local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺'
>       local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺'
>       local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺'
> @@ -144,6 +144,26 @@ test:test("unicode", function(test)
>       s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'})
>       test:is(s, lower_res, 'incorrect locale turns into default lower')
>       test:isnil(err, 'lower error is nil')
> +
> +    -- Test u_count.
> +    test:is(string.u_count(str), 56, 'u_count works')
> +    s, err = string.u_count("\xE2\x80\xE2")
> +    test:is(err, 1, 'u_count checks for errors')
> +    test:isnil(s, 'retval is nil on error')
> +    test:is(string.u_count(''), 0, 'u_count works on empty strings')
> +    s, err = pcall(string.u_count, 100)
> +    test:isnt(err:find('Usage'), nil, 'usage is checked')
> +    -- Test different symbol classes.
> +    s, err = pcall(string.u_count, str, 1234)
> +    test:isnt(err:find('Usage'), nil, 'usage checks options')
> +    test:is(string.u_count(str, {all = true}), 56, 'option all')
> +    test:is(string.u_count(str, {upper = true}), 13, 'option upper')
> +    test:is(string.u_count(str, {lower = true}), 19, 'option lower')
> +    test:is(string.u_count(str, {upper = true, lower = true}), 32,
> +            'options upper and lower')
> +    test:is(string.u_count(str, {digit = true}), 4, 'option digit')
> +    test:is(string.u_count(str, {digit = true, upper = true}), 17,
> +            'options digit and upper')
>   end)
>   
>   test:test("strip", function(test)
>