From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
To: tarantool-patches@freelists.org
Cc: kostja@tarantool.org
Subject: [tarantool-patches] [PATCH 2/7] lua: implement string.u_count
Date: Thu, 26 Apr 2018 02:29:02 +0300 [thread overview]
Message-ID: <a89c3b4d0946d32673fa3a6a4570a3002de4f81b.1524698920.git.v.shpilevoy@tarantool.org> (raw)
In-Reply-To: <cover.1524698920.git.v.shpilevoy@tarantool.org>
In-Reply-To: <cover.1524698920.git.v.shpilevoy@tarantool.org>
Lua can not calculate length of a unicode string correctly. But
Tarantool has ICU on board - lets use it to calculate length.
u_count has options, that allows to count only symbols of a
specific class, for example, only capital letters, or digits.
Options can be combined.
Closes #3081
---
| 1 +
src/CMakeLists.txt | 1 +
src/lua/string.lua | 52 ++++++++++++++++++++++++++++++++++++++++++++
src/util.c | 48 +++++++++++++++++++++++++++++++++++++++-
test/app-tap/string.test.lua | 22 ++++++++++++++++++-
5 files changed, 122 insertions(+), 2 deletions(-)
--git a/extra/exports b/extra/exports
index a274bb23b..b0480fe79 100644
--- a/extra/exports
+++ b/extra/exports
@@ -40,6 +40,7 @@ title_set_status
title_get_status
exception_get_string
exception_get_int
+u_count
tarantool_lua_ibuf
uuid_nil
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8ab09e968..f489c88cf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -110,6 +110,7 @@ target_link_libraries(core
${LIBEIO_LIBRARIES}
${LIBCORO_LIBRARIES}
${MSGPUCK_LIBRARIES}
+ ${ICU_LIBRARIES}
)
add_library(stat STATIC rmean.c latency.c histogram.c)
diff --git a/src/lua/string.lua b/src/lua/string.lua
index 1c7226143..6c566cb54 100644
--- a/src/lua/string.lua
+++ b/src/lua/string.lua
@@ -29,6 +29,9 @@ ffi.cdef[[
const char *
u_errorName(UErrorCode code);
+
+ int
+ u_count(const char *s, int bsize, uint8_t flags);
]]
local c_char_ptr = ffi.typeof('const char *')
@@ -452,6 +455,54 @@ local function string_u_lower(inp, opts)
return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage)
end
+local U_COUNT_CLASS_ALL = 0
+local U_COUNT_CLASS_UPPER_LETTER = 1
+local U_COUNT_CLASS_LOWER_LETTER = 2
+local U_COUNT_CLASS_DIGIT = 4
+
+--
+-- Calculate count of symbols matching the needed classes.
+-- @param inp Input UTF8 string.
+-- @param opts Options with needed classes. It supports 'all',
+-- 'upper', 'lower', 'digit'. Opts is a table, where needed
+-- class key is set to true. By default all classes are
+-- needed, and count works like strlen (not bsize, like Lua
+-- operator '#').
+-- @retval not nil Summary count of needed symbols.
+-- @retval nil, position Invalid UTF8 on returned position.
+--
+local function string_u_count(inp, opts)
+ local usage = 'Usage: string.u_count(str)'
+ if type(inp) ~= 'string' then
+ error(usage)
+ end
+ local flags = 0
+ if opts then
+ if type(opts) ~= 'table' then
+ error(usage)
+ end
+ if not opts.all then
+ if opts.upper then
+ flags = bit.bor(flags, U_COUNT_CLASS_UPPER_LETTER)
+ end
+ if opts.lower then
+ flags = bit.bor(flags, U_COUNT_CLASS_LOWER_LETTER)
+ end
+ if opts.digit then
+ flags = bit.bor(flags, U_COUNT_CLASS_DIGIT)
+ end
+ end
+ end
+ local len = #inp
+ inp = c_char_ptr(inp)
+ local ret = ffi.C.u_count(inp, len, flags)
+ if ret >= 0 then
+ return ret
+ else
+ return nil, -ret
+ end
+end
+
-- It'll automatically set string methods, too.
local string = require('string')
string.split = string_split
@@ -466,3 +517,4 @@ string.lstrip = string_lstrip
string.rstrip = string_rstrip
string.u_upper = string_u_upper
string.u_lower = string_u_lower
+string.u_count = string_u_count
diff --git a/src/util.c b/src/util.c
index 9458695b9..c117dee05 100644
--- a/src/util.c
+++ b/src/util.c
@@ -40,7 +40,8 @@
#include <time.h>
#include <unistd.h>
#include <limits.h>
-
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
#include <msgpuck/msgpuck.h> /* mp_char2escape[] table */
#include "say.h"
@@ -321,3 +322,48 @@ fpconv_check()
*/
assert(buf[1] == '.');
}
+
+enum u_count_class {
+ U_COUNT_CLASS_ALL = 0,
+ U_COUNT_CLASS_UPPER_LETTER = 1,
+ U_COUNT_CLASS_LOWER_LETTER = 2,
+ U_COUNT_CLASS_DIGIT = 4,
+};
+
+/**
+ * Get length of a UTF8 string.
+ * @param s UTF8 string.
+ * @param bsize Binary size of @an s.
+ * @param flags Binary OR of u_count_class flags.
+ * @retval >=0 Count of symbols matched one of @a flags.
+ * @retval <0 Invalid UTF8 on the position -1 * returned value.
+ */
+int
+u_count(const char *s, int bsize, uint8_t flags)
+{
+ int offset = 0;
+ int len = 0;
+ UChar32 c;
+ if (flags == 0) {
+ /* Fast path - just calculate strlen. */
+ while (offset < bsize) {
+ U8_NEXT(s, offset, bsize, c);
+ if (c == U_SENTINEL)
+ return -(len + 1);
+ ++len;
+ }
+ return len;
+ }
+ /* Slow path - must check each symbol to match flags. */
+ while (offset < bsize) {
+ U8_NEXT(s, offset, bsize, c);
+ if (c == U_SENTINEL)
+ return -(len + 1);
+ uint8_t f = 0;
+ f |= (flags & U_COUNT_CLASS_UPPER_LETTER) != 0 && u_isupper(c);
+ f |= (flags & U_COUNT_CLASS_LOWER_LETTER) != 0 && u_islower(c);
+ f |= (flags & U_COUNT_CLASS_DIGIT) != 0 && u_isdigit(c);
+ len += f != 0 ? 1 : 0;
+ }
+ return len;
+}
diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
index 004e149e9..650a5982d 100755
--- a/test/app-tap/string.test.lua
+++ b/test/app-tap/string.test.lua
@@ -115,7 +115,7 @@ test:test("hex", function(test)
end)
test:test("unicode", function(test)
- test:plan(12)
+ test:plan(24)
local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺'
local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺'
local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺'
@@ -144,6 +144,26 @@ test:test("unicode", function(test)
s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'})
test:is(s, lower_res, 'incorrect locale turns into default lower')
test:isnil(err, 'lower error is nil')
+
+ -- Test u_count.
+ test:is(string.u_count(str), 56, 'u_count works')
+ s, err = string.u_count("\xE2\x80\xE2")
+ test:is(err, 1, 'u_count checks for errors')
+ test:isnil(s, 'retval is nil on error')
+ test:is(string.u_count(''), 0, 'u_count works on empty strings')
+ s, err = pcall(string.u_count, 100)
+ test:isnt(err:find('Usage'), nil, 'usage is checked')
+ -- Test different symbol classes.
+ s, err = pcall(string.u_count, str, 1234)
+ test:isnt(err:find('Usage'), nil, 'usage checks options')
+ test:is(string.u_count(str, {all = true}), 56, 'option all')
+ test:is(string.u_count(str, {upper = true}), 13, 'option upper')
+ test:is(string.u_count(str, {lower = true}), 19, 'option lower')
+ test:is(string.u_count(str, {upper = true, lower = true}), 32,
+ 'options upper and lower')
+ test:is(string.u_count(str, {digit = true}), 4, 'option digit')
+ test:is(string.u_count(str, {digit = true, upper = true}), 17,
+ 'options digit and upper')
end)
test:test("strip", function(test)
--
2.15.1 (Apple Git-101)
next prev parent reply other threads:[~2018-04-25 23:29 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-04-25 23:29 [tarantool-patches] [PATCH 0/7] Expose ICU into Lua Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 1/7] lua: expose ICU upper/lower functions to Lua Vladislav Shpilevoy
2018-04-28 0:56 ` [tarantool-patches] " Alexander Turenko
2018-04-25 23:29 ` Vladislav Shpilevoy [this message]
2018-04-26 10:36 ` [tarantool-patches] Re: [PATCH 2/7] lua: implement string.u_count Vladislav Shpilevoy
2018-04-26 16:07 ` Vladislav Shpilevoy
2018-04-26 23:57 ` Vladislav Shpilevoy
2018-04-28 1:10 ` Alexander Turenko
2018-04-25 23:29 ` [tarantool-patches] [PATCH 3/7] alter: fix assertion in collations alter Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 4/7] Move struct on_access_denied_ctx into error.h Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 5/7] Merge box_error, stat and collations into core library Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 6/7] Always store built-in collations in the cache Vladislav Shpilevoy
2018-04-25 23:29 ` [tarantool-patches] [PATCH 7/7] lua: expose u_compare/u_icompare into Lua Vladislav Shpilevoy
2018-04-28 1:55 ` [tarantool-patches] Re: [PATCH 0/7] Expose ICU " Alexander Turenko
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a89c3b4d0946d32673fa3a6a4570a3002de4f81b.1524698920.git.v.shpilevoy@tarantool.org \
--to=v.shpilevoy@tarantool.org \
--cc=kostja@tarantool.org \
--cc=tarantool-patches@freelists.org \
--subject='Re: [tarantool-patches] [PATCH 2/7] lua: implement string.u_count' \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox