[tarantool-patches] [PATCH 2/7] lua: implement string.u_count

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Thu Apr 26 02:29:02 MSK 2018


Lua can not calculate length of a unicode string correctly. But
Tarantool has ICU on board - lets use it to calculate length.

u_count has options, that allows to count only symbols of a
specific class, for example, only capital letters, or digits.
Options can be combined.

Closes #3081
---
 extra/exports                |  1 +
 src/CMakeLists.txt           |  1 +
 src/lua/string.lua           | 52 ++++++++++++++++++++++++++++++++++++++++++++
 src/util.c                   | 48 +++++++++++++++++++++++++++++++++++++++-
 test/app-tap/string.test.lua | 22 ++++++++++++++++++-
 5 files changed, 122 insertions(+), 2 deletions(-)

diff --git a/extra/exports b/extra/exports
index a274bb23b..b0480fe79 100644
--- a/extra/exports
+++ b/extra/exports
@@ -40,6 +40,7 @@ title_set_status
 title_get_status
 exception_get_string
 exception_get_int
+u_count
 
 tarantool_lua_ibuf
 uuid_nil
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8ab09e968..f489c88cf 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -110,6 +110,7 @@ target_link_libraries(core
     ${LIBEIO_LIBRARIES}
     ${LIBCORO_LIBRARIES}
     ${MSGPUCK_LIBRARIES}
+    ${ICU_LIBRARIES}
 )
 
 add_library(stat STATIC rmean.c latency.c histogram.c)
diff --git a/src/lua/string.lua b/src/lua/string.lua
index 1c7226143..6c566cb54 100644
--- a/src/lua/string.lua
+++ b/src/lua/string.lua
@@ -29,6 +29,9 @@ ffi.cdef[[
 
     const char *
     u_errorName(UErrorCode code);
+
+    int
+    u_count(const char *s, int bsize, uint8_t flags);
 ]]
 
 local c_char_ptr = ffi.typeof('const char *')
@@ -452,6 +455,54 @@ local function string_u_lower(inp, opts)
     return string_u_to_case_impl(inp, opts, ffi.C.ucasemap_utf8ToLower, usage)
 end
 
+local U_COUNT_CLASS_ALL = 0
+local U_COUNT_CLASS_UPPER_LETTER = 1
+local U_COUNT_CLASS_LOWER_LETTER = 2
+local U_COUNT_CLASS_DIGIT = 4
+
+--
+-- Calculate count of symbols matching the needed classes.
+-- @param inp Input UTF8 string.
+-- @param opts Options with needed classes. It supports 'all',
+--        'upper', 'lower', 'digit'. Opts is a table, where needed
+--        class key is set to true. By default all classes are
+--        needed, and count works like strlen (not bsize, like Lua
+--        operator '#').
+-- @retval not nil Summary count of needed symbols.
+-- @retval nil, position Invalid UTF8 on returned position.
+--
+local function string_u_count(inp, opts)
+    local usage = 'Usage: string.u_count(str)'
+    if type(inp) ~= 'string' then
+        error(usage)
+    end
+    local flags = 0
+    if opts then
+        if type(opts) ~= 'table' then
+            error(usage)
+        end
+        if not opts.all then
+            if opts.upper then
+                flags = bit.bor(flags, U_COUNT_CLASS_UPPER_LETTER)
+            end
+            if opts.lower then
+                flags = bit.bor(flags, U_COUNT_CLASS_LOWER_LETTER)
+            end
+            if opts.digit then
+                flags = bit.bor(flags, U_COUNT_CLASS_DIGIT)
+            end
+        end
+    end
+    local len = #inp
+    inp = c_char_ptr(inp)
+    local ret = ffi.C.u_count(inp, len, flags)
+    if ret >= 0 then
+        return ret
+    else
+        return nil, -ret
+    end
+end
+
 -- It'll automatically set string methods, too.
 local string = require('string')
 string.split      = string_split
@@ -466,3 +517,4 @@ string.lstrip      = string_lstrip
 string.rstrip      = string_rstrip
 string.u_upper    = string_u_upper
 string.u_lower    = string_u_lower
+string.u_count    = string_u_count
diff --git a/src/util.c b/src/util.c
index 9458695b9..c117dee05 100644
--- a/src/util.c
+++ b/src/util.c
@@ -40,7 +40,8 @@
 #include <time.h>
 #include <unistd.h>
 #include <limits.h>
-
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
 #include <msgpuck/msgpuck.h> /* mp_char2escape[] table */
 
 #include "say.h"
@@ -321,3 +322,48 @@ fpconv_check()
 	 */
 	assert(buf[1] == '.');
 }
+
+enum u_count_class {
+	U_COUNT_CLASS_ALL = 0,
+	U_COUNT_CLASS_UPPER_LETTER = 1,
+	U_COUNT_CLASS_LOWER_LETTER = 2,
+	U_COUNT_CLASS_DIGIT = 4,
+};
+
+/**
+ * Get length of a UTF8 string.
+ * @param s UTF8 string.
+ * @param bsize Binary size of @an s.
+ * @param flags Binary OR of u_count_class flags.
+ * @retval >=0 Count of symbols matched one of @a flags.
+ * @retval  <0 Invalid UTF8 on the position -1 * returned value.
+ */
+int
+u_count(const char *s, int bsize, uint8_t flags)
+{
+	int offset = 0;
+	int len = 0;
+	UChar32 c;
+	if (flags == 0) {
+		/* Fast path - just calculate strlen. */
+		while (offset < bsize) {
+			U8_NEXT(s, offset, bsize, c);
+			if (c == U_SENTINEL)
+				return -(len + 1);
+			++len;
+		}
+		return len;
+	}
+	/* Slow path - must check each symbol to match flags. */
+	while (offset < bsize) {
+		U8_NEXT(s, offset, bsize, c);
+		if (c == U_SENTINEL)
+			return -(len + 1);
+		uint8_t f = 0;
+		f |= (flags & U_COUNT_CLASS_UPPER_LETTER) != 0 && u_isupper(c);
+		f |= (flags & U_COUNT_CLASS_LOWER_LETTER) != 0 && u_islower(c);
+		f |= (flags & U_COUNT_CLASS_DIGIT) != 0 && u_isdigit(c);
+		len += f != 0 ? 1 : 0;
+	}
+	return len;
+}
diff --git a/test/app-tap/string.test.lua b/test/app-tap/string.test.lua
index 004e149e9..650a5982d 100755
--- a/test/app-tap/string.test.lua
+++ b/test/app-tap/string.test.lua
@@ -115,7 +115,7 @@ test:test("hex", function(test)
 end)
 
 test:test("unicode", function(test)
-    test:plan(12)
+    test:plan(24)
     local str = 'хеЛлоу вОрЛд ё Ё я Я э Э ъ Ъ hElLo WorLd 1234 i I İ 勺#☢༺'
     local upper_res = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 I I İ 勺#☢༺'
     local upper_turkish = 'ХЕЛЛОУ ВОРЛД Ё Ё Я Я Э Э Ъ Ъ HELLO WORLD 1234 İ I İ 勺#☢༺'
@@ -144,6 +144,26 @@ test:test("unicode", function(test)
     s, err = string.u_lower(str, {locale = 'not_existing locale tratatatata'})
     test:is(s, lower_res, 'incorrect locale turns into default lower')
     test:isnil(err, 'lower error is nil')
+
+    -- Test u_count.
+    test:is(string.u_count(str), 56, 'u_count works')
+    s, err = string.u_count("\xE2\x80\xE2")
+    test:is(err, 1, 'u_count checks for errors')
+    test:isnil(s, 'retval is nil on error')
+    test:is(string.u_count(''), 0, 'u_count works on empty strings')
+    s, err = pcall(string.u_count, 100)
+    test:isnt(err:find('Usage'), nil, 'usage is checked')
+    -- Test different symbol classes.
+    s, err = pcall(string.u_count, str, 1234)
+    test:isnt(err:find('Usage'), nil, 'usage checks options')
+    test:is(string.u_count(str, {all = true}), 56, 'option all')
+    test:is(string.u_count(str, {upper = true}), 13, 'option upper')
+    test:is(string.u_count(str, {lower = true}), 19, 'option lower')
+    test:is(string.u_count(str, {upper = true, lower = true}), 32,
+            'options upper and lower')
+    test:is(string.u_count(str, {digit = true}), 4, 'option digit')
+    test:is(string.u_count(str, {digit = true, upper = true}), 17,
+            'options digit and upper')
 end)
 
 test:test("strip", function(test)
-- 
2.15.1 (Apple Git-101)





More information about the Tarantool-patches mailing list