[Tarantool-patches] [PATCH v1 4/4] Extend range of printable unicode characters
Alexander V. Tikhonov
avtikhon at tarantool.org
Thu Jun 18 08:36:52 MSK 2020
From: Ivan Koptelov <ivan.koptelov at tarantool.org>
Before the patch unicode characters encoded with 4 bytes
were always treated as non-printable and displayed as byte
sequences (with 'binary' tag).
With the patch, range of printable characters is extended and
include characters encoded with 4 bytes.
Currently it is: (old printable range) U (icu printable range).
Corresponding changes are also made in tarantool/libyaml.
Closes: #4090
(cherry picked from commit cdf37876189fa005a350d6b69397348354bb2073)
---
src/CMakeLists.txt | 1 +
src/util.c | 5 ++++-
test/app-tap/yaml.test.lua | 10 +++++++++-
3 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a8c515134..313d04e21 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -124,6 +124,7 @@ target_link_libraries(core
${LIBEIO_LIBRARIES}
${LIBCORO_LIBRARIES}
${MSGPUCK_LIBRARIES}
+ ${ICU_LIBRARIES}
${generic_libraries})
add_library(stat STATIC rmean.c latency.c histogram.c)
diff --git a/src/util.c b/src/util.c
index efc5a64e7..ed43bda05 100644
--- a/src/util.c
+++ b/src/util.c
@@ -41,6 +41,8 @@
#include <unistd.h>
#include <limits.h>
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
#include <msgpuck/msgpuck.h> /* mp_char2escape[] table */
#include "say.h"
@@ -274,7 +276,8 @@ utf8_check_printable(const char *start, size_t length)
(pointer[0] == 0xEF &&
!(pointer[1] == 0xBB && pointer[2] == 0xBF) &&
!(pointer[1] == 0xBF &&
- (pointer[2] == 0xBE || pointer[2] == 0xBF)))
+ (pointer[2] == 0xBE || pointer[2] == 0xBF))) ||
+ (u_isprint(value))
)
) {
return 0;
diff --git a/test/app-tap/yaml.test.lua b/test/app-tap/yaml.test.lua
index c0eecea03..4669b6102 100755
--- a/test/app-tap/yaml.test.lua
+++ b/test/app-tap/yaml.test.lua
@@ -42,7 +42,7 @@ local function test_compact(test, s)
end
local function test_output(test, s)
- test:plan(12)
+ test:plan(17)
test:is(s.encode({true}), '---\n- true\n...\n', "encode for true")
test:is(s.decode("---\nyes\n..."), true, "decode for 'yes'")
test:is(s.encode({false}), '---\n- false\n...\n', "encode for false")
@@ -55,6 +55,14 @@ local function test_output(test, s)
"encode for binary (2) - gh-354")
test:is(s.encode("\xe0\x82\x85\x00"), '--- !!binary 4IKFAA==\n...\n',
"encode for binary (3) - gh-1302")
+ -- gh-4090: some printable unicode characters displayed as byte sequences.
+ -- The following tests ensures that various 4-byte encoded unicode characters
+ -- displayed as expected.
+ test:is(s.encode("\xF0\x9F\x86\x98"), '--- 🆘\n...\n', "encode - gh-4090 (1)")
+ test:is(s.encode("\xF0\x9F\x84\xBD"), '--- 🄽\n...\n', "encode - gh-4090 (2)")
+ test:is(s.encode("\xF0\x9F\x85\xA9"), '--- 🅩\n...\n', "encode - gh-4090 (3)")
+ test:is(s.encode("\xF0\x9F\x87\xA6"), '--- 🇦\n...\n', "encode - gh-4090 (4)")
+ test:is(s.encode("\xF0\x9F\x88\xB2"), '--- 🈲\n...\n', "encode - gh-4090 (5)")
-- gh-883: console can hang tarantool process
local t = {}
for i=0x8000,0xffff,1 do
--
2.17.1
More information about the Tarantool-patches
mailing list