From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from smtp51.i.mail.ru (smtp51.i.mail.ru [94.100.177.111]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by dev.tarantool.org (Postfix) with ESMTPS id C98E642EF5C for ; Thu, 18 Jun 2020 08:37:17 +0300 (MSK) From: "Alexander V. Tikhonov" Date: Thu, 18 Jun 2020 08:36:52 +0300 Message-Id: <186d849b71e0ef8eb0bd0c356d716fc2833eba8f.1592458150.git.avtikhon@tarantool.org> In-Reply-To: References: MIME-Version: 1.0 In-Reply-To: References: Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Subject: [Tarantool-patches] [PATCH v1 4/4] Extend range of printable unicode characters List-Id: Tarantool development patches List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Kirill Yukhin , Sergey Bronnikov , Alexander Turenko Cc: Ivan Koptelov , tarantool-patches@dev.tarantool.org From: Ivan Koptelov Before the patch unicode characters encoded with 4 bytes were always treated as non-printable and displayed as byte sequences (with 'binary' tag). With the patch, range of printable characters is extended and include characters encoded with 4 bytes. Currently it is: (old printable range) U (icu printable range). Corresponding changes are also made in tarantool/libyaml. Closes: #4090 (cherry picked from commit cdf37876189fa005a350d6b69397348354bb2073) --- src/CMakeLists.txt | 1 + src/util.c | 5 ++++- test/app-tap/yaml.test.lua | 10 +++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a8c515134..313d04e21 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -124,6 +124,7 @@ target_link_libraries(core ${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES} ${MSGPUCK_LIBRARIES} + ${ICU_LIBRARIES} ${generic_libraries}) add_library(stat STATIC rmean.c latency.c histogram.c) diff --git a/src/util.c b/src/util.c index efc5a64e7..ed43bda05 100644 --- a/src/util.c +++ b/src/util.c @@ -41,6 +41,8 @@ #include #include +#include +#include #include /* mp_char2escape[] table */ #include "say.h" @@ -274,7 +276,8 @@ utf8_check_printable(const char *start, size_t length) (pointer[0] == 0xEF && !(pointer[1] == 0xBB && pointer[2] == 0xBF) && !(pointer[1] == 0xBF && - (pointer[2] == 0xBE || pointer[2] == 0xBF))) + (pointer[2] == 0xBE || pointer[2] == 0xBF))) || + (u_isprint(value)) ) ) { return 0; diff --git a/test/app-tap/yaml.test.lua b/test/app-tap/yaml.test.lua index c0eecea03..4669b6102 100755 --- a/test/app-tap/yaml.test.lua +++ b/test/app-tap/yaml.test.lua @@ -42,7 +42,7 @@ local function test_compact(test, s) end local function test_output(test, s) - test:plan(12) + test:plan(17) test:is(s.encode({true}), '---\n- true\n...\n', "encode for true") test:is(s.decode("---\nyes\n..."), true, "decode for 'yes'") test:is(s.encode({false}), '---\n- false\n...\n', "encode for false") @@ -55,6 +55,14 @@ local function test_output(test, s) "encode for binary (2) - gh-354") test:is(s.encode("\xe0\x82\x85\x00"), '--- !!binary 4IKFAA==\n...\n', "encode for binary (3) - gh-1302") + -- gh-4090: some printable unicode characters displayed as byte sequences. + -- The following tests ensures that various 4-byte encoded unicode characters + -- displayed as expected. + test:is(s.encode("\xF0\x9F\x86\x98"), '--- 🆘\n...\n', "encode - gh-4090 (1)") + test:is(s.encode("\xF0\x9F\x84\xBD"), '--- 🄽\n...\n', "encode - gh-4090 (2)") + test:is(s.encode("\xF0\x9F\x85\xA9"), '--- 🅩\n...\n', "encode - gh-4090 (3)") + test:is(s.encode("\xF0\x9F\x87\xA6"), '--- 🇦\n...\n', "encode - gh-4090 (4)") + test:is(s.encode("\xF0\x9F\x88\xB2"), '--- 🈲\n...\n', "encode - gh-4090 (5)") -- gh-883: console can hang tarantool process local t = {} for i=0x8000,0xffff,1 do -- 2.17.1