From: Ivan Koptelov <ivan.koptelov@tarantool.org> To: tarantool-patches@freelists.org Cc: alexander.turenko@tarantool.org, Ivan Koptelov <ivan.koptelov@tarantool.org> Subject: [tarantool-patches] [PATCH] Extend range of printable unicode characters Date: Tue, 2 Jul 2019 14:33:53 +0300 [thread overview] Message-ID: <20190702113353.97827-1-ivan.koptelov@tarantool.org> (raw) Before the patch unicode characters encoded with 4 bytes were always treated as non-printable and displayed as byte sequences (with 'binary' tag). With the patch, range of printable characters is extended and include characters encoded with 4 bytes. Currently it is: (old printable range) U (icu printable range). Corresponding changes are also made in tarantool/libyaml. Closes: #4090 --- Branch https://github.com/tarantool/tarantool/tree/sudobobo/gh-4090-valid-chars-displ-bin Issue https://github.com/tarantool/tarantool/issues/4090 src/lib/core/CMakeLists.txt | 2 +- src/lib/core/util.c | 5 ++++- test/app-tap/yaml.test.lua | 10 +++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/lib/core/CMakeLists.txt b/src/lib/core/CMakeLists.txt index 66e430a25..4b96d1105 100644 --- a/src/lib/core/CMakeLists.txt +++ b/src/lib/core/CMakeLists.txt @@ -40,7 +40,7 @@ add_library(core STATIC ${core_sources}) target_link_libraries(core salad small uri decNumber ${LIBEV_LIBRARIES} ${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES} - ${MSGPUCK_LIBRARIES}) + ${MSGPUCK_LIBRARIES} ${ICU_LIBRARIES}) if (ENABLE_BACKTRACE AND NOT TARGET_OS_DARWIN) target_link_libraries(core gcc_s ${UNWIND_LIBRARIES}) diff --git a/src/lib/core/util.c b/src/lib/core/util.c index 9458695b9..4ca99e177 100644 --- a/src/lib/core/util.c +++ b/src/lib/core/util.c @@ -41,6 +41,8 @@ #include <unistd.h> #include <limits.h> +#include <unicode/utf8.h> +#include <unicode/uchar.h> #include <msgpuck/msgpuck.h> /* mp_char2escape[] table */ #include "say.h" @@ -271,7 +273,8 @@ utf8_check_printable(const char *start, size_t length) (pointer[0] == 0xEF && !(pointer[1] == 0xBB && pointer[2] == 0xBF) && !(pointer[1] == 0xBF && - (pointer[2] == 0xBE || pointer[2] == 0xBF))) + (pointer[2] == 0xBE || pointer[2] == 0xBF))) || + (u_isprint(value)) ) ) { return 0; diff --git a/test/app-tap/yaml.test.lua b/test/app-tap/yaml.test.lua index c0eecea03..4669b6102 100755 --- a/test/app-tap/yaml.test.lua +++ b/test/app-tap/yaml.test.lua @@ -42,7 +42,7 @@ local function test_compact(test, s) end local function test_output(test, s) - test:plan(12) + test:plan(17) test:is(s.encode({true}), '---\n- true\n...\n', "encode for true") test:is(s.decode("---\nyes\n..."), true, "decode for 'yes'") test:is(s.encode({false}), '---\n- false\n...\n', "encode for false") @@ -55,6 +55,14 @@ local function test_output(test, s) "encode for binary (2) - gh-354") test:is(s.encode("\xe0\x82\x85\x00"), '--- !!binary 4IKFAA==\n...\n', "encode for binary (3) - gh-1302") + -- gh-4090: some printable unicode characters displayed as byte sequences. + -- The following tests ensures that various 4-byte encoded unicode characters + -- displayed as expected. + test:is(s.encode("\xF0\x9F\x86\x98"), '--- 🆘\n...\n', "encode - gh-4090 (1)") + test:is(s.encode("\xF0\x9F\x84\xBD"), '--- 🄽\n...\n', "encode - gh-4090 (2)") + test:is(s.encode("\xF0\x9F\x85\xA9"), '--- 🅩\n...\n', "encode - gh-4090 (3)") + test:is(s.encode("\xF0\x9F\x87\xA6"), '--- 🇦\n...\n', "encode - gh-4090 (4)") + test:is(s.encode("\xF0\x9F\x88\xB2"), '--- 🈲\n...\n', "encode - gh-4090 (5)") -- gh-883: console can hang tarantool process local t = {} for i=0x8000,0xffff,1 do -- 2.20.1
next reply other threads:[~2019-07-02 11:34 UTC|newest] Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top 2019-07-02 11:33 Ivan Koptelov [this message] -- strict thread matches above, loose matches on Subject: below -- 2019-07-18 8:39 Serge Petrenko 2019-07-19 12:04 ` [tarantool-patches] " Kirill Yukhin 2019-07-16 13:43 Serge Petrenko 2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov 2019-07-17 9:00 ` Serge Petrenko 2019-07-18 4:50 ` Kirill Yukhin 2019-07-02 11:32 Ivan Koptelov
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20190702113353.97827-1-ivan.koptelov@tarantool.org \ --to=ivan.koptelov@tarantool.org \ --cc=alexander.turenko@tarantool.org \ --cc=tarantool-patches@freelists.org \ --subject='Re: [tarantool-patches] [PATCH] Extend range of printable unicode characters' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox