From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id 0F31C206E0 for ; Tue, 2 Jul 2019 07:32:49 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id JNEKQLoMRbdN for ; Tue, 2 Jul 2019 07:32:48 -0400 (EDT) Received: from smtp53.i.mail.ru (smtp53.i.mail.ru [94.100.177.113]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 6C58820452 for ; Tue, 2 Jul 2019 07:32:48 -0400 (EDT) From: Ivan Koptelov Subject: [tarantool-patches] [PATCH] Extend range of printable unicode characters Date: Tue, 2 Jul 2019 14:32:08 +0300 Message-Id: <20190702113208.97769-1-ivan.koptelov@tarantool.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-Help: List-Unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-Subscribe: List-Owner: List-post: List-Archive: To: tarantool-patches@freelists.org Cc: alexander.turenko@tarantool.org, Ivan Koptelov Before the patch IS_PRINTABLE macros was used to determine if given character is printable or not. This macros did not take into account characters encoded with 4 bytes. After the patch IS_PRINTABLE is removed with new corresponding function. Now the range of printable characters is: (libyaml old range) U (icu range). This new range include characters encoded with 4 bytes. Related to tarantool/tarantool #4090 --- Issue https://github.com/tarantool/tarantool/issues/4090 Branch https://github.com/tarantool/libyaml/tree/sudobob/tarantool-gh-4090-fix .gitignore | 1 - CMakeLists.txt | 6 ++++ cmake/FindICU.cmake | 66 ++++++++++++++++++++++++++++++++++++++++++ src/emitter.c | 47 ++++++++++++++++++++++++++++-- tests/run-all-tests.sh | 5 ++-- 5 files changed, 120 insertions(+), 5 deletions(-) create mode 100644 cmake/FindICU.cmake diff --git a/.gitignore b/.gitignore index ec3700d..d18fdfd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ *.BAK *.a -*.cmake *.dll *.exe *.la diff --git a/CMakeLists.txt b/CMakeLists.txt index e20a494..2cc8ccf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC $ ) + +include(cmake/FindICU.cmake) +find_package(ICU) +target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS}) +target_link_libraries(yaml ${ICU_LIBRARIES}) + # # Install rules # diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake new file mode 100644 index 0000000..3e36cf5 --- /dev/null +++ b/cmake/FindICU.cmake @@ -0,0 +1,66 @@ +# - Find ICU header and library +# The module defines the following variables: +# +# ICU_FOUND - true if ICU was found +# ICU_INCLUDE_DIRS - the directory of the ICU headers +# ICU_LIBRARIES - the ICU libraries needed for linking +# + +if(DEFINED ICU_ROOT) + set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH) + set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib") + set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include") +else() + set(ICU_FIND_OPTS) + set(ICU_FIND_LIBRARY_HINTS) + set(ICU_FIND_PATH_HINTS) +endif() + +find_path(ICU_INCLUDE_DIR + unicode/ucol.h + HINTS ${ICU_FIND_PATH_HINTS} + ${ICU_FIND_OPTS} +) + +if(BUILD_STATIC) + set(ICU_I18N_LIB_NAME libicui18n.a) + set(ICU_UC_LIB_NAME libicuuc.a) + set(ICU_DATA_LIB_NAME libicudata.a) +else() + set(ICU_I18N_LIB_NAME icui18n) + set(ICU_UC_LIB_NAME icuuc) + set(ICU_DATA_LIB_NAME icudata) +endif() + +find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME} + HINTS ${ICU_FIND_LIBRARY_HINTS} + ${ICU_FIND_OPTS} +) +find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME} + HINTS ${ICU_FIND_LIBRARY_HINTS} + ${ICU_FIND_OPTS} +) + +find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME} + HINTS ${ICU_FIND_LIBRARY_HINTS} + ${ICU_FIND_OPTS} +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ICU + REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC) +set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR}) +set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA}) +mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS + ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES) + +# +# Check presence of ucol_strcollUTF8 function from ICU +# +set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES}) +set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS}) +set(CMAKE_REQUIRED_FLAGS "-std=c++11") +set(CMAKE_REQUIRED_DEFINITIONS "") +set(CMAKE_REQUIRED_LIBRARIES "") +set(CMAKE_REQUIRED_INCLUDES "") +set(CMAKE_REQUIRED_FLAGS "") diff --git a/src/emitter.c b/src/emitter.c index 1400df1..14e3551 100644 --- a/src/emitter.c +++ b/src/emitter.c @@ -1,6 +1,9 @@ #include "yaml_private.h" +#include +#include + /* * Flush the buffer if needed. */ @@ -86,6 +89,9 @@ static int yaml_emitter_increase_indent(yaml_emitter_t *emitter, int flow, int indentless); +static inline int +yaml_emitter_is_printable(yaml_string_t string); + /* * State functions. */ @@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter, return 1; } +/* + * Checks if given utf-8 encoded code point represent printable character. + */ + +static inline int +yaml_emitter_is_printable(yaml_string_t string) +{ + unsigned char octet; + unsigned int width; + unsigned int value; + + octet = string.pointer[0]; + width = (octet & 0x80) == 0x00 ? 1 : + (octet & 0xE0) == 0xC0 ? 2 : + (octet & 0xF0) == 0xE0 ? 3 : + (octet & 0xF8) == 0xF0 ? 4 : 0; + value = (octet & 0x80) == 0x00 ? octet & 0x7F : + (octet & 0xE0) == 0xC0 ? octet & 0x1F : + (octet & 0xF0) == 0xE0 ? octet & 0x0F : + (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; + for (int k = 1; k < (int)width; k ++) { + octet = string.pointer[k]; + value = (value << 6) + (octet & 0x3F); + } + return (((string).pointer[0] == 0x0A) + || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E) + || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0) + || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED) + || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0) + || ((string).pointer[0] == 0xEE) + || ((string).pointer[0] == 0xEF + && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF) + && !((string).pointer[1] == 0xBF + && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF))) + || u_isprint(value)); +} + /* * State dispatcher. */ @@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter, } } - if (!IS_PRINTABLE(string) + if (!yaml_emitter_is_printable(string) || (!IS_ASCII(string) && !emitter->unicode)) { special_characters = 1; } @@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter, while (string.pointer != string.end) { - if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string)) + if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string)) || IS_BOM(string) || IS_BREAK(string) || CHECK(string, '"') || CHECK(string, '\\')) { diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh index 9c92741..fee18d5 100755 --- a/tests/run-all-tests.sh +++ b/tests/run-all-tests.sh @@ -5,14 +5,15 @@ set -e main() { # Autoconf based in-source build and tests clean - + export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc" + export CPPFLAGS="-I/usr/local/opt/icu4c/include" ./bootstrap ./configure make test-all # CMake based in-source build and tests clean - + export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c cmake . make make test -- 2.20.1