From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: From: Serge Petrenko Subject: [PATCH] Extend range of printable unicode characters Date: Tue, 16 Jul 2019 16:43:31 +0300 Message-Id: <20190716134331.15327-1-sergepetrenko@tarantool.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit To: vdavydov.dev@gmail.com Cc: tarantool-patches@freelists.org, alexander.turenko@tarantool.org, Serge Petrenko List-ID: Before the patch IS_PRINTABLE macros was used to determine if given character is printable or not. This macros did not take into account characters encoded with 4 bytes. After the patch IS_PRINTABLE is replaced with new corresponding function. Now the range of printable characters is: (libyaml old range) U (icu range). This new range include characters encoded with 4 bytes. Related to tarantool/tarantool#4090 --- https://github.com/tarantool/libyaml/tree/tarantool-gh-4090-fix https://github.com/tarantool/tarantool/issues/4090 The patch was initially submitted by SudoBobo (Ivan Koptelov) The only change I made is remove the now unused IS_PRINTABLE macro. .gitignore | 1 - CMakeLists.txt | 6 ++++ cmake/FindICU.cmake | 66 ++++++++++++++++++++++++++++++++++++++++++ src/emitter.c | 47 ++++++++++++++++++++++++++++-- src/yaml_private.h | 20 ------------- tests/run-all-tests.sh | 5 ++-- 6 files changed, 120 insertions(+), 25 deletions(-) create mode 100644 cmake/FindICU.cmake diff --git a/.gitignore b/.gitignore index ec3700d..d18fdfd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ *.BAK *.a -*.cmake *.dll *.exe *.la diff --git a/CMakeLists.txt b/CMakeLists.txt index e20a494..2cc8ccf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC $ ) + +include(cmake/FindICU.cmake) +find_package(ICU) +target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS}) +target_link_libraries(yaml ${ICU_LIBRARIES}) + # # Install rules # diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake new file mode 100644 index 0000000..3e36cf5 --- /dev/null +++ b/cmake/FindICU.cmake @@ -0,0 +1,66 @@ +# - Find ICU header and library +# The module defines the following variables: +# +# ICU_FOUND - true if ICU was found +# ICU_INCLUDE_DIRS - the directory of the ICU headers +# ICU_LIBRARIES - the ICU libraries needed for linking +# + +if(DEFINED ICU_ROOT) + set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH) + set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib") + set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include") +else() + set(ICU_FIND_OPTS) + set(ICU_FIND_LIBRARY_HINTS) + set(ICU_FIND_PATH_HINTS) +endif() + +find_path(ICU_INCLUDE_DIR + unicode/ucol.h + HINTS ${ICU_FIND_PATH_HINTS} + ${ICU_FIND_OPTS} +) + +if(BUILD_STATIC) + set(ICU_I18N_LIB_NAME libicui18n.a) + set(ICU_UC_LIB_NAME libicuuc.a) + set(ICU_DATA_LIB_NAME libicudata.a) +else() + set(ICU_I18N_LIB_NAME icui18n) + set(ICU_UC_LIB_NAME icuuc) + set(ICU_DATA_LIB_NAME icudata) +endif() + +find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME} + HINTS ${ICU_FIND_LIBRARY_HINTS} + ${ICU_FIND_OPTS} +) +find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME} + HINTS ${ICU_FIND_LIBRARY_HINTS} + ${ICU_FIND_OPTS} +) + +find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME} + HINTS ${ICU_FIND_LIBRARY_HINTS} + ${ICU_FIND_OPTS} +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ICU + REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC) +set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR}) +set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA}) +mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS + ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES) + +# +# Check presence of ucol_strcollUTF8 function from ICU +# +set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES}) +set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS}) +set(CMAKE_REQUIRED_FLAGS "-std=c++11") +set(CMAKE_REQUIRED_DEFINITIONS "") +set(CMAKE_REQUIRED_LIBRARIES "") +set(CMAKE_REQUIRED_INCLUDES "") +set(CMAKE_REQUIRED_FLAGS "") diff --git a/src/emitter.c b/src/emitter.c index 1400df1..14e3551 100644 --- a/src/emitter.c +++ b/src/emitter.c @@ -1,6 +1,9 @@ #include "yaml_private.h" +#include +#include + /* * Flush the buffer if needed. */ @@ -86,6 +89,9 @@ static int yaml_emitter_increase_indent(yaml_emitter_t *emitter, int flow, int indentless); +static inline int +yaml_emitter_is_printable(yaml_string_t string); + /* * State functions. */ @@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter, return 1; } +/* + * Checks if given utf-8 encoded code point represent printable character. + */ + +static inline int +yaml_emitter_is_printable(yaml_string_t string) +{ + unsigned char octet; + unsigned int width; + unsigned int value; + + octet = string.pointer[0]; + width = (octet & 0x80) == 0x00 ? 1 : + (octet & 0xE0) == 0xC0 ? 2 : + (octet & 0xF0) == 0xE0 ? 3 : + (octet & 0xF8) == 0xF0 ? 4 : 0; + value = (octet & 0x80) == 0x00 ? octet & 0x7F : + (octet & 0xE0) == 0xC0 ? octet & 0x1F : + (octet & 0xF0) == 0xE0 ? octet & 0x0F : + (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0; + for (int k = 1; k < (int)width; k ++) { + octet = string.pointer[k]; + value = (value << 6) + (octet & 0x3F); + } + return (((string).pointer[0] == 0x0A) + || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E) + || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0) + || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED) + || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0) + || ((string).pointer[0] == 0xEE) + || ((string).pointer[0] == 0xEF + && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF) + && !((string).pointer[1] == 0xBF + && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF))) + || u_isprint(value)); +} + /* * State dispatcher. */ @@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter, } } - if (!IS_PRINTABLE(string) + if (!yaml_emitter_is_printable(string) || (!IS_ASCII(string) && !emitter->unicode)) { special_characters = 1; } @@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter, while (string.pointer != string.end) { - if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string)) + if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string)) || IS_BOM(string) || IS_BREAK(string) || CHECK(string, '"') || CHECK(string, '\\')) { diff --git a/src/yaml_private.h b/src/yaml_private.h index eb72207..437ee36 100644 --- a/src/yaml_private.h +++ b/src/yaml_private.h @@ -258,26 +258,6 @@ yaml_string_join( * Check if the character can be printed unescaped. */ -#define IS_PRINTABLE_AT(string,offset) \ - (((string).pointer[offset] == 0x0A) /* . == #x0A */ \ - || ((string).pointer[offset] >= 0x20 /* #x20 <= . <= #x7E */ \ - && (string).pointer[offset] <= 0x7E) \ - || ((string).pointer[offset] == 0xC2 /* #0xA0 <= . <= #xD7FF */ \ - && (string).pointer[offset+1] >= 0xA0) \ - || ((string).pointer[offset] > 0xC2 \ - && (string).pointer[offset] < 0xED) \ - || ((string).pointer[offset] == 0xED \ - && (string).pointer[offset+1] < 0xA0) \ - || ((string).pointer[offset] == 0xEE) \ - || ((string).pointer[offset] == 0xEF /* #xE000 <= . <= #xFFFD */ \ - && !((string).pointer[offset+1] == 0xBB /* && . != #xFEFF */ \ - && (string).pointer[offset+2] == 0xBF) \ - && !((string).pointer[offset+1] == 0xBF \ - && ((string).pointer[offset+2] == 0xBE \ - || (string).pointer[offset+2] == 0xBF)))) - -#define IS_PRINTABLE(string) IS_PRINTABLE_AT((string),0) - /* * Check if the character at the specified position is NUL. */ diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh index 9c92741..fee18d5 100755 --- a/tests/run-all-tests.sh +++ b/tests/run-all-tests.sh @@ -5,14 +5,15 @@ set -e main() { # Autoconf based in-source build and tests clean - + export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc" + export CPPFLAGS="-I/usr/local/opt/icu4c/include" ./bootstrap ./configure make test-all # CMake based in-source build and tests clean - + export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c cmake . make make test -- 2.20.1 (Apple Git-117)