[tarantool-patches] [PATCH] Extend range of printable unicode characters
Ivan Koptelov
ivan.koptelov at tarantool.org
Tue Jul 2 14:32:08 MSK 2019
Before the patch IS_PRINTABLE macros was used
to determine if given character is printable or not.
This macros did not take into account characters
encoded with 4 bytes.
After the patch IS_PRINTABLE is removed with new
corresponding function. Now the range of printable
characters is: (libyaml old range) U (icu range). This
new range include characters encoded with 4 bytes.
Related to tarantool/tarantool #4090
---
Issue https://github.com/tarantool/tarantool/issues/4090
Branch https://github.com/tarantool/libyaml/tree/sudobob/tarantool-gh-4090-fix
.gitignore | 1 -
CMakeLists.txt | 6 ++++
cmake/FindICU.cmake | 66 ++++++++++++++++++++++++++++++++++++++++++
src/emitter.c | 47 ++++++++++++++++++++++++++++--
tests/run-all-tests.sh | 5 ++--
5 files changed, 120 insertions(+), 5 deletions(-)
create mode 100644 cmake/FindICU.cmake
diff --git a/.gitignore b/.gitignore
index ec3700d..d18fdfd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
*.BAK
*.a
-*.cmake
*.dll
*.exe
*.la
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e20a494..2cc8ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC
$<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
)
+
+include(cmake/FindICU.cmake)
+find_package(ICU)
+target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
+target_link_libraries(yaml ${ICU_LIBRARIES})
+
#
# Install rules
#
diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
new file mode 100644
index 0000000..3e36cf5
--- /dev/null
+++ b/cmake/FindICU.cmake
@@ -0,0 +1,66 @@
+# - Find ICU header and library
+# The module defines the following variables:
+#
+# ICU_FOUND - true if ICU was found
+# ICU_INCLUDE_DIRS - the directory of the ICU headers
+# ICU_LIBRARIES - the ICU libraries needed for linking
+#
+
+if(DEFINED ICU_ROOT)
+ set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
+ set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
+ set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
+else()
+ set(ICU_FIND_OPTS)
+ set(ICU_FIND_LIBRARY_HINTS)
+ set(ICU_FIND_PATH_HINTS)
+endif()
+
+find_path(ICU_INCLUDE_DIR
+ unicode/ucol.h
+ HINTS ${ICU_FIND_PATH_HINTS}
+ ${ICU_FIND_OPTS}
+)
+
+if(BUILD_STATIC)
+ set(ICU_I18N_LIB_NAME libicui18n.a)
+ set(ICU_UC_LIB_NAME libicuuc.a)
+ set(ICU_DATA_LIB_NAME libicudata.a)
+else()
+ set(ICU_I18N_LIB_NAME icui18n)
+ set(ICU_UC_LIB_NAME icuuc)
+ set(ICU_DATA_LIB_NAME icudata)
+endif()
+
+find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
+ HINTS ${ICU_FIND_LIBRARY_HINTS}
+ ${ICU_FIND_OPTS}
+)
+find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
+ HINTS ${ICU_FIND_LIBRARY_HINTS}
+ ${ICU_FIND_OPTS}
+)
+
+find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
+ HINTS ${ICU_FIND_LIBRARY_HINTS}
+ ${ICU_FIND_OPTS}
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ICU
+ REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
+set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
+set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
+mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
+ ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)
+
+#
+# Check presence of ucol_strcollUTF8 function from ICU
+#
+set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
+set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+set(CMAKE_REQUIRED_DEFINITIONS "")
+set(CMAKE_REQUIRED_LIBRARIES "")
+set(CMAKE_REQUIRED_INCLUDES "")
+set(CMAKE_REQUIRED_FLAGS "")
diff --git a/src/emitter.c b/src/emitter.c
index 1400df1..14e3551 100644
--- a/src/emitter.c
+++ b/src/emitter.c
@@ -1,6 +1,9 @@
#include "yaml_private.h"
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+
/*
* Flush the buffer if needed.
*/
@@ -86,6 +89,9 @@ static int
yaml_emitter_increase_indent(yaml_emitter_t *emitter,
int flow, int indentless);
+static inline int
+yaml_emitter_is_printable(yaml_string_t string);
+
/*
* State functions.
*/
@@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
return 1;
}
+/*
+ * Checks if given utf-8 encoded code point represent printable character.
+ */
+
+static inline int
+yaml_emitter_is_printable(yaml_string_t string)
+{
+ unsigned char octet;
+ unsigned int width;
+ unsigned int value;
+
+ octet = string.pointer[0];
+ width = (octet & 0x80) == 0x00 ? 1 :
+ (octet & 0xE0) == 0xC0 ? 2 :
+ (octet & 0xF0) == 0xE0 ? 3 :
+ (octet & 0xF8) == 0xF0 ? 4 : 0;
+ value = (octet & 0x80) == 0x00 ? octet & 0x7F :
+ (octet & 0xE0) == 0xC0 ? octet & 0x1F :
+ (octet & 0xF0) == 0xE0 ? octet & 0x0F :
+ (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
+ for (int k = 1; k < (int)width; k ++) {
+ octet = string.pointer[k];
+ value = (value << 6) + (octet & 0x3F);
+ }
+ return (((string).pointer[0] == 0x0A)
+ || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
+ || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
+ || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
+ || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
+ || ((string).pointer[0] == 0xEE)
+ || ((string).pointer[0] == 0xEF
+ && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
+ && !((string).pointer[1] == 0xBF
+ && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
+ || u_isprint(value));
+}
+
/*
* State dispatcher.
*/
@@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
}
}
- if (!IS_PRINTABLE(string)
+ if (!yaml_emitter_is_printable(string)
|| (!IS_ASCII(string) && !emitter->unicode)) {
special_characters = 1;
}
@@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,
while (string.pointer != string.end)
{
- if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
+ if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
|| IS_BOM(string) || IS_BREAK(string)
|| CHECK(string, '"') || CHECK(string, '\\'))
{
diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh
index 9c92741..fee18d5 100755
--- a/tests/run-all-tests.sh
+++ b/tests/run-all-tests.sh
@@ -5,14 +5,15 @@ set -e
main() {
# Autoconf based in-source build and tests
clean
-
+ export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
+ export CPPFLAGS="-I/usr/local/opt/icu4c/include"
./bootstrap
./configure
make test-all
# CMake based in-source build and tests
clean
-
+ export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
cmake .
make
make test
--
2.20.1
More information about the Tarantool-patches
mailing list