Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH] Extend range of printable unicode characters
@ 2019-07-16 13:43 Serge Petrenko
  2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
  2019-07-18  4:50 ` Kirill Yukhin
  0 siblings, 2 replies; 7+ messages in thread
From: Serge Petrenko @ 2019-07-16 13:43 UTC (permalink / raw)
  To: vdavydov.dev; +Cc: tarantool-patches, alexander.turenko, Serge Petrenko

Before the patch IS_PRINTABLE macros was used
to determine if given character is printable or not.
This macros did not take into account characters
encoded with 4 bytes.
After the patch IS_PRINTABLE is replaced with new
corresponding function. Now the range of printable
characters is: (libyaml old range) U (icu range). This
new range include characters encoded with 4 bytes.

Related to tarantool/tarantool#4090
---
https://github.com/tarantool/libyaml/tree/tarantool-gh-4090-fix
https://github.com/tarantool/tarantool/issues/4090

The patch was initially submitted by SudoBobo (Ivan Koptelov)
The only change I made is remove the now unused IS_PRINTABLE macro.

 .gitignore             |  1 -
 CMakeLists.txt         |  6 ++++
 cmake/FindICU.cmake    | 66 ++++++++++++++++++++++++++++++++++++++++++
 src/emitter.c          | 47 ++++++++++++++++++++++++++++--
 src/yaml_private.h     | 20 -------------
 tests/run-all-tests.sh |  5 ++--
 6 files changed, 120 insertions(+), 25 deletions(-)
 create mode 100644 cmake/FindICU.cmake

diff --git a/.gitignore b/.gitignore
index ec3700d..d18fdfd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 *.BAK
 *.a
-*.cmake
 *.dll
 *.exe
 *.la
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e20a494..2cc8ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC
   $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
   )
 
+
+include(cmake/FindICU.cmake)
+find_package(ICU)
+target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
+target_link_libraries(yaml  ${ICU_LIBRARIES})
+
 #
 # Install rules
 #
diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
new file mode 100644
index 0000000..3e36cf5
--- /dev/null
+++ b/cmake/FindICU.cmake
@@ -0,0 +1,66 @@
+# - Find ICU header and library
+# The module defines the following variables:
+#
+#  ICU_FOUND - true if ICU was found
+#  ICU_INCLUDE_DIRS - the directory of the ICU headers
+#  ICU_LIBRARIES - the ICU libraries needed for linking
+#
+
+if(DEFINED ICU_ROOT)
+    set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
+    set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
+    set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
+else()
+    set(ICU_FIND_OPTS)
+    set(ICU_FIND_LIBRARY_HINTS)
+    set(ICU_FIND_PATH_HINTS)
+endif()
+
+find_path(ICU_INCLUDE_DIR
+    unicode/ucol.h
+    HINTS ${ICU_FIND_PATH_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+if(BUILD_STATIC)
+    set(ICU_I18N_LIB_NAME libicui18n.a)
+    set(ICU_UC_LIB_NAME libicuuc.a)
+    set(ICU_DATA_LIB_NAME libicudata.a)
+else()
+    set(ICU_I18N_LIB_NAME icui18n)
+    set(ICU_UC_LIB_NAME icuuc)
+    set(ICU_DATA_LIB_NAME icudata)
+endif()
+
+find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ICU
+    REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
+set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
+set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
+mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
+        ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)
+
+#
+# Check presence of ucol_strcollUTF8 function from ICU
+#
+set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
+set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+set(CMAKE_REQUIRED_DEFINITIONS "")
+set(CMAKE_REQUIRED_LIBRARIES "")
+set(CMAKE_REQUIRED_INCLUDES "")
+set(CMAKE_REQUIRED_FLAGS "")
diff --git a/src/emitter.c b/src/emitter.c
index 1400df1..14e3551 100644
--- a/src/emitter.c
+++ b/src/emitter.c
@@ -1,6 +1,9 @@
 
 #include "yaml_private.h"
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+
 /*
  * Flush the buffer if needed.
  */
@@ -86,6 +89,9 @@ static int
 yaml_emitter_increase_indent(yaml_emitter_t *emitter,
         int flow, int indentless);
 
+static inline int
+yaml_emitter_is_printable(yaml_string_t string);
+
 /*
  * State functions.
  */
@@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
     return 1;
 }
 
+/*
+ * Checks if given utf-8 encoded code point represent printable character.
+ */
+
+static inline int
+yaml_emitter_is_printable(yaml_string_t string)
+{
+    unsigned char octet;
+    unsigned int width;
+    unsigned int value;
+
+    octet = string.pointer[0];
+    width = (octet & 0x80) == 0x00 ? 1 :
+            (octet & 0xE0) == 0xC0 ? 2 :
+            (octet & 0xF0) == 0xE0 ? 3 :
+            (octet & 0xF8) == 0xF0 ? 4 : 0;
+    value = (octet & 0x80) == 0x00 ? octet & 0x7F :
+            (octet & 0xE0) == 0xC0 ? octet & 0x1F :
+            (octet & 0xF0) == 0xE0 ? octet & 0x0F :
+            (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
+    for (int k = 1; k < (int)width; k ++) {
+        octet = string.pointer[k];
+        value = (value << 6) + (octet & 0x3F);
+    }
+    return (((string).pointer[0] == 0x0A)
+            || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
+            || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
+            || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
+            || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
+            || ((string).pointer[0] == 0xEE)
+            || ((string).pointer[0] == 0xEF
+                && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
+                && !((string).pointer[1] == 0xBF
+                     && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
+            || u_isprint(value));
+}
+
 /*
  * State dispatcher.
  */
@@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
             }
         }
 
-        if (!IS_PRINTABLE(string)
+        if (!yaml_emitter_is_printable(string)
                 || (!IS_ASCII(string) && !emitter->unicode)) {
             special_characters = 1;
         }
@@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,
 
     while (string.pointer != string.end)
     {
-        if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
+        if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
                 || IS_BOM(string) || IS_BREAK(string)
                 || CHECK(string, '"') || CHECK(string, '\\'))
         {
diff --git a/src/yaml_private.h b/src/yaml_private.h
index eb72207..437ee36 100644
--- a/src/yaml_private.h
+++ b/src/yaml_private.h
@@ -258,26 +258,6 @@ yaml_string_join(
  * Check if the character can be printed unescaped.
  */
 
-#define IS_PRINTABLE_AT(string,offset)                                          \
-    (((string).pointer[offset] == 0x0A)         /* . == #x0A */                 \
-     || ((string).pointer[offset] >= 0x20       /* #x20 <= . <= #x7E */         \
-         && (string).pointer[offset] <= 0x7E)                                   \
-     || ((string).pointer[offset] == 0xC2       /* #0xA0 <= . <= #xD7FF */      \
-         && (string).pointer[offset+1] >= 0xA0)                                 \
-     || ((string).pointer[offset] > 0xC2                                        \
-         && (string).pointer[offset] < 0xED)                                    \
-     || ((string).pointer[offset] == 0xED                                       \
-         && (string).pointer[offset+1] < 0xA0)                                  \
-     || ((string).pointer[offset] == 0xEE)                                      \
-     || ((string).pointer[offset] == 0xEF      /* #xE000 <= . <= #xFFFD */      \
-         && !((string).pointer[offset+1] == 0xBB        /* && . != #xFEFF */    \
-             && (string).pointer[offset+2] == 0xBF)                             \
-         && !((string).pointer[offset+1] == 0xBF                                \
-             && ((string).pointer[offset+2] == 0xBE                             \
-                 || (string).pointer[offset+2] == 0xBF))))
-
-#define IS_PRINTABLE(string)    IS_PRINTABLE_AT((string),0)
-
 /*
  * Check if the character at the specified position is NUL.
  */
diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh
index 9c92741..fee18d5 100755
--- a/tests/run-all-tests.sh
+++ b/tests/run-all-tests.sh
@@ -5,14 +5,15 @@ set -e
 main() {
   # Autoconf based in-source build and tests
   clean
-
+  export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
+  export CPPFLAGS="-I/usr/local/opt/icu4c/include"
   ./bootstrap
   ./configure
   make test-all
 
   # CMake based in-source build and tests
   clean
-
+  export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
   cmake .
   make
   make test
-- 
2.20.1 (Apple Git-117)

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2019-07-18 11:38 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-16 13:43 [PATCH] Extend range of printable unicode characters Serge Petrenko
2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
2019-07-17  9:00   ` Serge Petrenko
2019-07-18  4:50 ` Kirill Yukhin
2019-07-18  9:49   ` [tarantool-patches] " Konstantin Osipov
2019-07-18 11:16     ` Kirill Yukhin
2019-07-18 11:38       ` Konstantin Osipov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox