Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH] Extend range of printable unicode characters
@ 2019-07-16 13:43 Serge Petrenko
  2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
  2019-07-18  4:50 ` Kirill Yukhin
  0 siblings, 2 replies; 10+ messages in thread
From: Serge Petrenko @ 2019-07-16 13:43 UTC (permalink / raw)
  To: vdavydov.dev; +Cc: tarantool-patches, alexander.turenko, Serge Petrenko

Before the patch IS_PRINTABLE macros was used
to determine if given character is printable or not.
This macros did not take into account characters
encoded with 4 bytes.
After the patch IS_PRINTABLE is replaced with new
corresponding function. Now the range of printable
characters is: (libyaml old range) U (icu range). This
new range include characters encoded with 4 bytes.

Related to tarantool/tarantool#4090
---
https://github.com/tarantool/libyaml/tree/tarantool-gh-4090-fix
https://github.com/tarantool/tarantool/issues/4090

The patch was initially submitted by SudoBobo (Ivan Koptelov)
The only change I made is remove the now unused IS_PRINTABLE macro.

 .gitignore             |  1 -
 CMakeLists.txt         |  6 ++++
 cmake/FindICU.cmake    | 66 ++++++++++++++++++++++++++++++++++++++++++
 src/emitter.c          | 47 ++++++++++++++++++++++++++++--
 src/yaml_private.h     | 20 -------------
 tests/run-all-tests.sh |  5 ++--
 6 files changed, 120 insertions(+), 25 deletions(-)
 create mode 100644 cmake/FindICU.cmake

diff --git a/.gitignore b/.gitignore
index ec3700d..d18fdfd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 *.BAK
 *.a
-*.cmake
 *.dll
 *.exe
 *.la
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e20a494..2cc8ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC
   $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
   )
 
+
+include(cmake/FindICU.cmake)
+find_package(ICU)
+target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
+target_link_libraries(yaml  ${ICU_LIBRARIES})
+
 #
 # Install rules
 #
diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
new file mode 100644
index 0000000..3e36cf5
--- /dev/null
+++ b/cmake/FindICU.cmake
@@ -0,0 +1,66 @@
+# - Find ICU header and library
+# The module defines the following variables:
+#
+#  ICU_FOUND - true if ICU was found
+#  ICU_INCLUDE_DIRS - the directory of the ICU headers
+#  ICU_LIBRARIES - the ICU libraries needed for linking
+#
+
+if(DEFINED ICU_ROOT)
+    set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
+    set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
+    set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
+else()
+    set(ICU_FIND_OPTS)
+    set(ICU_FIND_LIBRARY_HINTS)
+    set(ICU_FIND_PATH_HINTS)
+endif()
+
+find_path(ICU_INCLUDE_DIR
+    unicode/ucol.h
+    HINTS ${ICU_FIND_PATH_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+if(BUILD_STATIC)
+    set(ICU_I18N_LIB_NAME libicui18n.a)
+    set(ICU_UC_LIB_NAME libicuuc.a)
+    set(ICU_DATA_LIB_NAME libicudata.a)
+else()
+    set(ICU_I18N_LIB_NAME icui18n)
+    set(ICU_UC_LIB_NAME icuuc)
+    set(ICU_DATA_LIB_NAME icudata)
+endif()
+
+find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ICU
+    REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
+set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
+set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
+mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
+        ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)
+
+#
+# Check presence of ucol_strcollUTF8 function from ICU
+#
+set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
+set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+set(CMAKE_REQUIRED_DEFINITIONS "")
+set(CMAKE_REQUIRED_LIBRARIES "")
+set(CMAKE_REQUIRED_INCLUDES "")
+set(CMAKE_REQUIRED_FLAGS "")
diff --git a/src/emitter.c b/src/emitter.c
index 1400df1..14e3551 100644
--- a/src/emitter.c
+++ b/src/emitter.c
@@ -1,6 +1,9 @@
 
 #include "yaml_private.h"
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+
 /*
  * Flush the buffer if needed.
  */
@@ -86,6 +89,9 @@ static int
 yaml_emitter_increase_indent(yaml_emitter_t *emitter,
         int flow, int indentless);
 
+static inline int
+yaml_emitter_is_printable(yaml_string_t string);
+
 /*
  * State functions.
  */
@@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
     return 1;
 }
 
+/*
+ * Checks if given utf-8 encoded code point represent printable character.
+ */
+
+static inline int
+yaml_emitter_is_printable(yaml_string_t string)
+{
+    unsigned char octet;
+    unsigned int width;
+    unsigned int value;
+
+    octet = string.pointer[0];
+    width = (octet & 0x80) == 0x00 ? 1 :
+            (octet & 0xE0) == 0xC0 ? 2 :
+            (octet & 0xF0) == 0xE0 ? 3 :
+            (octet & 0xF8) == 0xF0 ? 4 : 0;
+    value = (octet & 0x80) == 0x00 ? octet & 0x7F :
+            (octet & 0xE0) == 0xC0 ? octet & 0x1F :
+            (octet & 0xF0) == 0xE0 ? octet & 0x0F :
+            (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
+    for (int k = 1; k < (int)width; k ++) {
+        octet = string.pointer[k];
+        value = (value << 6) + (octet & 0x3F);
+    }
+    return (((string).pointer[0] == 0x0A)
+            || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
+            || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
+            || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
+            || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
+            || ((string).pointer[0] == 0xEE)
+            || ((string).pointer[0] == 0xEF
+                && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
+                && !((string).pointer[1] == 0xBF
+                     && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
+            || u_isprint(value));
+}
+
 /*
  * State dispatcher.
  */
@@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
             }
         }
 
-        if (!IS_PRINTABLE(string)
+        if (!yaml_emitter_is_printable(string)
                 || (!IS_ASCII(string) && !emitter->unicode)) {
             special_characters = 1;
         }
@@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,
 
     while (string.pointer != string.end)
     {
-        if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
+        if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
                 || IS_BOM(string) || IS_BREAK(string)
                 || CHECK(string, '"') || CHECK(string, '\\'))
         {
diff --git a/src/yaml_private.h b/src/yaml_private.h
index eb72207..437ee36 100644
--- a/src/yaml_private.h
+++ b/src/yaml_private.h
@@ -258,26 +258,6 @@ yaml_string_join(
  * Check if the character can be printed unescaped.
  */
 
-#define IS_PRINTABLE_AT(string,offset)                                          \
-    (((string).pointer[offset] == 0x0A)         /* . == #x0A */                 \
-     || ((string).pointer[offset] >= 0x20       /* #x20 <= . <= #x7E */         \
-         && (string).pointer[offset] <= 0x7E)                                   \
-     || ((string).pointer[offset] == 0xC2       /* #0xA0 <= . <= #xD7FF */      \
-         && (string).pointer[offset+1] >= 0xA0)                                 \
-     || ((string).pointer[offset] > 0xC2                                        \
-         && (string).pointer[offset] < 0xED)                                    \
-     || ((string).pointer[offset] == 0xED                                       \
-         && (string).pointer[offset+1] < 0xA0)                                  \
-     || ((string).pointer[offset] == 0xEE)                                      \
-     || ((string).pointer[offset] == 0xEF      /* #xE000 <= . <= #xFFFD */      \
-         && !((string).pointer[offset+1] == 0xBB        /* && . != #xFEFF */    \
-             && (string).pointer[offset+2] == 0xBF)                             \
-         && !((string).pointer[offset+1] == 0xBF                                \
-             && ((string).pointer[offset+2] == 0xBE                             \
-                 || (string).pointer[offset+2] == 0xBF))))
-
-#define IS_PRINTABLE(string)    IS_PRINTABLE_AT((string),0)
-
 /*
  * Check if the character at the specified position is NUL.
  */
diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh
index 9c92741..fee18d5 100755
--- a/tests/run-all-tests.sh
+++ b/tests/run-all-tests.sh
@@ -5,14 +5,15 @@ set -e
 main() {
   # Autoconf based in-source build and tests
   clean
-
+  export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
+  export CPPFLAGS="-I/usr/local/opt/icu4c/include"
   ./bootstrap
   ./configure
   make test-all
 
   # CMake based in-source build and tests
   clean
-
+  export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
   cmake .
   make
   make test
-- 
2.20.1 (Apple Git-117)

^ permalink raw reply	[flat|nested] 10+ messages in thread
* [PATCH] Extend range of printable unicode characters
@ 2019-07-18  8:39 Serge Petrenko
  2019-07-19 12:04 ` [tarantool-patches] " Kirill Yukhin
  0 siblings, 1 reply; 10+ messages in thread
From: Serge Petrenko @ 2019-07-18  8:39 UTC (permalink / raw)
  To: vdavydov.dev; +Cc: tarantool-patches, alexander.turenko, Serge Petrenko

Before the patch unicode characters encoded with 4 bytes
were always treated as non-printable and displayed as byte
sequences (with 'binary' tag).
With the patch, range of printable characters is extended and
include characters encoded with 4 bytes.
Currently it is: (old printable range) U (icu printable range).
Corresponding changes are also made in tarantool/libyaml.

Closes: #4090
---
https://github.com/tarantool/tarantool/tree/gh-4090-unicode-displ-bin
https://github.com/tarantool/tarantool/issues/4090

The patch was prepared by @SudoBobo (Ivan Koptelov).
The only change I made is rebase it on top of current master,
which includes the corresponding libyaml update.

 src/lib/core/CMakeLists.txt |  2 +-
 src/lib/core/util.c         |  5 ++++-
 test/app-tap/yaml.test.lua  | 10 +++++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/lib/core/CMakeLists.txt b/src/lib/core/CMakeLists.txt
index 66e430a25..4b96d1105 100644
--- a/src/lib/core/CMakeLists.txt
+++ b/src/lib/core/CMakeLists.txt
@@ -40,7 +40,7 @@ add_library(core STATIC ${core_sources})
 
 target_link_libraries(core salad small uri decNumber ${LIBEV_LIBRARIES}
                       ${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES}
-                      ${MSGPUCK_LIBRARIES})
+                      ${MSGPUCK_LIBRARIES} ${ICU_LIBRARIES})
 
 if (ENABLE_BACKTRACE AND NOT TARGET_OS_DARWIN)
     target_link_libraries(core gcc_s ${UNWIND_LIBRARIES})
diff --git a/src/lib/core/util.c b/src/lib/core/util.c
index 9458695b9..4ca99e177 100644
--- a/src/lib/core/util.c
+++ b/src/lib/core/util.c
@@ -41,6 +41,8 @@
 #include <unistd.h>
 #include <limits.h>
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
 #include <msgpuck/msgpuck.h> /* mp_char2escape[] table */
 
 #include "say.h"
@@ -271,7 +273,8 @@ utf8_check_printable(const char *start, size_t length)
 		      (pointer[0] == 0xEF &&
 		       !(pointer[1] == 0xBB && pointer[2] == 0xBF) &&
 		       !(pointer[1] == 0xBF &&
-			 (pointer[2] == 0xBE || pointer[2] == 0xBF)))
+			 (pointer[2] == 0xBE || pointer[2] == 0xBF))) ||
+		      (u_isprint(value))
 		      )
 		    ) {
 			return 0;
diff --git a/test/app-tap/yaml.test.lua b/test/app-tap/yaml.test.lua
index c0eecea03..4669b6102 100755
--- a/test/app-tap/yaml.test.lua
+++ b/test/app-tap/yaml.test.lua
@@ -42,7 +42,7 @@ local function test_compact(test, s)
 end
 
 local function test_output(test, s)
-    test:plan(12)
+    test:plan(17)
     test:is(s.encode({true}), '---\n- true\n...\n', "encode for true")
     test:is(s.decode("---\nyes\n..."), true, "decode for 'yes'")
     test:is(s.encode({false}), '---\n- false\n...\n', "encode for false")
@@ -55,6 +55,14 @@ local function test_output(test, s)
         "encode for binary (2) - gh-354")
     test:is(s.encode("\xe0\x82\x85\x00"), '--- !!binary 4IKFAA==\n...\n',
         "encode for binary (3) - gh-1302")
+    -- gh-4090: some printable unicode characters displayed as byte sequences.
+    -- The following tests ensures that various 4-byte encoded unicode characters
+    -- displayed as expected.
+    test:is(s.encode("\xF0\x9F\x86\x98"), '--- 🆘\n...\n', "encode - gh-4090 (1)")
+    test:is(s.encode("\xF0\x9F\x84\xBD"), '--- 🄽\n...\n', "encode - gh-4090 (2)")
+    test:is(s.encode("\xF0\x9F\x85\xA9"), '--- 🅩\n...\n', "encode - gh-4090 (3)")
+    test:is(s.encode("\xF0\x9F\x87\xA6"), '--- 🇦\n...\n', "encode - gh-4090 (4)")
+    test:is(s.encode("\xF0\x9F\x88\xB2"), '--- 🈲\n...\n', "encode - gh-4090 (5)")
     -- gh-883: console can hang tarantool process
     local t = {}
     for i=0x8000,0xffff,1 do
-- 
2.20.1 (Apple Git-117)

^ permalink raw reply	[flat|nested] 10+ messages in thread
* [tarantool-patches] [PATCH] Extend range of printable unicode characters
@ 2019-07-02 11:33 Ivan Koptelov
  0 siblings, 0 replies; 10+ messages in thread
From: Ivan Koptelov @ 2019-07-02 11:33 UTC (permalink / raw)
  To: tarantool-patches; +Cc: alexander.turenko, Ivan Koptelov

Before the patch unicode characters encoded with 4 bytes
were always treated as non-printable and displayed as byte
sequences (with 'binary' tag).
With the patch, range of printable characters is extended and
include characters encoded with 4 bytes.
Currently it is: (old printable range) U (icu printable range).
Corresponding changes are also made in tarantool/libyaml.

Closes: #4090
---
Branch https://github.com/tarantool/tarantool/tree/sudobobo/gh-4090-valid-chars-displ-bin
Issue https://github.com/tarantool/tarantool/issues/4090

 src/lib/core/CMakeLists.txt |  2 +-
 src/lib/core/util.c         |  5 ++++-
 test/app-tap/yaml.test.lua  | 10 +++++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/lib/core/CMakeLists.txt b/src/lib/core/CMakeLists.txt
index 66e430a25..4b96d1105 100644
--- a/src/lib/core/CMakeLists.txt
+++ b/src/lib/core/CMakeLists.txt
@@ -40,7 +40,7 @@ add_library(core STATIC ${core_sources})
 
 target_link_libraries(core salad small uri decNumber ${LIBEV_LIBRARIES}
                       ${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES}
-                      ${MSGPUCK_LIBRARIES})
+                      ${MSGPUCK_LIBRARIES} ${ICU_LIBRARIES})
 
 if (ENABLE_BACKTRACE AND NOT TARGET_OS_DARWIN)
     target_link_libraries(core gcc_s ${UNWIND_LIBRARIES})
diff --git a/src/lib/core/util.c b/src/lib/core/util.c
index 9458695b9..4ca99e177 100644
--- a/src/lib/core/util.c
+++ b/src/lib/core/util.c
@@ -41,6 +41,8 @@
 #include <unistd.h>
 #include <limits.h>
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
 #include <msgpuck/msgpuck.h> /* mp_char2escape[] table */
 
 #include "say.h"
@@ -271,7 +273,8 @@ utf8_check_printable(const char *start, size_t length)
 		      (pointer[0] == 0xEF &&
 		       !(pointer[1] == 0xBB && pointer[2] == 0xBF) &&
 		       !(pointer[1] == 0xBF &&
-			 (pointer[2] == 0xBE || pointer[2] == 0xBF)))
+			 (pointer[2] == 0xBE || pointer[2] == 0xBF))) ||
+		      (u_isprint(value))
 		      )
 		    ) {
 			return 0;
diff --git a/test/app-tap/yaml.test.lua b/test/app-tap/yaml.test.lua
index c0eecea03..4669b6102 100755
--- a/test/app-tap/yaml.test.lua
+++ b/test/app-tap/yaml.test.lua
@@ -42,7 +42,7 @@ local function test_compact(test, s)
 end
 
 local function test_output(test, s)
-    test:plan(12)
+    test:plan(17)
     test:is(s.encode({true}), '---\n- true\n...\n', "encode for true")
     test:is(s.decode("---\nyes\n..."), true, "decode for 'yes'")
     test:is(s.encode({false}), '---\n- false\n...\n', "encode for false")
@@ -55,6 +55,14 @@ local function test_output(test, s)
         "encode for binary (2) - gh-354")
     test:is(s.encode("\xe0\x82\x85\x00"), '--- !!binary 4IKFAA==\n...\n',
         "encode for binary (3) - gh-1302")
+    -- gh-4090: some printable unicode characters displayed as byte sequences.
+    -- The following tests ensures that various 4-byte encoded unicode characters
+    -- displayed as expected.
+    test:is(s.encode("\xF0\x9F\x86\x98"), '--- 🆘\n...\n', "encode - gh-4090 (1)")
+    test:is(s.encode("\xF0\x9F\x84\xBD"), '--- 🄽\n...\n', "encode - gh-4090 (2)")
+    test:is(s.encode("\xF0\x9F\x85\xA9"), '--- 🅩\n...\n', "encode - gh-4090 (3)")
+    test:is(s.encode("\xF0\x9F\x87\xA6"), '--- 🇦\n...\n', "encode - gh-4090 (4)")
+    test:is(s.encode("\xF0\x9F\x88\xB2"), '--- 🈲\n...\n', "encode - gh-4090 (5)")
     -- gh-883: console can hang tarantool process
     local t = {}
     for i=0x8000,0xffff,1 do
-- 
2.20.1

^ permalink raw reply	[flat|nested] 10+ messages in thread
* [tarantool-patches] [PATCH] Extend range of printable unicode characters
@ 2019-07-02 11:32 Ivan Koptelov
  0 siblings, 0 replies; 10+ messages in thread
From: Ivan Koptelov @ 2019-07-02 11:32 UTC (permalink / raw)
  To: tarantool-patches; +Cc: alexander.turenko, Ivan Koptelov

Before the patch IS_PRINTABLE macros was used
to determine if given character is printable or not.
This macros did not take into account characters
encoded with 4 bytes.
After the patch IS_PRINTABLE is removed with new
corresponding function. Now the range of printable
characters is: (libyaml old range) U (icu range). This
new range include characters encoded with 4 bytes.

Related to tarantool/tarantool #4090
---
Issue https://github.com/tarantool/tarantool/issues/4090
Branch https://github.com/tarantool/libyaml/tree/sudobob/tarantool-gh-4090-fix

 .gitignore             |  1 -
 CMakeLists.txt         |  6 ++++
 cmake/FindICU.cmake    | 66 ++++++++++++++++++++++++++++++++++++++++++
 src/emitter.c          | 47 ++++++++++++++++++++++++++++--
 tests/run-all-tests.sh |  5 ++--
 5 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 cmake/FindICU.cmake

diff --git a/.gitignore b/.gitignore
index ec3700d..d18fdfd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 *.BAK
 *.a
-*.cmake
 *.dll
 *.exe
 *.la
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e20a494..2cc8ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC
   $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
   )
 
+
+include(cmake/FindICU.cmake)
+find_package(ICU)
+target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
+target_link_libraries(yaml  ${ICU_LIBRARIES})
+
 #
 # Install rules
 #
diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
new file mode 100644
index 0000000..3e36cf5
--- /dev/null
+++ b/cmake/FindICU.cmake
@@ -0,0 +1,66 @@
+# - Find ICU header and library
+# The module defines the following variables:
+#
+#  ICU_FOUND - true if ICU was found
+#  ICU_INCLUDE_DIRS - the directory of the ICU headers
+#  ICU_LIBRARIES - the ICU libraries needed for linking
+#
+
+if(DEFINED ICU_ROOT)
+    set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
+    set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
+    set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
+else()
+    set(ICU_FIND_OPTS)
+    set(ICU_FIND_LIBRARY_HINTS)
+    set(ICU_FIND_PATH_HINTS)
+endif()
+
+find_path(ICU_INCLUDE_DIR
+    unicode/ucol.h
+    HINTS ${ICU_FIND_PATH_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+if(BUILD_STATIC)
+    set(ICU_I18N_LIB_NAME libicui18n.a)
+    set(ICU_UC_LIB_NAME libicuuc.a)
+    set(ICU_DATA_LIB_NAME libicudata.a)
+else()
+    set(ICU_I18N_LIB_NAME icui18n)
+    set(ICU_UC_LIB_NAME icuuc)
+    set(ICU_DATA_LIB_NAME icudata)
+endif()
+
+find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ICU
+    REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
+set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
+set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
+mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
+        ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)
+
+#
+# Check presence of ucol_strcollUTF8 function from ICU
+#
+set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
+set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+set(CMAKE_REQUIRED_DEFINITIONS "")
+set(CMAKE_REQUIRED_LIBRARIES "")
+set(CMAKE_REQUIRED_INCLUDES "")
+set(CMAKE_REQUIRED_FLAGS "")
diff --git a/src/emitter.c b/src/emitter.c
index 1400df1..14e3551 100644
--- a/src/emitter.c
+++ b/src/emitter.c
@@ -1,6 +1,9 @@
 
 #include "yaml_private.h"
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+
 /*
  * Flush the buffer if needed.
  */
@@ -86,6 +89,9 @@ static int
 yaml_emitter_increase_indent(yaml_emitter_t *emitter,
         int flow, int indentless);
 
+static inline int
+yaml_emitter_is_printable(yaml_string_t string);
+
 /*
  * State functions.
  */
@@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
     return 1;
 }
 
+/*
+ * Checks if given utf-8 encoded code point represent printable character.
+ */
+
+static inline int
+yaml_emitter_is_printable(yaml_string_t string)
+{
+    unsigned char octet;
+    unsigned int width;
+    unsigned int value;
+
+    octet = string.pointer[0];
+    width = (octet & 0x80) == 0x00 ? 1 :
+            (octet & 0xE0) == 0xC0 ? 2 :
+            (octet & 0xF0) == 0xE0 ? 3 :
+            (octet & 0xF8) == 0xF0 ? 4 : 0;
+    value = (octet & 0x80) == 0x00 ? octet & 0x7F :
+            (octet & 0xE0) == 0xC0 ? octet & 0x1F :
+            (octet & 0xF0) == 0xE0 ? octet & 0x0F :
+            (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
+    for (int k = 1; k < (int)width; k ++) {
+        octet = string.pointer[k];
+        value = (value << 6) + (octet & 0x3F);
+    }
+    return (((string).pointer[0] == 0x0A)
+            || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
+            || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
+            || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
+            || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
+            || ((string).pointer[0] == 0xEE)
+            || ((string).pointer[0] == 0xEF
+                && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
+                && !((string).pointer[1] == 0xBF
+                     && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
+            || u_isprint(value));
+}
+
 /*
  * State dispatcher.
  */
@@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
             }
         }
 
-        if (!IS_PRINTABLE(string)
+        if (!yaml_emitter_is_printable(string)
                 || (!IS_ASCII(string) && !emitter->unicode)) {
             special_characters = 1;
         }
@@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,
 
     while (string.pointer != string.end)
     {
-        if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
+        if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
                 || IS_BOM(string) || IS_BREAK(string)
                 || CHECK(string, '"') || CHECK(string, '\\'))
         {
diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh
index 9c92741..fee18d5 100755
--- a/tests/run-all-tests.sh
+++ b/tests/run-all-tests.sh
@@ -5,14 +5,15 @@ set -e
 main() {
   # Autoconf based in-source build and tests
   clean
-
+  export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
+  export CPPFLAGS="-I/usr/local/opt/icu4c/include"
   ./bootstrap
   ./configure
   make test-all
 
   # CMake based in-source build and tests
   clean
-
+  export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
   cmake .
   make
   make test
-- 
2.20.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-07-19 12:04 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-16 13:43 [PATCH] Extend range of printable unicode characters Serge Petrenko
2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
2019-07-17  9:00   ` Serge Petrenko
2019-07-18  4:50 ` Kirill Yukhin
2019-07-18  9:49   ` [tarantool-patches] " Konstantin Osipov
2019-07-18 11:16     ` Kirill Yukhin
2019-07-18 11:38       ` Konstantin Osipov
  -- strict thread matches above, loose matches on Subject: below --
2019-07-18  8:39 Serge Petrenko
2019-07-19 12:04 ` [tarantool-patches] " Kirill Yukhin
2019-07-02 11:33 Ivan Koptelov
2019-07-02 11:32 Ivan Koptelov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox