Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH] Extend range of printable unicode characters
@ 2019-07-16 13:43 Serge Petrenko
  2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
  2019-07-18  4:50 ` Kirill Yukhin
  0 siblings, 2 replies; 10+ messages in thread
From: Serge Petrenko @ 2019-07-16 13:43 UTC (permalink / raw)
  To: vdavydov.dev; +Cc: tarantool-patches, alexander.turenko, Serge Petrenko

Before the patch IS_PRINTABLE macros was used
to determine if given character is printable or not.
This macros did not take into account characters
encoded with 4 bytes.
After the patch IS_PRINTABLE is replaced with new
corresponding function. Now the range of printable
characters is: (libyaml old range) U (icu range). This
new range include characters encoded with 4 bytes.

Related to tarantool/tarantool#4090
---
https://github.com/tarantool/libyaml/tree/tarantool-gh-4090-fix
https://github.com/tarantool/tarantool/issues/4090

The patch was initially submitted by SudoBobo (Ivan Koptelov)
The only change I made is remove the now unused IS_PRINTABLE macro.

 .gitignore             |  1 -
 CMakeLists.txt         |  6 ++++
 cmake/FindICU.cmake    | 66 ++++++++++++++++++++++++++++++++++++++++++
 src/emitter.c          | 47 ++++++++++++++++++++++++++++--
 src/yaml_private.h     | 20 -------------
 tests/run-all-tests.sh |  5 ++--
 6 files changed, 120 insertions(+), 25 deletions(-)
 create mode 100644 cmake/FindICU.cmake

diff --git a/.gitignore b/.gitignore
index ec3700d..d18fdfd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 *.BAK
 *.a
-*.cmake
 *.dll
 *.exe
 *.la
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e20a494..2cc8ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC
   $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
   )
 
+
+include(cmake/FindICU.cmake)
+find_package(ICU)
+target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
+target_link_libraries(yaml  ${ICU_LIBRARIES})
+
 #
 # Install rules
 #
diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
new file mode 100644
index 0000000..3e36cf5
--- /dev/null
+++ b/cmake/FindICU.cmake
@@ -0,0 +1,66 @@
+# - Find ICU header and library
+# The module defines the following variables:
+#
+#  ICU_FOUND - true if ICU was found
+#  ICU_INCLUDE_DIRS - the directory of the ICU headers
+#  ICU_LIBRARIES - the ICU libraries needed for linking
+#
+
+if(DEFINED ICU_ROOT)
+    set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
+    set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
+    set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
+else()
+    set(ICU_FIND_OPTS)
+    set(ICU_FIND_LIBRARY_HINTS)
+    set(ICU_FIND_PATH_HINTS)
+endif()
+
+find_path(ICU_INCLUDE_DIR
+    unicode/ucol.h
+    HINTS ${ICU_FIND_PATH_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+if(BUILD_STATIC)
+    set(ICU_I18N_LIB_NAME libicui18n.a)
+    set(ICU_UC_LIB_NAME libicuuc.a)
+    set(ICU_DATA_LIB_NAME libicudata.a)
+else()
+    set(ICU_I18N_LIB_NAME icui18n)
+    set(ICU_UC_LIB_NAME icuuc)
+    set(ICU_DATA_LIB_NAME icudata)
+endif()
+
+find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ICU
+    REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
+set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
+set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
+mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
+        ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)
+
+#
+# Check presence of ucol_strcollUTF8 function from ICU
+#
+set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
+set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+set(CMAKE_REQUIRED_DEFINITIONS "")
+set(CMAKE_REQUIRED_LIBRARIES "")
+set(CMAKE_REQUIRED_INCLUDES "")
+set(CMAKE_REQUIRED_FLAGS "")
diff --git a/src/emitter.c b/src/emitter.c
index 1400df1..14e3551 100644
--- a/src/emitter.c
+++ b/src/emitter.c
@@ -1,6 +1,9 @@
 
 #include "yaml_private.h"
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+
 /*
  * Flush the buffer if needed.
  */
@@ -86,6 +89,9 @@ static int
 yaml_emitter_increase_indent(yaml_emitter_t *emitter,
         int flow, int indentless);
 
+static inline int
+yaml_emitter_is_printable(yaml_string_t string);
+
 /*
  * State functions.
  */
@@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
     return 1;
 }
 
+/*
+ * Checks if given utf-8 encoded code point represent printable character.
+ */
+
+static inline int
+yaml_emitter_is_printable(yaml_string_t string)
+{
+    unsigned char octet;
+    unsigned int width;
+    unsigned int value;
+
+    octet = string.pointer[0];
+    width = (octet & 0x80) == 0x00 ? 1 :
+            (octet & 0xE0) == 0xC0 ? 2 :
+            (octet & 0xF0) == 0xE0 ? 3 :
+            (octet & 0xF8) == 0xF0 ? 4 : 0;
+    value = (octet & 0x80) == 0x00 ? octet & 0x7F :
+            (octet & 0xE0) == 0xC0 ? octet & 0x1F :
+            (octet & 0xF0) == 0xE0 ? octet & 0x0F :
+            (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
+    for (int k = 1; k < (int)width; k ++) {
+        octet = string.pointer[k];
+        value = (value << 6) + (octet & 0x3F);
+    }
+    return (((string).pointer[0] == 0x0A)
+            || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
+            || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
+            || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
+            || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
+            || ((string).pointer[0] == 0xEE)
+            || ((string).pointer[0] == 0xEF
+                && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
+                && !((string).pointer[1] == 0xBF
+                     && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
+            || u_isprint(value));
+}
+
 /*
  * State dispatcher.
  */
@@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
             }
         }
 
-        if (!IS_PRINTABLE(string)
+        if (!yaml_emitter_is_printable(string)
                 || (!IS_ASCII(string) && !emitter->unicode)) {
             special_characters = 1;
         }
@@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,
 
     while (string.pointer != string.end)
     {
-        if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
+        if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
                 || IS_BOM(string) || IS_BREAK(string)
                 || CHECK(string, '"') || CHECK(string, '\\'))
         {
diff --git a/src/yaml_private.h b/src/yaml_private.h
index eb72207..437ee36 100644
--- a/src/yaml_private.h
+++ b/src/yaml_private.h
@@ -258,26 +258,6 @@ yaml_string_join(
  * Check if the character can be printed unescaped.
  */
 
-#define IS_PRINTABLE_AT(string,offset)                                          \
-    (((string).pointer[offset] == 0x0A)         /* . == #x0A */                 \
-     || ((string).pointer[offset] >= 0x20       /* #x20 <= . <= #x7E */         \
-         && (string).pointer[offset] <= 0x7E)                                   \
-     || ((string).pointer[offset] == 0xC2       /* #0xA0 <= . <= #xD7FF */      \
-         && (string).pointer[offset+1] >= 0xA0)                                 \
-     || ((string).pointer[offset] > 0xC2                                        \
-         && (string).pointer[offset] < 0xED)                                    \
-     || ((string).pointer[offset] == 0xED                                       \
-         && (string).pointer[offset+1] < 0xA0)                                  \
-     || ((string).pointer[offset] == 0xEE)                                      \
-     || ((string).pointer[offset] == 0xEF      /* #xE000 <= . <= #xFFFD */      \
-         && !((string).pointer[offset+1] == 0xBB        /* && . != #xFEFF */    \
-             && (string).pointer[offset+2] == 0xBF)                             \
-         && !((string).pointer[offset+1] == 0xBF                                \
-             && ((string).pointer[offset+2] == 0xBE                             \
-                 || (string).pointer[offset+2] == 0xBF))))
-
-#define IS_PRINTABLE(string)    IS_PRINTABLE_AT((string),0)
-
 /*
  * Check if the character at the specified position is NUL.
  */
diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh
index 9c92741..fee18d5 100755
--- a/tests/run-all-tests.sh
+++ b/tests/run-all-tests.sh
@@ -5,14 +5,15 @@ set -e
 main() {
   # Autoconf based in-source build and tests
   clean
-
+  export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
+  export CPPFLAGS="-I/usr/local/opt/icu4c/include"
   ./bootstrap
   ./configure
   make test-all
 
   # CMake based in-source build and tests
   clean
-
+  export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
   cmake .
   make
   make test
-- 
2.20.1 (Apple Git-117)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [tarantool-patches] [PATCH] Extend range of printable unicode characters
  2019-07-16 13:43 [PATCH] Extend range of printable unicode characters Serge Petrenko
@ 2019-07-16 18:31 ` Konstantin Osipov
  2019-07-17  9:00   ` Serge Petrenko
  2019-07-18  4:50 ` Kirill Yukhin
  1 sibling, 1 reply; 10+ messages in thread
From: Konstantin Osipov @ 2019-07-16 18:31 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, alexander.turenko, Serge Petrenko

* Serge Petrenko <sergepetrenko@tarantool.org> [19/07/16 16:44]:
> Before the patch IS_PRINTABLE macros was used
> to determine if given character is printable or not.
> This macros did not take into account characters
> encoded with 4 bytes.
> After the patch IS_PRINTABLE is replaced with new
> corresponding function. Now the range of printable
> characters is: (libyaml old range) U (icu range). This
> new range include characters encoded with 4 bytes.
> 

Please don't forget to upstream the patch to the maintainer once
it's reviewed.

Has there been any changes in the upstream which we should be
aware of?

On another note, since we're switching to Lua emitter as the
defualt one quite soon, we can perhaps close #4090 as won't fix
rather than bother improving libyaml.

> Related to tarantool/tarantool#4090

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [tarantool-patches] [PATCH] Extend range of printable unicode characters
  2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
@ 2019-07-17  9:00   ` Serge Petrenko
  0 siblings, 0 replies; 10+ messages in thread
From: Serge Petrenko @ 2019-07-17  9:00 UTC (permalink / raw)
  To: Konstantin Osipov; +Cc: tarantool-patches, Vladimir Davydov, alexander.turenko

[-- Attachment #1: Type: text/plain, Size: 1523 bytes --]





> 16 июля 2019 г., в 21:31, Konstantin Osipov <kostja@tarantool.org> написал(а):
> 
> * Serge Petrenko <sergepetrenko@tarantool.org> [19/07/16 16:44]:
>> Before the patch IS_PRINTABLE macros was used
>> to determine if given character is printable or not.
>> This macros did not take into account characters
>> encoded with 4 bytes.
>> After the patch IS_PRINTABLE is replaced with new
>> corresponding function. Now the range of printable
>> characters is: (libyaml old range) U (icu range). This
>> new range include characters encoded with 4 bytes.
>> 
> 
> Please don't forget to upstream the patch to the maintainer once
> it's reviewed.

Ok

> 
> Has there been any changes in the upstream which we should be
> aware of?

None that I’m aware of.
Similar issues are still open in the libyaml repo.
https://github.com/yaml/libyaml/issues/110 <https://github.com/yaml/libyaml/issues/110>
The code dealing with printable unicode characters in libyaml
remains the same, it doesn’t deal with codepoints outside of
the Basic Multilingual Plane (every code point > 0xFFFF)
https://github.com/yaml/libyaml/blob/master/src/yaml_private.h#L261



> 
> On another note, since we're switching to Lua emitter as the
> defualt one quite soon, we can perhaps close #4090 as won't fix
> rather than bother improving libyaml.
> 
>> Related to tarantool/tarantool#4090
> 
> -- 
> Konstantin Osipov, Moscow, Russia

--
Serge Petrenko
sergepetrenko@tarantool.org



[-- Attachment #2: Type: text/html, Size: 3112 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [tarantool-patches] [PATCH] Extend range of printable unicode characters
  2019-07-16 13:43 [PATCH] Extend range of printable unicode characters Serge Petrenko
  2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
@ 2019-07-18  4:50 ` Kirill Yukhin
  2019-07-18  9:49   ` [tarantool-patches] " Konstantin Osipov
  1 sibling, 1 reply; 10+ messages in thread
From: Kirill Yukhin @ 2019-07-18  4:50 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, alexander.turenko, Serge Petrenko

Hello,

On 16 Jul 16:43, Serge Petrenko wrote:
> Before the patch IS_PRINTABLE macros was used
> to determine if given character is printable or not.
> This macros did not take into account characters
> encoded with 4 bytes.
> After the patch IS_PRINTABLE is replaced with new
> corresponding function. Now the range of printable
> characters is: (libyaml old range) U (icu range). This
> new range include characters encoded with 4 bytes.
> 
> Related to tarantool/tarantool#4090
> ---
> https://github.com/tarantool/libyaml/tree/tarantool-gh-4090-fix
> https://github.com/tarantool/tarantool/issues/4090

New parsers are out of scope 2.2 release series, so I've checked
your patch into master.

Serge, please don't forget to upstream the patch.

--
Regards, Kirill Yukhin

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [tarantool-patches] Re: [PATCH] Extend range of printable unicode characters
  2019-07-18  4:50 ` Kirill Yukhin
@ 2019-07-18  9:49   ` Konstantin Osipov
  2019-07-18 11:16     ` Kirill Yukhin
  0 siblings, 1 reply; 10+ messages in thread
From: Konstantin Osipov @ 2019-07-18  9:49 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, alexander.turenko, Serge Petrenko

* Kirill Yukhin <kyukhin@tarantool.org> [19/07/18 09:16]:
> Hello,
> 
> On 16 Jul 16:43, Serge Petrenko wrote:
> > Before the patch IS_PRINTABLE macros was used
> > to determine if given character is printable or not.
> > This macros did not take into account characters
> > encoded with 4 bytes.
> > After the patch IS_PRINTABLE is replaced with new
> > corresponding function. Now the range of printable
> > characters is: (libyaml old range) U (icu range). This
> > new range include characters encoded with 4 bytes.
> > 
> > Related to tarantool/tarantool#4090
> > ---
> > https://github.com/tarantool/libyaml/tree/tarantool-gh-4090-fix
> > https://github.com/tarantool/tarantool/issues/4090
> 
> New parsers are out of scope 2.2 release series, so I've checked
> your patch into master.
> 
> Serge, please don't forget to upstream the patch.

Well, how does this ticket meet 2.2 release criteria? It's not a
big deal, but why did you choose to not push the bug out of the
release instead?

Working on wrong things boils down to not working on the right things.

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [tarantool-patches] Re: [PATCH] Extend range of printable unicode characters
  2019-07-18  9:49   ` [tarantool-patches] " Konstantin Osipov
@ 2019-07-18 11:16     ` Kirill Yukhin
  2019-07-18 11:38       ` Konstantin Osipov
  0 siblings, 1 reply; 10+ messages in thread
From: Kirill Yukhin @ 2019-07-18 11:16 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, alexander.turenko, Serge Petrenko

On 18 Jul 12:49, Konstantin Osipov wrote:
> * Kirill Yukhin <kyukhin@tarantool.org> [19/07/18 09:16]:
> > Hello,
> > 
> > On 16 Jul 16:43, Serge Petrenko wrote:
> > > Before the patch IS_PRINTABLE macros was used
> > > to determine if given character is printable or not.
> > > This macros did not take into account characters
> > > encoded with 4 bytes.
> > > After the patch IS_PRINTABLE is replaced with new
> > > corresponding function. Now the range of printable
> > > characters is: (libyaml old range) U (icu range). This
> > > new range include characters encoded with 4 bytes.
> > > 
> > > Related to tarantool/tarantool#4090
> > > ---
> > > https://github.com/tarantool/libyaml/tree/tarantool-gh-4090-fix
> > > https://github.com/tarantool/tarantool/issues/4090
> > 
> > New parsers are out of scope 2.2 release series, so I've checked
> > your patch into master.
> > 
> > Serge, please don't forget to upstream the patch.
> 
> Well, how does this ticket meet 2.2 release criteria? It's not a
> big deal, but why did you choose to not push the bug out of the
> release instead?

Because printing binary garbage instead of string value looks like
a bug to me.

> -- 
> Konstantin Osipov, Moscow, Russia
> 
--
Regards, Kirill Yukhin

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [tarantool-patches] Re: [PATCH] Extend range of printable unicode characters
  2019-07-18 11:16     ` Kirill Yukhin
@ 2019-07-18 11:38       ` Konstantin Osipov
  0 siblings, 0 replies; 10+ messages in thread
From: Konstantin Osipov @ 2019-07-18 11:38 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, alexander.turenko, Serge Petrenko

* Kirill Yukhin <kyukhin@tarantool.org> [19/07/18 14:17]:
> > > Serge, please don't forget to upstream the patch.
> > 
> > Well, how does this ticket meet 2.2 release criteria? It's not a
> > big deal, but why did you choose to not push the bug out of the
> > release instead?
> 
> Because printing binary garbage instead of string value looks like
> a bug to me.

It's impact 4, severity 3, so automatically priority 3.

The lowest priority possible according to SOP.

Please read the SOP again and use it its triage guidelines, or change the SOP.

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [tarantool-patches] [PATCH] Extend range of printable unicode characters
  2019-07-18  8:39 Serge Petrenko
@ 2019-07-19 12:04 ` Kirill Yukhin
  0 siblings, 0 replies; 10+ messages in thread
From: Kirill Yukhin @ 2019-07-19 12:04 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, alexander.turenko, Serge Petrenko

Hello,

On 18 Jul 11:39, Serge Petrenko wrote:
> Before the patch unicode characters encoded with 4 bytes
> were always treated as non-printable and displayed as byte
> sequences (with 'binary' tag).
> With the patch, range of printable characters is extended and
> include characters encoded with 4 bytes.
> Currently it is: (old printable range) U (icu printable range).
> Corresponding changes are also made in tarantool/libyaml.
> 
> Closes: #4090
> ---
> https://github.com/tarantool/tarantool/tree/gh-4090-unicode-displ-bin
> https://github.com/tarantool/tarantool/issues/4090

I've checked your patch into master.

--
Regards, Kirill Yukhin

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tarantool-patches] [PATCH] Extend range of printable unicode characters
@ 2019-07-02 11:33 Ivan Koptelov
  0 siblings, 0 replies; 10+ messages in thread
From: Ivan Koptelov @ 2019-07-02 11:33 UTC (permalink / raw)
  To: tarantool-patches; +Cc: alexander.turenko, Ivan Koptelov

Before the patch unicode characters encoded with 4 bytes
were always treated as non-printable and displayed as byte
sequences (with 'binary' tag).
With the patch, range of printable characters is extended and
include characters encoded with 4 bytes.
Currently it is: (old printable range) U (icu printable range).
Corresponding changes are also made in tarantool/libyaml.

Closes: #4090
---
Branch https://github.com/tarantool/tarantool/tree/sudobobo/gh-4090-valid-chars-displ-bin
Issue https://github.com/tarantool/tarantool/issues/4090

 src/lib/core/CMakeLists.txt |  2 +-
 src/lib/core/util.c         |  5 ++++-
 test/app-tap/yaml.test.lua  | 10 +++++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/lib/core/CMakeLists.txt b/src/lib/core/CMakeLists.txt
index 66e430a25..4b96d1105 100644
--- a/src/lib/core/CMakeLists.txt
+++ b/src/lib/core/CMakeLists.txt
@@ -40,7 +40,7 @@ add_library(core STATIC ${core_sources})
 
 target_link_libraries(core salad small uri decNumber ${LIBEV_LIBRARIES}
                       ${LIBEIO_LIBRARIES} ${LIBCORO_LIBRARIES}
-                      ${MSGPUCK_LIBRARIES})
+                      ${MSGPUCK_LIBRARIES} ${ICU_LIBRARIES})
 
 if (ENABLE_BACKTRACE AND NOT TARGET_OS_DARWIN)
     target_link_libraries(core gcc_s ${UNWIND_LIBRARIES})
diff --git a/src/lib/core/util.c b/src/lib/core/util.c
index 9458695b9..4ca99e177 100644
--- a/src/lib/core/util.c
+++ b/src/lib/core/util.c
@@ -41,6 +41,8 @@
 #include <unistd.h>
 #include <limits.h>
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
 #include <msgpuck/msgpuck.h> /* mp_char2escape[] table */
 
 #include "say.h"
@@ -271,7 +273,8 @@ utf8_check_printable(const char *start, size_t length)
 		      (pointer[0] == 0xEF &&
 		       !(pointer[1] == 0xBB && pointer[2] == 0xBF) &&
 		       !(pointer[1] == 0xBF &&
-			 (pointer[2] == 0xBE || pointer[2] == 0xBF)))
+			 (pointer[2] == 0xBE || pointer[2] == 0xBF))) ||
+		      (u_isprint(value))
 		      )
 		    ) {
 			return 0;
diff --git a/test/app-tap/yaml.test.lua b/test/app-tap/yaml.test.lua
index c0eecea03..4669b6102 100755
--- a/test/app-tap/yaml.test.lua
+++ b/test/app-tap/yaml.test.lua
@@ -42,7 +42,7 @@ local function test_compact(test, s)
 end
 
 local function test_output(test, s)
-    test:plan(12)
+    test:plan(17)
     test:is(s.encode({true}), '---\n- true\n...\n', "encode for true")
     test:is(s.decode("---\nyes\n..."), true, "decode for 'yes'")
     test:is(s.encode({false}), '---\n- false\n...\n', "encode for false")
@@ -55,6 +55,14 @@ local function test_output(test, s)
         "encode for binary (2) - gh-354")
     test:is(s.encode("\xe0\x82\x85\x00"), '--- !!binary 4IKFAA==\n...\n',
         "encode for binary (3) - gh-1302")
+    -- gh-4090: some printable unicode characters displayed as byte sequences.
+    -- The following tests ensures that various 4-byte encoded unicode characters
+    -- displayed as expected.
+    test:is(s.encode("\xF0\x9F\x86\x98"), '--- 🆘\n...\n', "encode - gh-4090 (1)")
+    test:is(s.encode("\xF0\x9F\x84\xBD"), '--- 🄽\n...\n', "encode - gh-4090 (2)")
+    test:is(s.encode("\xF0\x9F\x85\xA9"), '--- 🅩\n...\n', "encode - gh-4090 (3)")
+    test:is(s.encode("\xF0\x9F\x87\xA6"), '--- 🇦\n...\n', "encode - gh-4090 (4)")
+    test:is(s.encode("\xF0\x9F\x88\xB2"), '--- 🈲\n...\n', "encode - gh-4090 (5)")
     -- gh-883: console can hang tarantool process
     local t = {}
     for i=0x8000,0xffff,1 do
-- 
2.20.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [tarantool-patches] [PATCH] Extend range of printable unicode characters
@ 2019-07-02 11:32 Ivan Koptelov
  0 siblings, 0 replies; 10+ messages in thread
From: Ivan Koptelov @ 2019-07-02 11:32 UTC (permalink / raw)
  To: tarantool-patches; +Cc: alexander.turenko, Ivan Koptelov

Before the patch IS_PRINTABLE macros was used
to determine if given character is printable or not.
This macros did not take into account characters
encoded with 4 bytes.
After the patch IS_PRINTABLE is removed with new
corresponding function. Now the range of printable
characters is: (libyaml old range) U (icu range). This
new range include characters encoded with 4 bytes.

Related to tarantool/tarantool #4090
---
Issue https://github.com/tarantool/tarantool/issues/4090
Branch https://github.com/tarantool/libyaml/tree/sudobob/tarantool-gh-4090-fix

 .gitignore             |  1 -
 CMakeLists.txt         |  6 ++++
 cmake/FindICU.cmake    | 66 ++++++++++++++++++++++++++++++++++++++++++
 src/emitter.c          | 47 ++++++++++++++++++++++++++++--
 tests/run-all-tests.sh |  5 ++--
 5 files changed, 120 insertions(+), 5 deletions(-)
 create mode 100644 cmake/FindICU.cmake

diff --git a/.gitignore b/.gitignore
index ec3700d..d18fdfd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,5 @@
 *.BAK
 *.a
-*.cmake
 *.dll
 *.exe
 *.la
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e20a494..2cc8ccf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -77,6 +77,12 @@ target_include_directories(yaml PUBLIC
   $<INSTALL_INTERFACE:${INSTALL_INCLUDE_DIR}>
   )
 
+
+include(cmake/FindICU.cmake)
+find_package(ICU)
+target_include_directories(yaml PRIVATE ${ICU_INCLUDE_DIRS})
+target_link_libraries(yaml  ${ICU_LIBRARIES})
+
 #
 # Install rules
 #
diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake
new file mode 100644
index 0000000..3e36cf5
--- /dev/null
+++ b/cmake/FindICU.cmake
@@ -0,0 +1,66 @@
+# - Find ICU header and library
+# The module defines the following variables:
+#
+#  ICU_FOUND - true if ICU was found
+#  ICU_INCLUDE_DIRS - the directory of the ICU headers
+#  ICU_LIBRARIES - the ICU libraries needed for linking
+#
+
+if(DEFINED ICU_ROOT)
+    set(ICU_FIND_OPTS NO_CMAKE NO_CMAKE_SYSTEM_PATH)
+    set(ICU_FIND_LIBRARY_HINTS "${ICU_ROOT}/lib")
+    set(ICU_FIND_PATH_HINTS "${ICU_ROOT}/include")
+else()
+    set(ICU_FIND_OPTS)
+    set(ICU_FIND_LIBRARY_HINTS)
+    set(ICU_FIND_PATH_HINTS)
+endif()
+
+find_path(ICU_INCLUDE_DIR
+    unicode/ucol.h
+    HINTS ${ICU_FIND_PATH_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+if(BUILD_STATIC)
+    set(ICU_I18N_LIB_NAME libicui18n.a)
+    set(ICU_UC_LIB_NAME libicuuc.a)
+    set(ICU_DATA_LIB_NAME libicudata.a)
+else()
+    set(ICU_I18N_LIB_NAME icui18n)
+    set(ICU_UC_LIB_NAME icuuc)
+    set(ICU_DATA_LIB_NAME icudata)
+endif()
+
+find_library(ICU_LIBRARY_I18N NAMES ${ICU_I18N_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+find_library(ICU_LIBRARY_UC NAMES ${ICU_UC_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+find_library(ICU_LIBRARY_DATA NAMES ${ICU_DATA_LIB_NAME}
+    HINTS ${ICU_FIND_LIBRARY_HINTS}
+    ${ICU_FIND_OPTS}
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(ICU
+    REQUIRED_VARS ICU_INCLUDE_DIR ICU_LIBRARY_I18N ICU_LIBRARY_UC)
+set(ICU_INCLUDE_DIRS ${ICU_INCLUDE_DIR})
+set(ICU_LIBRARIES ${ICU_LIBRARY_I18N} ${ICU_LIBRARY_UC} ${ICU_LIBRARY_DATA})
+mark_as_advanced(ICU_INCLUDE_DIR ICU_INCLUDE_DIRS
+        ICU_LIBRARY_I18N ICU_LIBRARY_UC ICU_LIBRARIES)
+
+#
+# Check presence of ucol_strcollUTF8 function from ICU
+#
+set(CMAKE_REQUIRED_LIBRARIES ${ICU_LIBRARIES})
+set(CMAKE_REQUIRED_INCLUDES ${ICU_INCLUDE_DIRS})
+set(CMAKE_REQUIRED_FLAGS "-std=c++11")
+set(CMAKE_REQUIRED_DEFINITIONS "")
+set(CMAKE_REQUIRED_LIBRARIES "")
+set(CMAKE_REQUIRED_INCLUDES "")
+set(CMAKE_REQUIRED_FLAGS "")
diff --git a/src/emitter.c b/src/emitter.c
index 1400df1..14e3551 100644
--- a/src/emitter.c
+++ b/src/emitter.c
@@ -1,6 +1,9 @@
 
 #include "yaml_private.h"
 
+#include <unicode/utf8.h>
+#include <unicode/uchar.h>
+
 /*
  * Flush the buffer if needed.
  */
@@ -86,6 +89,9 @@ static int
 yaml_emitter_increase_indent(yaml_emitter_t *emitter,
         int flow, int indentless);
 
+static inline int
+yaml_emitter_is_printable(yaml_string_t string);
+
 /*
  * State functions.
  */
@@ -416,6 +422,43 @@ yaml_emitter_increase_indent(yaml_emitter_t *emitter,
     return 1;
 }
 
+/*
+ * Checks if given utf-8 encoded code point represent printable character.
+ */
+
+static inline int
+yaml_emitter_is_printable(yaml_string_t string)
+{
+    unsigned char octet;
+    unsigned int width;
+    unsigned int value;
+
+    octet = string.pointer[0];
+    width = (octet & 0x80) == 0x00 ? 1 :
+            (octet & 0xE0) == 0xC0 ? 2 :
+            (octet & 0xF0) == 0xE0 ? 3 :
+            (octet & 0xF8) == 0xF0 ? 4 : 0;
+    value = (octet & 0x80) == 0x00 ? octet & 0x7F :
+            (octet & 0xE0) == 0xC0 ? octet & 0x1F :
+            (octet & 0xF0) == 0xE0 ? octet & 0x0F :
+            (octet & 0xF8) == 0xF0 ? octet & 0x07 : 0;
+    for (int k = 1; k < (int)width; k ++) {
+        octet = string.pointer[k];
+        value = (value << 6) + (octet & 0x3F);
+    }
+    return (((string).pointer[0] == 0x0A)
+            || ((string).pointer[0] >= 0x20 && (string).pointer[0] <= 0x7E)
+            || ((string).pointer[0] == 0xC2 && (string).pointer[1] >= 0xA0)
+            || ((string).pointer[0] > 0xC2 && (string).pointer[0] < 0xED)
+            || ((string).pointer[0] == 0xED && (string).pointer[1] < 0xA0)
+            || ((string).pointer[0] == 0xEE)
+            || ((string).pointer[0] == 0xEF
+                && !((string).pointer[1] == 0xBB && (string).pointer[2] == 0xBF)
+                && !((string).pointer[1] == 0xBF
+                     && ((string).pointer[2] == 0xBE || (string).pointer[2] == 0xBF)))
+            || u_isprint(value));
+}
+
 /*
  * State dispatcher.
  */
@@ -1569,7 +1612,7 @@ yaml_emitter_analyze_scalar(yaml_emitter_t *emitter,
             }
         }
 
-        if (!IS_PRINTABLE(string)
+        if (!yaml_emitter_is_printable(string)
                 || (!IS_ASCII(string) && !emitter->unicode)) {
             special_characters = 1;
         }
@@ -2027,7 +2070,7 @@ yaml_emitter_write_double_quoted_scalar(yaml_emitter_t *emitter,
 
     while (string.pointer != string.end)
     {
-        if (!IS_PRINTABLE(string) || (!emitter->unicode && !IS_ASCII(string))
+        if (!yaml_emitter_is_printable(string) || (!emitter->unicode && !IS_ASCII(string))
                 || IS_BOM(string) || IS_BREAK(string)
                 || CHECK(string, '"') || CHECK(string, '\\'))
         {
diff --git a/tests/run-all-tests.sh b/tests/run-all-tests.sh
index 9c92741..fee18d5 100755
--- a/tests/run-all-tests.sh
+++ b/tests/run-all-tests.sh
@@ -5,14 +5,15 @@ set -e
 main() {
   # Autoconf based in-source build and tests
   clean
-
+  export LDFLAGS="-L/usr/local/opt/icu4c/lib -licuuc"
+  export CPPFLAGS="-I/usr/local/opt/icu4c/include"
   ./bootstrap
   ./configure
   make test-all
 
   # CMake based in-source build and tests
   clean
-
+  export CMAKE_PREFIX_PATH=/usr/local/opt/icu4c
   cmake .
   make
   make test
-- 
2.20.1

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2019-07-19 12:04 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-16 13:43 [PATCH] Extend range of printable unicode characters Serge Petrenko
2019-07-16 18:31 ` [tarantool-patches] " Konstantin Osipov
2019-07-17  9:00   ` Serge Petrenko
2019-07-18  4:50 ` Kirill Yukhin
2019-07-18  9:49   ` [tarantool-patches] " Konstantin Osipov
2019-07-18 11:16     ` Kirill Yukhin
2019-07-18 11:38       ` Konstantin Osipov
  -- strict thread matches above, loose matches on Subject: below --
2019-07-18  8:39 Serge Petrenko
2019-07-19 12:04 ` [tarantool-patches] " Kirill Yukhin
2019-07-02 11:33 Ivan Koptelov
2019-07-02 11:32 Ivan Koptelov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox