Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH 1/1] identifier: do not use ICU UConverter for checks
@ 2018-04-04 12:35 Vladislav Shpilevoy
  2018-04-04 12:38 ` [tarantool-patches] " Vladislav Shpilevoy
  2018-04-06 13:53 ` Vladimir Davydov
  0 siblings, 2 replies; 5+ messages in thread
From: Vladislav Shpilevoy @ 2018-04-04 12:35 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev, Vladislav Shpilevoy

I makes no sense to create a converter, when there is
nothing to convert. To check an identifier it is
enough to use stateless ICU macros: U8_NEXT_OR_FFFD,
that also allows to eliminate 0xFFFD symbol as a
special one - this macros returns this code on any
error, or when it is actually this symbol.
---
Branch: https://github.com/tarantool/tarantool/tree/identifier-do-not-use-uconverter

 src/box/box.cc              |  2 --
 src/box/identifier.c        | 51 ++++++++++-----------------------------------
 src/box/identifier.h        | 13 ------------
 test/unit/vy_point_lookup.c |  2 --
 4 files changed, 11 insertions(+), 57 deletions(-)

diff --git a/src/box/box.cc b/src/box/box.cc
index cb3199624..d2dfc5b5f 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -1518,7 +1518,6 @@ box_free(void)
 		gc_free();
 		engine_shutdown();
 		wal_thread_stop();
-		identifier_destroy();
 	}
 
 	fiber_cond_destroy(&ro_cond);
@@ -1724,7 +1723,6 @@ box_cfg_xc(void)
 	engine_init();
 	if (module_init() != 0)
 		diag_raise();
-	identifier_init();
 	schema_init();
 	replication_init();
 	port_init();
diff --git a/src/box/identifier.c b/src/box/identifier.c
index e73e666b7..318f914f6 100644
--- a/src/box/identifier.c
+++ b/src/box/identifier.c
@@ -33,44 +33,34 @@
 #include "say.h"
 #include "diag.h"
 
-#include <unicode/ucnv.h>
+#include <unicode/utf8.h>
 #include <unicode/uchar.h>
-/* ICU returns this character in case of unknown symbol */
-#define REPLACEMENT_CHARACTER (0xFFFD)
-
-static UConverter* utf8conv = NULL;
 
 int
 identifier_check(const char *str, size_t str_len)
 {
-	assert(utf8conv);
 	const char *end = str + str_len;
 	if (str == end)
 		goto error;
 
-	ucnv_reset(utf8conv);
-
-	while (str < end) {
-		int8_t type;
-		UErrorCode status = U_ZERO_ERROR;
-		UChar32 c = ucnv_getNextUChar(utf8conv, &str, end, &status);
-
-		if (U_FAILURE(status))
+	UChar32 c;
+	uint32_t offset = 0;
+	while (offset < str_len) {
+		U8_NEXT_OR_FFFD(str, offset, str_len, c)
+		if (c == 0xFFFD)
 			goto error;
 
-		type = u_charType(c);
+		int8_t type = u_charType(c);
 		/**
 		 * The icu library has a function named u_isprint, however,
 		 * this function does not return any errors.
 		 * Here the `c` symbol printability is determined by comparison
 		 * with unicode category types explicitly.
 		 */
-		if (c == REPLACEMENT_CHARACTER ||
-			type == U_UNASSIGNED ||
-			type == U_LINE_SEPARATOR ||
-			type == U_CONTROL_CHAR ||
-			type == U_PARAGRAPH_SEPARATOR)
-
+		if (type == U_UNASSIGNED ||
+		    type == U_LINE_SEPARATOR ||
+		    type == U_CONTROL_CHAR ||
+		    type == U_PARAGRAPH_SEPARATOR)
 			goto error;
 	}
 	return 0;
@@ -78,22 +68,3 @@ error:
 	diag_set(ClientError, ER_IDENTIFIER, tt_cstr(str, str_len));
 	return -1;
 }
-
-void
-identifier_init()
-{
-	assert(utf8conv == NULL);
-	UErrorCode status = U_ZERO_ERROR ;
-	utf8conv = ucnv_open("utf8", &status);
-	if (U_FAILURE(status))
-		panic("ICU ucnv_open(\"utf8\") failed");
-}
-
-void
-identifier_destroy()
-{
-	assert(utf8conv);
-	ucnv_close(utf8conv);
-	utf8conv = NULL;
-}
-
diff --git a/src/box/identifier.h b/src/box/identifier.h
index f1e36fe2a..30e6fdb69 100644
--- a/src/box/identifier.h
+++ b/src/box/identifier.h
@@ -49,19 +49,6 @@ extern "C" {
 int
 identifier_check(const char *str, size_t str_len);
 
-/**
- * Init identifier check mechanism.
- * This function allocates necessary for icu structures.
- */
-void
-identifier_init();
-
-/**
- * Clean icu structures.
- */
-void
-identifier_destroy();
-
 #if defined(__cplusplus)
 } /* extern "C" */
 
diff --git a/test/unit/vy_point_lookup.c b/test/unit/vy_point_lookup.c
index 6a3802def..629cd3efa 100644
--- a/test/unit/vy_point_lookup.c
+++ b/test/unit/vy_point_lookup.c
@@ -327,7 +327,6 @@ test_basic()
 int
 main()
 {
-	identifier_init();
 	plan(1);
 
 	vy_iterator_C_test_init(128 * 1024);
@@ -337,6 +336,5 @@ main()
 
 	vy_iterator_C_test_finish();
 
-	identifier_destroy();
 	return check_plan();
 }
-- 
2.14.3 (Apple Git-98)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [tarantool-patches] [PATCH 1/1] identifier: do not use ICU UConverter for checks
  2018-04-04 12:35 [PATCH 1/1] identifier: do not use ICU UConverter for checks Vladislav Shpilevoy
@ 2018-04-04 12:38 ` Vladislav Shpilevoy
  2018-04-06 13:53 ` Vladimir Davydov
  1 sibling, 0 replies; 5+ messages in thread
From: Vladislav Shpilevoy @ 2018-04-04 12:38 UTC (permalink / raw)
  To: tarantool-patches; +Cc: vdavydov.dev



04.04.2018 15:35, Vladislav Shpilevoy пишет:
> I makes no sense to create a converter, when there is
Fixed a typo: 'I' -> 'It'.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH 1/1] identifier: do not use ICU UConverter for checks
  2018-04-04 12:35 [PATCH 1/1] identifier: do not use ICU UConverter for checks Vladislav Shpilevoy
  2018-04-04 12:38 ` [tarantool-patches] " Vladislav Shpilevoy
@ 2018-04-06 13:53 ` Vladimir Davydov
  2018-04-07 20:34   ` [tarantool-patches] " Vladislav Shpilevoy
  1 sibling, 1 reply; 5+ messages in thread
From: Vladimir Davydov @ 2018-04-06 13:53 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

This doesn't work on EL6/7 and Debian Wheezy - check out travis.

  https://travis-ci.org/tarantool/tarantool/builds/362116299?utm_source=github_status&utm_medium=notification

On Wed, Apr 04, 2018 at 03:35:41PM +0300, Vladislav Shpilevoy wrote:
> I makes no sense to create a converter, when there is
> nothing to convert. To check an identifier it is
> enough to use stateless ICU macros: U8_NEXT_OR_FFFD,
> that also allows to eliminate 0xFFFD symbol as a
> special one - this macros returns this code on any
> error, or when it is actually this symbol.
> ---
> Branch: https://github.com/tarantool/tarantool/tree/identifier-do-not-use-uconverter
> 
>  src/box/box.cc              |  2 --
>  src/box/identifier.c        | 51 ++++++++++-----------------------------------
>  src/box/identifier.h        | 13 ------------
>  test/unit/vy_point_lookup.c |  2 --
>  4 files changed, 11 insertions(+), 57 deletions(-)

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [tarantool-patches] Re: [PATCH 1/1] identifier: do not use ICU UConverter for checks
  2018-04-06 13:53 ` Vladimir Davydov
@ 2018-04-07 20:34   ` Vladislav Shpilevoy
  2018-04-10 10:37     ` Vladimir Davydov
  0 siblings, 1 reply; 5+ messages in thread
From: Vladislav Shpilevoy @ 2018-04-07 20:34 UTC (permalink / raw)
  To: tarantool-patches, Vladimir Davydov

Now it works. I changed U8_NEXT_OR_FFFD to U8_NEXT + explicit FFFD check.

On 06/04/2018 16:53, Vladimir Davydov wrote:
> This doesn't work on EL6/7 and Debian Wheezy - check out travis.
> 
>    https://travis-ci.org/tarantool/tarantool/builds/362116299?utm_source=github_status&utm_medium=notification
> 
> On Wed, Apr 04, 2018 at 03:35:41PM +0300, Vladislav Shpilevoy wrote:
>> I makes no sense to create a converter, when there is
>> nothing to convert. To check an identifier it is
>> enough to use stateless ICU macros: U8_NEXT_OR_FFFD,
>> that also allows to eliminate 0xFFFD symbol as a
>> special one - this macros returns this code on any
>> error, or when it is actually this symbol.
>> ---
>> Branch: https://github.com/tarantool/tarantool/tree/identifier-do-not-use-uconverter
>>
>>   src/box/box.cc              |  2 --
>>   src/box/identifier.c        | 51 ++++++++++-----------------------------------
>>   src/box/identifier.h        | 13 ------------
>>   test/unit/vy_point_lookup.c |  2 --
>>   4 files changed, 11 insertions(+), 57 deletions(-)
> 

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [tarantool-patches] Re: [PATCH 1/1] identifier: do not use ICU UConverter for checks
  2018-04-07 20:34   ` [tarantool-patches] " Vladislav Shpilevoy
@ 2018-04-10 10:37     ` Vladimir Davydov
  0 siblings, 0 replies; 5+ messages in thread
From: Vladimir Davydov @ 2018-04-10 10:37 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

ACK

> From 26f02c3be72636c68d951dc4bc0e9c94ea1e278b Mon Sep 17 00:00:00 2001
> From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
> Date: Wed, 4 Apr 2018 15:30:32 +0300
> Subject: [PATCH] identifier: do not use ICU UConverter for checks
> 
> It makes no sense to create a converter, when there is
> nothing to convert. To check an identifier it is
> enough to use stateless ICU macros: U8_NEXT.
> 
> diff --git a/src/box/box.cc b/src/box/box.cc
> index cb319962..d2dfc5b5 100644
> --- a/src/box/box.cc
> +++ b/src/box/box.cc
> @@ -1518,7 +1518,6 @@ box_free(void)
>  		gc_free();
>  		engine_shutdown();
>  		wal_thread_stop();
> -		identifier_destroy();
>  	}
>  
>  	fiber_cond_destroy(&ro_cond);
> @@ -1724,7 +1723,6 @@ box_cfg_xc(void)
>  	engine_init();
>  	if (module_init() != 0)
>  		diag_raise();
> -	identifier_init();
>  	schema_init();
>  	replication_init();
>  	port_init();
> diff --git a/src/box/identifier.c b/src/box/identifier.c
> index e73e666b..252d5361 100644
> --- a/src/box/identifier.c
> +++ b/src/box/identifier.c
> @@ -33,44 +33,34 @@
>  #include "say.h"
>  #include "diag.h"
>  
> -#include <unicode/ucnv.h>
> +#include <unicode/utf8.h>
>  #include <unicode/uchar.h>
> -/* ICU returns this character in case of unknown symbol */
> -#define REPLACEMENT_CHARACTER (0xFFFD)
> -
> -static UConverter* utf8conv = NULL;
>  
>  int
> -identifier_check(const char *str, size_t str_len)
> +identifier_check(const char *str, int str_len)
>  {
> -	assert(utf8conv);
>  	const char *end = str + str_len;
>  	if (str == end)
>  		goto error;
>  
> -	ucnv_reset(utf8conv);
> -
> -	while (str < end) {
> -		int8_t type;
> -		UErrorCode status = U_ZERO_ERROR;
> -		UChar32 c = ucnv_getNextUChar(utf8conv, &str, end, &status);
> -
> -		if (U_FAILURE(status))
> +	UChar32 c;
> +	int offset = 0;
> +	while (offset < str_len) {
> +		U8_NEXT(str, offset, str_len, c);
> +		if (c == U_SENTINEL || c == 0xFFFD)
>  			goto error;
>  
> -		type = u_charType(c);
> +		int8_t type = u_charType(c);
>  		/**
>  		 * The icu library has a function named u_isprint, however,
>  		 * this function does not return any errors.
>  		 * Here the `c` symbol printability is determined by comparison
>  		 * with unicode category types explicitly.
>  		 */
> -		if (c == REPLACEMENT_CHARACTER ||
> -			type == U_UNASSIGNED ||
> -			type == U_LINE_SEPARATOR ||
> -			type == U_CONTROL_CHAR ||
> -			type == U_PARAGRAPH_SEPARATOR)
> -
> +		if (type == U_UNASSIGNED ||
> +		    type == U_LINE_SEPARATOR ||
> +		    type == U_CONTROL_CHAR ||
> +		    type == U_PARAGRAPH_SEPARATOR)
>  			goto error;
>  	}
>  	return 0;
> @@ -78,22 +68,3 @@ error:
>  	diag_set(ClientError, ER_IDENTIFIER, tt_cstr(str, str_len));
>  	return -1;
>  }
> -
> -void
> -identifier_init()
> -{
> -	assert(utf8conv == NULL);
> -	UErrorCode status = U_ZERO_ERROR ;
> -	utf8conv = ucnv_open("utf8", &status);
> -	if (U_FAILURE(status))
> -		panic("ICU ucnv_open(\"utf8\") failed");
> -}
> -
> -void
> -identifier_destroy()
> -{
> -	assert(utf8conv);
> -	ucnv_close(utf8conv);
> -	utf8conv = NULL;
> -}
> -
> diff --git a/src/box/identifier.h b/src/box/identifier.h
> index f1e36fe2..f34380d6 100644
> --- a/src/box/identifier.h
> +++ b/src/box/identifier.h
> @@ -47,20 +47,7 @@ extern "C" {
>   * @retval -1 error, diagnostics area is set
>   */
>  int
> -identifier_check(const char *str, size_t str_len);
> -
> -/**
> - * Init identifier check mechanism.
> - * This function allocates necessary for icu structures.
> - */
> -void
> -identifier_init();
> -
> -/**
> - * Clean icu structures.
> - */
> -void
> -identifier_destroy();
> +identifier_check(const char *str, int str_len);
>  
>  #if defined(__cplusplus)
>  } /* extern "C" */
> @@ -69,7 +56,7 @@ identifier_destroy();
>   * Throw an error if identifier is not valid.
>   */
>  static inline void
> -identifier_check_xc(const char *str, size_t str_len)
> +identifier_check_xc(const char *str, int str_len)
>  {
>  	if (identifier_check(str, str_len))
>  		diag_raise();
> diff --git a/test/unit/vy_point_lookup.c b/test/unit/vy_point_lookup.c
> index 6a3802de..629cd3ef 100644
> --- a/test/unit/vy_point_lookup.c
> +++ b/test/unit/vy_point_lookup.c
> @@ -327,7 +327,6 @@ test_basic()
>  int
>  main()
>  {
> -	identifier_init();
>  	plan(1);
>  
>  	vy_iterator_C_test_init(128 * 1024);
> @@ -337,6 +336,5 @@ main()
>  
>  	vy_iterator_C_test_finish();
>  
> -	identifier_destroy();
>  	return check_plan();
>  }

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-04-10 10:37 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-04 12:35 [PATCH 1/1] identifier: do not use ICU UConverter for checks Vladislav Shpilevoy
2018-04-04 12:38 ` [tarantool-patches] " Vladislav Shpilevoy
2018-04-06 13:53 ` Vladimir Davydov
2018-04-07 20:34   ` [tarantool-patches] " Vladislav Shpilevoy
2018-04-10 10:37     ` Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox