From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org>
To: tarantool-patches@freelists.org
Cc: kostja@tarantool.org
Subject: [tarantool-patches] Re: [PATCH v3 3/4] collation: introduce collation fingerprint
Date: Thu, 17 May 2018 22:24:05 +0300	[thread overview]
Message-ID: <7b70d6b6-2fa0-7722-9e52-417508a1e329@tarantool.org> (raw)
In-Reply-To: <0377ce9f0096789625370e2d68ea6f87bbdc4de0.1526414017.git.v.shpilevoy@tarantool.org>
New patch version. It is changed after the previous patch was
reworked.
---
collation: introduce collation fingerprint
Collation fingerprint is a formatted string unique for a set
of collation properties. Equal collations with different names
have the same fingerprint.
This new property is used to build collation fingerprint cache
to use in Tarantool internals, where collation name does not
matter.
Fingerprint cache can never conflict or replace on insertion into
it. It means, that, for example, utf8 module being created in
this patchset, can fill collation cache with its own collations
and it will affect neither users or other modules.
---
  src/coll.c               | 121 ++++++++++++++++++++++++++++++++++++++++++++++-
  src/coll.h               |  17 ++++++-
  src/main.cc              |   3 ++
  test/unit/CMakeLists.txt |   2 +-
  test/unit/coll.cpp       |  39 +++++++++++++--
  test/unit/coll.result    |   5 ++
  6 files changed, 179 insertions(+), 8 deletions(-)
diff --git a/src/coll.c b/src/coll.c
index 66afa6c4f..d0e36827d 100644
--- a/src/coll.c
+++ b/src/coll.c
@@ -32,9 +32,40 @@
  #include "coll.h"
  #include "third_party/PMurHash.h"
  #include "diag.h"
+#include "assoc.h"
  #include <unicode/ucol.h>
  #include <trivia/config.h>
  
+#define mh_name _coll
+struct mh_coll_key_t {
+	const char *str;
+	size_t len;
+	uint32_t hash;
+};
+#define mh_key_t struct mh_coll_key_t *
+
+struct mh_coll_node_t {
+	size_t len;
+	uint32_t hash;
+	struct coll *coll;
+};
+#define mh_node_t struct mh_coll_node_t
+
+#define mh_arg_t void *
+#define mh_hash(a, arg) ((a)->hash)
+#define mh_hash_key(a, arg) ((a)->hash)
+#define mh_cmp(a, b, arg) ((a)->len != (b)->len || \
+			    strncmp((a)->coll->fingerprint, \
+				    (b)->coll->fingerprint, (a)->len))
+#define mh_cmp_key(a, b, arg) ((a)->len != (b)->len || \
+			       strncmp((a)->str, (b)->coll->fingerprint, \
+				       (a)->len))
+#define MH_SOURCE
+#include "salad/mhash.h"
+
+/** Table fingerprint -> collation. */
+static struct mh_coll_t *coll_cache = NULL;
+
  enum {
  	MAX_LOCALE = 1024,
  };
@@ -205,21 +236,88 @@ coll_icu_init_cmp(struct coll *coll, const struct coll_def *def)
  	return 0;
  }
  
+/**
+ * Print ICU definition into @a buffer limited with @a size bytes.
+ * If @a size bytes is not enough, then total needed byte count is
+ * returned.
+ * @param buffer Buffer to write to.
+ * @param size Size of @a buffer.
+ * @param def ICU definition.
+ *
+ * @retval Written or needed byte count.
+ */
+static int
+coll_icu_def_snfingerprint(char *buffer, int size,
+			   const struct coll_icu_def *def)
+{
+	return snprintf(buffer, size, "{french_coll: %d, alt_handling: %d, "\
+			"case_first: %d, case_level: %d, norm_mode: %d, "\
+			"strength: %d, numeric_coll: %d}",
+			(int) def->french_collation,
+			(int) def->alternate_handling, (int) def->case_first,
+			(int) def->case_level, (int) def->normalization_mode,
+			(int) def->strength, (int) def->numeric_collation);
+}
+
+/**
+ * Print collation definition into @a buffer limited with @a size
+ * bytes. If @a size bytes is not enough, then total needed byte
+ * count is returned.
+ * @param buffer Buffer to write to.
+ * @param size Size of @a buffer.
+ * @param def Collation definition.
+ *
+ * @retval Written or needed byte count.
+ */
+static int
+coll_def_snfingerprint(char *buffer, int size, const struct coll_def *def)
+{
+	int total = 0;
+	SNPRINT(total, snprintf, buffer, size, "{locale: %.*s, type = %d, "\
+	        "icu: ", (int) def->locale_len, def->locale, (int) def->type);
+	SNPRINT(total, coll_icu_def_snfingerprint, buffer, size, &def->icu);
+	SNPRINT(total, snprintf, buffer, size, "}");
+	return total;
+}
+
  struct coll *
  coll_new(const struct coll_def *def)
  {
  	assert(def->type == COLL_TYPE_ICU);
-	struct coll *coll = (struct coll *) malloc(sizeof(*coll));
+	int fingerprint_len = coll_def_snfingerprint(NULL, 0, def);
+	assert(fingerprint_len <= TT_STATIC_BUF_LEN);
+	char *fingerprint = tt_static_buf();
+	coll_def_snfingerprint(fingerprint, TT_STATIC_BUF_LEN, def);
+
+	uint32_t hash = mh_strn_hash(fingerprint, fingerprint_len);
+	struct mh_coll_key_t key = { fingerprint, fingerprint_len, hash };
+	mh_int_t i = mh_coll_find(coll_cache, &key, NULL);
+	if (i != mh_end(coll_cache)) {
+		struct coll *coll = mh_coll_node(coll_cache, i)->coll;
+		coll_ref(coll);
+		return coll;
+	}
+
+	int total_size = sizeof(struct coll) + fingerprint_len + 1;
+	struct coll *coll = (struct coll *) malloc(total_size);
  	if (coll == NULL) {
-		diag_set(OutOfMemory, sizeof(*coll), "malloc", "coll");
+		diag_set(OutOfMemory, total_size, "malloc", "coll");
  		return NULL;
  	}
+	memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1);
  	coll->refs = 1;
  	coll->type = def->type;
  	if (coll_icu_init_cmp(coll, def) != 0) {
  		free(coll);
  		return NULL;
  	}
+
+	struct mh_coll_node_t node = { fingerprint_len, hash, coll };
+	if (mh_coll_put(coll_cache, &node, NULL, NULL) == mh_end(coll_cache)) {
+		diag_set(OutOfMemory, sizeof(node), "malloc", "coll_cache");
+		coll_unref(coll);
+		return NULL;
+	}
  	return coll;
  }
  
@@ -228,7 +326,26 @@ coll_unref(struct coll *coll)
  {
  	assert(coll->refs > 0);
  	if (--coll->refs == 0) {
+		int len = strlen(coll->fingerprint);
+		struct mh_coll_node_t node = {
+			len, mh_strn_hash(coll->fingerprint, len), coll
+		};
+		mh_coll_remove(coll_cache, &node, NULL);
  		ucol_close(coll->icu.collator);
  		free(coll);
  	}
  }
+
+void
+coll_init()
+{
+	coll_cache = mh_coll_new();
+	if (coll_cache == NULL)
+		panic("Can not create system collations cache");
+}
+
+void
+coll_free()
+{
+	mh_coll_delete(coll_cache);
+}
diff --git a/src/coll.h b/src/coll.h
index cc834f446..7e950d164 100644
--- a/src/coll.h
+++ b/src/coll.h
@@ -68,10 +68,17 @@ struct coll {
  	coll_hash_f hash;
  	/** Reference counter. */
  	int refs;
+	/**
+	 * Formatted string with collation properties, that
+	 * completely describes how the collation works.
+	 */
+	const char fingerprint[0];
  };
  
  /**
- * Create a collation by definition.
+ * Create a collation by definition. Can return an existing
+ * collation object, if a one with the same fingerprint was
+ * created before.
   * @param def Collation definition.
   * @retval NULL Collation or memory error.
   * @retval not NULL Collation.
@@ -90,6 +97,14 @@ coll_ref(struct coll *coll)
  void
  coll_unref(struct coll *coll);
  
+/** Initialize collations subsystem. */
+void
+coll_init();
+
+/** Destroy collations subsystem. */
+void
+coll_free();
+
  #if defined(__cplusplus)
  } /* extern "C" */
  #endif /* defined(__cplusplus) */
diff --git a/src/main.cc b/src/main.cc
index 1682baea0..a36a2b0d0 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -58,6 +58,7 @@
  #include <say.h>
  #include <rmean.h>
  #include <limits.h>
+#include "coll.h"
  #include "trivia/util.h"
  #include "backtrace.h"
  #include "tt_pthread.h"
@@ -581,6 +582,7 @@ tarantool_free(void)
  	memory_free();
  	random_free();
  #endif
+	coll_free();
  	systemd_free();
  	say_logger_free();
  }
@@ -732,6 +734,7 @@ main(int argc, char **argv)
  	coio_enable();
  	signal_init();
  	cbus_init();
+	coll_init();
  	tarantool_lua_init(tarantool_bin, main_argc, main_argv);
  
  	start_time = ev_monotonic_time();
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 5d83f53b0..dbc02cdf0 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -191,4 +191,4 @@ add_executable(vy_cache.test vy_cache.c ${ITERATOR_TEST_SOURCES})
  target_link_libraries(vy_cache.test ${ITERATOR_TEST_LIBS})
  
  add_executable(coll.test coll.cpp)
-target_link_libraries(coll.test box)
+target_link_libraries(coll.test core unit ${ICU_LIBRARIES} misc)
diff --git a/test/unit/coll.cpp b/test/unit/coll.cpp
index 53e06f2ce..eeee739b7 100644
--- a/test/unit/coll.cpp
+++ b/test/unit/coll.cpp
@@ -9,6 +9,7 @@
  #include <memory.h>
  #include "coll_def.h"
  #include "coll.h"
+#include "unit.h"
  #include "third_party/PMurHash.h"
  
  using namespace std;
@@ -43,7 +44,7 @@ test_sort_strings(vector<const char *> &strings, struct coll *coll)
  void
  manual_test()
  {
-	cout << "\t*** " << __func__ << " ***" << endl;
+	header();
  
  	vector<const char *> strings;
  	struct coll_def def;
@@ -111,7 +112,7 @@ manual_test()
  	test_sort_strings(strings, coll);
  	coll_unref(coll);
  
-	cout << "\t*** " << __func__ << ": done ***" << endl;
+	footer();
  }
  
  unsigned calc_hash(const char *str, struct coll *coll)
@@ -127,7 +128,7 @@ unsigned calc_hash(const char *str, struct coll *coll)
  void
  hash_test()
  {
-	cout << "\t*** " << __func__ << " ***" << endl;
+	header();
  
  	struct coll_def def;
  	memset(&def, 0, sizeof(def));
@@ -155,17 +156,47 @@ hash_test()
  	cout << (calc_hash("аЕ", coll) != calc_hash("аё", coll) ? "OK" : "Fail") << endl;
  	coll_unref(coll);
  
-	cout << "\t*** " << __func__ << ": done ***" << endl;
+	footer();
  }
  
+void
+cache_test()
+{
+	header();
+	plan(2);
+
+	struct coll_def def;
+	memset(&def, 0, sizeof(def));
+	def.locale = "ru_RU";
+	def.locale_len = strlen(def.locale);
+	def.type = COLL_TYPE_ICU;
+
+	struct coll *coll1 = coll_new(&def);
+	struct coll *coll2 = coll_new(&def);
+	is(coll1, coll2,
+	   "collations with the same definition are not duplicated");
+	coll_unref(coll2);
+	def.locale = "en_EN";
+	coll2 = coll_new(&def);
+	isnt(coll1, coll2,
+	     "collations with different definitions are different objects");
+	coll_unref(coll2);
+	coll_unref(coll1);
+
+	check_plan();
+	footer();
+}
  
  int
  main(int, const char**)
  {
+	coll_init();
  	memory_init();
  	fiber_init(fiber_c_invoke);
  	manual_test();
  	hash_test();
+	cache_test();
  	fiber_free();
  	memory_free();
+	coll_free();
  }
\ No newline at end of file
diff --git a/test/unit/coll.result b/test/unit/coll.result
index 218dca8f4..269764246 100644
--- a/test/unit/coll.result
+++ b/test/unit/coll.result
@@ -83,3 +83,8 @@ OK
  OK
  OK
  	*** hash_test: done ***
+	*** cache_test ***
+1..2
+ok 1 - collations with the same definition are not duplicated
+ok 2 - collations with different definitions are different objects
+	*** cache_test: done ***
-- 
2.15.1 (Apple Git-101)
next prev parent reply	other threads:[~2018-05-17 19:24 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-05-15 19:54 [tarantool-patches] [PATCH v3 0/4] Lua utf8 module Vladislav Shpilevoy
2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 1/4] error: introduce error rebulding API Vladislav Shpilevoy
2018-05-16 17:06   ` [tarantool-patches] " Vladislav Shpilevoy
2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 2/4] collation: split collation into core and box objects Vladislav Shpilevoy
2018-05-16 17:07   ` [tarantool-patches] " Vladislav Shpilevoy
2018-05-16 17:17     ` Konstantin Osipov
2018-05-16 17:19       ` Vladislav Shpilevoy
2018-05-17 19:23   ` Vladislav Shpilevoy
2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 3/4] collation: introduce collation fingerprint Vladislav Shpilevoy
2018-05-17 19:24   ` Vladislav Shpilevoy [this message]
2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 4/4] lua: introduce utf8 built-in globaly visible module Vladislav Shpilevoy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox
  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):
  git send-email \
    --in-reply-to=7b70d6b6-2fa0-7722-9e52-417508a1e329@tarantool.org \
    --to=v.shpilevoy@tarantool.org \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='[tarantool-patches] Re: [PATCH v3 3/4] collation: introduce collation fingerprint' \
    /path/to/YOUR_REPLY
  https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox