[tarantool-patches] [PATCH v3 3/4] collation: introduce collation fingerprint

Vladislav Shpilevoy v.shpilevoy at tarantool.org
Tue May 15 22:54:07 MSK 2018


Collation fingerprint is a formatted string unique for a set
of collation properties. Equal collations with different names
have the same fingerprint.

This new property is used to build collation fingerprint cache
to use in Tarantool internals, where collation name does not
matter.

Fingerprint cache can never conflict or replace on insertion into
it. It means, that, for example, utf8 module being created in
this patchset, can fill collation cache with its own collations
and it will affect neither users or other modules.
---
 src/coll.c               | 122 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/coll.h               |  17 ++++++-
 src/main.cc              |   3 ++
 test/unit/CMakeLists.txt |   2 +-
 test/unit/coll.cpp       |  39 +++++++++++++--
 test/unit/coll.result    |   5 ++
 6 files changed, 180 insertions(+), 8 deletions(-)

diff --git a/src/coll.c b/src/coll.c
index eacb643f2..398bff49e 100644
--- a/src/coll.c
+++ b/src/coll.c
@@ -32,12 +32,44 @@
 #include "coll.h"
 #include "third_party/PMurHash.h"
 #include "diag.h"
+#include "assoc.h"
 #include <unicode/ucol.h>
 #include <trivia/config.h>
 
+#define mh_name _coll
+struct mh_coll_key_t {
+	const char *str;
+	size_t len;
+	uint32_t hash;
+};
+#define mh_key_t struct mh_coll_key_t *
+
+struct mh_coll_node_t {
+	size_t len;
+	uint32_t hash;
+	struct coll *coll;
+};
+#define mh_node_t struct mh_coll_node_t
+
+#define mh_arg_t void *
+#define mh_hash(a, arg) ((a)->hash)
+#define mh_hash_key(a, arg) ((a)->hash)
+#define mh_cmp(a, b, arg) ((a)->len != (b)->len || \
+			    strncmp((a)->coll->fingerprint, \
+				    (b)->coll->fingerprint, (a)->len))
+#define mh_cmp_key(a, b, arg) ((a)->len != (b)->len || \
+			       strncmp((a)->str, (b)->coll->fingerprint, \
+				       (a)->len))
+#define MH_SOURCE
+#include "salad/mhash.h"
+
+/** Table fingerprint -> collation. */
+static struct mh_coll_t *coll_cache = NULL;
+
 enum {
 	MAX_HASH_BUFFER = 1024,
 	MAX_LOCALE = 1024,
+	MAX_FINGERPRINT_SIZE = 1024,
 };
 
 /** Compare two string using ICU collation. */
@@ -205,21 +237,88 @@ coll_icu_init_cmp(struct coll *coll, const struct coll_def *def)
 	return 0;
 }
 
+/**
+ * Print ICU definition into @a buffer limited with @a size bytes.
+ * If @a size bytes is not enough, then total needed byte count is
+ * returned.
+ * @param buffer Buffer to write to.
+ * @param size Size of @a buffer.
+ * @param def ICU definition.
+ *
+ * @retval Written or needed byte count.
+ */
+static int
+coll_icu_def_snfingerprint(char *buffer, int size,
+			   const struct coll_icu_def *def)
+{
+	return snprintf(buffer, size, "{french_coll: %d, alt_handling: %d, "\
+			"case_first: %d, case_level: %d, norm_mode: %d, "\
+			"strength: %d, numeric_coll: %d}",
+			(int) def->french_collation,
+			(int) def->alternate_handling, (int) def->case_first,
+			(int) def->case_level, (int) def->normalization_mode,
+			(int) def->strength, (int) def->numeric_collation);
+}
+
+/**
+ * Print collation definition into @a buffer limited with @a size
+ * bytes. If @a size bytes is not enough, then total needed byte
+ * count is returned.
+ * @param buffer Buffer to write to.
+ * @param size Size of @a buffer.
+ * @param def Collation definition.
+ *
+ * @retval Written or needed byte count.
+ */
+static int
+coll_def_snfingerprint(char *buffer, int size, const struct coll_def *def)
+{
+	int total = 0;
+	SNPRINT(total, snprintf, buffer, size, "{locale: %.*s, type = %d, "\
+	        "icu: ", (int) def->locale_len, def->locale, (int) def->type);
+	SNPRINT(total, coll_icu_def_snfingerprint, buffer, size, &def->icu);
+	SNPRINT(total, snprintf, buffer, size, "}");
+	return total;
+}
+
 struct coll *
 coll_new(const struct coll_def *def)
 {
 	assert(def->type == COLL_TYPE_ICU);
-	struct coll *coll = (struct coll *) malloc(sizeof(*coll));
+	int fingerprint_len = coll_def_snfingerprint(NULL, 0, def);
+	assert(fingerprint_len <= MAX_FINGERPRINT_SIZE);
+	char fingerprint[MAX_FINGERPRINT_SIZE];
+	coll_def_snfingerprint(fingerprint, MAX_FINGERPRINT_SIZE, def);
+
+	uint32_t hash = mh_strn_hash(fingerprint, fingerprint_len);
+	struct mh_coll_key_t key = { fingerprint, fingerprint_len, hash };
+	mh_int_t i = mh_coll_find(coll_cache, &key, NULL);
+	if (i != mh_end(coll_cache)) {
+		struct coll *coll = mh_coll_node(coll_cache, i)->coll;
+		coll_ref(coll);
+		return coll;
+	}
+
+	int total_size = sizeof(struct coll) + fingerprint_len + 1;
+	struct coll *coll = (struct coll *) malloc(total_size);
 	if (coll == NULL) {
-		diag_set(OutOfMemory, sizeof(*coll), "malloc", "coll");
+		diag_set(OutOfMemory, total_size, "malloc", "coll");
 		return NULL;
 	}
+	memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1);
 	coll->refs = 1;
 	coll->type = def->type;
 	if (coll_icu_init_cmp(coll, def) != 0) {
 		free(coll);
 		return NULL;
 	}
+
+	struct mh_coll_node_t node = { fingerprint_len, hash, coll };
+	if (mh_coll_put(coll_cache, &node, NULL, NULL) == mh_end(coll_cache)) {
+		diag_set(OutOfMemory, sizeof(node), "malloc", "coll_cache");
+		coll_unref(coll);
+		return NULL;
+	}
 	return coll;
 }
 
@@ -228,7 +327,26 @@ coll_unref(struct coll *coll)
 {
 	assert(coll->refs > 0);
 	if (--coll->refs == 0) {
+		int len = strlen(coll->fingerprint);
+		struct mh_coll_node_t node = {
+			len, mh_strn_hash(coll->fingerprint, len), coll
+		};
+		mh_coll_remove(coll_cache, &node, NULL);
 		ucol_close(coll->icu.collator);
 		free(coll);
 	}
 }
+
+void
+coll_init()
+{
+	coll_cache = mh_coll_new();
+	if (coll_cache == NULL)
+		panic("Can not create system collations cache");
+}
+
+void
+coll_free()
+{
+	mh_coll_delete(coll_cache);
+}
diff --git a/src/coll.h b/src/coll.h
index 8798d9491..348f073eb 100644
--- a/src/coll.h
+++ b/src/coll.h
@@ -69,10 +69,17 @@ struct coll {
 	coll_hash_f hash;
 	/** Reference counter. */
 	int refs;
+	/**
+	 * Formatted string with collation properties, that
+	 * completely describes how the collation works.
+	 */
+	const char fingerprint[0];
 };
 
 /**
- * Create a core collation by definition.
+ * Create a core collation by definition. Can return an existing
+ * collation object, if a one with the same fingerprint was
+ * created before.
  * @param def Core collation definition.
  * @retval NULL Illegal parameters or memory error.
  * @retval not NULL Collation.
@@ -91,6 +98,14 @@ coll_ref(struct coll *coll)
 void
 coll_unref(struct coll *coll);
 
+/** Initialize collations subsystem. */
+void
+coll_init();
+
+/** Destroy collations subsystem. */
+void
+coll_free();
+
 #if defined(__cplusplus)
 } /* extern "C" */
 #endif /* defined(__cplusplus) */
diff --git a/src/main.cc b/src/main.cc
index 1682baea0..3caa59ffa 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -58,6 +58,7 @@
 #include <say.h>
 #include <rmean.h>
 #include <limits.h>
+#include <coll.h>
 #include "trivia/util.h"
 #include "backtrace.h"
 #include "tt_pthread.h"
@@ -581,6 +582,7 @@ tarantool_free(void)
 	memory_free();
 	random_free();
 #endif
+	coll_free();
 	systemd_free();
 	say_logger_free();
 }
@@ -732,6 +734,7 @@ main(int argc, char **argv)
 	coio_enable();
 	signal_init();
 	cbus_init();
+	coll_init();
 	tarantool_lua_init(tarantool_bin, main_argc, main_argv);
 
 	start_time = ev_monotonic_time();
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 5d83f53b0..dbc02cdf0 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -191,4 +191,4 @@ add_executable(vy_cache.test vy_cache.c ${ITERATOR_TEST_SOURCES})
 target_link_libraries(vy_cache.test ${ITERATOR_TEST_LIBS})
 
 add_executable(coll.test coll.cpp)
-target_link_libraries(coll.test box)
+target_link_libraries(coll.test core unit ${ICU_LIBRARIES} misc)
diff --git a/test/unit/coll.cpp b/test/unit/coll.cpp
index 17f26ea07..2e2d4a54a 100644
--- a/test/unit/coll.cpp
+++ b/test/unit/coll.cpp
@@ -9,6 +9,7 @@
 #include <diag.h>
 #include <fiber.h>
 #include <memory.h>
+#include "unit.h"
 #include "third_party/PMurHash.h"
 
 using namespace std;
@@ -43,7 +44,7 @@ test_sort_strings(vector<const char *> &strings, struct coll *coll)
 void
 manual_test()
 {
-	cout << "\t*** " << __func__ << " ***" << endl;
+	header();
 
 	vector<const char *> strings;
 	struct coll_def def;
@@ -111,7 +112,7 @@ manual_test()
 	test_sort_strings(strings, coll);
 	coll_unref(coll);
 
-	cout << "\t*** " << __func__ << ": done ***" << endl;
+	footer();
 }
 
 unsigned calc_hash(const char *str, struct coll *coll)
@@ -127,7 +128,7 @@ unsigned calc_hash(const char *str, struct coll *coll)
 void
 hash_test()
 {
-	cout << "\t*** " << __func__ << " ***" << endl;
+	header();
 
 	struct coll_def def;
 	memset(&def, 0, sizeof(def));
@@ -155,17 +156,47 @@ hash_test()
 	cout << (calc_hash("аЕ", coll) != calc_hash("аё", coll) ? "OK" : "Fail") << endl;
 	coll_unref(coll);
 
-	cout << "\t*** " << __func__ << ": done ***" << endl;
+	footer();
 }
 
+void
+cache_test()
+{
+	header();
+	plan(2);
+
+	struct coll_def def;
+	memset(&def, 0, sizeof(def));
+	def.locale = "ru_RU";
+	def.locale_len = strlen(def.locale);
+	def.type = COLL_TYPE_ICU;
+
+	struct coll *coll1 = coll_new(&def);
+	struct coll *coll2 = coll_new(&def);
+	is(coll1, coll2,
+	   "collations with the same definition are not duplicated");
+	coll_unref(coll2);
+	def.locale = "en_EN";
+	coll2 = coll_new(&def);
+	isnt(coll1, coll2,
+	     "collations with different definitions are different objects");
+	coll_unref(coll2);
+	coll_unref(coll1);
+
+	check_plan();
+	footer();
+}
 
 int
 main(int, const char**)
 {
+	coll_init();
 	memory_init();
 	fiber_init(fiber_c_invoke);
 	manual_test();
 	hash_test();
+	cache_test();
 	fiber_free();
 	memory_free();
+	coll_free();
 }
\ No newline at end of file
diff --git a/test/unit/coll.result b/test/unit/coll.result
index 218dca8f4..269764246 100644
--- a/test/unit/coll.result
+++ b/test/unit/coll.result
@@ -83,3 +83,8 @@ OK
 OK
 OK
 	*** hash_test: done ***
+	*** cache_test ***
+1..2
+ok 1 - collations with the same definition are not duplicated
+ok 2 - collations with different definitions are different objects
+	*** cache_test: done ***
-- 
2.15.1 (Apple Git-101)





More information about the Tarantool-patches mailing list