From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id ED21923C1F for ; Tue, 15 May 2018 15:54:10 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id Yqrm3q-198kh for ; Tue, 15 May 2018 15:54:10 -0400 (EDT) Received: from smtp54.i.mail.ru (smtp54.i.mail.ru [217.69.128.34]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 56A9623B5F for ; Tue, 15 May 2018 15:54:10 -0400 (EDT) From: Vladislav Shpilevoy Subject: [tarantool-patches] [PATCH v3 3/4] collation: introduce collation fingerprint Date: Tue, 15 May 2018 22:54:07 +0300 Message-Id: <0377ce9f0096789625370e2d68ea6f87bbdc4de0.1526414017.git.v.shpilevoy@tarantool.org> In-Reply-To: References: MIME-Version: 1.0 In-Reply-To: References: Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: kostja@tarantool.org Collation fingerprint is a formatted string unique for a set of collation properties. Equal collations with different names have the same fingerprint. This new property is used to build collation fingerprint cache to use in Tarantool internals, where collation name does not matter. Fingerprint cache can never conflict or replace on insertion into it. It means, that, for example, utf8 module being created in this patchset, can fill collation cache with its own collations and it will affect neither users or other modules. --- src/coll.c | 122 ++++++++++++++++++++++++++++++++++++++++++++++- src/coll.h | 17 ++++++- src/main.cc | 3 ++ test/unit/CMakeLists.txt | 2 +- test/unit/coll.cpp | 39 +++++++++++++-- test/unit/coll.result | 5 ++ 6 files changed, 180 insertions(+), 8 deletions(-) diff --git a/src/coll.c b/src/coll.c index eacb643f2..398bff49e 100644 --- a/src/coll.c +++ b/src/coll.c @@ -32,12 +32,44 @@ #include "coll.h" #include "third_party/PMurHash.h" #include "diag.h" +#include "assoc.h" #include #include +#define mh_name _coll +struct mh_coll_key_t { + const char *str; + size_t len; + uint32_t hash; +}; +#define mh_key_t struct mh_coll_key_t * + +struct mh_coll_node_t { + size_t len; + uint32_t hash; + struct coll *coll; +}; +#define mh_node_t struct mh_coll_node_t + +#define mh_arg_t void * +#define mh_hash(a, arg) ((a)->hash) +#define mh_hash_key(a, arg) ((a)->hash) +#define mh_cmp(a, b, arg) ((a)->len != (b)->len || \ + strncmp((a)->coll->fingerprint, \ + (b)->coll->fingerprint, (a)->len)) +#define mh_cmp_key(a, b, arg) ((a)->len != (b)->len || \ + strncmp((a)->str, (b)->coll->fingerprint, \ + (a)->len)) +#define MH_SOURCE +#include "salad/mhash.h" + +/** Table fingerprint -> collation. */ +static struct mh_coll_t *coll_cache = NULL; + enum { MAX_HASH_BUFFER = 1024, MAX_LOCALE = 1024, + MAX_FINGERPRINT_SIZE = 1024, }; /** Compare two string using ICU collation. */ @@ -205,21 +237,88 @@ coll_icu_init_cmp(struct coll *coll, const struct coll_def *def) return 0; } +/** + * Print ICU definition into @a buffer limited with @a size bytes. + * If @a size bytes is not enough, then total needed byte count is + * returned. + * @param buffer Buffer to write to. + * @param size Size of @a buffer. + * @param def ICU definition. + * + * @retval Written or needed byte count. + */ +static int +coll_icu_def_snfingerprint(char *buffer, int size, + const struct coll_icu_def *def) +{ + return snprintf(buffer, size, "{french_coll: %d, alt_handling: %d, "\ + "case_first: %d, case_level: %d, norm_mode: %d, "\ + "strength: %d, numeric_coll: %d}", + (int) def->french_collation, + (int) def->alternate_handling, (int) def->case_first, + (int) def->case_level, (int) def->normalization_mode, + (int) def->strength, (int) def->numeric_collation); +} + +/** + * Print collation definition into @a buffer limited with @a size + * bytes. If @a size bytes is not enough, then total needed byte + * count is returned. + * @param buffer Buffer to write to. + * @param size Size of @a buffer. + * @param def Collation definition. + * + * @retval Written or needed byte count. + */ +static int +coll_def_snfingerprint(char *buffer, int size, const struct coll_def *def) +{ + int total = 0; + SNPRINT(total, snprintf, buffer, size, "{locale: %.*s, type = %d, "\ + "icu: ", (int) def->locale_len, def->locale, (int) def->type); + SNPRINT(total, coll_icu_def_snfingerprint, buffer, size, &def->icu); + SNPRINT(total, snprintf, buffer, size, "}"); + return total; +} + struct coll * coll_new(const struct coll_def *def) { assert(def->type == COLL_TYPE_ICU); - struct coll *coll = (struct coll *) malloc(sizeof(*coll)); + int fingerprint_len = coll_def_snfingerprint(NULL, 0, def); + assert(fingerprint_len <= MAX_FINGERPRINT_SIZE); + char fingerprint[MAX_FINGERPRINT_SIZE]; + coll_def_snfingerprint(fingerprint, MAX_FINGERPRINT_SIZE, def); + + uint32_t hash = mh_strn_hash(fingerprint, fingerprint_len); + struct mh_coll_key_t key = { fingerprint, fingerprint_len, hash }; + mh_int_t i = mh_coll_find(coll_cache, &key, NULL); + if (i != mh_end(coll_cache)) { + struct coll *coll = mh_coll_node(coll_cache, i)->coll; + coll_ref(coll); + return coll; + } + + int total_size = sizeof(struct coll) + fingerprint_len + 1; + struct coll *coll = (struct coll *) malloc(total_size); if (coll == NULL) { - diag_set(OutOfMemory, sizeof(*coll), "malloc", "coll"); + diag_set(OutOfMemory, total_size, "malloc", "coll"); return NULL; } + memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1); coll->refs = 1; coll->type = def->type; if (coll_icu_init_cmp(coll, def) != 0) { free(coll); return NULL; } + + struct mh_coll_node_t node = { fingerprint_len, hash, coll }; + if (mh_coll_put(coll_cache, &node, NULL, NULL) == mh_end(coll_cache)) { + diag_set(OutOfMemory, sizeof(node), "malloc", "coll_cache"); + coll_unref(coll); + return NULL; + } return coll; } @@ -228,7 +327,26 @@ coll_unref(struct coll *coll) { assert(coll->refs > 0); if (--coll->refs == 0) { + int len = strlen(coll->fingerprint); + struct mh_coll_node_t node = { + len, mh_strn_hash(coll->fingerprint, len), coll + }; + mh_coll_remove(coll_cache, &node, NULL); ucol_close(coll->icu.collator); free(coll); } } + +void +coll_init() +{ + coll_cache = mh_coll_new(); + if (coll_cache == NULL) + panic("Can not create system collations cache"); +} + +void +coll_free() +{ + mh_coll_delete(coll_cache); +} diff --git a/src/coll.h b/src/coll.h index 8798d9491..348f073eb 100644 --- a/src/coll.h +++ b/src/coll.h @@ -69,10 +69,17 @@ struct coll { coll_hash_f hash; /** Reference counter. */ int refs; + /** + * Formatted string with collation properties, that + * completely describes how the collation works. + */ + const char fingerprint[0]; }; /** - * Create a core collation by definition. + * Create a core collation by definition. Can return an existing + * collation object, if a one with the same fingerprint was + * created before. * @param def Core collation definition. * @retval NULL Illegal parameters or memory error. * @retval not NULL Collation. @@ -91,6 +98,14 @@ coll_ref(struct coll *coll) void coll_unref(struct coll *coll); +/** Initialize collations subsystem. */ +void +coll_init(); + +/** Destroy collations subsystem. */ +void +coll_free(); + #if defined(__cplusplus) } /* extern "C" */ #endif /* defined(__cplusplus) */ diff --git a/src/main.cc b/src/main.cc index 1682baea0..3caa59ffa 100644 --- a/src/main.cc +++ b/src/main.cc @@ -58,6 +58,7 @@ #include #include #include +#include #include "trivia/util.h" #include "backtrace.h" #include "tt_pthread.h" @@ -581,6 +582,7 @@ tarantool_free(void) memory_free(); random_free(); #endif + coll_free(); systemd_free(); say_logger_free(); } @@ -732,6 +734,7 @@ main(int argc, char **argv) coio_enable(); signal_init(); cbus_init(); + coll_init(); tarantool_lua_init(tarantool_bin, main_argc, main_argv); start_time = ev_monotonic_time(); diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 5d83f53b0..dbc02cdf0 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -191,4 +191,4 @@ add_executable(vy_cache.test vy_cache.c ${ITERATOR_TEST_SOURCES}) target_link_libraries(vy_cache.test ${ITERATOR_TEST_LIBS}) add_executable(coll.test coll.cpp) -target_link_libraries(coll.test box) +target_link_libraries(coll.test core unit ${ICU_LIBRARIES} misc) diff --git a/test/unit/coll.cpp b/test/unit/coll.cpp index 17f26ea07..2e2d4a54a 100644 --- a/test/unit/coll.cpp +++ b/test/unit/coll.cpp @@ -9,6 +9,7 @@ #include #include #include +#include "unit.h" #include "third_party/PMurHash.h" using namespace std; @@ -43,7 +44,7 @@ test_sort_strings(vector &strings, struct coll *coll) void manual_test() { - cout << "\t*** " << __func__ << " ***" << endl; + header(); vector strings; struct coll_def def; @@ -111,7 +112,7 @@ manual_test() test_sort_strings(strings, coll); coll_unref(coll); - cout << "\t*** " << __func__ << ": done ***" << endl; + footer(); } unsigned calc_hash(const char *str, struct coll *coll) @@ -127,7 +128,7 @@ unsigned calc_hash(const char *str, struct coll *coll) void hash_test() { - cout << "\t*** " << __func__ << " ***" << endl; + header(); struct coll_def def; memset(&def, 0, sizeof(def)); @@ -155,17 +156,47 @@ hash_test() cout << (calc_hash("аЕ", coll) != calc_hash("аё", coll) ? "OK" : "Fail") << endl; coll_unref(coll); - cout << "\t*** " << __func__ << ": done ***" << endl; + footer(); } +void +cache_test() +{ + header(); + plan(2); + + struct coll_def def; + memset(&def, 0, sizeof(def)); + def.locale = "ru_RU"; + def.locale_len = strlen(def.locale); + def.type = COLL_TYPE_ICU; + + struct coll *coll1 = coll_new(&def); + struct coll *coll2 = coll_new(&def); + is(coll1, coll2, + "collations with the same definition are not duplicated"); + coll_unref(coll2); + def.locale = "en_EN"; + coll2 = coll_new(&def); + isnt(coll1, coll2, + "collations with different definitions are different objects"); + coll_unref(coll2); + coll_unref(coll1); + + check_plan(); + footer(); +} int main(int, const char**) { + coll_init(); memory_init(); fiber_init(fiber_c_invoke); manual_test(); hash_test(); + cache_test(); fiber_free(); memory_free(); + coll_free(); } \ No newline at end of file diff --git a/test/unit/coll.result b/test/unit/coll.result index 218dca8f4..269764246 100644 --- a/test/unit/coll.result +++ b/test/unit/coll.result @@ -83,3 +83,8 @@ OK OK OK *** hash_test: done *** + *** cache_test *** +1..2 +ok 1 - collations with the same definition are not duplicated +ok 2 - collations with different definitions are different objects + *** cache_test: done *** -- 2.15.1 (Apple Git-101)