From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from localhost (localhost [127.0.0.1]) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTP id E50FD251C3 for ; Thu, 17 May 2018 15:24:08 -0400 (EDT) Received: from turing.freelists.org ([127.0.0.1]) by localhost (turing.freelists.org [127.0.0.1]) (amavisd-new, port 10024) with ESMTP id K1BporMr7LfB for ; Thu, 17 May 2018 15:24:08 -0400 (EDT) Received: from smtp29.i.mail.ru (smtp29.i.mail.ru [94.100.177.89]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by turing.freelists.org (Avenir Technologies Mail Multiplex) with ESMTPS id 7D650250E7 for ; Thu, 17 May 2018 15:24:08 -0400 (EDT) Subject: [tarantool-patches] Re: [PATCH v3 3/4] collation: introduce collation fingerprint From: Vladislav Shpilevoy References: <0377ce9f0096789625370e2d68ea6f87bbdc4de0.1526414017.git.v.shpilevoy@tarantool.org> Message-ID: <7b70d6b6-2fa0-7722-9e52-417508a1e329@tarantool.org> Date: Thu, 17 May 2018 22:24:05 +0300 MIME-Version: 1.0 In-Reply-To: <0377ce9f0096789625370e2d68ea6f87bbdc4de0.1526414017.git.v.shpilevoy@tarantool.org> Content-Type: text/plain; charset="utf-8"; format="flowed" Content-Language: en-US Content-Transfer-Encoding: 8bit Sender: tarantool-patches-bounce@freelists.org Errors-to: tarantool-patches-bounce@freelists.org Reply-To: tarantool-patches@freelists.org List-help: List-unsubscribe: List-software: Ecartis version 1.0.0 List-Id: tarantool-patches List-subscribe: List-owner: List-post: List-archive: To: tarantool-patches@freelists.org Cc: kostja@tarantool.org New patch version. It is changed after the previous patch was reworked. --- collation: introduce collation fingerprint Collation fingerprint is a formatted string unique for a set of collation properties. Equal collations with different names have the same fingerprint. This new property is used to build collation fingerprint cache to use in Tarantool internals, where collation name does not matter. Fingerprint cache can never conflict or replace on insertion into it. It means, that, for example, utf8 module being created in this patchset, can fill collation cache with its own collations and it will affect neither users or other modules. --- src/coll.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++- src/coll.h | 17 ++++++- src/main.cc | 3 ++ test/unit/CMakeLists.txt | 2 +- test/unit/coll.cpp | 39 +++++++++++++-- test/unit/coll.result | 5 ++ 6 files changed, 179 insertions(+), 8 deletions(-) diff --git a/src/coll.c b/src/coll.c index 66afa6c4f..d0e36827d 100644 --- a/src/coll.c +++ b/src/coll.c @@ -32,9 +32,40 @@ #include "coll.h" #include "third_party/PMurHash.h" #include "diag.h" +#include "assoc.h" #include #include +#define mh_name _coll +struct mh_coll_key_t { + const char *str; + size_t len; + uint32_t hash; +}; +#define mh_key_t struct mh_coll_key_t * + +struct mh_coll_node_t { + size_t len; + uint32_t hash; + struct coll *coll; +}; +#define mh_node_t struct mh_coll_node_t + +#define mh_arg_t void * +#define mh_hash(a, arg) ((a)->hash) +#define mh_hash_key(a, arg) ((a)->hash) +#define mh_cmp(a, b, arg) ((a)->len != (b)->len || \ + strncmp((a)->coll->fingerprint, \ + (b)->coll->fingerprint, (a)->len)) +#define mh_cmp_key(a, b, arg) ((a)->len != (b)->len || \ + strncmp((a)->str, (b)->coll->fingerprint, \ + (a)->len)) +#define MH_SOURCE +#include "salad/mhash.h" + +/** Table fingerprint -> collation. */ +static struct mh_coll_t *coll_cache = NULL; + enum { MAX_LOCALE = 1024, }; @@ -205,21 +236,88 @@ coll_icu_init_cmp(struct coll *coll, const struct coll_def *def) return 0; } +/** + * Print ICU definition into @a buffer limited with @a size bytes. + * If @a size bytes is not enough, then total needed byte count is + * returned. + * @param buffer Buffer to write to. + * @param size Size of @a buffer. + * @param def ICU definition. + * + * @retval Written or needed byte count. + */ +static int +coll_icu_def_snfingerprint(char *buffer, int size, + const struct coll_icu_def *def) +{ + return snprintf(buffer, size, "{french_coll: %d, alt_handling: %d, "\ + "case_first: %d, case_level: %d, norm_mode: %d, "\ + "strength: %d, numeric_coll: %d}", + (int) def->french_collation, + (int) def->alternate_handling, (int) def->case_first, + (int) def->case_level, (int) def->normalization_mode, + (int) def->strength, (int) def->numeric_collation); +} + +/** + * Print collation definition into @a buffer limited with @a size + * bytes. If @a size bytes is not enough, then total needed byte + * count is returned. + * @param buffer Buffer to write to. + * @param size Size of @a buffer. + * @param def Collation definition. + * + * @retval Written or needed byte count. + */ +static int +coll_def_snfingerprint(char *buffer, int size, const struct coll_def *def) +{ + int total = 0; + SNPRINT(total, snprintf, buffer, size, "{locale: %.*s, type = %d, "\ + "icu: ", (int) def->locale_len, def->locale, (int) def->type); + SNPRINT(total, coll_icu_def_snfingerprint, buffer, size, &def->icu); + SNPRINT(total, snprintf, buffer, size, "}"); + return total; +} + struct coll * coll_new(const struct coll_def *def) { assert(def->type == COLL_TYPE_ICU); - struct coll *coll = (struct coll *) malloc(sizeof(*coll)); + int fingerprint_len = coll_def_snfingerprint(NULL, 0, def); + assert(fingerprint_len <= TT_STATIC_BUF_LEN); + char *fingerprint = tt_static_buf(); + coll_def_snfingerprint(fingerprint, TT_STATIC_BUF_LEN, def); + + uint32_t hash = mh_strn_hash(fingerprint, fingerprint_len); + struct mh_coll_key_t key = { fingerprint, fingerprint_len, hash }; + mh_int_t i = mh_coll_find(coll_cache, &key, NULL); + if (i != mh_end(coll_cache)) { + struct coll *coll = mh_coll_node(coll_cache, i)->coll; + coll_ref(coll); + return coll; + } + + int total_size = sizeof(struct coll) + fingerprint_len + 1; + struct coll *coll = (struct coll *) malloc(total_size); if (coll == NULL) { - diag_set(OutOfMemory, sizeof(*coll), "malloc", "coll"); + diag_set(OutOfMemory, total_size, "malloc", "coll"); return NULL; } + memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1); coll->refs = 1; coll->type = def->type; if (coll_icu_init_cmp(coll, def) != 0) { free(coll); return NULL; } + + struct mh_coll_node_t node = { fingerprint_len, hash, coll }; + if (mh_coll_put(coll_cache, &node, NULL, NULL) == mh_end(coll_cache)) { + diag_set(OutOfMemory, sizeof(node), "malloc", "coll_cache"); + coll_unref(coll); + return NULL; + } return coll; } @@ -228,7 +326,26 @@ coll_unref(struct coll *coll) { assert(coll->refs > 0); if (--coll->refs == 0) { + int len = strlen(coll->fingerprint); + struct mh_coll_node_t node = { + len, mh_strn_hash(coll->fingerprint, len), coll + }; + mh_coll_remove(coll_cache, &node, NULL); ucol_close(coll->icu.collator); free(coll); } } + +void +coll_init() +{ + coll_cache = mh_coll_new(); + if (coll_cache == NULL) + panic("Can not create system collations cache"); +} + +void +coll_free() +{ + mh_coll_delete(coll_cache); +} diff --git a/src/coll.h b/src/coll.h index cc834f446..7e950d164 100644 --- a/src/coll.h +++ b/src/coll.h @@ -68,10 +68,17 @@ struct coll { coll_hash_f hash; /** Reference counter. */ int refs; + /** + * Formatted string with collation properties, that + * completely describes how the collation works. + */ + const char fingerprint[0]; }; /** - * Create a collation by definition. + * Create a collation by definition. Can return an existing + * collation object, if a one with the same fingerprint was + * created before. * @param def Collation definition. * @retval NULL Collation or memory error. * @retval not NULL Collation. @@ -90,6 +97,14 @@ coll_ref(struct coll *coll) void coll_unref(struct coll *coll); +/** Initialize collations subsystem. */ +void +coll_init(); + +/** Destroy collations subsystem. */ +void +coll_free(); + #if defined(__cplusplus) } /* extern "C" */ #endif /* defined(__cplusplus) */ diff --git a/src/main.cc b/src/main.cc index 1682baea0..a36a2b0d0 100644 --- a/src/main.cc +++ b/src/main.cc @@ -58,6 +58,7 @@ #include #include #include +#include "coll.h" #include "trivia/util.h" #include "backtrace.h" #include "tt_pthread.h" @@ -581,6 +582,7 @@ tarantool_free(void) memory_free(); random_free(); #endif + coll_free(); systemd_free(); say_logger_free(); } @@ -732,6 +734,7 @@ main(int argc, char **argv) coio_enable(); signal_init(); cbus_init(); + coll_init(); tarantool_lua_init(tarantool_bin, main_argc, main_argv); start_time = ev_monotonic_time(); diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 5d83f53b0..dbc02cdf0 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -191,4 +191,4 @@ add_executable(vy_cache.test vy_cache.c ${ITERATOR_TEST_SOURCES}) target_link_libraries(vy_cache.test ${ITERATOR_TEST_LIBS}) add_executable(coll.test coll.cpp) -target_link_libraries(coll.test box) +target_link_libraries(coll.test core unit ${ICU_LIBRARIES} misc) diff --git a/test/unit/coll.cpp b/test/unit/coll.cpp index 53e06f2ce..eeee739b7 100644 --- a/test/unit/coll.cpp +++ b/test/unit/coll.cpp @@ -9,6 +9,7 @@ #include #include "coll_def.h" #include "coll.h" +#include "unit.h" #include "third_party/PMurHash.h" using namespace std; @@ -43,7 +44,7 @@ test_sort_strings(vector &strings, struct coll *coll) void manual_test() { - cout << "\t*** " << __func__ << " ***" << endl; + header(); vector strings; struct coll_def def; @@ -111,7 +112,7 @@ manual_test() test_sort_strings(strings, coll); coll_unref(coll); - cout << "\t*** " << __func__ << ": done ***" << endl; + footer(); } unsigned calc_hash(const char *str, struct coll *coll) @@ -127,7 +128,7 @@ unsigned calc_hash(const char *str, struct coll *coll) void hash_test() { - cout << "\t*** " << __func__ << " ***" << endl; + header(); struct coll_def def; memset(&def, 0, sizeof(def)); @@ -155,17 +156,47 @@ hash_test() cout << (calc_hash("аЕ", coll) != calc_hash("аё", coll) ? "OK" : "Fail") << endl; coll_unref(coll); - cout << "\t*** " << __func__ << ": done ***" << endl; + footer(); } +void +cache_test() +{ + header(); + plan(2); + + struct coll_def def; + memset(&def, 0, sizeof(def)); + def.locale = "ru_RU"; + def.locale_len = strlen(def.locale); + def.type = COLL_TYPE_ICU; + + struct coll *coll1 = coll_new(&def); + struct coll *coll2 = coll_new(&def); + is(coll1, coll2, + "collations with the same definition are not duplicated"); + coll_unref(coll2); + def.locale = "en_EN"; + coll2 = coll_new(&def); + isnt(coll1, coll2, + "collations with different definitions are different objects"); + coll_unref(coll2); + coll_unref(coll1); + + check_plan(); + footer(); +} int main(int, const char**) { + coll_init(); memory_init(); fiber_init(fiber_c_invoke); manual_test(); hash_test(); + cache_test(); fiber_free(); memory_free(); + coll_free(); } \ No newline at end of file diff --git a/test/unit/coll.result b/test/unit/coll.result index 218dca8f4..269764246 100644 --- a/test/unit/coll.result +++ b/test/unit/coll.result @@ -83,3 +83,8 @@ OK OK OK *** hash_test: done *** + *** cache_test *** +1..2 +ok 1 - collations with the same definition are not duplicated +ok 2 - collations with different definitions are different objects + *** cache_test: done *** -- 2.15.1 (Apple Git-101)