From: Vladislav Shpilevoy <v.shpilevoy@tarantool.org> To: tarantool-patches@freelists.org Cc: kostja@tarantool.org Subject: [tarantool-patches] Re: [PATCH v3 3/4] collation: introduce collation fingerprint Date: Thu, 17 May 2018 22:24:05 +0300 [thread overview] Message-ID: <7b70d6b6-2fa0-7722-9e52-417508a1e329@tarantool.org> (raw) In-Reply-To: <0377ce9f0096789625370e2d68ea6f87bbdc4de0.1526414017.git.v.shpilevoy@tarantool.org> New patch version. It is changed after the previous patch was reworked. --- collation: introduce collation fingerprint Collation fingerprint is a formatted string unique for a set of collation properties. Equal collations with different names have the same fingerprint. This new property is used to build collation fingerprint cache to use in Tarantool internals, where collation name does not matter. Fingerprint cache can never conflict or replace on insertion into it. It means, that, for example, utf8 module being created in this patchset, can fill collation cache with its own collations and it will affect neither users or other modules. --- src/coll.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++- src/coll.h | 17 ++++++- src/main.cc | 3 ++ test/unit/CMakeLists.txt | 2 +- test/unit/coll.cpp | 39 +++++++++++++-- test/unit/coll.result | 5 ++ 6 files changed, 179 insertions(+), 8 deletions(-) diff --git a/src/coll.c b/src/coll.c index 66afa6c4f..d0e36827d 100644 --- a/src/coll.c +++ b/src/coll.c @@ -32,9 +32,40 @@ #include "coll.h" #include "third_party/PMurHash.h" #include "diag.h" +#include "assoc.h" #include <unicode/ucol.h> #include <trivia/config.h> +#define mh_name _coll +struct mh_coll_key_t { + const char *str; + size_t len; + uint32_t hash; +}; +#define mh_key_t struct mh_coll_key_t * + +struct mh_coll_node_t { + size_t len; + uint32_t hash; + struct coll *coll; +}; +#define mh_node_t struct mh_coll_node_t + +#define mh_arg_t void * +#define mh_hash(a, arg) ((a)->hash) +#define mh_hash_key(a, arg) ((a)->hash) +#define mh_cmp(a, b, arg) ((a)->len != (b)->len || \ + strncmp((a)->coll->fingerprint, \ + (b)->coll->fingerprint, (a)->len)) +#define mh_cmp_key(a, b, arg) ((a)->len != (b)->len || \ + strncmp((a)->str, (b)->coll->fingerprint, \ + (a)->len)) +#define MH_SOURCE +#include "salad/mhash.h" + +/** Table fingerprint -> collation. */ +static struct mh_coll_t *coll_cache = NULL; + enum { MAX_LOCALE = 1024, }; @@ -205,21 +236,88 @@ coll_icu_init_cmp(struct coll *coll, const struct coll_def *def) return 0; } +/** + * Print ICU definition into @a buffer limited with @a size bytes. + * If @a size bytes is not enough, then total needed byte count is + * returned. + * @param buffer Buffer to write to. + * @param size Size of @a buffer. + * @param def ICU definition. + * + * @retval Written or needed byte count. + */ +static int +coll_icu_def_snfingerprint(char *buffer, int size, + const struct coll_icu_def *def) +{ + return snprintf(buffer, size, "{french_coll: %d, alt_handling: %d, "\ + "case_first: %d, case_level: %d, norm_mode: %d, "\ + "strength: %d, numeric_coll: %d}", + (int) def->french_collation, + (int) def->alternate_handling, (int) def->case_first, + (int) def->case_level, (int) def->normalization_mode, + (int) def->strength, (int) def->numeric_collation); +} + +/** + * Print collation definition into @a buffer limited with @a size + * bytes. If @a size bytes is not enough, then total needed byte + * count is returned. + * @param buffer Buffer to write to. + * @param size Size of @a buffer. + * @param def Collation definition. + * + * @retval Written or needed byte count. + */ +static int +coll_def_snfingerprint(char *buffer, int size, const struct coll_def *def) +{ + int total = 0; + SNPRINT(total, snprintf, buffer, size, "{locale: %.*s, type = %d, "\ + "icu: ", (int) def->locale_len, def->locale, (int) def->type); + SNPRINT(total, coll_icu_def_snfingerprint, buffer, size, &def->icu); + SNPRINT(total, snprintf, buffer, size, "}"); + return total; +} + struct coll * coll_new(const struct coll_def *def) { assert(def->type == COLL_TYPE_ICU); - struct coll *coll = (struct coll *) malloc(sizeof(*coll)); + int fingerprint_len = coll_def_snfingerprint(NULL, 0, def); + assert(fingerprint_len <= TT_STATIC_BUF_LEN); + char *fingerprint = tt_static_buf(); + coll_def_snfingerprint(fingerprint, TT_STATIC_BUF_LEN, def); + + uint32_t hash = mh_strn_hash(fingerprint, fingerprint_len); + struct mh_coll_key_t key = { fingerprint, fingerprint_len, hash }; + mh_int_t i = mh_coll_find(coll_cache, &key, NULL); + if (i != mh_end(coll_cache)) { + struct coll *coll = mh_coll_node(coll_cache, i)->coll; + coll_ref(coll); + return coll; + } + + int total_size = sizeof(struct coll) + fingerprint_len + 1; + struct coll *coll = (struct coll *) malloc(total_size); if (coll == NULL) { - diag_set(OutOfMemory, sizeof(*coll), "malloc", "coll"); + diag_set(OutOfMemory, total_size, "malloc", "coll"); return NULL; } + memcpy((char *) coll->fingerprint, fingerprint, fingerprint_len + 1); coll->refs = 1; coll->type = def->type; if (coll_icu_init_cmp(coll, def) != 0) { free(coll); return NULL; } + + struct mh_coll_node_t node = { fingerprint_len, hash, coll }; + if (mh_coll_put(coll_cache, &node, NULL, NULL) == mh_end(coll_cache)) { + diag_set(OutOfMemory, sizeof(node), "malloc", "coll_cache"); + coll_unref(coll); + return NULL; + } return coll; } @@ -228,7 +326,26 @@ coll_unref(struct coll *coll) { assert(coll->refs > 0); if (--coll->refs == 0) { + int len = strlen(coll->fingerprint); + struct mh_coll_node_t node = { + len, mh_strn_hash(coll->fingerprint, len), coll + }; + mh_coll_remove(coll_cache, &node, NULL); ucol_close(coll->icu.collator); free(coll); } } + +void +coll_init() +{ + coll_cache = mh_coll_new(); + if (coll_cache == NULL) + panic("Can not create system collations cache"); +} + +void +coll_free() +{ + mh_coll_delete(coll_cache); +} diff --git a/src/coll.h b/src/coll.h index cc834f446..7e950d164 100644 --- a/src/coll.h +++ b/src/coll.h @@ -68,10 +68,17 @@ struct coll { coll_hash_f hash; /** Reference counter. */ int refs; + /** + * Formatted string with collation properties, that + * completely describes how the collation works. + */ + const char fingerprint[0]; }; /** - * Create a collation by definition. + * Create a collation by definition. Can return an existing + * collation object, if a one with the same fingerprint was + * created before. * @param def Collation definition. * @retval NULL Collation or memory error. * @retval not NULL Collation. @@ -90,6 +97,14 @@ coll_ref(struct coll *coll) void coll_unref(struct coll *coll); +/** Initialize collations subsystem. */ +void +coll_init(); + +/** Destroy collations subsystem. */ +void +coll_free(); + #if defined(__cplusplus) } /* extern "C" */ #endif /* defined(__cplusplus) */ diff --git a/src/main.cc b/src/main.cc index 1682baea0..a36a2b0d0 100644 --- a/src/main.cc +++ b/src/main.cc @@ -58,6 +58,7 @@ #include <say.h> #include <rmean.h> #include <limits.h> +#include "coll.h" #include "trivia/util.h" #include "backtrace.h" #include "tt_pthread.h" @@ -581,6 +582,7 @@ tarantool_free(void) memory_free(); random_free(); #endif + coll_free(); systemd_free(); say_logger_free(); } @@ -732,6 +734,7 @@ main(int argc, char **argv) coio_enable(); signal_init(); cbus_init(); + coll_init(); tarantool_lua_init(tarantool_bin, main_argc, main_argv); start_time = ev_monotonic_time(); diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt index 5d83f53b0..dbc02cdf0 100644 --- a/test/unit/CMakeLists.txt +++ b/test/unit/CMakeLists.txt @@ -191,4 +191,4 @@ add_executable(vy_cache.test vy_cache.c ${ITERATOR_TEST_SOURCES}) target_link_libraries(vy_cache.test ${ITERATOR_TEST_LIBS}) add_executable(coll.test coll.cpp) -target_link_libraries(coll.test box) +target_link_libraries(coll.test core unit ${ICU_LIBRARIES} misc) diff --git a/test/unit/coll.cpp b/test/unit/coll.cpp index 53e06f2ce..eeee739b7 100644 --- a/test/unit/coll.cpp +++ b/test/unit/coll.cpp @@ -9,6 +9,7 @@ #include <memory.h> #include "coll_def.h" #include "coll.h" +#include "unit.h" #include "third_party/PMurHash.h" using namespace std; @@ -43,7 +44,7 @@ test_sort_strings(vector<const char *> &strings, struct coll *coll) void manual_test() { - cout << "\t*** " << __func__ << " ***" << endl; + header(); vector<const char *> strings; struct coll_def def; @@ -111,7 +112,7 @@ manual_test() test_sort_strings(strings, coll); coll_unref(coll); - cout << "\t*** " << __func__ << ": done ***" << endl; + footer(); } unsigned calc_hash(const char *str, struct coll *coll) @@ -127,7 +128,7 @@ unsigned calc_hash(const char *str, struct coll *coll) void hash_test() { - cout << "\t*** " << __func__ << " ***" << endl; + header(); struct coll_def def; memset(&def, 0, sizeof(def)); @@ -155,17 +156,47 @@ hash_test() cout << (calc_hash("аЕ", coll) != calc_hash("аё", coll) ? "OK" : "Fail") << endl; coll_unref(coll); - cout << "\t*** " << __func__ << ": done ***" << endl; + footer(); } +void +cache_test() +{ + header(); + plan(2); + + struct coll_def def; + memset(&def, 0, sizeof(def)); + def.locale = "ru_RU"; + def.locale_len = strlen(def.locale); + def.type = COLL_TYPE_ICU; + + struct coll *coll1 = coll_new(&def); + struct coll *coll2 = coll_new(&def); + is(coll1, coll2, + "collations with the same definition are not duplicated"); + coll_unref(coll2); + def.locale = "en_EN"; + coll2 = coll_new(&def); + isnt(coll1, coll2, + "collations with different definitions are different objects"); + coll_unref(coll2); + coll_unref(coll1); + + check_plan(); + footer(); +} int main(int, const char**) { + coll_init(); memory_init(); fiber_init(fiber_c_invoke); manual_test(); hash_test(); + cache_test(); fiber_free(); memory_free(); + coll_free(); } \ No newline at end of file diff --git a/test/unit/coll.result b/test/unit/coll.result index 218dca8f4..269764246 100644 --- a/test/unit/coll.result +++ b/test/unit/coll.result @@ -83,3 +83,8 @@ OK OK OK *** hash_test: done *** + *** cache_test *** +1..2 +ok 1 - collations with the same definition are not duplicated +ok 2 - collations with different definitions are different objects + *** cache_test: done *** -- 2.15.1 (Apple Git-101)
next prev parent reply other threads:[~2018-05-17 19:24 UTC|newest] Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top 2018-05-15 19:54 [tarantool-patches] [PATCH v3 0/4] Lua utf8 module Vladislav Shpilevoy 2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 1/4] error: introduce error rebulding API Vladislav Shpilevoy 2018-05-16 17:06 ` [tarantool-patches] " Vladislav Shpilevoy 2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 2/4] collation: split collation into core and box objects Vladislav Shpilevoy 2018-05-16 17:07 ` [tarantool-patches] " Vladislav Shpilevoy 2018-05-16 17:17 ` Konstantin Osipov 2018-05-16 17:19 ` Vladislav Shpilevoy 2018-05-17 19:23 ` Vladislav Shpilevoy 2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 3/4] collation: introduce collation fingerprint Vladislav Shpilevoy 2018-05-17 19:24 ` Vladislav Shpilevoy [this message] 2018-05-15 19:54 ` [tarantool-patches] [PATCH v3 4/4] lua: introduce utf8 built-in globaly visible module Vladislav Shpilevoy
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=7b70d6b6-2fa0-7722-9e52-417508a1e329@tarantool.org \ --to=v.shpilevoy@tarantool.org \ --cc=kostja@tarantool.org \ --cc=tarantool-patches@freelists.org \ --subject='[tarantool-patches] Re: [PATCH v3 3/4] collation: introduce collation fingerprint' \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: link
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox