[PATCH v2 4/4] box: introduce tuple compare hint for string index

Kirill Shcherbatov kshcherbatov at tarantool.org
Wed Feb 13 12:32:28 MSK 2019


From: Alexandr Lyapunov <aleks at tarantool.org>

Implement functions for retrieving tuple hints for a particular
key_def. Hint is an integer that can be used for tuple comparison
optimization: if a hint of one tuple is less than a hint of another
then the first tuple is definitely less than the second; only if
hints are equal tuple_compare must be called for getting comparison
result.

Hints are calculated using only the first part of key_def.

Hints are only useful when:
 * they are precalculated and stored along with the tuple;
calculation is not cheap (cheaper than tuple_compare btw) but
precalculated hints allow to compare tuples without even fetching
tuple data.
 * first part of key_def is 'string'(for now)
 * since hint is calculated using only the first part of key_def
(and only first several characters if it is a string) this part
must be different with high probability for every tuple pair.

Part of #3961
---
 src/box/key_def.c         |   1 +
 src/box/key_def.h         |  44 ++++++++++++++
 src/box/memtx_tree_decl.c | 123 ++++++++++++++++++++++++++++++++++++++
 src/box/tuple_compare.cc  | 101 +++++++++++++++++++++++++++++++
 src/box/tuple_compare.h   |   7 +++
 src/coll.c                |  45 ++++++++++++++
 src/coll.h                |   4 ++
 7 files changed, 325 insertions(+)

diff --git a/src/box/key_def.c b/src/box/key_def.c
index 9411ade39..ceca1ebee 100644
--- a/src/box/key_def.c
+++ b/src/box/key_def.c
@@ -135,6 +135,7 @@ key_def_set_cmp(struct key_def *def)
 	def->tuple_compare_with_key = tuple_compare_with_key_create(def);
 	tuple_hash_func_set(def);
 	tuple_extract_key_set(def);
+	tuple_hint_set(def);
 }
 
 static void
diff --git a/src/box/key_def.h b/src/box/key_def.h
index bf3e63b6d..a2a0bbecb 100644
--- a/src/box/key_def.h
+++ b/src/box/key_def.h
@@ -153,6 +153,13 @@ typedef uint32_t (*tuple_hash_t)(const struct tuple *tuple,
 typedef uint32_t (*key_hash_t)(const char *key,
 				struct key_def *key_def);
 
+/** @copydoc tuple_hint() */
+typedef uint64_t (*tuple_hint_t)(const struct tuple *tuple,
+				 struct key_def *key_def);
+
+/** @copydoc key_hint() */
+typedef uint64_t (*key_hint_t)(const char *key, struct key_def *key_def);
+
 /* Definition of a multipart key. */
 struct key_def {
 	/** @see tuple_compare() */
@@ -167,6 +174,10 @@ struct key_def {
 	tuple_hash_t tuple_hash;
 	/** @see key_hash() */
 	key_hash_t key_hash;
+	/** @see tuple_hint() */
+	tuple_hint_t tuple_hint;
+	/** @see key_hint() */
+	key_hint_t key_hint;
 	/**
 	 * Minimal part count which always is unique. For example,
 	 * if a secondary index is unique, then
@@ -551,6 +562,39 @@ tuple_compare_with_key(const struct tuple *tuple, const char *key,
 	return key_def->tuple_compare_with_key(tuple, key, part_count, key_def);
 }
 
+/**
+ * Get a comparison hint of a tuple.
+ * Hint is such a function h(tuple) in terms of particular key_def that
+ * has follows rules:
+ * if h(t1) < h(t2) then t1 < t2;
+ * if h(t1) > h(t2) then t1 > t2;
+ * if t1 == t2 then h(t1) == h(t2);
+ * These rules means that instead of direct tuple vs tuple (or tuple vs key)
+ * comparison one may compare theirs hints first; and only if theirs hints
+ * are equal compare the tuples themselves.
+ * @param tuple - tuple to get hint of.
+ * @param key_def - key_def that defines which comparison is used.
+ * @return the hint.
+ */
+static inline uint64_t
+tuple_hint(const struct tuple *tuple, struct key_def *key_def)
+{
+	return key_def->tuple_hint(tuple, key_def);
+}
+
+/**
+ * Get a comparison hint of a key.
+ * @See tuple_hint for hint term definition.
+ * @param key - key to get hint of.
+ * @param key_def - key_def that defines which comparison is used.
+ * @return the hint.
+ */
+static inline uint64_t
+key_hint(const char *key, struct key_def *key_def)
+{
+	return key_def->key_hint(key, key_def);
+}
+
 #if defined(__cplusplus)
 } /* extern "C" */
 #endif /* defined(__cplusplus) */
diff --git a/src/box/memtx_tree_decl.c b/src/box/memtx_tree_decl.c
index 084a0594f..264241832 100644
--- a/src/box/memtx_tree_decl.c
+++ b/src/box/memtx_tree_decl.c
@@ -77,8 +77,131 @@ struct memtx_tuple_tree_key_data {
 
 /* }}} */
 
+/* {{{ Memtx hinted and hint_only tree class. *******************/
+
+/**
+ * Struct that is used as a key in BPS tree definition in
+ * memtx_hint_only_tree and memtx_hinted_tree.
+*/
+struct memtx_hinted_tree_key_data {
+	/** Sequence of msgpacked search fields. */
+	const char *key;
+	/** Number of msgpacked search fields. */
+	uint32_t part_count;
+	/**
+	 * Compare hint. Is calculated automatically on 'set'
+	 * operation with memtx_hinted_tree_key_data_set().
+	 */
+	uint64_t hint;
+};
+
+/**
+ * Struct that is used as a key in BPS tree definition in
+ * memtx_hint_only_tree and memtx_hinted_tree.
+ */
+struct memtx_hinted_tree_data {
+	/** Tuple this node is representing. */
+	struct tuple *tuple;
+	/**
+	 * Compare hint. Is calculated automatically on 'set'
+	 * operation with memtx_hinted_tree_data_set().
+	 */
+	uint64_t hint;
+};
+
+/**
+ * Compare memtx_hinted_tree records.
+ */
+static int
+memtx_hinted_tree_data_cmp(struct memtx_hinted_tree_data *a,
+			   struct memtx_hinted_tree_data *b,
+			   struct key_def *key_def)
+{
+	if (a->hint != b->hint)
+		return a->hint < b->hint ? -1 : 1;
+	return tuple_compare(a->tuple, b->tuple, key_def);
+}
+
+/**
+ * Compare memtx_hinted_tree record with key.
+ */
+static int
+memtx_hinted_tree_data_cmp_with_key(struct memtx_hinted_tree_data *a,
+				    struct memtx_hinted_tree_key_data *key,
+				    struct key_def *key_def)
+{
+	if (a->hint != key->hint)
+		return a->hint < key->hint ? -1 : 1;
+	return tuple_compare_with_key(a->tuple, key->key, key->part_count,
+				      key_def);
+}
+
+/**
+ * Initialize memtx_hinted_tree or memtx_hint_only_tree record
+ * with tuple and recalculate internal hint field.
+ */
+static void
+memtx_hinted_tree_data_set(struct memtx_hinted_tree_data *data,
+			   struct tuple *tuple, struct key_def *key_def)
+{
+	data->tuple = tuple;
+	data->hint = tuple != NULL ? tuple_hint(tuple, key_def) : 0;
+}
+
+/**
+ * Initialize memtx_hinted_tree or memtx_hint_only_tree key with
+ * key raw and part count and recalculate internal hint field.
+ */
+static void
+memtx_hinted_tree_key_data_set(struct memtx_hinted_tree_key_data *key_data,
+			       const char *key, uint32_t part_count,
+			       struct key_def *key_def)
+{
+	key_data->key = key;
+	key_data->part_count = part_count;
+	key_data->hint = key != NULL && part_count > 0 ?
+			 key_hint(key, key_def) : 0;
+}
+
+#define MEMTX_TREE_NAME memtx_hinted_tree
+#define memtx_tree_elem struct memtx_hinted_tree_data
+#define memtx_tree_key struct memtx_hinted_tree_key_data
+#define MEMTX_TREE_ELEM_EQUAL(elem_a_ptr, elem_b_ptr)				\
+	((elem_a_ptr)->tuple == (elem_b_ptr)->tuple)
+#define MEMTX_TREE_ELEM_CMP(elem_a_ptr, elem_b_ptr, key_def)			\
+	memtx_hinted_tree_data_cmp(elem_a_ptr, elem_b_ptr, key_def)
+#define MEMTX_TREE_ELEM_WITH_KEY_CMP(elem_ptr, key_ptr, key_def)		\
+	memtx_hinted_tree_data_cmp_with_key(elem_ptr, key_ptr, key_def)
+#define MEMTX_TREE_ELEM_SET(elem_ptr, tuple, key_def)				\
+	memtx_hinted_tree_data_set(elem_ptr, tuple, key_def)
+#define MEMTX_TREE_KEY_SET(key_ptr, key_val, part_count_val, key_def)		\
+	memtx_hinted_tree_key_data_set(key_ptr, key_val, part_count_val,	\
+				       key_def)
+#define MEMTX_TREE_ELEM_GET(elem_ptr) ((elem_ptr)->tuple)
+#define MEMTX_TREE_KEY_GET(key_ptr, part_count_ptr)				\
+	({*part_count_ptr = (key_ptr)->part_count; (key_ptr)->key;})
+
+#include "memtx_tree_impl.h"
+
+#undef memtx_tree_key
+#undef memtx_tree_elem
+#undef MEMTX_TREE_KEY_GET
+#undef MEMTX_TREE_ELEM_GET
+#undef MEMTX_TREE_KEY_SET
+#undef MEMTX_TREE_ELEM_SET
+#undef MEMTX_TREE_ELEM_WITH_KEY_CMP
+#undef MEMTX_TREE_ELEM_CMP
+#undef MEMTX_TREE_ELEM_EQUAL
+#undef memtx_tree_key
+#undef memtx_tree_elem
+#undef MEMTX_TREE_NAME
+
+/* }}} */
+
 struct index *
 memtx_tree_index_new(struct memtx_engine *memtx, struct index_def *def)
 {
+	if (def->cmp_def->parts->type == FIELD_TYPE_STRING)
+		return memtx_hinted_tree_index_new(memtx, def);
 	return memtx_tuple_tree_index_new(memtx, def);
 }
diff --git a/src/box/tuple_compare.cc b/src/box/tuple_compare.cc
index ded802c7d..744901d70 100644
--- a/src/box/tuple_compare.cc
+++ b/src/box/tuple_compare.cc
@@ -1321,4 +1321,105 @@ tuple_compare_with_key_create(const struct key_def *def)
 	       compare_with_key_slowpath_funcs[cmp_func_idx];
 }
 
+static uint64_t
+key_hint_default(const char *key, struct key_def *key_def)
+{
+	(void)key;
+	(void)key_def;
+	return 0;
+}
+
+static uint64_t
+tuple_hint_default(const struct tuple *tuple, struct key_def *key_def)
+{
+	(void)tuple;
+	(void)key_def;
+	return 0;
+}
+
+static uint64_t
+key_hint_string(const char *key, struct key_def *key_def)
+{
+	assert(key != NULL);
+	(void)key_def;
+	assert(key_def->parts->type == FIELD_TYPE_STRING);
+	assert(mp_typeof(*key) == MP_STR);
+	uint32_t len;
+	const unsigned char *str =
+		(const unsigned char *)mp_decode_str(&key, &len);
+	uint64_t result = 0;
+	/*
+	 * Map the sequence of characters in the hint's bytes in
+	 * the reverse order: the first character will be mapped
+	 * in the highest byte of the number, etc.
+	 * One additional bit encodes the presence of the next
+	 * sign.
+	 * 0bCHARBYTEU CHARBYTEU CHARBYTEU ... CHARBYTE0
+	 * The sequence constructed in this way provides a
+	 * comparison transitivity of strings no longer than 7
+	 * bytes.
+	 */
+	uint32_t process_len = MIN(len, sizeof(char) - 1);
+	for (uint32_t i = 0; i < process_len; i++) {
+		result <<= CHAR_BIT + 1;
+		result |= UCHAR_MAX + 1;
+		result |= str[i];
+	}
+	/*
+	 * If the length of the string is less than the 64-bit
+	 * hint can accommodate, the insignificant positions are
+	 * filled with 0.
+	 */
+	result <<= (CHAR_BIT + 1) * (sizeof(char) - 1 - process_len);
+	return result;
+}
+
+static uint64_t
+tuple_hint_string(const struct tuple *tuple, struct key_def *key_def)
+{
+	const char *field = tuple_field_by_part(tuple, key_def->parts);
+	return field != NULL ? key_hint_string(field, key_def) : 0;
+}
+
+static uint64_t
+key_hint_string_coll(const char *key, struct key_def *key_def)
+{
+	assert(key != NULL);
+	assert(key_def->parts->type == FIELD_TYPE_STRING);
+	assert(mp_typeof(*key) == MP_STR);
+	assert(key_def->parts->coll != NULL);
+	uint32_t len;
+	const char *str = mp_decode_str(&key, &len);
+	return key_def->parts->coll->hint(str, len, key_def->parts->coll);
+}
+
+static uint64_t
+tuple_hint_string_coll(const struct tuple *tuple, struct key_def *key_def)
+{
+	const char *field = tuple_field_by_part(tuple, key_def->parts);
+	return field != NULL ? key_hint_string_coll(field, key_def) : 0;
+}
+
+void
+tuple_hint_set(struct key_def *def)
+{
+	def->key_hint = key_hint_default;
+	def->tuple_hint = tuple_hint_default;
+	if (key_part_is_nullable(def->parts))
+		return;
+	if (def->parts->type == FIELD_TYPE_STRING && def->parts->coll != NULL) {
+		def->key_hint = key_hint_string_coll;
+		def->tuple_hint = tuple_hint_string_coll;
+		return;
+	}
+	switch (def->parts->type) {
+	case FIELD_TYPE_STRING:
+		def->key_hint = key_hint_string;
+		def->tuple_hint = tuple_hint_string;
+		break;
+	default:
+		break;
+	};
+}
+
 /* }}} tuple_compare_with_key */
diff --git a/src/box/tuple_compare.h b/src/box/tuple_compare.h
index e3a63204f..405460086 100644
--- a/src/box/tuple_compare.h
+++ b/src/box/tuple_compare.h
@@ -67,6 +67,13 @@ tuple_compare_create(const struct key_def *key_def);
 tuple_compare_with_key_t
 tuple_compare_with_key_create(const struct key_def *key_def);
 
+/**
+ * Initialize tuple_hint() and key_hint() functions for the key_def.
+ * @param key_def key definition to set up.
+ */
+void
+tuple_hint_set(struct key_def *key_def);
+
 #if defined(__cplusplus)
 } /* extern "C" */
 #endif /* defined(__cplusplus) */
diff --git a/src/coll.c b/src/coll.c
index 6d9c44dbf..5bb46a94f 100644
--- a/src/coll.c
+++ b/src/coll.c
@@ -33,6 +33,7 @@
 #include "third_party/PMurHash.h"
 #include "diag.h"
 #include "assoc.h"
+#include "limits.h"
 #include <unicode/ucol.h>
 #include <trivia/config.h>
 
@@ -133,6 +134,48 @@ coll_bin_hash(const char *s, size_t s_len, uint32_t *ph, uint32_t *pcarry,
 	return s_len;
 }
 
+/** Get a compare hint of a string using ICU collation. */
+static uint64_t
+coll_icu_hint(const char *s, size_t s_len, struct coll *coll)
+{
+	UCharIterator itr;
+	uiter_setUTF8(&itr, s, s_len);
+	uint8_t buf[7];
+	uint32_t state[2] = {0, 0};
+	UErrorCode status = U_ZERO_ERROR;
+	int32_t got = ucol_nextSortKeyPart(coll->collator, &itr, state, buf,
+					   sizeof(buf), &status);
+	assert(got >= 0 && got <= 7);
+	uint64_t result = 0;
+	for (int32_t i = 0; i < got; i++) {
+		result <<= 9;
+		result |= 0x100;
+		result |= buf[i];
+	}
+	result <<= 9 * (7 - got);
+	return result;
+}
+
+/**
+ * Get a compare hint of a string using BINARY collation.
+ * Works similar to key_hint_string, read it's comment for more
+ * details.
+ */
+static uint64_t
+coll_bin_hint(const char *s, size_t s_len, struct coll *coll)
+{
+	(void)coll;
+	uint64_t result = 0;
+	uint32_t process_len = MIN(s_len, sizeof(char) - 1);
+	for (uint32_t i = 0; i < process_len; i++) {
+		result <<= CHAR_BIT;
+		result |= UCHAR_MAX + 1;
+		result |= s[i];
+	}
+	result <<= (CHAR_BIT + 1) * (sizeof(char) - 1 - process_len);
+	return result;
+}
+
 /**
  * Set up ICU collator and init cmp and hash members of collation.
  * @param coll Collation to set up.
@@ -242,6 +285,7 @@ coll_icu_init_cmp(struct coll *coll, const struct coll_def *def)
 	}
 	coll->cmp = coll_icu_cmp;
 	coll->hash = coll_icu_hash;
+	coll->hint = coll_icu_hint;
 	return 0;
 }
 
@@ -332,6 +376,7 @@ coll_new(const struct coll_def *def)
 		coll->collator = NULL;
 		coll->cmp = coll_bin_cmp;
 		coll->hash = coll_bin_hash;
+		coll->hint = coll_bin_hint;
 		break;
 	default:
 		unreachable();
diff --git a/src/coll.h b/src/coll.h
index 9c725712a..53133dae3 100644
--- a/src/coll.h
+++ b/src/coll.h
@@ -47,6 +47,8 @@ typedef int (*coll_cmp_f)(const char *s, size_t s_len, const char *t,
 typedef uint32_t (*coll_hash_f)(const char *s, size_t s_len, uint32_t *ph,
 				uint32_t *pcarry, struct coll *coll);
 
+typedef uint64_t (*coll_hint_f)(const char *s, size_t s_len, struct coll *coll);
+
 struct UCollator;
 
 /**
@@ -61,6 +63,8 @@ struct coll {
 	/** String comparator. */
 	coll_cmp_f cmp;
 	coll_hash_f hash;
+	/** The pointer to routine calculating tuple hint. */
+	coll_hint_f hint;
 	/** Reference counter. */
 	int refs;
 	/**
-- 
2.20.1




More information about the Tarantool-patches mailing list