[tarantool-patches] Re: [PATCH v1 1/1] sql: improve vdbe_field_ref fetcher

Fri Aug 16 10:49:00 MSK 2019

> 1. 'scope'.
> 2. This is not an explanation, but rather a bit of water. With
> the same explanation you could test fields 1 and 2. Please, explain
> why exactly these numbers.
Got it.
================================================

Vdbe field ref is a dynamic index over tuple fields storing
offsets to each field and filling the offset array on demand.
It is highly used in SQL, because it strongly relies on fast and
repetitive access to any field, not only indexed.

There is an optimisation for the case when a requested field
fieldno is indexed, and the tuple itself stores offset to the
field in its own small field map, used by indexes. vdbe_field_ref
then uses that map to retrieve offset value without decoding
anything. But when SQL requests any field > fieldno, then
vdbe_field_ref decodes the tuple from the beginning in the worst
case. Even having previously accessed fieldno. But it could
start decoding from the latter.

An updated vdbe_field_ref fetcher class uses a bitmask of
initialized slots to use pre-calculated offsets when possible.
This speed-ups SQL in some corner case and doesn't damage
performance in general scenarios.

Closes #4267
---
 src/box/column_mask.h  |  30 +++++++++++
 src/box/sql.h          |   9 ++--
 src/box/sql.c          |   5 +-
 src/box/sql/vdbe.c     | 116 +++++++++++++++++++++++------------------
 test/sql/misc.result   |  92 ++++++++++++++++++++++++++++++++
 test/sql/misc.test.lua |  38 ++++++++++++++
 6 files changed, 236 insertions(+), 54 deletions(-)

diff --git a/src/box/column_mask.h b/src/box/column_mask.h
index 14da841f1..5fb0e1fe0 100644
--- a/src/box/column_mask.h
+++ b/src/box/column_mask.h
@@ -123,4 +123,34 @@ column_mask_fieldno_is_set(uint64_t column_mask, uint32_t fieldno)
 	return (column_mask & mask) != 0;
 }
 
+/**
+ * Set a bit with given index @a bitno in a given @a bitmask.
+ * Do nothing when @a bitno is greater than size if bitmask.
+ * @param bitmask Bitmask to test.
+ * @param bitno bit number to test (must be 0-based).
+ */
+static inline void
+bitmask64_set_bit(uint64_t *bitmask, uint32_t bitno)
+{
+	if (bitno >= 64)
+		return;
+	*bitmask |= ((uint64_t)1 << bitno);
+}
+
+/**
+ * Test a bit in the bitmask corresponding to @a bitno.
+ * @param bitmask Bitmask to test.
+ * @param bitno Bit number to test (must be 0-based).
+ * @return Whether given bit is set in a given bitmask.
+ *         When bitno is greater than size of bitmask returns
+ *         false.
+ */
+static inline bool
+bitmask64_is_bit_set(uint64_t bitmask, uint32_t bitno)
+{
+	if (bitno >= 64)
+		return false;
+	return (bitmask & ((uint64_t)1 << bitno)) != 0;
+}
+
 #endif
diff --git a/src/box/sql.h b/src/box/sql.h
index 9ccecf28c..7644051b4 100644
--- a/src/box/sql.h
+++ b/src/box/sql.h
@@ -372,10 +372,13 @@ struct vdbe_field_ref {
 	/** Count of fields in tuple. */
 	uint32_t field_count;
 	/**
-	 * Index of the rightmost initialized slot in slots
-	 * array.
+	 * Bitmask of initialized slots. The fieldno == 0 slot
+	 * must be initialized in vdbe_field_ref constructor.
+	 * This bitmap allows to lookup for the nearest
+	 * initialized slot for a given fieldno to perform as few
+	 * extra tuple decoding as possible.
 	 */
-	uint32_t rightmost_slot;
+	uint64_t slot_bitmask;
 	/**
 	 * Array of offsets of tuple fields.
 	 * Only values <= rightmost_slot are valid.
diff --git a/src/box/sql.c b/src/box/sql.c
index 0ab3a506f..f1df55571 100644
--- a/src/box/sql.c
+++ b/src/box/sql.c
@@ -1245,7 +1245,10 @@ vdbe_field_ref_create(struct vdbe_field_ref *field_ref, struct tuple *tuple,
 	const char *field0 = data;
 	field_ref->field_count = mp_decode_array((const char **) &field0);
 	field_ref->slots[0] = (uint32_t)(field0 - data);
-	field_ref->rightmost_slot = 0;
+	memset(&field_ref->slots[1], 0,
+	       field_ref->field_count * sizeof(field_ref->slots[0]));
+	field_ref->slot_bitmask = 0;
+	bitmask64_set_bit(&field_ref->slot_bitmask, 0);
 }
 
 void
diff --git a/src/box/sql/vdbe.c b/src/box/sql/vdbe.c
index e096e1f65..21b585364 100644
--- a/src/box/sql/vdbe.c
+++ b/src/box/sql/vdbe.c
@@ -606,27 +606,74 @@ vdbe_field_ref_fetch_field(struct vdbe_field_ref *field_ref, uint32_t fieldno)
 }
 
 /**
- * Try to get a current tuple field using its field map.
+ * Find the left closest field for a given fieldno in field_ref's
+ * slot_bitmask. The fieldno is expected to be greater than 0.
+ * @param field_ref The vdbe_field_ref instance to use.
+ * @param fieldno Number of a field to find the nearest left
+ *        neighbor of.
+ * @retval >0 An index of the closest smaller fieldno
+ *            initialized in slot_bitmask.
+ */
+static inline uint32_t
+vdbe_field_ref_closest_slotno(struct vdbe_field_ref *field_ref,
+			      uint32_t fieldno)
+{
+	uint64_t slot_bitmask = field_ref->slot_bitmask;
+	assert(slot_bitmask != 0 && fieldno > 0);
+	uint64_t le_mask = fieldno < 64 ? slot_bitmask & ((1LLU << fieldno) - 1)
+			   : slot_bitmask;
+	assert(bit_clz_u64(le_mask) < 64);
+	return 64 - bit_clz_u64(le_mask) - 1;
+}
+
+/**
+ * Get a tuple's field using field_ref's slot_bitmask, and tuple's
+ * field_map when possible. Required field must be present in
+ * tuple.
  * @param field_ref The vdbe_field_ref instance to use.
  * @param fieldno Number of a field to get.
- * @param[out] field_size Result field size.
  * @retval not NULL MessagePack field.
- * @retval NULL Can not use field_map - it does not contain
- *              offset to @a fieldno.
  */
-static const void *
-vdbe_field_ref_fast_fetch(struct vdbe_field_ref *field_ref, uint32_t fieldno,
-			  uint32_t *field_size)
+static const char *
+vdbe_field_ref_fetch_data(struct vdbe_field_ref *field_ref, uint32_t fieldno)
 {
-	const struct tuple_field *tf =
-		vdbe_field_ref_fetch_field(field_ref, fieldno);
-	if (tf == NULL || tf->offset_slot == TUPLE_OFFSET_SLOT_NIL)
-		return NULL;
-	const char *field = tuple_field(field_ref->tuple, fieldno);
-	const char *end = field;
-	mp_next(&end);
-	*field_size = end - field;
-	return field;
+	if (field_ref->slots[fieldno] != 0)
+		return field_ref->data + field_ref->slots[fieldno];
+
+	const char *field_begin;
+	const struct tuple_field *field = vdbe_field_ref_fetch_field(field_ref,
+								     fieldno);
+	if (field != NULL && field->offset_slot != TUPLE_OFFSET_SLOT_NIL) {
+		field_begin = tuple_field(field_ref->tuple, fieldno);
+	} else {
+		uint32_t prev = vdbe_field_ref_closest_slotno(field_ref,
+							      fieldno);;
+		if (fieldno >= 64) {
+			/*
+			 * There could be initialized slots
+			 * that didn't fit in the bitmask.
+			 * Try to find the biggest initialized
+			 * slot.
+			 */
+			for (uint32_t it = fieldno - 1; it > prev; it--) {
+				if (field_ref->slots[it] == 0)
+					continue;
+				prev = it;
+				break;
+			}
+		}
+		field_begin = field_ref->data + field_ref->slots[prev];
+		for (prev++; prev < fieldno; prev++) {
+			mp_next(&field_begin);
+			field_ref->slots[prev] =
+				(uint32_t)(field_begin - field_ref->data);
+			bitmask64_set_bit(&field_ref->slot_bitmask, prev);
+		}
+		mp_next(&field_begin);
+	}
+	field_ref->slots[fieldno] = (uint32_t)(field_begin - field_ref->data);
+	bitmask64_set_bit(&field_ref->slot_bitmask, fieldno);
+	return field_begin;
 }
 
 static inline enum field_type
@@ -652,45 +699,13 @@ static int
 vdbe_field_ref_fetch(struct vdbe_field_ref *field_ref, uint32_t fieldno,
 		     struct Mem *dest_mem)
 {
-	uint32_t *slots = field_ref->slots;
 	if (fieldno >= field_ref->field_count) {
 		UPDATE_MAX_BLOBSIZE(dest_mem);
 		return 0;
 	}
-
-	const char *data;
-	if (field_ref->rightmost_slot <= fieldno) {
-		uint32_t field_sz;
-		data = vdbe_field_ref_fast_fetch(field_ref, fieldno,
-						 &field_sz);
-		if (data != NULL) {
-			/*
-			 * Special case for tarantool spaces: for
-			 * indexed fields a tuple field map can be
-			 * used. Else there is no sense in
-			 * tuple_field usage, because it makes
-			 * foreach field { mp_next(); } to find
-			 * a field. In such a case sql is
-			 * better - it saves offsets to all fields
-			 * visited in mp_next() cycle.
-			 */
-			uint32_t offset = (uint32_t)(data - field_ref->data);
-			slots[fieldno] = offset;
-			slots[fieldno + 1] = offset + field_sz;
-		} else {
-			uint32_t i = field_ref->rightmost_slot;
-			data = field_ref->data + slots[i];
-			do {
-				mp_next(&data);
-				slots[++i] = (uint32_t)(data - field_ref->data);
-			} while (i <= fieldno);
-			field_ref->rightmost_slot = i;
-		}
-	}
-
 	assert(sqlVdbeCheckMemInvariants(dest_mem) != 0);
+	const char *data = vdbe_field_ref_fetch_data(field_ref, fieldno);
 	uint32_t dummy;
-	data = field_ref->data + slots[fieldno];
 	if (vdbe_decode_msgpack_into_mem(data, dest_mem, &dummy) != 0)
 		return -1;
 
@@ -699,8 +714,9 @@ vdbe_field_ref_fetch(struct vdbe_field_ref *field_ref, uint32_t fieldno,
 	 * Wrap it in a blob verbatim.
 	 */
 	if (dest_mem->flags == 0) {
-		dest_mem->n = slots[fieldno + 1] - slots[fieldno];
 		dest_mem->z = (char *) data;
+		dest_mem->n = vdbe_field_ref_fetch_data(field_ref,
+							fieldno + 1) - data;
 		dest_mem->flags = MEM_Blob | MEM_Ephem | MEM_Subtype;
 		dest_mem->subtype = SQL_SUBTYPE_MSGPACK;
 	}
diff --git a/test/sql/misc.result b/test/sql/misc.result
index 2591d9d75..545be15f6 100644
--- a/test/sql/misc.result
+++ b/test/sql/misc.result
@@ -133,3 +133,95 @@ box.execute('select * from "s"')
 s:drop()
 ---
 ...
+--
+-- gh-4267: Full power of vdbe_field_ref
+-- Tarantool's SQL internally stores data offset for all acceded
+-- fields. It also keeps a bitmask of size 64 with all initialized
+-- slots in actual state to find the nearest left field really
+-- fast and parse tuple from that position. For fieldno >= 64
+-- bitmask is not applicable, so it scans data offsets area in
+-- a cycle.
+--
+-- The test below covers a case when this optimisation doesn't
+-- work and the second lookup require parsing tuple from
+-- beginning.
+---
+format = {}
+---
+...
+t = {}
+---
+...
+for i = 1, 70 do                                                \
+        format[i] = {name = 'FIELD'..i, type = 'unsigned'}      \
+        t[i] = i                                                \
+end
+---
+...
+s = box.schema.create_space('TEST', {format = format})
+---
+...
+pk = s:create_index('pk', {parts = {70}})
+---
+...
+s:insert(t)
+---
+- [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+  23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+  43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+  63, 64, 65, 66, 67, 68, 69, 70]
+...
+box.execute('SELECT field70, field64 FROM test')
+---
+- metadata:
+  - name: FIELD70
+    type: unsigned
+  - name: FIELD64
+    type: unsigned
+  rows:
+  - [70, 64]
+...
+box.space.TEST:drop()
+---
+...
+-- In the case below described optimization works fine.
+format = {}
+---
+...
+t = {}
+---
+...
+for i = 1, 70 do                                                \
+        format[i] = {name = 'FIELD'..i, type = 'unsigned'}      \
+        t[i] = i                                                \
+end
+---
+...
+s = box.schema.create_space('TEST', {format = format})
+---
+...
+pk = s:create_index('pk', {parts = {66}})
+---
+...
+s:insert(t)
+---
+- [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+  23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+  43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+  63, 64, 65, 66, 67, 68, 69, 70]
+...
+box.execute('SELECT field66, field68, field70 FROM test')
+---
+- metadata:
+  - name: FIELD66
+    type: unsigned
+  - name: FIELD68
+    type: unsigned
+  - name: FIELD70
+    type: unsigned
+  rows:
+  - [66, 68, 70]
+...
+box.space.TEST:drop()
+---
+...
diff --git a/test/sql/misc.test.lua b/test/sql/misc.test.lua
index fdc19f3ac..8067be213 100644
--- a/test/sql/misc.test.lua
+++ b/test/sql/misc.test.lua
@@ -35,3 +35,41 @@ s = box.schema.space.create('s',{format=format, temporary=true})
 i = s:create_index('i')
 box.execute('select * from "s"')
 s:drop()
+
+--
+-- gh-4267: Full power of vdbe_field_ref
+-- Tarantool's SQL internally stores data offset for all acceded
+-- fields. It also keeps a bitmask of size 64 with all initialized
+-- slots in actual state to find the nearest left field really
+-- fast and parse tuple from that position. For fieldno >= 64
+-- bitmask is not applicable, so it scans data offsets area in
+-- a cycle.
+--
+-- The test below covers a case when this optimisation doesn't
+-- work and the second lookup require parsing tuple from
+-- beginning.
+---
+format = {}
+t = {}
+for i = 1, 70 do                                                \
+        format[i] = {name = 'FIELD'..i, type = 'unsigned'}      \
+        t[i] = i                                                \
+end
+s = box.schema.create_space('TEST', {format = format})
+pk = s:create_index('pk', {parts = {70}})
+s:insert(t)
+box.execute('SELECT field70, field64 FROM test')
+box.space.TEST:drop()
+
+-- In the case below described optimization works fine.
+format = {}
+t = {}
+for i = 1, 70 do                                                \
+        format[i] = {name = 'FIELD'..i, type = 'unsigned'}      \
+        t[i] = i                                                \
+end
+s = box.schema.create_space('TEST', {format = format})
+pk = s:create_index('pk', {parts = {66}})
+s:insert(t)
+box.execute('SELECT field66, field68, field70 FROM test')
+box.space.TEST:drop()
-- 
2.22.1