[tarantool-patches] Re: [PATCH v1 1/1] sql: improve vdbe_field_ref fetcher

Kirill Shcherbatov kshcherbatov at tarantool.org
Mon Aug 12 18:14:59 MSK 2019


> Hi! Thanks for the patch!
Hello, Vlad! Thank you for your review.

> 1. Please, describe here the problem in more details. Otherwise
> it is far from obvious what is wrong with the current version.
Done =)

>> +uint32_t
>> +vbde_field_ref_extra_sizeof(uint32_t max_field_count)
>> +{
>> +	assert(max_field_count > 0);
>> +	return (max_field_count) * sizeof(uint32_t) +
> 
> 2. Why do you need parentheses for max_field_count?

>> +	memset(field_ref->slot_bitmap, 0,
>> +	       bitmap_size(field_ref->field_count + 1));
> 
> 3. Why +1?
Outdated.

>> +/**
>> + * Calculate the size of vdbe_field_ref structure
>> + * that manages up to max_field_count fields.
> 
> 4. But it is not size of the whole structure. Only size
> of a tail memory block.
Outdated. I don't introduce this function now. (redundant diff)

>> +		curr = bit_iterator_next(&it)) {
> 
> 5. The alignment is broken.
Outdated

> 
> In addition, have you seen bit_iterator_init and bit_iterator_next?
> They are huge. They do so many actions, that perhaps it would be
> faster to decode MessagePack. Could you bench if it works really faster
> now? The most notable improvement should be for the case, when
> tuple fields are big complex objects (array, maps), and mp_next(&field)
> becomes a really expensive operation.
> 
> I understand, that it is hard to microbench this in Lua or C, so I
> propose to write a Lua bench, and to add time measurements right into
> the C code
> - before and after sql_prepare_and_execute;
> - before and after vdbe_field_ref_fetch.
> 
> In Lua we will see ~ overall improvement (I hope),
> around sql_prepare_and_execute we will see results not affected by
> Lua, and around vdbe_field_ref_fetch we will see the true impact. 

I've carried three interesting experiments:
1) a very special corner case:
s = box.schema.create_space('test')
s:format({{'id', 'integer'}, {'crap1', 'array'}, {'crap2', 'array'}, {'crap3', 'array'},
          {'crap4', 'array'}, {'crap5', 'array'}, {'crap6', 'array'}, {'crap7', 'array'},
          {'crap8', 'array'}, {'crap9', 'array'}, {'crap10', 'array'}, {'crap11', 'array'},
          {'crap12', 'array'}, {'crap13', 'array'}, {'crap14', 'array'}, {'crap15', 'array'},
          {'crap16', 'array'}, {'crap17', 'array'}, {'crap18', 'array'}, {'crap19', 'array'},
          {'crap20', 'array'}, {'crap21', 'array'}, {'crap22', 'array'}, {'crap23', 'array'},
          {'crap24', 'array'}, {'crap25', 'array'}, {'crap26', 'array'}, {'crap27', 'array'},
          {'crap28', 'array'}, {'crap29', 'array'}, {'crap30', 'array'},
          {'num', 'integer'}, {'crap31', 'array'},  {'word', 'string'}})
_ = s:create_index('pk')
_ = s:create_index('sk', {parts = {32, 'integer'}})
t = {1, {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        666,
        {{1, 2, 3, 4, 5}, {hello={1, 2, 3}, world={2, 3, 4}, yet={3, 4, 5}, another={4, 5, 6}, word={5, 6, 7}}},
        "hey"}
s:insert(t)
box.execute("SELECT \"num\", \"word\" from \"test\";")

I've added 
    clock_t start, end;
    start = clock();
    /*  ...retrieve field code */
    end = clock();
    printf(">> %ld\n", (end - start));

code to measure performance of two sequential  calls: for 666 field and for "hey" field

Original: field 666 - 3 clock_t_diff; "hey" - 6 clock_t_diff
Patched: field 666 - 3 clock_t_diff; "hey" - 2 clock_t_diff

So as you see, this approach works.

2) I've instrumented the same peace of code, but I've also defined a few
static global variables, to aggregate statistics and portionally flush  it in
a file (you need to run a single test-runner thread):
    clock_t start, end;
    double cpu_time_used;
    start = clock();
    /* ...retrieve field code */
    end = clock();
    sum += (end - start);
    count++;
    if (count == 100000) {
	FILE *f = fopen("results.txt", "a+");
	assert(f != NULL);
	fprintf(f, "%Lf\n", ((long double)sum)/count);
	fflush(f);
	sum = 0;
	count = 0;
	fclose(f);
    }

I then averaged the results from this file.

I should say that modified code a bit faster, about 1.25%

> 6. I guess it is not a fast path anymore, because you moved all the paths
> here, including the case when fieldno is not in the format. The function
> should be renamed to something appropriate, please.
Shure.

> 7. The blocks above and below are basically the same. Please, move one of them to
> a function, and use it here.
Outdated.

===================================================

Vdbe field ref is a dynamic index over tuple fields storing
offsets to each field and filling the offset array on demand.
It is highly used in SQL, because it strongly relies on fast and
repetitive access to any field, not only indexed.

There is an optimisation for the case when a requested field
fieldno is indexed, and the tuple itself stores offset to the
field in its own small field map, used by indexes. vdbe_field_ref
then uses that map to retrieve offset value without decoding
anything. But when SQL requests any field > fieldno, then
vdbe_field_ref decodes the tuple from the beginning in the worst
case. Even having previously accessed fieldno. But it could
start decoding from the latter.

An updated vdbe_field_ref fetcher class use a bitmask of an
initialized slots to use pre-calculated offsets when possible.
This speed-ups SQL in some corner case and doesn't damage
performance in general scenarios.

Closes #4267
---
Branch: http://github.com/tarantool/tarantool/tree/kshch/gh-4267-full-power-of-vdbe-field-ref
Issue: https://github.com/tarantool/tarantool/issues/4267

 src/box/sql.h      |  10 ++--
 src/box/sql.c      |   3 +-
 src/box/sql/vdbe.c | 111 +++++++++++++++++++++++++--------------------
 3 files changed, 70 insertions(+), 54 deletions(-)

diff --git a/src/box/sql.h b/src/box/sql.h
index 9ccecf28c..ec90d9099 100644
--- a/src/box/sql.h
+++ b/src/box/sql.h
@@ -372,10 +372,14 @@ struct vdbe_field_ref {
 	/** Count of fields in tuple. */
 	uint32_t field_count;
 	/**
-	 * Index of the rightmost initialized slot in slots
-	 * array.
+	 * Bitmask of initialized slots. The fieldno == 0 slot
+	 * must be initially initialized in vdbe_field_ref
+	 * constructor.
+	 * This bitmap allows to lookup for a nearest initialized
+	 * slot for a given fieldno to perform as less extra
+	 * tuple decoding, as possible.
 	 */
-	uint32_t rightmost_slot;
+	uint64_t slot_bitmask;
 	/**
 	 * Array of offsets of tuple fields.
 	 * Only values <= rightmost_slot are valid.
diff --git a/src/box/sql.c b/src/box/sql.c
index 0ab3a506f..94f468c28 100644
--- a/src/box/sql.c
+++ b/src/box/sql.c
@@ -1245,7 +1245,8 @@ vdbe_field_ref_create(struct vdbe_field_ref *field_ref, struct tuple *tuple,
 	const char *field0 = data;
 	field_ref->field_count = mp_decode_array((const char **) &field0);
 	field_ref->slots[0] = (uint32_t)(field0 - data);
-	field_ref->rightmost_slot = 0;
+	field_ref->slot_bitmask = 0;
+	column_mask_set_fieldno(&field_ref->slot_bitmask, 0);
 }
 
 void
diff --git a/src/box/sql/vdbe.c b/src/box/sql/vdbe.c
index e096e1f65..d0ff420c1 100644
--- a/src/box/sql/vdbe.c
+++ b/src/box/sql/vdbe.c
@@ -606,27 +606,69 @@ vdbe_field_ref_fetch_field(struct vdbe_field_ref *field_ref, uint32_t fieldno)
 }
 
 /**
- * Try to get a current tuple field using its field map.
+ * Find the closest field for given fieldno in field_ref's
+ * slot_bitmask. The fieldno is expected to be greater than 1.
+ * @param field_ref The vdbe_field_ref instance to use.
+ * @pram fieldno Number of a field to find nearest neighbor.
+ * @retval >0 an index of the closest lesser fieldno
+ *            initialized in slot_bitmask.
+ */
+static uint32_t
+vdbe_field_ref_closest_slotno(struct vdbe_field_ref *field_ref,
+			      uint32_t fieldno)
+{
+	uint64_t slot_bitmask = field_ref->slot_bitmask;
+	assert(slot_bitmask != 0 && fieldno > 0);
+	uint64_t le_mask = fieldno < 64 ?
+			   slot_bitmask & ((1LLU << fieldno) - 1) : slot_bitmask;
+	assert(bit_clz_u64(le_mask) < 64);
+	return 64 - bit_clz_u64(le_mask) - 1;
+}
+
+/**
+ * Get a tuple's field using field_ref's slot_bitmask, and tuple's
+ * field_map when possible. Required field must present in tuple.
  * @param field_ref The vdbe_field_ref instance to use.
  * @param fieldno Number of a field to get.
- * @param[out] field_size Result field size.
  * @retval not NULL MessagePack field.
- * @retval NULL Can not use field_map - it does not contain
- *              offset to @a fieldno.
  */
-static const void *
-vdbe_field_ref_fast_fetch(struct vdbe_field_ref *field_ref, uint32_t fieldno,
-			  uint32_t *field_size)
+static const char *
+vdbe_field_ref_fetch_data(struct vdbe_field_ref *field_ref, uint32_t fieldno)
 {
-	const struct tuple_field *tf =
-		vdbe_field_ref_fetch_field(field_ref, fieldno);
-	if (tf == NULL || tf->offset_slot == TUPLE_OFFSET_SLOT_NIL)
-		return NULL;
-	const char *field = tuple_field(field_ref->tuple, fieldno);
-	const char *end = field;
-	mp_next(&end);
-	*field_size = end - field;
-	return field;
+	/*
+	 * The column_mask_fieldno API works with a "smart" masks,
+	 * but actually we don't need smart masks here so we
+	 * should manually test that fieldno is lesser than
+	 * bitmask's size.
+	 */
+	if (fieldno < 64 &&
+	    column_mask_fieldno_is_set(field_ref->slot_bitmask, fieldno))
+		return field_ref->data + field_ref->slots[fieldno];
+
+	const char *field_begin;
+	const struct tuple_field *field = vdbe_field_ref_fetch_field(field_ref,
+								     fieldno);
+	if (field != NULL &&
+		field->offset_slot != TUPLE_OFFSET_SLOT_NIL) {
+		field_begin = tuple_field(field_ref->tuple, fieldno);
+	} else {
+		uint32_t prev =
+			vdbe_field_ref_closest_slotno(field_ref,
+							fieldno);
+		field_begin = field_ref->data + field_ref->slots[prev];
+		for (prev++; prev < fieldno; prev++) {
+			mp_next(&field_begin);
+			field_ref->slots[prev] =
+				(uint32_t)(field_begin - field_ref->data);
+			column_mask_set_fieldno(&field_ref->slot_bitmask,
+						prev);
+		}
+		mp_next(&field_begin);
+	}
+	field_ref->slots[fieldno] =
+		(uint32_t)(field_begin - field_ref->data);
+	column_mask_set_fieldno(&field_ref->slot_bitmask, fieldno);
+	return field_begin;
 }
 
 static inline enum field_type
@@ -652,45 +694,13 @@ static int
 vdbe_field_ref_fetch(struct vdbe_field_ref *field_ref, uint32_t fieldno,
 		     struct Mem *dest_mem)
 {
-	uint32_t *slots = field_ref->slots;
 	if (fieldno >= field_ref->field_count) {
 		UPDATE_MAX_BLOBSIZE(dest_mem);
 		return 0;
 	}
-
-	const char *data;
-	if (field_ref->rightmost_slot <= fieldno) {
-		uint32_t field_sz;
-		data = vdbe_field_ref_fast_fetch(field_ref, fieldno,
-						 &field_sz);
-		if (data != NULL) {
-			/*
-			 * Special case for tarantool spaces: for
-			 * indexed fields a tuple field map can be
-			 * used. Else there is no sense in
-			 * tuple_field usage, because it makes
-			 * foreach field { mp_next(); } to find
-			 * a field. In such a case sql is
-			 * better - it saves offsets to all fields
-			 * visited in mp_next() cycle.
-			 */
-			uint32_t offset = (uint32_t)(data - field_ref->data);
-			slots[fieldno] = offset;
-			slots[fieldno + 1] = offset + field_sz;
-		} else {
-			uint32_t i = field_ref->rightmost_slot;
-			data = field_ref->data + slots[i];
-			do {
-				mp_next(&data);
-				slots[++i] = (uint32_t)(data - field_ref->data);
-			} while (i <= fieldno);
-			field_ref->rightmost_slot = i;
-		}
-	}
-
 	assert(sqlVdbeCheckMemInvariants(dest_mem) != 0);
+	const char *data = vdbe_field_ref_fetch_data(field_ref, fieldno);
 	uint32_t dummy;
-	data = field_ref->data + slots[fieldno];
 	if (vdbe_decode_msgpack_into_mem(data, dest_mem, &dummy) != 0)
 		return -1;
 
@@ -699,8 +709,9 @@ vdbe_field_ref_fetch(struct vdbe_field_ref *field_ref, uint32_t fieldno,
 	 * Wrap it in a blob verbatim.
 	 */
 	if (dest_mem->flags == 0) {
-		dest_mem->n = slots[fieldno + 1] - slots[fieldno];
 		dest_mem->z = (char *) data;
+		dest_mem->n = vdbe_field_ref_fetch_data(field_ref,
+							fieldno + 1) - data;
 		dest_mem->flags = MEM_Blob | MEM_Ephem | MEM_Subtype;
 		dest_mem->subtype = SQL_SUBTYPE_MSGPACK;
 	}
-- 
2.22.0






More information about the Tarantool-patches mailing list