Tarantool development patches archive
 help / color / mirror / Atom feed
* [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread
@ 2019-05-29 15:12 Vladimir Davydov
  2019-05-29 15:12 ` [PATCH 1/5] vinyl: factor out function to lookup key in page Vladimir Davydov
                   ` (5 more replies)
  0 siblings, 6 replies; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-29 15:12 UTC (permalink / raw)
  To: tarantool-patches

If a key isn't found in the tuple cache, we fetch it from a run file. In
this case disk read and page decompression is done by a reader thread,
however key lookup in the fetched page is still performed by the tx
thread. Since pages are immutable, this could as well be done by the
reader thread, which would allow us to save some precious CPU cycles for
tx.

https://github.com/tarantool/tarantool/issues/4257
https://github.com/tarantool/tarantool/commits/dv/gh-4257-vy-perform-key-lookup-in-reader-thread

Vladimir Davydov (5):
  vinyl: factor out function to lookup key in page
  vinyl: pass page info by reference to reader thread
  vinyl: encapsulate reader thread selection logic in a helper function
  vinyl: do not allow to cancel a fiber reading a page
  vinyl: lookup key in reader thread

 src/box/vy_run.c | 281 ++++++++++++++++++++++++++++---------------------------
 1 file changed, 141 insertions(+), 140 deletions(-)

-- 
2.11.0

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/5] vinyl: factor out function to lookup key in page
  2019-05-29 15:12 [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread Vladimir Davydov
@ 2019-05-29 15:12 ` Vladimir Davydov
  2019-05-29 18:16   ` [tarantool-patches] " Konstantin Osipov
  2019-05-29 15:12 ` [PATCH 2/5] vinyl: pass page info by reference to reader thread Vladimir Davydov
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-29 15:12 UTC (permalink / raw)
  To: tarantool-patches

This function is a part of the run iterator API so we can't use it in
a reader thread. Let's make it an independent helper. As a good side
effect, we can now reuse it in the slice stream implementation.
---
 src/box/vy_run.c | 105 +++++++++++++++++++++++--------------------------------
 1 file changed, 44 insertions(+), 61 deletions(-)

diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index 3b38aac6..84e0f50b 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -744,6 +744,41 @@ vy_page_stmt(struct vy_page *page, uint32_t stmt_no,
 }
 
 /**
+ * Binary search in page
+ * In terms of STL, makes lower_bound for EQ,GE,LT and upper_bound for GT,LE
+ * Additionally *equal_key argument is set to true if the found value is
+ * equal to given key (untouched otherwise)
+ * @retval position in the page
+ */
+static uint32_t
+vy_page_find_key(struct vy_page *page, struct vy_entry key,
+		 struct key_def *cmp_def, struct tuple_format *format,
+		 enum iterator_type iterator_type, bool *equal_key)
+{
+	uint32_t beg = 0;
+	uint32_t end = page->row_count;
+	/* for upper bound we change zero comparison result to -1 */
+	int zero_cmp = (iterator_type == ITER_GT ||
+			iterator_type == ITER_LE ? -1 : 0);
+	while (beg != end) {
+		uint32_t mid = beg + (end - beg) / 2;
+		struct vy_entry fnd_key = vy_page_stmt(page, mid, cmp_def,
+						       format);
+		if (fnd_key.stmt == NULL)
+			return end;
+		int cmp = vy_entry_compare(fnd_key, key, cmp_def);
+		cmp = cmp ? cmp : zero_cmp;
+		*equal_key = *equal_key || cmp == 0;
+		if (cmp < 0)
+			beg = mid + 1;
+		else
+			end = mid;
+		tuple_unref(fnd_key.stmt);
+	}
+	return end;
+}
+
+/**
  * End iteration and free cached data.
  */
 static void
@@ -1055,42 +1090,6 @@ vy_run_iterator_read(struct vy_run_iterator *itr,
 }
 
 /**
- * Binary search in page
- * In terms of STL, makes lower_bound for EQ,GE,LT and upper_bound for GT,LE
- * Additionally *equal_key argument is set to true if the found value is
- * equal to given key (untouched otherwise)
- * @retval position in the page
- */
-static uint32_t
-vy_run_iterator_search_in_page(struct vy_run_iterator *itr,
-			       enum iterator_type iterator_type,
-			       struct vy_entry key,
-			       struct vy_page *page, bool *equal_key)
-{
-	uint32_t beg = 0;
-	uint32_t end = page->row_count;
-	/* for upper bound we change zero comparison result to -1 */
-	int zero_cmp = (iterator_type == ITER_GT ||
-			iterator_type == ITER_LE ? -1 : 0);
-	while (beg != end) {
-		uint32_t mid = beg + (end - beg) / 2;
-		struct vy_entry fnd_key = vy_page_stmt(page, mid, itr->cmp_def,
-						       itr->format);
-		if (fnd_key.stmt == NULL)
-			return end;
-		int cmp = vy_entry_compare(fnd_key, key, itr->cmp_def);
-		cmp = cmp ? cmp : zero_cmp;
-		*equal_key = *equal_key || cmp == 0;
-		if (cmp < 0)
-			beg = mid + 1;
-		else
-			end = mid;
-		tuple_unref(fnd_key.stmt);
-	}
-	return end;
-}
-
-/**
  * Binary search in a run for the given key.
  * In terms of STL, makes lower_bound for EQ,GE,LT and upper_bound for GT,LE
  * Resulting wide position is stored it *pos argument
@@ -1116,9 +1115,9 @@ vy_run_iterator_search(struct vy_run_iterator *itr,
 	if (rc != 0)
 		return rc;
 	bool equal_in_page = false;
-	pos->pos_in_page = vy_run_iterator_search_in_page(itr, iterator_type,
-							  key, page,
-							  &equal_in_page);
+	pos->pos_in_page = vy_page_find_key(page, key, itr->cmp_def,
+					    itr->format, iterator_type,
+					    &equal_in_page);
 	if (pos->pos_in_page == page->row_count) {
 		pos->page_no++;
 		pos->pos_in_page = 0;
@@ -2600,28 +2599,12 @@ vy_slice_stream_search(struct vy_stmt_stream *virt_stream)
 	if (vy_slice_stream_read_page(stream) != 0)
 		return -1;
 
-	/**
-	 * Binary search in page. Find the first position in page with
-	 * tuple >= stream->slice->begin.
-	 */
-	uint32_t beg = 0;
-	uint32_t end = stream->page->row_count;
-	while (beg != end) {
-		uint32_t mid = beg + (end - beg) / 2;
-		struct vy_entry fnd_key = vy_page_stmt(stream->page, mid,
-						       stream->cmp_def,
-						       stream->format);
-		if (fnd_key.stmt == NULL)
-			return -1;
-		int cmp = vy_entry_compare(fnd_key, stream->slice->begin,
-					   stream->cmp_def);
-		if (cmp < 0)
-			beg = mid + 1;
-		else
-			end = mid;
-		tuple_unref(fnd_key.stmt);
-	}
-	stream->pos_in_page = end;
+	bool unused;
+	stream->pos_in_page = vy_page_find_key(stream->page,
+					       stream->slice->begin,
+					       stream->cmp_def,
+					       stream->format,
+					       ITER_GE, &unused);
 
 	if (stream->pos_in_page == stream->page->row_count) {
 		/* The first tuple is in the beginning of the next page */
-- 
2.11.0

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/5] vinyl: pass page info by reference to reader thread
  2019-05-29 15:12 [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread Vladimir Davydov
  2019-05-29 15:12 ` [PATCH 1/5] vinyl: factor out function to lookup key in page Vladimir Davydov
@ 2019-05-29 15:12 ` Vladimir Davydov
  2019-05-29 18:16   ` [tarantool-patches] " Konstantin Osipov
  2019-05-29 15:12 ` [PATCH 3/5] vinyl: encapsulate reader thread selection logic in a helper function Vladimir Davydov
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-29 15:12 UTC (permalink / raw)
  To: tarantool-patches

Since a page read task references the source run file, we don't need to
pass page info by value.
---
 src/box/vy_run.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index 84e0f50b..5b990992 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -92,7 +92,7 @@ struct vy_page_read_task {
 	/** parent */
 	struct cbus_call_msg base;
 	/** vinyl page metadata */
-	struct vy_page_info page_info;
+	struct vy_page_info *page_info;
 	/** vy_run with fd - ref. counted */
 	struct vy_run *run;
 	/** [out] resulting vinyl page */
@@ -944,7 +944,7 @@ vy_page_read_cb(struct cbus_call_msg *base)
 	ZSTD_DStream *zdctx = vy_env_get_zdctx(task->run->env);
 	if (zdctx == NULL)
 		return -1;
-	return vy_page_read(task->page, &task->page_info, task->run, zdctx);
+	return vy_page_read(task->page, task->page_info, task->run, zdctx);
 }
 
 /**
@@ -1014,7 +1014,7 @@ vy_run_iterator_load_page(struct vy_run_iterator *itr, uint32_t page_no,
 		env->next_reader %= env->reader_pool_size;
 
 		task->run = slice->run;
-		task->page_info = *page_info;
+		task->page_info = page_info;
 		task->page = page;
 		vy_run_ref(task->run);
 
-- 
2.11.0

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 3/5] vinyl: encapsulate reader thread selection logic in a helper function
  2019-05-29 15:12 [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread Vladimir Davydov
  2019-05-29 15:12 ` [PATCH 1/5] vinyl: factor out function to lookup key in page Vladimir Davydov
  2019-05-29 15:12 ` [PATCH 2/5] vinyl: pass page info by reference to reader thread Vladimir Davydov
@ 2019-05-29 15:12 ` Vladimir Davydov
  2019-05-29 18:24   ` [tarantool-patches] " Konstantin Osipov
  2019-05-29 15:12 ` [PATCH 4/5] vinyl: do not allow to cancel a fiber reading a page Vladimir Davydov
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-29 15:12 UTC (permalink / raw)
  To: tarantool-patches

Page reading code is intermixed with the reader thread selection in the
same function, which makes it difficult to extend the former. So let's
introduce a helper function encapsulating a call on behalf of a reader
thread.
---
 src/box/vy_run.c | 106 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 55 insertions(+), 51 deletions(-)

diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index 5b990992..a8ca3afe 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -197,6 +197,44 @@ vy_run_env_enable_coio(struct vy_run_env *env)
 }
 
 /**
+ * Execute a task on behalf of a reader thread.
+ * Calls free_cb on failure.
+ */
+static int
+vy_run_env_coio_call(struct vy_run_env *env, struct cbus_call_msg *msg,
+		     cbus_call_f func, cbus_call_f free_cb, double timeout)
+{
+	/* Optimization: use blocking I/O during WAL recovery. */
+	if (env->reader_pool == NULL) {
+		if (func(msg) != 0)
+			goto fail;
+		return 0;
+	}
+
+	/* Pick a reader thread. */
+	struct vy_run_reader *reader;
+	reader = &env->reader_pool[env->next_reader++];
+	env->next_reader %= env->reader_pool_size;
+
+	/* Post the task to the reader thread. */
+	int rc = cbus_call(&reader->reader_pipe, &reader->tx_pipe,
+			   msg, func, free_cb, timeout);
+	if (!msg->complete) {
+		/*
+		 * Operation timed out or the fiber was cancelled.
+		 * free_cb will be called on task completion.
+		 */
+		return -1;
+	}
+	if (rc != 0)
+		goto fail;
+	return 0;
+fail:
+	free_cb(msg);
+	return -1;
+}
+
+/**
  * Initialize page info struct
  *
  * @retval 0 for Success
@@ -996,58 +1034,24 @@ vy_run_iterator_load_page(struct vy_run_iterator *itr, uint32_t page_no,
 		return -1;
 
 	/* Read page data from the disk */
-	int rc;
-	if (env->reader_pool != NULL) {
-		/* Allocate a cbus task. */
-		struct vy_page_read_task *task;
-		task = mempool_alloc(&env->read_task_pool);
-		if (task == NULL) {
-			diag_set(OutOfMemory, sizeof(*task), "mempool",
-				 "vy_page_read_task");
-			vy_page_delete(page);
-			return -1;
-		}
-
-		/* Pick a reader thread. */
-		struct vy_run_reader *reader;
-		reader = &env->reader_pool[env->next_reader++];
-		env->next_reader %= env->reader_pool_size;
-
-		task->run = slice->run;
-		task->page_info = page_info;
-		task->page = page;
-		vy_run_ref(task->run);
-
-		/* Post task to the reader thread. */
-		rc = cbus_call(&reader->reader_pipe, &reader->tx_pipe,
-			       &task->base, vy_page_read_cb,
-			       vy_page_read_cb_free, TIMEOUT_INFINITY);
-		if (!task->base.complete)
-			return -1; /* timed out or cancelled */
-
-		vy_run_unref(task->run);
-		mempool_free(&env->read_task_pool, task);
-
-		if (rc != 0) {
-			/* posted, but failed */
-			vy_page_delete(page);
-			return -1;
-		}
-	} else {
-		/*
-		 * Optimization: use blocked I/O for non-TX threads or
-		 * during WAL recovery (env->status != VINYL_ONLINE).
-		 */
-		ZSTD_DStream *zdctx = vy_env_get_zdctx(env);
-		if (zdctx == NULL) {
-			vy_page_delete(page);
-			return -1;
-		}
-		if (vy_page_read(page, page_info, slice->run, zdctx) != 0) {
-			vy_page_delete(page);
-			return -1;
-		}
+	struct vy_page_read_task *task = mempool_alloc(&env->read_task_pool);
+	if (task == NULL) {
+		diag_set(OutOfMemory, sizeof(*task),
+			 "mempool", "vy_page_read_task");
+		vy_page_delete(page);
+		return -1;
 	}
+	task->run = slice->run;
+	task->page_info = page_info;
+	task->page = page;
+	vy_run_ref(task->run);
+
+	if (vy_run_env_coio_call(env, &task->base, vy_page_read_cb,
+				 vy_page_read_cb_free, TIMEOUT_INFINITY) != 0)
+		return -1;
+
+	vy_run_unref(task->run);
+	mempool_free(&env->read_task_pool, task);
 
 	/* Update cache */
 	if (itr->prev_page != NULL)
-- 
2.11.0

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 4/5] vinyl: do not allow to cancel a fiber reading a page
  2019-05-29 15:12 [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread Vladimir Davydov
                   ` (2 preceding siblings ...)
  2019-05-29 15:12 ` [PATCH 3/5] vinyl: encapsulate reader thread selection logic in a helper function Vladimir Davydov
@ 2019-05-29 15:12 ` Vladimir Davydov
  2019-05-29 18:35   ` [tarantool-patches] " Konstantin Osipov
  2019-05-29 15:12 ` [PATCH 5/5] vinyl: lookup key in reader thread Vladimir Davydov
  2019-05-30  8:42 ` [PATCH 0/5] Hand over key lookup in a page to vinyl " Vladimir Davydov
  5 siblings, 1 reply; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-29 15:12 UTC (permalink / raw)
  To: tarantool-patches

To handle fiber cancellation during page read we need to pin all objects
referenced by vy_page_read_task. Currently, there's the only such
object, vy_run. It has reference counting so pinning it is trivial.
However, to move page lookup to a reader thread, we need to also
reference key def, tuple format, and key. Format and key have reference
counting, but key def doesn't - we typically copy it. Copying it in this
case is too heavy. Let's simply drop this functionality - nothing bad
happens if a cancelled fiber won't exit until disk read is complete.
---
 src/box/vy_run.c | 51 +++++++++++++--------------------------------------
 1 file changed, 13 insertions(+), 38 deletions(-)

diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index a8ca3afe..1dc7271f 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -198,18 +198,14 @@ vy_run_env_enable_coio(struct vy_run_env *env)
 
 /**
  * Execute a task on behalf of a reader thread.
- * Calls free_cb on failure.
  */
 static int
 vy_run_env_coio_call(struct vy_run_env *env, struct cbus_call_msg *msg,
-		     cbus_call_f func, cbus_call_f free_cb, double timeout)
+		     cbus_call_f func)
 {
 	/* Optimization: use blocking I/O during WAL recovery. */
-	if (env->reader_pool == NULL) {
-		if (func(msg) != 0)
-			goto fail;
-		return 0;
-	}
+	if (env->reader_pool == NULL)
+		return func(msg);
 
 	/* Pick a reader thread. */
 	struct vy_run_reader *reader;
@@ -217,21 +213,14 @@ vy_run_env_coio_call(struct vy_run_env *env, struct cbus_call_msg *msg,
 	env->next_reader %= env->reader_pool_size;
 
 	/* Post the task to the reader thread. */
+	bool cancellable = fiber_set_cancellable(false);
 	int rc = cbus_call(&reader->reader_pipe, &reader->tx_pipe,
-			   msg, func, free_cb, timeout);
-	if (!msg->complete) {
-		/*
-		 * Operation timed out or the fiber was cancelled.
-		 * free_cb will be called on task completion.
-		 */
-		return -1;
-	}
+			   msg, func, NULL, TIMEOUT_INFINITY);
+	fiber_set_cancellable(cancellable);
 	if (rc != 0)
-		goto fail;
+		return -1;
+
 	return 0;
-fail:
-	free_cb(msg);
-	return -1;
 }
 
 /**
@@ -986,20 +975,6 @@ vy_page_read_cb(struct cbus_call_msg *base)
 }
 
 /**
- * vinyl read task cleanup callback
- */
-static int
-vy_page_read_cb_free(struct cbus_call_msg *base)
-{
-	struct vy_page_read_task *task = (struct vy_page_read_task *)base;
-	struct vy_run_env *env = task->run->env;
-	vy_page_delete(task->page);
-	vy_run_unref(task->run);
-	mempool_free(&env->read_task_pool, task);
-	return 0;
-}
-
-/**
  * Read a page from disk given its number.
  * The function caches two most recently read pages.
  *
@@ -1044,14 +1019,14 @@ vy_run_iterator_load_page(struct vy_run_iterator *itr, uint32_t page_no,
 	task->run = slice->run;
 	task->page_info = page_info;
 	task->page = page;
-	vy_run_ref(task->run);
 
-	if (vy_run_env_coio_call(env, &task->base, vy_page_read_cb,
-				 vy_page_read_cb_free, TIMEOUT_INFINITY) != 0)
-		return -1;
+	int rc = vy_run_env_coio_call(env, &task->base, vy_page_read_cb);
 
-	vy_run_unref(task->run);
 	mempool_free(&env->read_task_pool, task);
+	if (rc != 0) {
+		vy_page_delete(page);
+		return -1;
+	}
 
 	/* Update cache */
 	if (itr->prev_page != NULL)
-- 
2.11.0

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 5/5] vinyl: lookup key in reader thread
  2019-05-29 15:12 [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread Vladimir Davydov
                   ` (3 preceding siblings ...)
  2019-05-29 15:12 ` [PATCH 4/5] vinyl: do not allow to cancel a fiber reading a page Vladimir Davydov
@ 2019-05-29 15:12 ` Vladimir Davydov
  2019-05-29 18:41   ` [tarantool-patches] " Konstantin Osipov
  2019-05-30  8:42 ` [PATCH 0/5] Hand over key lookup in a page to vinyl " Vladimir Davydov
  5 siblings, 1 reply; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-29 15:12 UTC (permalink / raw)
  To: tarantool-patches

If a key isn't found in the tuple cache, we fetch it from a run file. In
this case disk read and page decompression is done by a reader thread,
however key lookup in the fetched page is still performed by the tx
thread. Since pages are immutable, this could as well be done by the
reader thread, which would allow us to save some precious CPU cycles for
tx.

Close #4257
---
 src/box/vy_run.c | 79 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 20 deletions(-)

diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index 1dc7271f..3485c500 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -95,6 +95,18 @@ struct vy_page_read_task {
 	struct vy_page_info *page_info;
 	/** vy_run with fd - ref. counted */
 	struct vy_run *run;
+	/** key to lookup within the page */
+	struct vy_entry key;
+	/** iterator type (needed for for key lookup) */
+	enum iterator_type iterator_type;
+	/** key definition (needed for key lookup) */
+	struct key_def *cmp_def;
+	/** disk format (needed for key lookup) */
+	struct tuple_format *format;
+	/** [out] position of the key in the page */
+	uint32_t pos_in_page;
+	/** [out] true if key was found in the page */
+	bool equal_found;
 	/** [out] resulting vinyl page */
 	struct vy_page *page;
 };
@@ -971,7 +983,15 @@ vy_page_read_cb(struct cbus_call_msg *base)
 	ZSTD_DStream *zdctx = vy_env_get_zdctx(task->run->env);
 	if (zdctx == NULL)
 		return -1;
-	return vy_page_read(task->page, task->page_info, task->run, zdctx);
+	if (vy_page_read(task->page, task->page_info, task->run, zdctx) != 0)
+		return -1;
+	if (task->key.stmt != NULL) {
+		task->pos_in_page = vy_page_find_key(task->page, task->key,
+						     task->cmp_def, task->format,
+						     task->iterator_type,
+						     &task->equal_found);
+	}
+	return 0;
 }
 
 /**
@@ -983,28 +1003,35 @@ vy_page_read_cb(struct cbus_call_msg *base)
  */
 static NODISCARD int
 vy_run_iterator_load_page(struct vy_run_iterator *itr, uint32_t page_no,
-			  struct vy_page **result)
+			  struct vy_entry key, enum iterator_type iterator_type,
+			  struct vy_page **result, uint32_t *pos_in_page,
+			  bool *equal_found)
 {
 	struct vy_slice *slice = itr->slice;
 	struct vy_run_env *env = slice->run->env;
 
 	/* Check cache */
-	if (itr->curr_page != NULL) {
-		if (itr->curr_page->page_no == page_no) {
-			*result = itr->curr_page;
-			return 0;
-		}
-		if (itr->prev_page != NULL &&
-		    itr->prev_page->page_no == page_no) {
-			SWAP(itr->prev_page, itr->curr_page);
-			*result = itr->curr_page;
-			return 0;
-		}
+	struct vy_page *page = NULL;
+	if (itr->curr_page != NULL &&
+	    itr->curr_page->page_no == page_no) {
+		page = itr->curr_page;
+	} else if (itr->prev_page != NULL &&
+		   itr->prev_page->page_no == page_no) {
+		SWAP(itr->prev_page, itr->curr_page);
+		page = itr->curr_page;
+	}
+	if (page != NULL) {
+		if (key.stmt != NULL)
+			*pos_in_page = vy_page_find_key(page, key, itr->cmp_def,
+							itr->format, iterator_type,
+							equal_found);
+		*result = page;
+		return 0;
 	}
 
 	/* Allocate buffers */
 	struct vy_page_info *page_info = vy_run_page_info(slice->run, page_no);
-	struct vy_page *page = vy_page_new(page_info);
+	page = vy_page_new(page_info);
 	if (page == NULL)
 		return -1;
 
@@ -1019,9 +1046,18 @@ vy_run_iterator_load_page(struct vy_run_iterator *itr, uint32_t page_no,
 	task->run = slice->run;
 	task->page_info = page_info;
 	task->page = page;
+	task->key = key;
+	task->iterator_type = iterator_type;
+	task->cmp_def = itr->cmp_def;
+	task->format = itr->format;
+	task->pos_in_page = 0;
+	task->equal_found = false;
 
 	int rc = vy_run_env_coio_call(env, &task->base, vy_page_read_cb);
 
+	*pos_in_page = task->pos_in_page;
+	*equal_found = task->equal_found;
+
 	mempool_free(&env->read_task_pool, task);
 	if (rc != 0) {
 		vy_page_delete(page);
@@ -1059,7 +1095,11 @@ vy_run_iterator_read(struct vy_run_iterator *itr,
 		     struct vy_entry *ret)
 {
 	struct vy_page *page;
-	int rc = vy_run_iterator_load_page(itr, pos.page_no, &page);
+	bool equal_found;
+	uint32_t pos_in_page;
+	int rc = vy_run_iterator_load_page(itr, pos.page_no, vy_entry_none(),
+					   ITER_GE, &page, &pos_in_page,
+					   &equal_found);
 	if (rc != 0)
 		return rc;
 	*ret = vy_page_stmt(page, pos.pos_in_page, itr->cmp_def, itr->format);
@@ -1089,14 +1129,13 @@ vy_run_iterator_search(struct vy_run_iterator *itr,
 					       equal_key);
 	if (pos->page_no == itr->slice->run->info.page_count)
 		return 1;
+	bool equal_in_page;
 	struct vy_page *page;
-	int rc = vy_run_iterator_load_page(itr, pos->page_no, &page);
+	int rc = vy_run_iterator_load_page(itr, pos->page_no, key,
+					   iterator_type, &page,
+					   &pos->pos_in_page, &equal_in_page);
 	if (rc != 0)
 		return rc;
-	bool equal_in_page = false;
-	pos->pos_in_page = vy_page_find_key(page, key, itr->cmp_def,
-					    itr->format, iterator_type,
-					    &equal_in_page);
 	if (pos->pos_in_page == page->row_count) {
 		pos->page_no++;
 		pos->pos_in_page = 0;
-- 
2.11.0

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tarantool-patches] Re: [PATCH 1/5] vinyl: factor out function to lookup key in page
  2019-05-29 15:12 ` [PATCH 1/5] vinyl: factor out function to lookup key in page Vladimir Davydov
@ 2019-05-29 18:16   ` Konstantin Osipov
  0 siblings, 0 replies; 13+ messages in thread
From: Konstantin Osipov @ 2019-05-29 18:16 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [19/05/29 21:03]:
> This function is a part of the run iterator API so we can't use it in
> a reader thread. Let's make it an independent helper. As a good side
> effect, we can now reuse it in the slice stream implementation.

OK to push.


-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tarantool-patches] Re: [PATCH 2/5] vinyl: pass page info by reference to reader thread
  2019-05-29 15:12 ` [PATCH 2/5] vinyl: pass page info by reference to reader thread Vladimir Davydov
@ 2019-05-29 18:16   ` Konstantin Osipov
  0 siblings, 0 replies; 13+ messages in thread
From: Konstantin Osipov @ 2019-05-29 18:16 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [19/05/29 21:03]:
> Since a page read task references the source run file, we don't need to
> pass page info by value.

OK to push.


-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tarantool-patches] Re: [PATCH 3/5] vinyl: encapsulate reader thread selection logic in a helper function
  2019-05-29 15:12 ` [PATCH 3/5] vinyl: encapsulate reader thread selection logic in a helper function Vladimir Davydov
@ 2019-05-29 18:24   ` Konstantin Osipov
  0 siblings, 0 replies; 13+ messages in thread
From: Konstantin Osipov @ 2019-05-29 18:24 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [19/05/29 21:03]:
> Page reading code is intermixed with the reader thread selection in the
> same function, which makes it difficult to extend the former. So let's
> introduce a helper function encapsulating a call on behalf of a reader
> thread.
>  /**
> + * Execute a task on behalf of a reader thread.
> + * Calls free_cb on failure.
> + */
> +static int
> +vy_run_env_coio_call(struct vy_run_env *env, struct cbus_call_msg *msg,
> +		     cbus_call_f func, cbus_call_f free_cb, double timeout)
> +{
> +	/* Optimization: use blocking I/O during WAL recovery. */
> +	if (env->reader_pool == NULL) {
> +		if (func(msg) != 0)
> +			goto fail;
> +		return 0;
> +	}

this would be a great api if not for this optimization. Today not
involving free_cb on cbus_msg is harmless, but tomorrow may break
some logic. 

Please consider designing an API in a way that would work with
resources in the same way whether or not it is boot time or a coio
call.

The comment could also be extended, not reduced - e.g. explain why
we consider this an optimization, when exactly we perform reads
during recovery (for what statements, for example).


-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tarantool-patches] Re: [PATCH 4/5] vinyl: do not allow to cancel a fiber reading a page
  2019-05-29 15:12 ` [PATCH 4/5] vinyl: do not allow to cancel a fiber reading a page Vladimir Davydov
@ 2019-05-29 18:35   ` Konstantin Osipov
  0 siblings, 0 replies; 13+ messages in thread
From: Konstantin Osipov @ 2019-05-29 18:35 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [19/05/29 21:03]:
> To handle fiber cancellation during page read we need to pin all objects
> referenced by vy_page_read_task. Currently, there's the only such
> object, vy_run. It has reference counting so pinning it is trivial.
> However, to move page lookup to a reader thread, we need to also
> reference key def, tuple format, and key. Format and key have reference
> counting, but key def doesn't - we typically copy it. Copying it in this
> case is too heavy. Let's simply drop this functionality - nothing bad
> happens if a cancelled fiber won't exit until disk read is complete.

This is perhaps a relevant statement with PCIE attached flash
drives. It used to be not true with rotating disks, since a
rotating disk controller could retry reading a block indefinitely
on read failure. 
It is still dangerously false with Network Attached Storage. 

On the other hand this has never been tested. What is not tested,
can and should be removed. 

So I agree let's kill this branch for now, we will need to
re-think query timeout handling for entire call chain as
soon as Tarantool is more often used for complex SQL queries,
so we'll be forced to rethink this.

Please at least call testcancel() after the message has
returned.

This agreement also withdraws my comment about the previous patch.

Please also document carefully the reasoning why we decided to
remove timeout handling in the code not just in the changeset comment.

Otherwise LGTM.

> ---
>  src/box/vy_run.c | 51 +++++++++++++--------------------------------------
>  1 file changed, 13 insertions(+), 38 deletions(-)
> 
> diff --git a/src/box/vy_run.c b/src/box/vy_run.c
> index a8ca3afe..1dc7271f 100644
> --- a/src/box/vy_run.c
> +++ b/src/box/vy_run.c
> @@ -198,18 +198,14 @@ vy_run_env_enable_coio(struct vy_run_env *env)
>  
>  /**
>   * Execute a task on behalf of a reader thread.
> - * Calls free_cb on failure.
>   */
>  static int
>  vy_run_env_coio_call(struct vy_run_env *env, struct cbus_call_msg *msg,
> -		     cbus_call_f func, cbus_call_f free_cb, double timeout)
> +		     cbus_call_f func)
>  {
>  	/* Optimization: use blocking I/O during WAL recovery. */
> -	if (env->reader_pool == NULL) {
> -		if (func(msg) != 0)
> -			goto fail;
> -		return 0;
> -	}
> +	if (env->reader_pool == NULL)
> +		return func(msg);
>  
>  	/* Pick a reader thread. */
>  	struct vy_run_reader *reader;
> @@ -217,21 +213,14 @@ vy_run_env_coio_call(struct vy_run_env *env, struct cbus_call_msg *msg,
>  	env->next_reader %= env->reader_pool_size;
>  
>  	/* Post the task to the reader thread. */
> +	bool cancellable = fiber_set_cancellable(false);
>  	int rc = cbus_call(&reader->reader_pipe, &reader->tx_pipe,
> -			   msg, func, free_cb, timeout);
> -	if (!msg->complete) {
> -		/*
> -		 * Operation timed out or the fiber was cancelled.
> -		 * free_cb will be called on task completion.
> -		 */
> -		return -1;
> -	}
> +			   msg, func, NULL, TIMEOUT_INFINITY);
> +	fiber_set_cancellable(cancellable);
>  	if (rc != 0)
> -		goto fail;
> +		return -1;
> +
>  	return 0;
> -fail:
> -	free_cb(msg);
> -	return -1;
>  }
>  
>  /**
> @@ -986,20 +975,6 @@ vy_page_read_cb(struct cbus_call_msg *base)
>  }
>  
>  /**
> - * vinyl read task cleanup callback
> - */
> -static int
> -vy_page_read_cb_free(struct cbus_call_msg *base)
> -{
> -	struct vy_page_read_task *task = (struct vy_page_read_task *)base;
> -	struct vy_run_env *env = task->run->env;
> -	vy_page_delete(task->page);
> -	vy_run_unref(task->run);
> -	mempool_free(&env->read_task_pool, task);
> -	return 0;
> -}
> -
> -/**
>   * Read a page from disk given its number.
>   * The function caches two most recently read pages.
>   *
> @@ -1044,14 +1019,14 @@ vy_run_iterator_load_page(struct vy_run_iterator *itr, uint32_t page_no,
>  	task->run = slice->run;
>  	task->page_info = page_info;
>  	task->page = page;
> -	vy_run_ref(task->run);
>  
> -	if (vy_run_env_coio_call(env, &task->base, vy_page_read_cb,
> -				 vy_page_read_cb_free, TIMEOUT_INFINITY) != 0)
> -		return -1;
> +	int rc = vy_run_env_coio_call(env, &task->base, vy_page_read_cb);
>  
> -	vy_run_unref(task->run);
>  	mempool_free(&env->read_task_pool, task);
> +	if (rc != 0) {
> +		vy_page_delete(page);
> +		return -1;
> +	}
>  
>  	/* Update cache */
>  	if (itr->prev_page != NULL)
> -- 
> 2.11.0
> 

-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [tarantool-patches] Re: [PATCH 5/5] vinyl: lookup key in reader thread
  2019-05-29 15:12 ` [PATCH 5/5] vinyl: lookup key in reader thread Vladimir Davydov
@ 2019-05-29 18:41   ` Konstantin Osipov
  0 siblings, 0 replies; 13+ messages in thread
From: Konstantin Osipov @ 2019-05-29 18:41 UTC (permalink / raw)
  To: tarantool-patches

* Vladimir Davydov <vdavydov.dev@gmail.com> [19/05/29 21:03]:
> If a key isn't found in the tuple cache, we fetch it from a run file. In
> this case disk read and page decompression is done by a reader thread,
> however key lookup in the fetched page is still performed by the tx
> thread. Since pages are immutable, this could as well be done by the
> reader thread, which would allow us to save some precious CPU cycles for
> tx.
> 
> Close #4257
> ---

This patch is LGTM. The next logical step is moving vy_page_xrow
and vy_page_stmt calls to the reader thread, but this would be
more messy I guess. I'd guess they also consume quite a bit of CPU,
however.


-- 
Konstantin Osipov, Moscow, Russia

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread
  2019-05-29 15:12 [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread Vladimir Davydov
                   ` (4 preceding siblings ...)
  2019-05-29 15:12 ` [PATCH 5/5] vinyl: lookup key in reader thread Vladimir Davydov
@ 2019-05-30  8:42 ` Vladimir Davydov
  2019-05-30 14:20   ` Vladimir Davydov
  5 siblings, 1 reply; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-30  8:42 UTC (permalink / raw)
  To: tarantool-patches

On Wed, May 29, 2019 at 06:12:46PM +0300, Vladimir Davydov wrote:
> Vladimir Davydov (5):
>   vinyl: factor out function to lookup key in page
>   vinyl: pass page info by reference to reader thread
>   vinyl: encapsulate reader thread selection logic in a helper function
>   vinyl: do not allow to cancel a fiber reading a page
>   vinyl: lookup key in reader thread

Pushed to master after adding fiber_testcancel and rewriting the comment
to patch 4 as follows:

    vinyl: do not allow to cancel a fiber reading a page
    
    To handle fiber cancellation during page read we need to pin all objects
    referenced by vy_page_read_task. Currently, there's the only such
    object, vy_run. It has reference counting so pinning it is trivial.
    However, to move page lookup to a reader thread, we need to also
    reference key def, tuple format, and key. Format and key have reference
    counting, but key def doesn't - we typically copy it. Copying it in this
    case is too heavy.
    
    Actually, cancelling a fiber manually or on timeout while it's reading
    disk doesn't make much sense with PCIE attached flash drives. It used to
    be reasonable with rotating disks, since a rotating disk controller
    could retry reading a block indefinitely on read failure. It is still
    relevant to Network Attached Storage. On the other hand, NAS has never
    been tested, and what isn't tested, can and should be removed. For
    complex SQL queries we'll be forced to rethink timeout handling anyway.
    
    That being said, let's simply drop this functionality.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread
  2019-05-30  8:42 ` [PATCH 0/5] Hand over key lookup in a page to vinyl " Vladimir Davydov
@ 2019-05-30 14:20   ` Vladimir Davydov
  0 siblings, 0 replies; 13+ messages in thread
From: Vladimir Davydov @ 2019-05-30 14:20 UTC (permalink / raw)
  To: tarantool-patches

On Thu, May 30, 2019 at 11:42:36AM +0300, Vladimir Davydov wrote:
> On Wed, May 29, 2019 at 06:12:46PM +0300, Vladimir Davydov wrote:
> > Vladimir Davydov (5):
> >   vinyl: factor out function to lookup key in page
> >   vinyl: pass page info by reference to reader thread
> >   vinyl: encapsulate reader thread selection logic in a helper function
> >   vinyl: do not allow to cancel a fiber reading a page
> >   vinyl: lookup key in reader thread
> 
> Pushed to master

Also backported to 2.1 and 1.10 as it may improve performance
of Mail.Ru FRS.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2019-05-30 14:20 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-05-29 15:12 [PATCH 0/5] Hand over key lookup in a page to vinyl reader thread Vladimir Davydov
2019-05-29 15:12 ` [PATCH 1/5] vinyl: factor out function to lookup key in page Vladimir Davydov
2019-05-29 18:16   ` [tarantool-patches] " Konstantin Osipov
2019-05-29 15:12 ` [PATCH 2/5] vinyl: pass page info by reference to reader thread Vladimir Davydov
2019-05-29 18:16   ` [tarantool-patches] " Konstantin Osipov
2019-05-29 15:12 ` [PATCH 3/5] vinyl: encapsulate reader thread selection logic in a helper function Vladimir Davydov
2019-05-29 18:24   ` [tarantool-patches] " Konstantin Osipov
2019-05-29 15:12 ` [PATCH 4/5] vinyl: do not allow to cancel a fiber reading a page Vladimir Davydov
2019-05-29 18:35   ` [tarantool-patches] " Konstantin Osipov
2019-05-29 15:12 ` [PATCH 5/5] vinyl: lookup key in reader thread Vladimir Davydov
2019-05-29 18:41   ` [tarantool-patches] " Konstantin Osipov
2019-05-30  8:42 ` [PATCH 0/5] Hand over key lookup in a page to vinyl " Vladimir Davydov
2019-05-30 14:20   ` Vladimir Davydov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox