Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [RFC PATCH 23/23] vinyl: eliminate read on REPLACE/DELETE
Date: Sun,  8 Jul 2018 19:48:54 +0300	[thread overview]
Message-ID: <22418dc475632b06aee6e8db562dc4467d6a0a31.1531065648.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1531065648.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1531065648.git.vdavydov.dev@gmail.com>

Currently, in presense of secondary indexes we have to read the primary
key on REPLACE and DELETE in order to delete the overwritten tuple from
secondary indexes. This patch eliminates the reads by hading this job
over to primary index dump/compaction task.

In progress...

Closes #2129
---
 src/box/vinyl.c                    | 133 ++++++++++++++++++------
 src/box/vy_scheduler.c             | 200 ++++++++++++++++++++++++++++++++++++-
 src/box/vy_scheduler.h             |   8 ++
 src/box/vy_tx.c                    |  26 +++++
 test/vinyl/info.result             |   5 +
 test/vinyl/info.test.lua           |   3 +
 test/vinyl/layout.result           | 146 +++++++++++++++++----------
 test/vinyl/tx_gap_lock.result      |  16 +--
 test/vinyl/tx_gap_lock.test.lua    |  10 +-
 test/vinyl/write_iterator.result   |   5 +
 test/vinyl/write_iterator.test.lua |   3 +
 11 files changed, 461 insertions(+), 94 deletions(-)

diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 7e23dd93..5acba436 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -65,6 +65,7 @@
 #include "engine.h"
 #include "space.h"
 #include "index.h"
+#include "schema.h"
 #include "xstream.h"
 #include "info.h"
 #include "column_mask.h"
@@ -1282,25 +1283,39 @@ vy_get_by_secondary_tuple(struct vy_lsm *lsm, struct vy_tx *tx,
 			  struct tuple *tuple, struct tuple **result)
 {
 	assert(lsm->index_id > 0);
-	/*
-	 * No need in vy_tx_track() as the tuple must already be
-	 * tracked in the secondary index LSM tree.
-	 */
+
 	if (vy_point_lookup(lsm->pk, tx, rv, tuple, result) != 0)
 		return -1;
 
-	if (*result == NULL) {
+	if (*result == NULL ||
+	    vy_tuple_compare(*result, tuple, lsm->key_def) != 0) {
 		/*
-		 * All indexes of a space must be consistent, i.e.
-		 * if a tuple is present in one index, it must be
-		 * present in all other indexes as well, so we can
-		 * get here only if there's a bug somewhere in vinyl.
-		 * Don't abort as core dump won't really help us in
-		 * this case. Just warn the user and proceed to the
-		 * next tuple.
+		 * If a tuple read from a secondary index doesn't
+		 * match the tuple corresponding to it in the
+		 * primary index, it must have been overwritten or
+		 * deleted, but the DELETE statement hasn't been
+		 * propagated to the secondary index yet. In this
+		 * case silently skip this tuple.
 		 */
-		say_warn("%s: key %s missing in primary index",
-			 vy_lsm_name(lsm), vy_stmt_str(tuple));
+		if (*result != NULL) {
+			tuple_unref(*result);
+			*result = NULL;
+		}
+		vy_cache_on_write(&lsm->cache, tuple, NULL);
+		return 0;
+	}
+
+	/*
+	 * Even though the tuple is tracked in the secondary index
+	 * read set, we still must track the full tuple read from
+	 * the primary index, otherwise the transaction won't be
+	 * aborted if this tuple is overwritten or deleted, because
+	 * the DELETE statement is not written to secondary indexes
+	 * immediately.
+	 */
+	if (tx != NULL && vy_tx_track_point(tx, lsm->pk, *result) != 0) {
+		tuple_unref(*result);
+		return -1;
 	}
 
 	if ((*rv)->vlsn == INT64_MAX)
@@ -1613,7 +1628,6 @@ vy_delete(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
 	struct vy_lsm *lsm = vy_lsm_find_unique(space, request->index_id);
 	if (lsm == NULL)
 		return -1;
-	bool has_secondary = space->index_count > 1;
 	const char *key = request->key;
 	uint32_t part_count = mp_decode_array(&key);
 	if (vy_unique_key_validate(lsm, key, part_count))
@@ -1623,12 +1637,9 @@ vy_delete(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
 	 * before deletion.
 	 * - if the space has on_replace triggers and need to pass
 	 *   to them the old tuple.
-	 *
-	 * - if the space has one or more secondary indexes, then
-	 *   we need to extract secondary keys from the old tuple
-	 *   and pass them to indexes for deletion.
+	 * - if deletion is done by a secondary index.
 	 */
-	if (has_secondary || !rlist_empty(&space->on_replace)) {
+	if (lsm->index_id > 0 || !rlist_empty(&space->on_replace)) {
 		if (vy_get_by_raw_key(lsm, tx, vy_tx_read_view(tx),
 				      key, part_count, &stmt->old_tuple) != 0)
 			return -1;
@@ -1637,8 +1648,7 @@ vy_delete(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
 	}
 	int rc = 0;
 	struct tuple *delete;
-	if (has_secondary) {
-		assert(stmt->old_tuple != NULL);
+	if (stmt->old_tuple != NULL) {
 		delete = vy_stmt_new_surrogate_delete(pk->mem_format,
 						      stmt->old_tuple);
 		if (delete == NULL)
@@ -1651,12 +1661,14 @@ vy_delete(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
 			if (rc != 0)
 				break;
 		}
-	} else { /* Primary is the single index in the space. */
+	} else {
 		assert(lsm->index_id == 0);
 		delete = vy_stmt_new_surrogate_delete_from_key(request->key,
 						pk->key_def, pk->mem_format);
 		if (delete == NULL)
 			return -1;
+		if (space->index_count > 1)
+			vy_stmt_set_flags(delete, VY_STMT_DEFERRED_DELETE);
 		rc = vy_tx_set(tx, pk, delete);
 	}
 	tuple_unref(delete);
@@ -2175,14 +2187,14 @@ vy_replace(struct vy_env *env, struct vy_tx *tx, struct txn_stmt *stmt,
 	/*
 	 * Get the overwritten tuple from the primary index if
 	 * the space has on_replace triggers, in which case we
-	 * need to pass the old tuple to trigger callbacks, or
-	 * if the space has secondary indexes and so we need
-	 * the old tuple to delete it from them.
+	 * need to pass the old tuple to trigger callbacks.
 	 */
-	if (space->index_count > 1 || !rlist_empty(&space->on_replace)) {
+	if (!rlist_empty(&space->on_replace)) {
 		if (vy_get(pk, tx, vy_tx_read_view(tx),
 			   stmt->new_tuple, &stmt->old_tuple) != 0)
 			return -1;
+	} else if (space->index_count > 1) {
+		vy_stmt_set_flags(stmt->new_tuple, VY_STMT_DEFERRED_DELETE);
 	}
 	/*
 	 * Replace in the primary index without explicit deletion
@@ -2454,6 +2466,71 @@ vinyl_engine_rollback_statement(struct engine *engine, struct txn *txn,
 
 /* }}} Public API of transaction control */
 
+/* {{{ Deferred DELETE handling */
+
+static int
+vy_deferred_delete_one(struct vy_lsm *lsm, struct tuple *delete,
+		       struct tuple *old_stmt, struct tuple *new_stmt,
+		       const struct tuple **region_stmt)
+{
+	if (vy_stmt_type(new_stmt) == IPROTO_REPLACE &&
+	    vy_tuple_compare(old_stmt, new_stmt, lsm->key_def) == 0)
+		return 0;
+	if (unlikely(lsm->mem->schema_version != schema_version ||
+		     lsm->mem->generation != *lsm->env->p_generation)) {
+		if (vy_lsm_rotate_mem(lsm) != 0)
+			return -1;
+	}
+	if (vy_lsm_set(lsm, lsm->mem, delete, region_stmt) != 0)
+		return -1;
+	vy_lsm_commit_stmt(lsm, lsm->mem, *region_stmt);
+	return 0;
+}
+
+static int
+vy_deferred_delete(struct tuple *old_stmt, struct tuple *new_stmt, void *arg)
+{
+	struct vy_lsm *pk = arg;
+	assert(pk->index_id == 0);
+	if (pk->is_dropped)
+		return 0;
+	struct space *space = space_by_id(pk->space_id);
+	if (space == NULL)
+		return 0;
+	if (space->index_count <= 1)
+		return 0;
+
+	struct tuple *delete;
+	delete = vy_stmt_new_surrogate_delete(pk->mem_format, old_stmt);
+	if (delete == NULL)
+		return -1;
+
+	vy_stmt_set_lsn(delete, vy_stmt_lsn(new_stmt));
+	vy_stmt_set_flags(delete, VY_STMT_SKIP_READ);
+
+	int rc = 0;
+	struct vy_env *env = container_of(pk->env, struct vy_env, lsm_env);
+	size_t mem_used_before = lsregion_used(&env->mem_env.allocator);
+
+	const struct tuple *region_stmt = NULL;
+	for (uint32_t i = 1; i < space->index_count; i++) {
+		struct vy_lsm *lsm = vy_lsm(space_index(space, i));
+		rc = vy_deferred_delete_one(lsm, delete, old_stmt, new_stmt,
+					    &region_stmt);
+		if (rc != 0)
+			break;
+	}
+
+	size_t mem_used_after = lsregion_used(&env->mem_env.allocator);
+	assert(mem_used_after >= mem_used_before);
+	vy_quota_force_use(&env->quota, mem_used_after - mem_used_before);
+
+	tuple_unref(delete);
+	return rc;
+}
+
+/* }}} Deferred DELETE handling */
+
 /** {{{ Environment */
 
 static void
@@ -2615,7 +2692,7 @@ vy_env_new(const char *path, size_t memory,
 
 	vy_mem_env_create(&e->mem_env, e->memory);
 	vy_scheduler_create(&e->scheduler, e->write_threads,
-			    vy_env_dump_complete_cb,
+			    vy_env_dump_complete_cb, vy_deferred_delete,
 			    &e->run_env, &e->xm->read_views);
 
 	if (vy_lsm_env_create(&e->lsm_env, e->path,
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 06dbb1f8..a070b46f 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -65,6 +65,8 @@ static int vy_worker_f(va_list);
 static int vy_scheduler_f(va_list);
 static void vy_task_execute_f(struct cmsg *);
 static void vy_task_complete_f(struct cmsg *);
+static void vy_deferred_delete_batch_process_f(struct cmsg *);
+static void vy_deferred_delete_batch_free_f(struct cmsg *);
 
 static const struct cmsg_hop vy_task_execute_route[] = {
 	{ vy_task_execute_f, NULL },
@@ -83,10 +85,42 @@ struct vy_worker {
 	struct cpipe tx_pipe;
 	/** Link in vy_scheduler::idle_workers. */
 	struct stailq_entry in_idle;
+	/** Route for sending deferred DELETEs back to tx. */
+	struct cmsg_hop deferred_delete_route[2];
 };
 
 struct vy_task;
 
+/** Max number of statements in a batch of deferred DELETEs. */
+enum { VY_DEFERRED_DELETE_BATCH_MAX = 100 };
+
+/** Deferred DELETE statement. */
+struct vy_deferred_delete_stmt {
+	/** Overwritten tuple. */
+	struct tuple *old_stmt;
+	/** Statement that overwrote @old_stmt. */
+	struct tuple *new_stmt;
+};
+
+/**
+ * Batch of deferred DELETE statements generated during
+ * a primary index compaction.
+ */
+struct vy_deferred_delete_batch {
+	/** CBus messages for sending the batch to tx. */
+	struct cmsg cmsg;
+	/** Task that generated this batch. */
+	struct vy_task *task;
+	/** Set if the tx thread failed to process the batch. */
+	bool is_failed;
+	/** In case of failure the error is stored here. */
+	struct diag diag;
+	/** Number of elements actually stored in @stmt array. */
+	int count;
+	/** Array of deferred DELETE statements. */
+	struct vy_deferred_delete_stmt stmt[VY_DEFERRED_DELETE_BATCH_MAX];
+};
+
 struct vy_task_ops {
 	/**
 	 * This function is called from a worker. It is supposed to do work
@@ -159,6 +193,13 @@ struct vy_task {
 	 */
 	double bloom_fpr;
 	int64_t page_size;
+	/** Batch of deferred deletes generated by this task. */
+	struct vy_deferred_delete_batch *deferred_delete_batch;
+	/**
+	 * Number of batches of deferred DELETEs sent to tx
+	 * and not yet processed.
+	 */
+	int deferred_delete_in_progress;
 	/** Link in vy_scheduler::processed_tasks. */
 	struct stailq_entry in_processed;
 };
@@ -204,6 +245,8 @@ vy_task_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 static void
 vy_task_delete(struct vy_task *task)
 {
+	assert(task->deferred_delete_batch == NULL);
+	assert(task->deferred_delete_in_progress == 0);
 	key_def_delete(task->cmp_def);
 	key_def_delete(task->key_def);
 	vy_lsm_unref(task->lsm);
@@ -293,6 +336,12 @@ vy_scheduler_start_workers(struct vy_scheduler *scheduler)
 		cpipe_create(&worker->worker_pipe, name);
 		stailq_add_tail_entry(&scheduler->idle_workers,
 				      worker, in_idle);
+
+		struct cmsg_hop *route = worker->deferred_delete_route;
+		route[0].f = vy_deferred_delete_batch_process_f;
+		route[0].pipe = &worker->worker_pipe;
+		route[1].f = vy_deferred_delete_batch_free_f;
+		route[1].pipe = NULL;
 	}
 }
 
@@ -313,11 +362,13 @@ vy_scheduler_stop_workers(struct vy_scheduler *scheduler)
 void
 vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 		    vy_scheduler_dump_complete_f dump_complete_cb,
+		    vy_deferred_delete_f deferred_delete_cb,
 		    struct vy_run_env *run_env, struct rlist *read_views)
 {
 	memset(scheduler, 0, sizeof(*scheduler));
 
 	scheduler->dump_complete_cb = dump_complete_cb;
+	scheduler->deferred_delete_cb = deferred_delete_cb;
 	scheduler->read_views = read_views;
 	scheduler->run_env = run_env;
 
@@ -652,6 +703,136 @@ vy_run_discard(struct vy_run *run)
 	vy_log_tx_try_commit();
 }
 
+/**
+ * Callback invoked by the tx thread to process deferred DELETEs
+ * generated during compaction.
+ */
+static void
+vy_deferred_delete_batch_process_f(struct cmsg *cmsg)
+{
+	struct vy_deferred_delete_batch *batch = container_of(cmsg,
+				struct vy_deferred_delete_batch, cmsg);
+	struct vy_task *task = batch->task;
+	struct vy_scheduler *scheduler = task->scheduler;
+
+	for (int i = 0; i < batch->count; i++) {
+		struct vy_deferred_delete_stmt *stmt = &batch->stmt[i];
+		if (scheduler->deferred_delete_cb(stmt->old_stmt,
+						  stmt->new_stmt,
+						  task->lsm) != 0) {
+			struct diag *diag = diag_get();
+			assert(!diag_is_empty(diag));
+			batch->is_failed = true;
+			diag_move(diag, &batch->diag);
+			return;
+		}
+	}
+}
+
+/**
+ * Callback invoked by a worker thread to free processed deferred
+ * DELETE statements. It must be done on behalf the worker thread
+ * that generated those DELETEs, because a vinyl statement cannot
+ * be allocated and freed in different threads.
+ */
+static void
+vy_deferred_delete_batch_free_f(struct cmsg *cmsg)
+{
+	struct vy_deferred_delete_batch *batch = container_of(cmsg,
+				struct vy_deferred_delete_batch, cmsg);
+	struct vy_task *task = batch->task;
+	for (int i = 0; i < batch->count; i++) {
+		struct vy_deferred_delete_stmt *stmt = &batch->stmt[i];
+		vy_stmt_unref_if_possible(stmt->old_stmt);
+		vy_stmt_unref_if_possible(stmt->new_stmt);
+	}
+	/*
+	 * Abort the task if the tx thread failed to process
+	 * the batch unless it has already been aborted.
+	 */
+	if (batch->is_failed && !task->is_failed) {
+		assert(!diag_is_empty(&batch->diag));
+		diag_move(&batch->diag, &task->diag);
+		task->is_failed = true;
+		fiber_cancel(task->fiber);
+	}
+	diag_destroy(&batch->diag);
+	free(batch);
+	/* Notify the caller if this is the last batch. */
+	assert(task->deferred_delete_in_progress > 0);
+	if (--task->deferred_delete_in_progress == 0)
+		fiber_wakeup(task->fiber);
+}
+
+/**
+ * Send all deferred DELETEs accumulated by a vinyl task to
+ * the tx thread where they will be processed.
+ */
+static void
+vy_task_deferred_delete_flush(struct vy_task *task)
+{
+	struct vy_worker *worker = task->worker;
+	struct vy_deferred_delete_batch *batch = task->deferred_delete_batch;
+
+	if (batch == NULL)
+		return;
+
+	task->deferred_delete_batch = NULL;
+	task->deferred_delete_in_progress++;
+
+	cmsg_init(&batch->cmsg, worker->deferred_delete_route);
+	cpipe_push(&worker->tx_pipe, &batch->cmsg);
+}
+
+/**
+ * Wait for all deferred DELETE statements sent to tx to
+ * be processed and returned back to the worker.
+ */
+static void
+vy_task_deferred_delete_wait(struct vy_task *task)
+{
+	while (task->deferred_delete_in_progress > 0)
+		fiber_sleep(TIMEOUT_INFINITY);
+}
+
+/**
+ * Callback invoked by the write iterator during compaction to
+ * generate deferred DELETE statements. It adds a deferred DELETE
+ * to a batch. Once the batch gets full, it submits it to tx.
+ */
+static int
+vy_task_deferred_delete(struct tuple *old_stmt,
+			struct tuple *new_stmt, void *arg)
+{
+	struct vy_task *task = arg;
+	struct vy_deferred_delete_batch *batch = task->deferred_delete_batch;
+
+	/* Allocate a new batch on demand. */
+	if (batch == NULL) {
+		batch = malloc(sizeof(*batch));
+		if (batch == NULL) {
+			diag_set(OutOfMemory, sizeof(*batch), "malloc",
+				 "struct vy_deferred_delete_batch");
+			return -1;
+		}
+		memset(batch, 0, sizeof(*batch));
+		batch->task = task;
+		diag_create(&batch->diag);
+		task->deferred_delete_batch = batch;
+	}
+
+	assert(batch->count < VY_DEFERRED_DELETE_BATCH_MAX);
+	struct vy_deferred_delete_stmt *stmt = &batch->stmt[batch->count++];
+	stmt->old_stmt = old_stmt;
+	vy_stmt_ref_if_possible(old_stmt);
+	stmt->new_stmt = new_stmt;
+	vy_stmt_ref_if_possible(new_stmt);
+
+	if (batch->count == VY_DEFERRED_DELETE_BATCH_MAX)
+		vy_task_deferred_delete_flush(task);
+	return 0;
+}
+
 static int
 vy_task_write_run(struct vy_task *task)
 {
@@ -1006,7 +1187,9 @@ vy_task_dump_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 	bool is_last_level = (lsm->run_count == 0);
 	wi = vy_write_iterator_new(task->cmp_def, lsm->disk_format,
 				   lsm->index_id == 0, is_last_level,
-				   scheduler->read_views, NULL, NULL);
+				   scheduler->read_views,
+				   lsm->index_id > 0 ? NULL :
+				   vy_task_deferred_delete, task);
 	if (wi == NULL)
 		goto err_wi;
 	rlist_foreach_entry(mem, &lsm->sealed, in_sealed) {
@@ -1273,7 +1456,9 @@ vy_task_compact_new(struct vy_scheduler *scheduler, struct vy_lsm *lsm,
 	bool is_last_level = (range->compact_priority == range->slice_count);
 	wi = vy_write_iterator_new(task->cmp_def, lsm->disk_format,
 				   lsm->index_id == 0, is_last_level,
-				   scheduler->read_views, NULL, NULL);
+				   scheduler->read_views,
+				   lsm->index_id > 0 ? NULL :
+				   vy_task_deferred_delete, task);
 	if (wi == NULL)
 		goto err_wi;
 
@@ -1336,12 +1521,21 @@ static int
 vy_task_f(va_list va)
 {
 	struct vy_task *task = va_arg(va, struct vy_task *);
-	if (task->ops->execute(task) != 0) {
+	if (task->ops->execute(task) != 0 && !task->is_failed) {
 		struct diag *diag = diag_get();
 		assert(!diag_is_empty(diag));
 		task->is_failed = true;
 		diag_move(diag, &task->diag);
 	}
+
+	/*
+	 * We must not complete the task until we make sure that
+	 * all deferred DELETEs generated during task execution
+	 * have been successfully processed.
+	 */
+	vy_task_deferred_delete_flush(task);
+	vy_task_deferred_delete_wait(task);
+
 	cmsg_init(&task->cmsg, vy_task_complete_route);
 	cpipe_push(&task->worker->tx_pipe, &task->cmsg);
 	task->fiber = NULL;
diff --git a/src/box/vy_scheduler.h b/src/box/vy_scheduler.h
index deefacd7..f056eeff 100644
--- a/src/box/vy_scheduler.h
+++ b/src/box/vy_scheduler.h
@@ -43,6 +43,8 @@
 #include "salad/heap.h"
 #include "salad/stailq.h"
 
+#include "vy_write_iterator.h" /* vy_deferred_delete_f */
+
 #if defined(__cplusplus)
 extern "C" {
 #endif /* defined(__cplusplus) */
@@ -144,6 +146,11 @@ struct vy_scheduler {
 	 * by the dump.
 	 */
 	vy_scheduler_dump_complete_f dump_complete_cb;
+	/**
+	 * Callback invoked in the tx thread for each deferred DELETE
+	 * statement generated during compaction.
+	 */
+	vy_deferred_delete_f deferred_delete_cb;
 	/** List of read views, see tx_manager::read_views. */
 	struct rlist *read_views;
 	/** Context needed for writing runs. */
@@ -156,6 +163,7 @@ struct vy_scheduler {
 void
 vy_scheduler_create(struct vy_scheduler *scheduler, int write_threads,
 		    vy_scheduler_dump_complete_f dump_complete_cb,
+		    vy_deferred_delete_f deferred_delete_cb,
 		    struct vy_run_env *run_env, struct rlist *read_views);
 
 /**
diff --git a/src/box/vy_tx.c b/src/box/vy_tx.c
index f5bb624f..8100deef 100644
--- a/src/box/vy_tx.c
+++ b/src/box/vy_tx.c
@@ -536,6 +536,22 @@ vy_tx_prepare(struct vy_tx *tx)
 		if (v->is_overwritten)
 			continue;
 
+		if (lsm->index_id > 0 && repsert == NULL && delete == NULL) {
+			/*
+			 * This statement is for a secondary index,
+			 * and the statement corresponding to it in
+			 * the primary index was overwritten. This
+			 * can only happen if insertion of DELETE
+			 * into secondary indexes was postponed until
+			 * primary index compaction. In this case
+			 * the DELETE will not be propagated, because
+			 * the corresponding statement never made it
+			 * to the primary index LSM tree. So we must
+			 * skip it for secondary indexes as well.
+			 */
+			continue;
+		}
+
 		enum iproto_type type = vy_stmt_type(v->stmt);
 
 		/* Optimize out INSERT + DELETE for the same key. */
@@ -550,6 +566,16 @@ vy_tx_prepare(struct vy_tx *tx)
 			 */
 			type = IPROTO_INSERT;
 			vy_stmt_set_type(v->stmt, type);
+			/*
+			 * In case of INSERT, no statement was actually
+			 * overwritten so no need to generate a deferred
+			 * DELETE for secondary indexes.
+			 */
+			uint8_t flags = vy_stmt_flags(v->stmt);
+			if (flags & VY_STMT_DEFERRED_DELETE) {
+				vy_stmt_set_flags(v->stmt, flags &
+						  ~VY_STMT_DEFERRED_DELETE);
+			}
 		}
 
 		if (!v->is_first_insert && type == IPROTO_INSERT) {
diff --git a/test/vinyl/info.result b/test/vinyl/info.result
index 112ba85e..950a56cf 100644
--- a/test/vinyl/info.result
+++ b/test/vinyl/info.result
@@ -1032,6 +1032,11 @@ s:drop()
 s = box.schema.space.create('test', {engine = 'vinyl'})
 ---
 ...
+-- Install on_replace trigger to disable REPLACE/DELETE
+-- optimization in the secondary index (gh-2129).
+_ = s:on_replace(function() end)
+---
+...
 s:bsize()
 ---
 - 0
diff --git a/test/vinyl/info.test.lua b/test/vinyl/info.test.lua
index 863a8793..867415c9 100644
--- a/test/vinyl/info.test.lua
+++ b/test/vinyl/info.test.lua
@@ -321,6 +321,9 @@ s:drop()
 --
 
 s = box.schema.space.create('test', {engine = 'vinyl'})
+-- Install on_replace trigger to disable REPLACE/DELETE
+-- optimization in the secondary index (gh-2129).
+_ = s:on_replace(function() end)
 s:bsize()
 i1 = s:create_index('i1', {parts = {1, 'unsigned'}, run_count_per_level = 1})
 i2 = s:create_index('i2', {parts = {2, 'unsigned'}, run_count_per_level = 1})
diff --git a/test/vinyl/layout.result b/test/vinyl/layout.result
index 1f928a8f..33f7e4b9 100644
--- a/test/vinyl/layout.result
+++ b/test/vinyl/layout.result
@@ -135,15 +135,15 @@ result
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [5, {2: 8, 9: 10}]
+          tuple: [5, {2: 9, 9: 10}]
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [4, {2: 5}]
+          tuple: [4, {2: 6}]
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [6, {2: 5}]
+          tuple: [6, {2: 6}]
       - HEADER:
           type: INSERT
         BODY:
@@ -151,7 +151,7 @@ result
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [8, {1: 1, 2: 8, 8: 9}]
+          tuple: [8, {1: 1, 2: 9, 8: 10}]
       - HEADER:
           type: INSERT
         BODY:
@@ -160,15 +160,11 @@ result
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [5, {0: 2, 2: 6, 9: 10}]
+          tuple: [5, {0: 2, 2: 7, 9: 10}]
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [4, {0: 2, 2: 4}]
-      - HEADER:
-          type: INSERT
-        BODY:
-          tuple: [6, {2: 4}]
+          tuple: [5, {0: 2, 2: 4, 9: 5}]
       - HEADER:
           type: INSERT
         BODY:
@@ -176,36 +172,35 @@ result
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [8, {1: 3, 2: 6, 8: 7}]
+          tuple: [8, {1: 3, 2: 4, 8: 5}]
       - HEADER:
           type: INSERT
         BODY:
-          tuple: [11, {}]
+          tuple: [8, {1: 3, 2: 7, 8: 8}]
       - HEADER:
-          timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [7, {2: 5}]
+          tuple: [11, {}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [7, {2: 4}]
+          tuple: [7, {2: 6}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [4, {0: 2, 2: 10}]
+          tuple: [4, {0: 2, 2: 11}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [5, {0: 2, 2: 10, 9: 13}]
+          tuple: [5, {0: 2, 2: 11, 9: 13}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [8, {1: 3, 2: 10, 8: 11}]
+          tuple: [8, {1: 3, 2: 11, 8: 12}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
@@ -215,23 +210,23 @@ result
           timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [4, {2: 12}]
+          tuple: [4, {2: 13}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [5, {2: 12, 9: 13}]
+          tuple: [5, {2: 13, 9: 13}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
         BODY:
-          tuple: [8, {1: 1, 2: 12, 8: 13}]
+          tuple: [8, {1: 1, 2: 13, 8: 14}]
       - HEADER:
           timestamp: <timestamp>
           type: INSERT
         BODY:
           tuple: [10, {9: 13}]
-  - - 00000000000000000008.index
+  - - 00000000000000000009.index
     - - HEADER:
           type: RUNINFO
         BODY:
@@ -250,7 +245,7 @@ result
           unpacked_size: 67
           row_count: 3
           min_key: ['ёёё']
-  - - 00000000000000000008.run
+  - - 00000000000000000009.run
     - - HEADER:
           lsn: 10
           type: REPLACE
@@ -270,7 +265,7 @@ result
           type: ROWINDEX
         BODY:
           row_index: "\0\0\0\0\0\0\0\x10\0\0\0 "
-  - - 00000000000000000012.index
+  - - 00000000000000000013.index
     - - HEADER:
           type: RUNINFO
         BODY:
@@ -285,36 +280,86 @@ result
         BODY:
           row_index_offset: <offset>
           offset: <offset>
-          size: 90
-          unpacked_size: 71
-          row_count: 3
+          size: 166
+          unpacked_size: 147
+          row_count: 6
           min_key: ['ёёё']
-  - - 00000000000000000012.run
+  - - 00000000000000000013.run
     - - HEADER:
           lsn: 11
           type: REPLACE
         BODY:
           tuple: ['ёёё', 123]
+          flags: 1
+      - HEADER:
+          lsn: 11
+          type: REPLACE
+        BODY:
+          tuple: ['ёёё', 123]
+          flags: 1
       - HEADER:
           lsn: 13
           type: REPLACE
         BODY:
           tuple: ['ююю', 789]
+          flags: 1
+      - HEADER:
+          lsn: 13
+          type: REPLACE
+        BODY:
+          tuple: ['ююю', 789]
+          flags: 1
+      - HEADER:
+          lsn: 12
+          type: REPLACE
+        BODY:
+          tuple: ['ЮЮЮ', 456]
+          flags: 1
       - HEADER:
           lsn: 12
           type: REPLACE
         BODY:
           tuple: ['ЮЮЮ', 456]
+          flags: 1
       - HEADER:
           type: ROWINDEX
         BODY:
-          row_index: "\0\0\0\0\0\0\0\x10\0\0\0\""
-  - - 00000000000000000006.index
+          row_index: "\0\0\0\0\0\0\0\x12\0\0\0$\0\0\08\0\0\0L\0\0\0`"
+  - - 00000000000000000004.index
     - - HEADER:
           type: RUNINFO
         BODY:
-          min_lsn: 8
-          max_key: [null, 'ЭЭЭ']
+          min_lsn: 5
+          max_key: [777, 'ЁЁЁ']
+          page_count: 1
+          bloom_filter: <bloom_filter>
+          max_lsn: 5
+          min_key: [777, 'ЁЁЁ']
+      - HEADER:
+          type: PAGEINFO
+        BODY:
+          row_index_offset: <offset>
+          offset: <offset>
+          size: 48
+          unpacked_size: 29
+          row_count: 1
+          min_key: [777, 'ЁЁЁ']
+  - - 00000000000000000004.run
+    - - HEADER:
+          lsn: 5
+          type: INSERT
+        BODY:
+          tuple: [777, 'ЁЁЁ']
+      - HEADER:
+          type: ROWINDEX
+        BODY:
+          row_index: "\0\0\0\0"
+  - - 00000000000000000007.index
+    - - HEADER:
+          type: RUNINFO
+        BODY:
+          min_lsn: 6
+          max_key: [777, 'ЁЁЁ']
           page_count: 1
           bloom_filter: <bloom_filter>
           max_lsn: 10
@@ -324,11 +369,11 @@ result
         BODY:
           row_index_offset: <offset>
           offset: <offset>
-          size: 86
-          unpacked_size: 67
-          row_count: 3
+          size: 110
+          unpacked_size: 91
+          row_count: 4
           min_key: [null, 'ёёё']
-  - - 00000000000000000006.run
+  - - 00000000000000000007.run
     - - HEADER:
           lsn: 10
           type: REPLACE
@@ -345,10 +390,16 @@ result
         BODY:
           tuple: [null, 'ЭЭЭ']
       - HEADER:
+          lsn: 6
+          type: DELETE
+        BODY:
+          key: [777, 'ЁЁЁ']
+          flags: 2
+      - HEADER:
           type: ROWINDEX
         BODY:
-          row_index: "\0\0\0\0\0\0\0\x10\0\0\0 "
-  - - 00000000000000000010.index
+          row_index: "\0\0\0\0\0\0\0\x10\0\0\0 \0\0\00"
+  - - 00000000000000000011.index
     - - HEADER:
           type: RUNINFO
         BODY:
@@ -357,24 +408,19 @@ result
           page_count: 1
           bloom_filter: <bloom_filter>
           max_lsn: 13
-          min_key: [null, 'ёёё']
+          min_key: [123, 'ёёё']
       - HEADER:
           type: PAGEINFO
         BODY:
           row_index_offset: <offset>
           offset: <offset>
-          size: 110
-          unpacked_size: 91
-          row_count: 4
-          min_key: [null, 'ёёё']
-  - - 00000000000000000010.run
+          size: 90
+          unpacked_size: 71
+          row_count: 3
+          min_key: [123, 'ёёё']
+  - - 00000000000000000011.run
     - - HEADER:
           lsn: 11
-          type: DELETE
-        BODY:
-          key: [null, 'ёёё']
-      - HEADER:
-          lsn: 11
           type: REPLACE
         BODY:
           tuple: [123, 'ёёё']
@@ -391,7 +437,7 @@ result
       - HEADER:
           type: ROWINDEX
         BODY:
-          row_index: "\0\0\0\0\0\0\0\x10\0\0\0 \0\0\02"
+          row_index: "\0\0\0\0\0\0\0\x10\0\0\0\""
 ...
 test_run:cmd("clear filter")
 ---
diff --git a/test/vinyl/tx_gap_lock.result b/test/vinyl/tx_gap_lock.result
index 150826cb..a456c017 100644
--- a/test/vinyl/tx_gap_lock.result
+++ b/test/vinyl/tx_gap_lock.result
@@ -1194,8 +1194,8 @@ s:drop()
 ---
 ...
 ----------------------------------------------------------------
--- gh-2534: Iterator over a secondary index doesn't double track
--- results in the primary index.
+-- Iterator over a secondary index tracks all results in the
+-- primary index. Needed for gh-2129.
 ----------------------------------------------------------------
 s = box.schema.space.create('test', {engine = 'vinyl'})
 ---
@@ -1219,23 +1219,23 @@ gap_lock_count() -- 0
 _ = s.index.sk:select({}, {limit = 50})
 ---
 ...
-gap_lock_count() -- 1
+gap_lock_count() -- 51
 ---
-- 1
+- 51
 ...
 for i = 1, 100 do s.index.sk:get(i) end
 ---
 ...
-gap_lock_count() -- 51
+gap_lock_count() -- 151
 ---
-- 51
+- 151
 ...
 _ = s.index.sk:select()
 ---
 ...
-gap_lock_count() -- 1
+gap_lock_count() -- 101
 ---
-- 1
+- 101
 ...
 box.commit()
 ---
diff --git a/test/vinyl/tx_gap_lock.test.lua b/test/vinyl/tx_gap_lock.test.lua
index 4d8d21d8..4ad55860 100644
--- a/test/vinyl/tx_gap_lock.test.lua
+++ b/test/vinyl/tx_gap_lock.test.lua
@@ -380,8 +380,8 @@ c4:commit()
 
 s:drop()
 ----------------------------------------------------------------
--- gh-2534: Iterator over a secondary index doesn't double track
--- results in the primary index.
+-- Iterator over a secondary index tracks all results in the
+-- primary index. Needed for gh-2129.
 ----------------------------------------------------------------
 s = box.schema.space.create('test', {engine = 'vinyl'})
 _ = s:create_index('pk', {parts = {1, 'unsigned'}})
@@ -390,11 +390,11 @@ for i = 1, 100 do s:insert{i, i} end
 box.begin()
 gap_lock_count() -- 0
 _ = s.index.sk:select({}, {limit = 50})
-gap_lock_count() -- 1
-for i = 1, 100 do s.index.sk:get(i) end
 gap_lock_count() -- 51
+for i = 1, 100 do s.index.sk:get(i) end
+gap_lock_count() -- 151
 _ = s.index.sk:select()
-gap_lock_count() -- 1
+gap_lock_count() -- 101
 box.commit()
 gap_lock_count() -- 0
 s:drop()
diff --git a/test/vinyl/write_iterator.result b/test/vinyl/write_iterator.result
index 162d8463..8ccd125a 100644
--- a/test/vinyl/write_iterator.result
+++ b/test/vinyl/write_iterator.result
@@ -741,6 +741,11 @@ space:drop()
 s = box.schema.space.create('test', {engine = 'vinyl'})
 ---
 ...
+-- Install on_replace trigger to disable REPLACE/DELETE
+-- optimization in the secondary index (gh-2129).
+_ = s:on_replace(function() end)
+---
+...
 pk = s:create_index('primary', {run_count_per_level = 1})
 ---
 ...
diff --git a/test/vinyl/write_iterator.test.lua b/test/vinyl/write_iterator.test.lua
index 9a6cc480..82d92649 100644
--- a/test/vinyl/write_iterator.test.lua
+++ b/test/vinyl/write_iterator.test.lua
@@ -317,6 +317,9 @@ space:drop()
 -- gh-2875 INSERT+DELETE pairs are annihilated on compaction
 
 s = box.schema.space.create('test', {engine = 'vinyl'})
+-- Install on_replace trigger to disable REPLACE/DELETE
+-- optimization in the secondary index (gh-2129).
+_ = s:on_replace(function() end)
 pk = s:create_index('primary', {run_count_per_level = 1})
 sk = s:create_index('secondary', {run_count_per_level = 1, parts = {2, 'unsigned'}})
 PAD1 = 100
-- 
2.11.0

  parent reply	other threads:[~2018-07-08 16:48 UTC|newest]

Thread overview: 65+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-08 16:48 [RFC PATCH 02/23] vinyl: always get full tuple from pk after reading from secondary index Vladimir Davydov
2018-07-08 16:48 ` [RFC PATCH 00/23] vinyl: eliminate read on REPLACE/DELETE Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 01/23] vinyl: do not turn REPLACE into INSERT when processing DML request Vladimir Davydov
2018-07-10 12:15     ` Konstantin Osipov
2018-07-10 12:19       ` Vladimir Davydov
2018-07-10 18:39         ` Konstantin Osipov
2018-07-11  7:57           ` Vladimir Davydov
2018-07-11 10:25             ` Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 03/23] vinyl: use vy_mem_iterator for point lookup Vladimir Davydov
2018-07-17 10:14     ` Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 04/23] vinyl: make point lookup always return the latest tuple version Vladimir Davydov
2018-07-10 16:19     ` Konstantin Osipov
2018-07-10 16:43       ` Vladimir Davydov
2018-07-11 16:33         ` Vladimir Davydov
2018-07-31 19:17           ` Konstantin Osipov
2018-07-08 16:48   ` [RFC PATCH 05/23] vinyl: fold vy_replace_one and vy_replace_impl Vladimir Davydov
2018-07-31 20:28     ` Konstantin Osipov
2018-07-08 16:48   ` [RFC PATCH 06/23] vinyl: fold vy_delete_impl Vladimir Davydov
2018-07-31 20:28     ` Konstantin Osipov
2018-07-08 16:48   ` [RFC PATCH 07/23] vinyl: refactor unique check Vladimir Davydov
2018-07-31 20:28     ` Konstantin Osipov
2018-07-08 16:48   ` [RFC PATCH 08/23] vinyl: check key uniqueness before modifying tx write set Vladimir Davydov
2018-07-31 20:34     ` Konstantin Osipov
2018-08-01 10:42       ` Vladimir Davydov
2018-08-09 20:26     ` Konstantin Osipov
2018-08-10  8:26       ` Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 09/23] vinyl: remove env argument of vy_check_is_unique_{primary,secondary} Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 10/23] vinyl: store full tuples in secondary index cache Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 11/23] xrow: allow to store flags in DML requests Vladimir Davydov
2018-07-31 20:36     ` Konstantin Osipov
2018-08-01 14:10       ` Vladimir Davydov
2018-08-17 13:34         ` Vladimir Davydov
2018-08-17 13:34           ` [PATCH 1/2] xrow: allow to store tuple metadata in request Vladimir Davydov
2018-08-17 13:34           ` [PATCH 2/2] vinyl: introduce statement flags Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 12/23] vinyl: do not pass region explicitly to write iterator functions Vladimir Davydov
2018-07-17 10:16     ` Vladimir Davydov
2018-07-31 20:38     ` Konstantin Osipov
2018-08-01 14:14       ` Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 13/23] vinyl: fix potential use-after-free in vy_read_view_merge Vladimir Davydov
2018-07-17 10:16     ` Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 14/23] test: unit/vy_write_iterator: minor refactoring Vladimir Davydov
2018-07-17 10:17     ` Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 15/23] vinyl: teach write iterator to return overwritten tuples Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 16/23] vinyl: allow to skip certain statements on read Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 17/23] vinyl: do not free pending tasks on shutdown Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 18/23] vinyl: store pointer to scheduler in struct vy_task Vladimir Davydov
2018-07-31 20:39     ` Konstantin Osipov
2018-07-08 16:48   ` [RFC PATCH 19/23] vinyl: rename some members of vy_scheduler and vy_task struct Vladimir Davydov
2018-07-31 20:40     ` Konstantin Osipov
2018-07-08 16:48   ` [RFC PATCH 20/23] vinyl: use cbus for communication between scheduler and worker threads Vladimir Davydov
2018-07-31 20:43     ` Konstantin Osipov
2018-08-01 14:26       ` Vladimir Davydov
2018-07-08 16:48   ` [RFC PATCH 21/23] vinyl: zap vy_scheduler::is_worker_pool_running Vladimir Davydov
2018-07-31 20:43     ` Konstantin Osipov
2018-07-08 16:48   ` [RFC PATCH 22/23] vinyl: rename vy_task::status to is_failed Vladimir Davydov
2018-07-31 20:44     ` Konstantin Osipov
2018-07-08 16:48   ` Vladimir Davydov [this message]
2018-07-13 10:53     ` [RFC PATCH 23/23] vinyl: eliminate read on REPLACE/DELETE Vladimir Davydov
2018-07-13 10:53       ` [PATCH 1/3] stailq: add stailq_insert function Vladimir Davydov
2018-07-15  7:02         ` Konstantin Osipov
2018-07-15 13:17           ` Vladimir Davydov
2018-07-15 18:40             ` Konstantin Osipov
2018-07-17 10:18         ` Vladimir Davydov
2018-07-13 10:53       ` [PATCH 2/3] vinyl: link all indexes of the same space Vladimir Davydov
2018-07-13 10:53       ` [PATCH 3/3] vinyl: generate deferred DELETEs on tx commit Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=22418dc475632b06aee6e8db562dc4467d6a0a31.1531065648.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [RFC PATCH 23/23] vinyl: eliminate read on REPLACE/DELETE' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox