Tarantool development patches archive
 help / color / mirror / Atom feed
From: Vladimir Davydov <vdavydov.dev@gmail.com>
To: kostja@tarantool.org
Cc: tarantool-patches@freelists.org
Subject: [PATCH v2 1/2] memtx: do not use space_vtab::commit_alter for freeing tuples
Date: Fri,  6 Apr 2018 16:15:31 +0300	[thread overview]
Message-ID: <92f5581b8d5b981abb14d29e411b03fad02e2607.1523019950.git.vdavydov.dev@gmail.com> (raw)
In-Reply-To: <cover.1523019950.git.vdavydov.dev@gmail.com>
In-Reply-To: <cover.1523019950.git.vdavydov.dev@gmail.com>

When the last index of a memtx space is dropped, we need to delete all
tuples stored in the space. We do it in space_vtab::commit_alter, but
this is wrong, because this function is not rolled back and so we may
get use-after-free error if we fail to write a DDL operation to WAL.
To avoid that, let's delete tuples in index_vtab::commit_drop, which is
called after WAL write.

There's a nuance here: index_vtab::commit_drop is called if an index is
rebuilt (because essentially it is drop + create) so we must elevate the
reference counter of every tuple added to the new index during rebuild
and, respectively, drop all the references in index_vtab::abort_create,
which is called if index creation is aborted for some reason. This also
means that now we iterate over all tuples twice when a primary key is
rebuilt - first to build the new index, then to unreference all tuples
stored in the old index. This is OK as we can make the last step
asynchronous, which will also speed up the more common case of space
drop.

Closes #3289
---
 src/box/memtx_bitset.c   |  4 +--
 src/box/memtx_engine.c   | 47 ++++++++++++++++++++++++++++++++++
 src/box/memtx_engine.h   | 10 ++++++++
 src/box/memtx_hash.c     |  4 +--
 src/box/memtx_rtree.c    |  4 +--
 src/box/memtx_space.c    | 45 +++++----------------------------
 src/box/memtx_tree.c     |  4 +--
 test/box/errinj.result   | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 test/box/errinj.test.lua | 17 +++++++++++++
 9 files changed, 155 insertions(+), 46 deletions(-)

diff --git a/src/box/memtx_bitset.c b/src/box/memtx_bitset.c
index e66247ee..f67405ec 100644
--- a/src/box/memtx_bitset.c
+++ b/src/box/memtx_bitset.c
@@ -457,9 +457,9 @@ memtx_bitset_index_count(struct index *base, enum iterator_type type,
 static const struct index_vtab memtx_bitset_index_vtab = {
 	/* .destroy = */ memtx_bitset_index_destroy,
 	/* .commit_create = */ generic_index_commit_create,
-	/* .abort_create = */ generic_index_abort_create,
+	/* .abort_create = */ memtx_index_abort_create,
 	/* .commit_modify = */ generic_index_commit_modify,
-	/* .commit_drop = */ generic_index_commit_drop,
+	/* .commit_drop = */ memtx_index_commit_drop,
 	/* .update_def = */ generic_index_update_def,
 	/* .depends_on_pk = */ generic_index_depends_on_pk,
 	/* .size = */ memtx_bitset_index_size,
diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
index e60e34d0..f7e21a69 100644
--- a/src/box/memtx_engine.c
+++ b/src/box/memtx_engine.c
@@ -1062,3 +1062,50 @@ memtx_index_extent_reserve(int num)
 	}
 	return 0;
 }
+
+static void
+memtx_index_free_tuples(struct index *index)
+{
+	if (index->def->iid > 0)
+		return;
+
+	/*
+	 * Tuples stored in a memtx space are referenced by the
+	 * primary index so when the primary index is dropped we
+	 * should delete them.
+	 */
+	struct iterator *it = index_create_iterator(index, ITER_ALL, NULL, 0);
+	if (it == NULL)
+		goto fail;
+	int rc;
+	struct tuple *tuple;
+	while ((rc = iterator_next(it, &tuple)) == 0 && tuple != NULL)
+		tuple_unref(tuple);
+	iterator_delete(it);
+	if (rc != 0)
+		goto fail;
+
+	return;
+fail:
+	/*
+	 * This function is called after WAL write so we have no
+	 * other choice but panic in case of any error. The good
+	 * news is memtx iterators do not fail so we should not
+	 * normally get here.
+	 */
+	diag_log();
+	unreachable();
+	panic("failed to drop index");
+}
+
+void
+memtx_index_abort_create(struct index *index)
+{
+	memtx_index_free_tuples(index);
+}
+
+void
+memtx_index_commit_drop(struct index *index)
+{
+	memtx_index_free_tuples(index);
+}
diff --git a/src/box/memtx_engine.h b/src/box/memtx_engine.h
index b64c0ca3..e71045f0 100644
--- a/src/box/memtx_engine.h
+++ b/src/box/memtx_engine.h
@@ -42,6 +42,8 @@
 extern "C" {
 #endif /* defined(__cplusplus) */
 
+struct index;
+
 /**
  * The state of memtx recovery process.
  * There is a global state of the entire engine state of each
@@ -149,6 +151,14 @@ memtx_index_extent_free(void *ctx, void *extent);
 int
 memtx_index_extent_reserve(int num);
 
+/*
+ * The following two methods are used by all kinds of memtx indexes
+ * to delete tuples stored in the space when the primary index is
+ * destroyed.
+ */
+void memtx_index_abort_create(struct index *index);
+void memtx_index_commit_drop(struct index *index);
+
 #if defined(__cplusplus)
 } /* extern "C" */
 
diff --git a/src/box/memtx_hash.c b/src/box/memtx_hash.c
index c4081edc..38027a3b 100644
--- a/src/box/memtx_hash.c
+++ b/src/box/memtx_hash.c
@@ -381,9 +381,9 @@ memtx_hash_index_create_snapshot_iterator(struct index *base)
 static const struct index_vtab memtx_hash_index_vtab = {
 	/* .destroy = */ memtx_hash_index_destroy,
 	/* .commit_create = */ generic_index_commit_create,
-	/* .abort_create = */ generic_index_abort_create,
+	/* .abort_create = */ memtx_index_abort_create,
 	/* .commit_modify = */ generic_index_commit_modify,
-	/* .commit_drop = */ generic_index_commit_drop,
+	/* .commit_drop = */ memtx_index_commit_drop,
 	/* .update_def = */ memtx_hash_index_update_def,
 	/* .depends_on_pk = */ generic_index_depends_on_pk,
 	/* .size = */ memtx_hash_index_size,
diff --git a/src/box/memtx_rtree.c b/src/box/memtx_rtree.c
index ff213922..d0dceaea 100644
--- a/src/box/memtx_rtree.c
+++ b/src/box/memtx_rtree.c
@@ -288,9 +288,9 @@ memtx_rtree_index_create_iterator(struct index *base,  enum iterator_type type,
 static const struct index_vtab memtx_rtree_index_vtab = {
 	/* .destroy = */ memtx_rtree_index_destroy,
 	/* .commit_create = */ generic_index_commit_create,
-	/* .abort_create = */ generic_index_abort_create,
+	/* .abort_create = */ memtx_index_abort_create,
 	/* .commit_modify = */ generic_index_commit_modify,
-	/* .commit_drop = */ generic_index_commit_drop,
+	/* .commit_drop = */ memtx_index_commit_drop,
 	/* .update_def = */ generic_index_update_def,
 	/* .depends_on_pk = */ generic_index_depends_on_pk,
 	/* .size = */ memtx_rtree_index_size,
diff --git a/src/box/memtx_space.c b/src/box/memtx_space.c
index ec6f6db6..ebb54f05 100644
--- a/src/box/memtx_space.c
+++ b/src/box/memtx_space.c
@@ -803,41 +803,17 @@ memtx_space_build_secondary_key(struct space *old_space,
 			break;
 		assert(old_tuple == NULL); /* Guaranteed by DUP_INSERT. */
 		(void) old_tuple;
+		/*
+		 * All tuples stored in a memtx space must be
+		 * referenced by the primary index.
+		 */
+		if (new_index->def->iid == 0)
+			tuple_ref(tuple);
 	}
 	iterator_delete(it);
 	return rc;
 }
 
-static void
-memtx_space_prune(struct space *space)
-{
-	struct index *index = space_index(space, 0);
-	if (index == NULL)
-		return;
-
-	struct iterator *it = index_create_iterator(index, ITER_ALL, NULL, 0);
-	if (it == NULL)
-		goto fail;
-	int rc;
-	struct tuple *tuple;
-	while ((rc = iterator_next(it, &tuple)) == 0 && tuple != NULL)
-		tuple_unref(tuple);
-	iterator_delete(it);
-	if (rc == 0)
-		return;
-fail:
-	/*
-	 * This function is called from space_vtab::commit_alter()
-	 * or commit_truncate(), which do not tolerate failures,
-	 * so we have no other choice but panic here. Good news is
-	 * memtx iterators do not fail so we should not normally
-	 * get here.
-	 */
-	diag_log();
-	unreachable();
-	panic("failed to prune space");
-}
-
 static int
 memtx_space_prepare_alter(struct space *old_space, struct space *new_space)
 {
@@ -857,14 +833,7 @@ memtx_space_commit_alter(struct space *old_space, struct space *new_space)
 	struct memtx_space *new_memtx_space = (struct memtx_space *)new_space;
 	bool is_empty = new_space->index_count == 0 ||
 			index_size(new_space->index[0]) == 0;
-
-	/*
-	 * Delete all tuples when the last index is dropped
-	 * or the space is truncated.
-	 */
-	if (is_empty)
-		memtx_space_prune(old_space);
-	else
+	if (!is_empty)
 		new_memtx_space->bsize = old_memtx_space->bsize;
 }
 
diff --git a/src/box/memtx_tree.c b/src/box/memtx_tree.c
index a06b590d..22177f57 100644
--- a/src/box/memtx_tree.c
+++ b/src/box/memtx_tree.c
@@ -590,9 +590,9 @@ memtx_tree_index_create_snapshot_iterator(struct index *base)
 static const struct index_vtab memtx_tree_index_vtab = {
 	/* .destroy = */ memtx_tree_index_destroy,
 	/* .commit_create = */ generic_index_commit_create,
-	/* .abort_create = */ generic_index_abort_create,
+	/* .abort_create = */ memtx_index_abort_create,
 	/* .commit_modify = */ generic_index_commit_modify,
-	/* .commit_drop = */ generic_index_commit_drop,
+	/* .commit_drop = */ memtx_index_commit_drop,
 	/* .update_def = */ memtx_tree_index_update_def,
 	/* .depends_on_pk = */ memtx_tree_index_depends_on_pk,
 	/* .size = */ memtx_tree_index_size,
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 8e4d5742..269de663 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -1093,3 +1093,69 @@ s:drop()
 box.schema.user.revoke('guest', 'read,write,execute','universe')
 ---
 ...
+--
+-- gh-3289: drop/truncate leaves the space in inconsistent
+-- state if WAL write fails.
+--
+s = box.schema.space.create('test')
+---
+...
+_ = s:create_index('pk')
+---
+...
+for i = 1, 10 do s:replace{i} end
+---
+...
+errinj.set('ERRINJ_WAL_IO', true)
+---
+- ok
+...
+s:drop()
+---
+- error: Failed to write to disk
+...
+s:truncate()
+---
+- error: Failed to write to disk
+...
+s:drop()
+---
+- error: Failed to write to disk
+...
+s:truncate()
+---
+- error: Failed to write to disk
+...
+errinj.set('ERRINJ_WAL_IO', false)
+---
+- ok
+...
+for i = 1, 10 do s:replace{i + 10} end
+---
+...
+s:select()
+---
+- - [1]
+  - [2]
+  - [3]
+  - [4]
+  - [5]
+  - [6]
+  - [7]
+  - [8]
+  - [9]
+  - [10]
+  - [11]
+  - [12]
+  - [13]
+  - [14]
+  - [15]
+  - [16]
+  - [17]
+  - [18]
+  - [19]
+  - [20]
+...
+s:drop()
+---
+...
diff --git a/test/box/errinj.test.lua b/test/box/errinj.test.lua
index d97cd81f..2d4b210d 100644
--- a/test/box/errinj.test.lua
+++ b/test/box/errinj.test.lua
@@ -366,3 +366,20 @@ errinj.set("ERRINJ_IPROTO_TX_DELAY", false)
 
 s:drop()
 box.schema.user.revoke('guest', 'read,write,execute','universe')
+
+--
+-- gh-3289: drop/truncate leaves the space in inconsistent
+-- state if WAL write fails.
+--
+s = box.schema.space.create('test')
+_ = s:create_index('pk')
+for i = 1, 10 do s:replace{i} end
+errinj.set('ERRINJ_WAL_IO', true)
+s:drop()
+s:truncate()
+s:drop()
+s:truncate()
+errinj.set('ERRINJ_WAL_IO', false)
+for i = 1, 10 do s:replace{i + 10} end
+s:select()
+s:drop()
-- 
2.11.0

  reply	other threads:[~2018-04-06 13:15 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-04-06 13:15 [PATCH v2 0/2] alter: fix WAL error handling Vladimir Davydov
2018-04-06 13:15 ` Vladimir Davydov [this message]
2018-04-06 14:08   ` [PATCH v2 1/2] memtx: do not use space_vtab::commit_alter for freeing tuples Konstantin Osipov
2018-04-06 14:19     ` Vladimir Davydov
2018-04-06 15:29       ` Vladimir Davydov
2018-04-06 13:15 ` [PATCH v2 2/2] alter: zap space_vtab::commit_alter Vladimir Davydov
2018-04-06 14:10   ` Konstantin Osipov
2018-04-06 14:23     ` Vladimir Davydov
2018-04-06 14:28       ` Konstantin Osipov
2018-04-06 15:25         ` Vladimir Davydov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=92f5581b8d5b981abb14d29e411b03fad02e2607.1523019950.git.vdavydov.dev@gmail.com \
    --to=vdavydov.dev@gmail.com \
    --cc=kostja@tarantool.org \
    --cc=tarantool-patches@freelists.org \
    --subject='Re: [PATCH v2 1/2] memtx: do not use space_vtab::commit_alter for freeing tuples' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox