* [PATCH v2 1/2] memtx: do not use space_vtab::commit_alter for freeing tuples
2018-04-06 13:15 [PATCH v2 0/2] alter: fix WAL error handling Vladimir Davydov
@ 2018-04-06 13:15 ` Vladimir Davydov
2018-04-06 14:08 ` Konstantin Osipov
2018-04-06 13:15 ` [PATCH v2 2/2] alter: zap space_vtab::commit_alter Vladimir Davydov
1 sibling, 1 reply; 10+ messages in thread
From: Vladimir Davydov @ 2018-04-06 13:15 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
When the last index of a memtx space is dropped, we need to delete all
tuples stored in the space. We do it in space_vtab::commit_alter, but
this is wrong, because this function is not rolled back and so we may
get use-after-free error if we fail to write a DDL operation to WAL.
To avoid that, let's delete tuples in index_vtab::commit_drop, which is
called after WAL write.
There's a nuance here: index_vtab::commit_drop is called if an index is
rebuilt (because essentially it is drop + create) so we must elevate the
reference counter of every tuple added to the new index during rebuild
and, respectively, drop all the references in index_vtab::abort_create,
which is called if index creation is aborted for some reason. This also
means that now we iterate over all tuples twice when a primary key is
rebuilt - first to build the new index, then to unreference all tuples
stored in the old index. This is OK as we can make the last step
asynchronous, which will also speed up the more common case of space
drop.
Closes #3289
---
src/box/memtx_bitset.c | 4 +--
src/box/memtx_engine.c | 47 ++++++++++++++++++++++++++++++++++
src/box/memtx_engine.h | 10 ++++++++
src/box/memtx_hash.c | 4 +--
src/box/memtx_rtree.c | 4 +--
src/box/memtx_space.c | 45 +++++----------------------------
src/box/memtx_tree.c | 4 +--
test/box/errinj.result | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
test/box/errinj.test.lua | 17 +++++++++++++
9 files changed, 155 insertions(+), 46 deletions(-)
diff --git a/src/box/memtx_bitset.c b/src/box/memtx_bitset.c
index e66247ee..f67405ec 100644
--- a/src/box/memtx_bitset.c
+++ b/src/box/memtx_bitset.c
@@ -457,9 +457,9 @@ memtx_bitset_index_count(struct index *base, enum iterator_type type,
static const struct index_vtab memtx_bitset_index_vtab = {
/* .destroy = */ memtx_bitset_index_destroy,
/* .commit_create = */ generic_index_commit_create,
- /* .abort_create = */ generic_index_abort_create,
+ /* .abort_create = */ memtx_index_abort_create,
/* .commit_modify = */ generic_index_commit_modify,
- /* .commit_drop = */ generic_index_commit_drop,
+ /* .commit_drop = */ memtx_index_commit_drop,
/* .update_def = */ generic_index_update_def,
/* .depends_on_pk = */ generic_index_depends_on_pk,
/* .size = */ memtx_bitset_index_size,
diff --git a/src/box/memtx_engine.c b/src/box/memtx_engine.c
index e60e34d0..f7e21a69 100644
--- a/src/box/memtx_engine.c
+++ b/src/box/memtx_engine.c
@@ -1062,3 +1062,50 @@ memtx_index_extent_reserve(int num)
}
return 0;
}
+
+static void
+memtx_index_free_tuples(struct index *index)
+{
+ if (index->def->iid > 0)
+ return;
+
+ /*
+ * Tuples stored in a memtx space are referenced by the
+ * primary index so when the primary index is dropped we
+ * should delete them.
+ */
+ struct iterator *it = index_create_iterator(index, ITER_ALL, NULL, 0);
+ if (it == NULL)
+ goto fail;
+ int rc;
+ struct tuple *tuple;
+ while ((rc = iterator_next(it, &tuple)) == 0 && tuple != NULL)
+ tuple_unref(tuple);
+ iterator_delete(it);
+ if (rc != 0)
+ goto fail;
+
+ return;
+fail:
+ /*
+ * This function is called after WAL write so we have no
+ * other choice but panic in case of any error. The good
+ * news is memtx iterators do not fail so we should not
+ * normally get here.
+ */
+ diag_log();
+ unreachable();
+ panic("failed to drop index");
+}
+
+void
+memtx_index_abort_create(struct index *index)
+{
+ memtx_index_free_tuples(index);
+}
+
+void
+memtx_index_commit_drop(struct index *index)
+{
+ memtx_index_free_tuples(index);
+}
diff --git a/src/box/memtx_engine.h b/src/box/memtx_engine.h
index b64c0ca3..e71045f0 100644
--- a/src/box/memtx_engine.h
+++ b/src/box/memtx_engine.h
@@ -42,6 +42,8 @@
extern "C" {
#endif /* defined(__cplusplus) */
+struct index;
+
/**
* The state of memtx recovery process.
* There is a global state of the entire engine state of each
@@ -149,6 +151,14 @@ memtx_index_extent_free(void *ctx, void *extent);
int
memtx_index_extent_reserve(int num);
+/*
+ * The following two methods are used by all kinds of memtx indexes
+ * to delete tuples stored in the space when the primary index is
+ * destroyed.
+ */
+void memtx_index_abort_create(struct index *index);
+void memtx_index_commit_drop(struct index *index);
+
#if defined(__cplusplus)
} /* extern "C" */
diff --git a/src/box/memtx_hash.c b/src/box/memtx_hash.c
index c4081edc..38027a3b 100644
--- a/src/box/memtx_hash.c
+++ b/src/box/memtx_hash.c
@@ -381,9 +381,9 @@ memtx_hash_index_create_snapshot_iterator(struct index *base)
static const struct index_vtab memtx_hash_index_vtab = {
/* .destroy = */ memtx_hash_index_destroy,
/* .commit_create = */ generic_index_commit_create,
- /* .abort_create = */ generic_index_abort_create,
+ /* .abort_create = */ memtx_index_abort_create,
/* .commit_modify = */ generic_index_commit_modify,
- /* .commit_drop = */ generic_index_commit_drop,
+ /* .commit_drop = */ memtx_index_commit_drop,
/* .update_def = */ memtx_hash_index_update_def,
/* .depends_on_pk = */ generic_index_depends_on_pk,
/* .size = */ memtx_hash_index_size,
diff --git a/src/box/memtx_rtree.c b/src/box/memtx_rtree.c
index ff213922..d0dceaea 100644
--- a/src/box/memtx_rtree.c
+++ b/src/box/memtx_rtree.c
@@ -288,9 +288,9 @@ memtx_rtree_index_create_iterator(struct index *base, enum iterator_type type,
static const struct index_vtab memtx_rtree_index_vtab = {
/* .destroy = */ memtx_rtree_index_destroy,
/* .commit_create = */ generic_index_commit_create,
- /* .abort_create = */ generic_index_abort_create,
+ /* .abort_create = */ memtx_index_abort_create,
/* .commit_modify = */ generic_index_commit_modify,
- /* .commit_drop = */ generic_index_commit_drop,
+ /* .commit_drop = */ memtx_index_commit_drop,
/* .update_def = */ generic_index_update_def,
/* .depends_on_pk = */ generic_index_depends_on_pk,
/* .size = */ memtx_rtree_index_size,
diff --git a/src/box/memtx_space.c b/src/box/memtx_space.c
index ec6f6db6..ebb54f05 100644
--- a/src/box/memtx_space.c
+++ b/src/box/memtx_space.c
@@ -803,41 +803,17 @@ memtx_space_build_secondary_key(struct space *old_space,
break;
assert(old_tuple == NULL); /* Guaranteed by DUP_INSERT. */
(void) old_tuple;
+ /*
+ * All tuples stored in a memtx space must be
+ * referenced by the primary index.
+ */
+ if (new_index->def->iid == 0)
+ tuple_ref(tuple);
}
iterator_delete(it);
return rc;
}
-static void
-memtx_space_prune(struct space *space)
-{
- struct index *index = space_index(space, 0);
- if (index == NULL)
- return;
-
- struct iterator *it = index_create_iterator(index, ITER_ALL, NULL, 0);
- if (it == NULL)
- goto fail;
- int rc;
- struct tuple *tuple;
- while ((rc = iterator_next(it, &tuple)) == 0 && tuple != NULL)
- tuple_unref(tuple);
- iterator_delete(it);
- if (rc == 0)
- return;
-fail:
- /*
- * This function is called from space_vtab::commit_alter()
- * or commit_truncate(), which do not tolerate failures,
- * so we have no other choice but panic here. Good news is
- * memtx iterators do not fail so we should not normally
- * get here.
- */
- diag_log();
- unreachable();
- panic("failed to prune space");
-}
-
static int
memtx_space_prepare_alter(struct space *old_space, struct space *new_space)
{
@@ -857,14 +833,7 @@ memtx_space_commit_alter(struct space *old_space, struct space *new_space)
struct memtx_space *new_memtx_space = (struct memtx_space *)new_space;
bool is_empty = new_space->index_count == 0 ||
index_size(new_space->index[0]) == 0;
-
- /*
- * Delete all tuples when the last index is dropped
- * or the space is truncated.
- */
- if (is_empty)
- memtx_space_prune(old_space);
- else
+ if (!is_empty)
new_memtx_space->bsize = old_memtx_space->bsize;
}
diff --git a/src/box/memtx_tree.c b/src/box/memtx_tree.c
index a06b590d..22177f57 100644
--- a/src/box/memtx_tree.c
+++ b/src/box/memtx_tree.c
@@ -590,9 +590,9 @@ memtx_tree_index_create_snapshot_iterator(struct index *base)
static const struct index_vtab memtx_tree_index_vtab = {
/* .destroy = */ memtx_tree_index_destroy,
/* .commit_create = */ generic_index_commit_create,
- /* .abort_create = */ generic_index_abort_create,
+ /* .abort_create = */ memtx_index_abort_create,
/* .commit_modify = */ generic_index_commit_modify,
- /* .commit_drop = */ generic_index_commit_drop,
+ /* .commit_drop = */ memtx_index_commit_drop,
/* .update_def = */ memtx_tree_index_update_def,
/* .depends_on_pk = */ memtx_tree_index_depends_on_pk,
/* .size = */ memtx_tree_index_size,
diff --git a/test/box/errinj.result b/test/box/errinj.result
index 8e4d5742..269de663 100644
--- a/test/box/errinj.result
+++ b/test/box/errinj.result
@@ -1093,3 +1093,69 @@ s:drop()
box.schema.user.revoke('guest', 'read,write,execute','universe')
---
...
+--
+-- gh-3289: drop/truncate leaves the space in inconsistent
+-- state if WAL write fails.
+--
+s = box.schema.space.create('test')
+---
+...
+_ = s:create_index('pk')
+---
+...
+for i = 1, 10 do s:replace{i} end
+---
+...
+errinj.set('ERRINJ_WAL_IO', true)
+---
+- ok
+...
+s:drop()
+---
+- error: Failed to write to disk
+...
+s:truncate()
+---
+- error: Failed to write to disk
+...
+s:drop()
+---
+- error: Failed to write to disk
+...
+s:truncate()
+---
+- error: Failed to write to disk
+...
+errinj.set('ERRINJ_WAL_IO', false)
+---
+- ok
+...
+for i = 1, 10 do s:replace{i + 10} end
+---
+...
+s:select()
+---
+- - [1]
+ - [2]
+ - [3]
+ - [4]
+ - [5]
+ - [6]
+ - [7]
+ - [8]
+ - [9]
+ - [10]
+ - [11]
+ - [12]
+ - [13]
+ - [14]
+ - [15]
+ - [16]
+ - [17]
+ - [18]
+ - [19]
+ - [20]
+...
+s:drop()
+---
+...
diff --git a/test/box/errinj.test.lua b/test/box/errinj.test.lua
index d97cd81f..2d4b210d 100644
--- a/test/box/errinj.test.lua
+++ b/test/box/errinj.test.lua
@@ -366,3 +366,20 @@ errinj.set("ERRINJ_IPROTO_TX_DELAY", false)
s:drop()
box.schema.user.revoke('guest', 'read,write,execute','universe')
+
+--
+-- gh-3289: drop/truncate leaves the space in inconsistent
+-- state if WAL write fails.
+--
+s = box.schema.space.create('test')
+_ = s:create_index('pk')
+for i = 1, 10 do s:replace{i} end
+errinj.set('ERRINJ_WAL_IO', true)
+s:drop()
+s:truncate()
+s:drop()
+s:truncate()
+errinj.set('ERRINJ_WAL_IO', false)
+for i = 1, 10 do s:replace{i + 10} end
+s:select()
+s:drop()
--
2.11.0
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH v2 2/2] alter: zap space_vtab::commit_alter
2018-04-06 13:15 [PATCH v2 0/2] alter: fix WAL error handling Vladimir Davydov
2018-04-06 13:15 ` [PATCH v2 1/2] memtx: do not use space_vtab::commit_alter for freeing tuples Vladimir Davydov
@ 2018-04-06 13:15 ` Vladimir Davydov
2018-04-06 14:10 ` Konstantin Osipov
1 sibling, 1 reply; 10+ messages in thread
From: Vladimir Davydov @ 2018-04-06 13:15 UTC (permalink / raw)
To: kostja; +Cc: tarantool-patches
space_vtab::commit_alter is implemented only by memtx, which uses it
to set bsize for a new space. However, it isn't necessary to use
commit_alter for this - instead we can set bsize in prepare_alter and
reset it in drop_primary_key, along with memtx_space::replace. Let's
do it and zap space_vtab::commit_alter altogether, because the callback
is confusing: judging by its name it should be called after WAL write,
but it is called before.
---
src/box/alter.cc | 21 ++-------------------
src/box/memtx_space.c | 21 +++++++++------------
src/box/space.h | 15 ---------------
src/box/sysview_engine.c | 8 --------
src/box/vinyl.c | 8 --------
5 files changed, 11 insertions(+), 62 deletions(-)
diff --git a/src/box/alter.cc b/src/box/alter.cc
index 174d53fa..15805b9b 100644
--- a/src/box/alter.cc
+++ b/src/box/alter.cc
@@ -864,8 +864,6 @@ alter_space_do(struct txn *txn, struct alter_space *alter)
* The new space is ready. Time to update the space
* cache with it.
*/
- space_commit_alter(alter->old_space, alter->new_space);
-
struct space *old_space = space_cache_replace(alter->new_space);
(void) old_space;
assert(old_space == alter->old_space);
@@ -1026,23 +1024,8 @@ DropIndex::alter_def(struct alter_space * /* alter */)
void
DropIndex::alter(struct alter_space *alter)
{
- /*
- * If it's not the primary key, nothing to do --
- * the dropped index didn't exist in the new space
- * definition, so does not exist in the created space.
- */
- if (space_index(alter->new_space, 0) != NULL)
- return;
- /*
- * OK to drop the primary key. Inform the engine about it,
- * since it may have to reset handler->replace function,
- * so that:
- * - DML returns proper errors rather than crashes the
- * program
- * - when a new primary key is finally added, the space
- * can be put back online properly.
- */
- space_drop_primary_key(alter->new_space);
+ if (old_index_def->iid == 0)
+ space_drop_primary_key(alter->new_space);
}
void
diff --git a/src/box/memtx_space.c b/src/box/memtx_space.c
index ebb54f05..b3e08e49 100644
--- a/src/box/memtx_space.c
+++ b/src/box/memtx_space.c
@@ -735,7 +735,15 @@ static void
memtx_space_drop_primary_key(struct space *space)
{
struct memtx_space *memtx_space = (struct memtx_space *)space;
+ /*
+ * Reset 'replace' callback so that:
+ * - DML returns proper errors rather than crashes the
+ * program.
+ * - When a new primary key is finally added, the space
+ * can be put back online properly.
+ */
memtx_space->replace = memtx_space_replace_no_keys;
+ memtx_space->bsize = 0;
}
static void
@@ -820,23 +828,13 @@ memtx_space_prepare_alter(struct space *old_space, struct space *new_space)
struct memtx_space *old_memtx_space = (struct memtx_space *)old_space;
struct memtx_space *new_memtx_space = (struct memtx_space *)new_space;
new_memtx_space->replace = old_memtx_space->replace;
+ new_memtx_space->bsize = old_memtx_space->bsize;
bool is_empty = old_space->index_count == 0 ||
index_size(old_space->index[0]) == 0;
return space_def_check_compatibility(old_space->def,
new_space->def, is_empty);
}
-static void
-memtx_space_commit_alter(struct space *old_space, struct space *new_space)
-{
- struct memtx_space *old_memtx_space = (struct memtx_space *)old_space;
- struct memtx_space *new_memtx_space = (struct memtx_space *)new_space;
- bool is_empty = new_space->index_count == 0 ||
- index_size(new_space->index[0]) == 0;
- if (!is_empty)
- new_memtx_space->bsize = old_memtx_space->bsize;
-}
-
/* }}} DDL */
static const struct space_vtab memtx_space_vtab = {
@@ -856,7 +854,6 @@ static const struct space_vtab memtx_space_vtab = {
/* .build_secondary_key = */ memtx_space_build_secondary_key,
/* .swap_index = */ generic_space_swap_index,
/* .prepare_alter = */ memtx_space_prepare_alter,
- /* .commit_alter = */ memtx_space_commit_alter,
};
struct space *
diff --git a/src/box/space.h b/src/box/space.h
index cf1be1e9..758d575e 100644
--- a/src/box/space.h
+++ b/src/box/space.h
@@ -116,14 +116,6 @@ struct space_vtab {
*/
int (*prepare_alter)(struct space *old_space,
struct space *new_space);
- /**
- * Notify the engine engine after altering a space and
- * replacing old_space with new_space in the space cache,
- * to, e.g., update all references to struct space
- * and replace old_space with new_space.
- */
- void (*commit_alter)(struct space *old_space,
- struct space *new_space);
};
struct space {
@@ -361,13 +353,6 @@ space_prepare_alter(struct space *old_space, struct space *new_space)
return new_space->vtab->prepare_alter(old_space, new_space);
}
-static inline void
-space_commit_alter(struct space *old_space, struct space *new_space)
-{
- assert(old_space->vtab == new_space->vtab);
- new_space->vtab->commit_alter(old_space, new_space);
-}
-
static inline bool
space_is_memtx(struct space *space) { return space->engine->id == 0; }
diff --git a/src/box/sysview_engine.c b/src/box/sysview_engine.c
index fd4ebe5b..d28430aa 100644
--- a/src/box/sysview_engine.c
+++ b/src/box/sysview_engine.c
@@ -154,13 +154,6 @@ sysview_space_prepare_alter(struct space *old_space, struct space *new_space)
return 0;
}
-static void
-sysview_space_commit_alter(struct space *old_space, struct space *new_space)
-{
- (void)old_space;
- (void)new_space;
-}
-
static int
sysview_space_check_format(struct space *new_space, struct space *old_space)
{
@@ -187,7 +180,6 @@ static const struct space_vtab sysview_space_vtab = {
/* .build_secondary_key = */ sysview_space_build_secondary_key,
/* .swap_index = */ generic_space_swap_index,
/* .prepare_alter = */ sysview_space_prepare_alter,
- /* .commit_alter = */ sysview_space_commit_alter,
};
static void
diff --git a/src/box/vinyl.c b/src/box/vinyl.c
index 83df2b81..e809f6a8 100644
--- a/src/box/vinyl.c
+++ b/src/box/vinyl.c
@@ -1053,13 +1053,6 @@ vinyl_space_check_format(struct space *new_space, struct space *old_space)
}
static void
-vinyl_space_commit_alter(struct space *old_space, struct space *new_space)
-{
- (void)old_space;
- (void)new_space;
-}
-
-static void
vinyl_space_swap_index(struct space *old_space, struct space *new_space,
uint32_t old_index_id, uint32_t new_index_id)
{
@@ -3941,7 +3934,6 @@ static const struct space_vtab vinyl_space_vtab = {
/* .build_secondary_key = */ vinyl_space_build_secondary_key,
/* .swap_index = */ vinyl_space_swap_index,
/* .prepare_alter = */ vinyl_space_prepare_alter,
- /* .commit_alter = */ vinyl_space_commit_alter,
};
static const struct index_vtab vinyl_index_vtab = {
--
2.11.0
^ permalink raw reply [flat|nested] 10+ messages in thread