* [Tarantool-patches] [PATCH 1/4] box/txn: fix void args mess
2020-02-17 15:59 [Tarantool-patches] [PATCH 0/4] box/txn: fix nil dereference on txn error path Cyrill Gorcunov
@ 2020-02-17 15:59 ` Cyrill Gorcunov
2020-02-17 17:13 ` Konstantin Osipov
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 2/4] box/journal: sanitize completion naming Cyrill Gorcunov
` (2 subsequent siblings)
3 siblings, 1 reply; 15+ messages in thread
From: Cyrill Gorcunov @ 2020-02-17 15:59 UTC (permalink / raw)
To: tml
Using void explicitly in functions which take
no arguments allows to optimize code a bit and
don't assume if there might be variable args.
Moreover in commit e070cc4d9 we dropped arguments
from txn_begin but didn't update vy_scheduler.c.
The compiler didn't complain because it assumed
there are vargs.
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
---
src/box/txn.c | 14 +++++++-------
src/box/txn.h | 4 ++--
src/box/vy_scheduler.c | 2 +-
3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/src/box/txn.c b/src/box/txn.c
index 30ec586ba..5dadf4985 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -173,7 +173,7 @@ txn_rollback_to_svp(struct txn *txn, struct stailq_entry *svp)
* Return a txn from cache or create a new one if cache is empty.
*/
inline static struct txn *
-txn_new()
+txn_new(void)
{
if (!stailq_empty(&txn_cache))
return stailq_shift_entry(&txn_cache, struct txn, in_txn_cache);
@@ -209,7 +209,7 @@ txn_free(struct txn *txn)
}
struct txn *
-txn_begin()
+txn_begin(void)
{
static int64_t tsn = 0;
assert(! in_txn());
@@ -670,13 +670,13 @@ box_txn_id(void)
}
bool
-box_txn()
+box_txn(void)
{
return in_txn() != NULL;
}
int
-box_txn_begin()
+box_txn_begin(void)
{
if (in_txn()) {
diag_set(ClientError, ER_ACTIVE_TRANSACTION);
@@ -688,7 +688,7 @@ box_txn_begin()
}
int
-box_txn_commit()
+box_txn_commit(void)
{
struct txn *txn = in_txn();
/**
@@ -709,7 +709,7 @@ box_txn_commit()
}
int
-box_txn_rollback()
+box_txn_rollback(void)
{
struct txn *txn = in_txn();
if (txn == NULL)
@@ -788,7 +788,7 @@ txn_savepoint_by_name(struct txn *txn, const char *name)
}
box_txn_savepoint_t *
-box_txn_savepoint()
+box_txn_savepoint(void)
{
struct txn *txn = in_txn();
if (txn == NULL) {
diff --git a/src/box/txn.h b/src/box/txn.h
index 97d7b5de5..ae2c3a58f 100644
--- a/src/box/txn.h
+++ b/src/box/txn.h
@@ -251,7 +251,7 @@ txn_clear_flag(struct txn *txn, enum txn_flag flag)
/* Pointer to the current transaction (if any) */
static inline struct txn *
-in_txn()
+in_txn(void)
{
return fiber()->storage.txn;
}
@@ -261,7 +261,7 @@ in_txn()
* @pre no transaction is active
*/
struct txn *
-txn_begin();
+txn_begin(void);
/**
* Commit a transaction.
diff --git a/src/box/vy_scheduler.c b/src/box/vy_scheduler.c
index 86bed8013..acc909d09 100644
--- a/src/box/vy_scheduler.c
+++ b/src/box/vy_scheduler.c
@@ -895,7 +895,7 @@ vy_deferred_delete_batch_process_f(struct cmsg *cmsg)
deferred_delete_space = space_by_id(BOX_VINYL_DEFERRED_DELETE_ID);
assert(deferred_delete_space != NULL);
- struct txn *txn = txn_begin(false);
+ struct txn *txn = txn_begin();
if (txn == NULL)
goto fail;
--
2.20.1
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Tarantool-patches] [PATCH 2/4] box/journal: sanitize completion naming
2020-02-17 15:59 [Tarantool-patches] [PATCH 0/4] box/txn: fix nil dereference on txn error path Cyrill Gorcunov
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 1/4] box/txn: fix void args mess Cyrill Gorcunov
@ 2020-02-17 15:59 ` Cyrill Gorcunov
2020-02-17 17:14 ` Konstantin Osipov
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 3/4] box/txn: rename txn_entry_done_cb to txn_entry_complete_cb Cyrill Gorcunov
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback Cyrill Gorcunov
3 siblings, 1 reply; 15+ messages in thread
From: Cyrill Gorcunov @ 2020-02-17 15:59 UTC (permalink / raw)
To: tml
Instead of on_done use on_complete prefix
since done it too general while we're trying
to complete write procedue. Also it is more
consistent with txn_complete name.
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
---
src/box/journal.c | 8 ++++----
src/box/journal.h | 12 ++++++------
2 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/src/box/journal.c b/src/box/journal.c
index d762613dd..c9c0ac475 100644
--- a/src/box/journal.c
+++ b/src/box/journal.c
@@ -54,8 +54,8 @@ struct journal *current_journal = &dummy_journal;
struct journal_entry *
journal_entry_new(size_t n_rows, struct region *region,
- journal_entry_done_cb on_done_cb,
- void *on_done_cb_data)
+ journal_entry_complete_cb on_complete_cb,
+ void *on_complete_cb_data)
{
struct journal_entry *entry;
@@ -71,8 +71,8 @@ journal_entry_new(size_t n_rows, struct region *region,
entry->approx_len = 0;
entry->n_rows = n_rows;
entry->res = -1;
- entry->on_done_cb = on_done_cb;
- entry->on_done_cb_data = on_done_cb_data;
+ entry->on_complete_cb = on_complete_cb;
+ entry->on_complete_cb_data = on_complete_cb_data;
return entry;
}
diff --git a/src/box/journal.h b/src/box/journal.h
index 236058bb1..071618496 100644
--- a/src/box/journal.h
+++ b/src/box/journal.h
@@ -43,7 +43,7 @@ struct xrow_header;
struct journal_entry;
/** Journal entry finalization callback typedef. */
-typedef void (*journal_entry_done_cb)(struct journal_entry *entry, void *data);
+typedef void (*journal_entry_complete_cb)(struct journal_entry *entry, void *data);
/**
* An entry for an abstract journal.
@@ -66,11 +66,11 @@ struct journal_entry {
* or fail. Entry->res is set to a result value before the callback
* is fired.
*/
- journal_entry_done_cb on_done_cb;
+ journal_entry_complete_cb on_complete_cb;
/**
* A journal entry completion callback argument.
*/
- void *on_done_cb_data;
+ void *on_complete_cb_data;
/**
* Approximate size of this request when encoded.
*/
@@ -94,8 +94,8 @@ struct region;
*/
struct journal_entry *
journal_entry_new(size_t n_rows, struct region *region,
- journal_entry_done_cb on_done_cb,
- void *on_done_cb_data);
+ journal_entry_complete_cb on_complete_cb,
+ void *on_complete_cb_data);
/**
* Finalize a single entry.
@@ -103,7 +103,7 @@ journal_entry_new(size_t n_rows, struct region *region,
static inline void
journal_entry_complete(struct journal_entry *entry)
{
- entry->on_done_cb(entry, entry->on_done_cb_data);
+ entry->on_complete_cb(entry, entry->on_complete_cb_data);
}
/**
--
2.20.1
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Tarantool-patches] [PATCH 3/4] box/txn: rename txn_entry_done_cb to txn_entry_complete_cb
2020-02-17 15:59 [Tarantool-patches] [PATCH 0/4] box/txn: fix nil dereference on txn error path Cyrill Gorcunov
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 1/4] box/txn: fix void args mess Cyrill Gorcunov
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 2/4] box/journal: sanitize completion naming Cyrill Gorcunov
@ 2020-02-17 15:59 ` Cyrill Gorcunov
2020-02-17 17:14 ` Konstantin Osipov
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback Cyrill Gorcunov
3 siblings, 1 reply; 15+ messages in thread
From: Cyrill Gorcunov @ 2020-02-17 15:59 UTC (permalink / raw)
To: tml
To look similar to txn_complete.
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
---
src/box/txn.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/box/txn.c b/src/box/txn.c
index 5dadf4985..a4ca48224 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -469,7 +469,7 @@ txn_complete(struct txn *txn)
}
static void
-txn_entry_done_cb(struct journal_entry *entry, void *data)
+txn_entry_complete_cb(struct journal_entry *entry, void *data)
{
struct txn *txn = data;
txn->signature = entry->res;
@@ -493,7 +493,7 @@ txn_write_to_wal(struct txn *txn)
struct journal_entry *req = journal_entry_new(txn->n_new_rows +
txn->n_applier_rows,
&txn->region,
- txn_entry_done_cb,
+ txn_entry_complete_cb,
txn);
if (req == NULL) {
txn_rollback(txn);
--
2.20.1
^ permalink raw reply [flat|nested] 15+ messages in thread
* [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 15:59 [Tarantool-patches] [PATCH 0/4] box/txn: fix nil dereference on txn error path Cyrill Gorcunov
` (2 preceding siblings ...)
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 3/4] box/txn: rename txn_entry_done_cb to txn_entry_complete_cb Cyrill Gorcunov
@ 2020-02-17 15:59 ` Cyrill Gorcunov
2020-02-17 17:19 ` Konstantin Osipov
2020-02-17 17:25 ` Georgy Kirichenko
3 siblings, 2 replies; 15+ messages in thread
From: Cyrill Gorcunov @ 2020-02-17 15:59 UTC (permalink / raw)
To: tml
In case if we get failed in allocation of new
journal entry the txn_rollback will try to
derefernce nil pointer
| txn_write
| fiber_set_txn(fiber(), NULL); // zap fiber's storage.txn
| txn_write_to_wal(txn);
| journal_entry_new(..., txn_entry_done_cb, ...)
| if (req == NULL)
| txn_rollback(txn);
| assert(txn == in_txn()); // in_txn()=nil, triggers
This is because there are two call site:
- when transaction is complete the wal engine will
call txn_entry_complete_cb completion handler and
since it is async in terms of threads (wal is a
separate thread) it setup the txn it processes
into a fiber's storage thus it expects the current
storage is nil
- in turn error may happen and we need to run a rollback
procedure, there fiber's storage keeps current txn;
Taking this into account we clean txn inside txn_write_to_wal
right before journal_write is called and restore it back
on error. This allows the caller code to run txn_rollback
in a more consistent way.
https://github.com/tarantool/tarantool/issues/4776
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
---
src/box/txn.c | 27 +++++++++++++++++++++------
1 file changed, 21 insertions(+), 6 deletions(-)
diff --git a/src/box/txn.c b/src/box/txn.c
index a4ca48224..68365ea0b 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -495,10 +495,8 @@ txn_write_to_wal(struct txn *txn)
&txn->region,
txn_entry_complete_cb,
txn);
- if (req == NULL) {
- txn_rollback(txn);
+ if (req == NULL)
return -1;
- }
struct txn_stmt *stmt;
struct xrow_header **remote_row = req->rows;
@@ -519,8 +517,20 @@ txn_write_to_wal(struct txn *txn)
assert(remote_row == req->rows + txn->n_applier_rows);
assert(local_row == remote_row + txn->n_new_rows);
- /* Send the entry to the journal. */
+ /*
+ * Queue the entry for processing in journal
+ * engine. The semantics of complete_cb implies
+ * that fiber's txn (kept in storage) is nil
+ * becase WAL is a separet thread, for this
+ * sake we zap it here.
+ *
+ * Still this is messy since the caller runs
+ * txn_rollback if something bad happened. Thus
+ * restore the former txn on error path.
+ */
+ fiber_set_txn(fiber(), NULL);
if (journal_write(req) < 0) {
+ fiber_set_txn(fiber(), txn);
diag_set(ClientError, ER_WAL_IO);
diag_log();
return -1;
@@ -583,8 +593,13 @@ txn_write(struct txn *txn)
fiber_set_txn(fiber(), NULL);
return 0;
}
- fiber_set_txn(fiber(), NULL);
- return txn_write_to_wal(txn);
+
+ if (txn_write_to_wal(txn) != 0) {
+ txn_rollback(txn);
+ return -1;
+ }
+
+ return 0;
}
int
--
2.20.1
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback Cyrill Gorcunov
@ 2020-02-17 17:19 ` Konstantin Osipov
2020-02-17 19:05 ` Cyrill Gorcunov
2020-02-17 17:25 ` Georgy Kirichenko
1 sibling, 1 reply; 15+ messages in thread
From: Konstantin Osipov @ 2020-02-17 17:19 UTC (permalink / raw)
To: Cyrill Gorcunov; +Cc: tml
* Cyrill Gorcunov <gorcunov@gmail.com> [20/02/17 20:08]:
> In case if we get failed in allocation of new
> journal entry the txn_rollback will try to
> derefernce nil pointer
This is lgtm (apart from typos in the comment), however,
I think this patch is incomplete, since wal_write() still calls
rollback on failure to queue the task to cbus.
Could you please remove txn_rollback() from wal_write in scope of
this series?
I mean this part:
fail:
entry->res = -1;
journal_entry_complete(entry);
return -1;
}
(remove jounral_entry_complete() from here).
I think once you remove journal_entry_complete (which basically
calls rollback), you will be able to remove this ugly
"cleanr-then-restore" of txn as well:
> + /*
> + * Queue the entry for processing in journal
> + * engine. The semantics of complete_cb implies
> + * that fiber's txn (kept in storage) is nil
> + * becase WAL is a separet thread, for this
> + * sake we zap it here.
> + *
> + * Still this is messy since the caller runs
> + * txn_rollback if something bad happened. Thus
> + * restore the former txn on error path.
> + */
> + fiber_set_txn(fiber(), NULL);
> if (journal_write(req) < 0) {
> + fiber_set_txn(fiber(), txn);
> diag_set(ClientError, ER_WAL_IO);
> diag_log();
> return -1;
--
Konstantin Osipov, Moscow, Russia
https://scylladb.com
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 17:19 ` Konstantin Osipov
@ 2020-02-17 19:05 ` Cyrill Gorcunov
2020-02-17 20:01 ` Konstantin Osipov
2020-02-17 20:01 ` Konstantin Osipov
0 siblings, 2 replies; 15+ messages in thread
From: Cyrill Gorcunov @ 2020-02-17 19:05 UTC (permalink / raw)
To: Konstantin Osipov; +Cc: tml
On Mon, Feb 17, 2020 at 08:19:40PM +0300, Konstantin Osipov wrote:
>
> (remove jounral_entry_complete() from here).
>
> I think once you remove journal_entry_complete (which basically
> calls rollback), you will be able to remove this ugly
> "cleanr-then-restore" of txn as well:
IOW, like below? Grigory pointed about my previous attempt. Here is
my call-gparh view (please be ready that i'm wrong)
1) Good path without error (with the patch below applied)
txn_write
txn_write_to_wal
journal_write
fiber_set_txn(fiber(), NULL);
now it is nill and txn_entry_complete_cb process just fine
txn_entry_complete_cb
assert(in_txn() == NULL);
fiber_set_txn(fiber(), txn);
txn_complete(txn);
fiber_set_txn(fiber(), NULL);
2) Error paths
a) error during journal_entry_new allocation
txn_write
txn_write_to_wal
req = journal_entry_new = NULL;
return -1;
txn_rollback (with fiber.txn != NULL)
return -1
fiber.txn is not null, so txn_rollback reverts
and set it to null, looks fine
b) error during journal_write
txn_write
txn_write_to_wal
journal_write
wal_write
...
entry->res = -1;
return -1;
txn_rollback(with fiber.txn != NULL), which
is fine.
Guys, do I miss something obvious? The key moment
is dropping journal_entry_complete call from wal.c
on error path and defer setting fiber.txn = NULL
until journal_write passed without errors.
---
Subject: [PATCH] prelim
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
---
src/box/txn.c | 15 ++++++++++-----
src/box/wal.c | 1 -
2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/src/box/txn.c b/src/box/txn.c
index a4ca48224..a75f8dc0b 100644
--- a/src/box/txn.c
+++ b/src/box/txn.c
@@ -495,10 +495,8 @@ txn_write_to_wal(struct txn *txn)
&txn->region,
txn_entry_complete_cb,
txn);
- if (req == NULL) {
- txn_rollback(txn);
+ if (req == NULL)
return -1;
- }
struct txn_stmt *stmt;
struct xrow_header **remote_row = req->rows;
@@ -525,6 +523,8 @@ txn_write_to_wal(struct txn *txn)
diag_log();
return -1;
}
+
+ fiber_set_txn(fiber(), NULL);
return 0;
}
@@ -583,8 +583,13 @@ txn_write(struct txn *txn)
fiber_set_txn(fiber(), NULL);
return 0;
}
- fiber_set_txn(fiber(), NULL);
- return txn_write_to_wal(txn);
+
+ if (txn_write_to_wal(txn) < 0) {
+ txn_rollback(txn);
+ return -1;
+ }
+
+ return 0;
}
int
diff --git a/src/box/wal.c b/src/box/wal.c
index 0ae66ff32..a8bab4f34 100644
--- a/src/box/wal.c
+++ b/src/box/wal.c
@@ -1209,7 +1209,6 @@ wal_write(struct journal *journal, struct journal_entry *entry)
fail:
entry->res = -1;
- journal_entry_complete(entry);
return -1;
}
--
2.20.1
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 19:05 ` Cyrill Gorcunov
@ 2020-02-17 20:01 ` Konstantin Osipov
2020-02-17 20:01 ` Konstantin Osipov
1 sibling, 0 replies; 15+ messages in thread
From: Konstantin Osipov @ 2020-02-17 20:01 UTC (permalink / raw)
To: Cyrill Gorcunov; +Cc: tml
* Cyrill Gorcunov <gorcunov@gmail.com> [20/02/17 22:07]:
> b) error during journal_write
>
> txn_write
> txn_write_to_wal
> journal_write
> wal_write
> ...
> entry->res = -1;
> return -1;
> txn_rollback(with fiber.txn != NULL), which
> is fine.
After you remove journal_entry_complete_cb from fail: branch of
wal_write(), if you set txn to null only after successful journal_write, you'll
be fine. On failure, the rollback will be done by txn_write().
> Guys, do I miss something obvious? The key moment
> is dropping journal_entry_complete call from wal.c
> on error path and defer setting fiber.txn = NULL
> until journal_write passed without errors.
Don't know how obvious this is, but it is something both me and
Georgy are suggesting it seems.
--
Konstantin Osipov, Moscow, Russia
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 19:05 ` Cyrill Gorcunov
2020-02-17 20:01 ` Konstantin Osipov
@ 2020-02-17 20:01 ` Konstantin Osipov
2020-02-17 20:05 ` Cyrill Gorcunov
2020-02-17 20:34 ` Cyrill Gorcunov
1 sibling, 2 replies; 15+ messages in thread
From: Konstantin Osipov @ 2020-02-17 20:01 UTC (permalink / raw)
To: Cyrill Gorcunov; +Cc: tml
* Cyrill Gorcunov <gorcunov@gmail.com> [20/02/17 22:07]:
> Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
> ---
> src/box/txn.c | 15 ++++++++++-----
> src/box/wal.c | 1 -
> 2 files changed, 10 insertions(+), 6 deletions(-)
>
> diff --git a/src/box/txn.c b/src/box/txn.c
> index a4ca48224..a75f8dc0b 100644
> --- a/src/box/txn.c
> +++ b/src/box/txn.c
> @@ -495,10 +495,8 @@ txn_write_to_wal(struct txn *txn)
> &txn->region,
> txn_entry_complete_cb,
> txn);
> - if (req == NULL) {
> - txn_rollback(txn);
> + if (req == NULL)
> return -1;
> - }
>
> struct txn_stmt *stmt;
> struct xrow_header **remote_row = req->rows;
> @@ -525,6 +523,8 @@ txn_write_to_wal(struct txn *txn)
> diag_log();
> return -1;
> }
> +
> + fiber_set_txn(fiber(), NULL);
> return 0;
> }
>
> @@ -583,8 +583,13 @@ txn_write(struct txn *txn)
> fiber_set_txn(fiber(), NULL);
> return 0;
> }
> - fiber_set_txn(fiber(), NULL);
> - return txn_write_to_wal(txn);
> +
> + if (txn_write_to_wal(txn) < 0) {
> + txn_rollback(txn);
> + return -1;
> + }
> +
> + return 0;
> }
>
> int
> diff --git a/src/box/wal.c b/src/box/wal.c
> index 0ae66ff32..a8bab4f34 100644
> --- a/src/box/wal.c
> +++ b/src/box/wal.c
> @@ -1209,7 +1209,6 @@ wal_write(struct journal *journal, struct journal_entry *entry)
>
> fail:
> entry->res = -1;
> - journal_entry_complete(entry);
> return -1;
> }
This patch is indeed what I was trying to suggest.
--
Konstantin Osipov, Moscow, Russia
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 20:01 ` Konstantin Osipov
@ 2020-02-17 20:05 ` Cyrill Gorcunov
2020-02-17 20:34 ` Cyrill Gorcunov
1 sibling, 0 replies; 15+ messages in thread
From: Cyrill Gorcunov @ 2020-02-17 20:05 UTC (permalink / raw)
To: Konstantin Osipov; +Cc: tml
On Mon, Feb 17, 2020 at 11:01:56PM +0300, Konstantin Osipov wrote:
> >
> > int
> > diff --git a/src/box/wal.c b/src/box/wal.c
> > index 0ae66ff32..a8bab4f34 100644
> > --- a/src/box/wal.c
> > +++ b/src/box/wal.c
> > @@ -1209,7 +1209,6 @@ wal_write(struct journal *journal, struct journal_entry *entry)
> >
> > fail:
> > entry->res = -1;
> > - journal_entry_complete(entry);
> > return -1;
> > }
>
> This patch is indeed what I was trying to suggest.
Thanks Kostya! I'll prepare a patch with normal subject and
need a test for it as well (hmm, actually the test will
fail in diag_raise nil dereference I suspect but I'll
prepare an internal version to be sure the rollback
is passing). IOW, once I test it I'll resend.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 20:01 ` Konstantin Osipov
2020-02-17 20:05 ` Cyrill Gorcunov
@ 2020-02-17 20:34 ` Cyrill Gorcunov
1 sibling, 0 replies; 15+ messages in thread
From: Cyrill Gorcunov @ 2020-02-17 20:34 UTC (permalink / raw)
To: Konstantin Osipov; +Cc: tml
On Mon, Feb 17, 2020 at 11:01:56PM +0300, Konstantin Osipov wrote:
...
> This patch is indeed what I was trying to suggest.
Now we're failing at
[001] tarantool: /home/cyrill/sda1/tarantool/tarantool.git/src/box/txn.c:481: txn_entry_complete_cb: Assertion `in_txn() == NULL' failed.
I think that's what Georgy meant when been pointing on
a need of modifying journal code. I'll continue investigating
tomorrow.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback
2020-02-17 15:59 ` [Tarantool-patches] [PATCH 4/4] box/txn: fix nil dereference in txn_rollback Cyrill Gorcunov
2020-02-17 17:19 ` Konstantin Osipov
@ 2020-02-17 17:25 ` Georgy Kirichenko
1 sibling, 0 replies; 15+ messages in thread
From: Georgy Kirichenko @ 2020-02-17 17:25 UTC (permalink / raw)
To: tml
[-- Attachment #1: Type: text/plain, Size: 3640 bytes --]
Hi!
I afraid the approach is not working. Please take a look into wal_write
function inside `wal.c' file. There is a journal_entry_complete invocation
which will call a txn_rollback function as an effect. So before your patch
there was an invariant: if txn engine failed to send a transaction to a
journal using txn_write_to_wal then this transaction should be rolled backed
immediately (despite the fact was there an allocation error, cascading
rollback or something else).
Basing on the foregoing I would suggest you to switch to another invariant:
let transaction to be existent if there was a writing error and to be rolled
it back explicitly, but you should do corresponding journal changes then.
WBR
On Monday, February 17, 2020 6:59:53 PM MSK Cyrill Gorcunov wrote:
> In case if we get failed in allocation of new
> journal entry the txn_rollback will try to
> derefernce nil pointer
>
> | txn_write
> |
> | fiber_set_txn(fiber(), NULL); // zap fiber's storage.txn
> | txn_write_to_wal(txn);
> |
> | journal_entry_new(..., txn_entry_done_cb, ...)
> | if (req == NULL)
> |
> | txn_rollback(txn);
> |
> | assert(txn == in_txn()); // in_txn()=nil, triggers
>
> This is because there are two call site:
>
> - when transaction is complete the wal engine will
> call txn_entry_complete_cb completion handler and
> since it is async in terms of threads (wal is a
> separate thread) it setup the txn it processes
> into a fiber's storage thus it expects the current
> storage is nil
>
> - in turn error may happen and we need to run a rollback
> procedure, there fiber's storage keeps current txn;
>
> Taking this into account we clean txn inside txn_write_to_wal
> right before journal_write is called and restore it back
> on error. This allows the caller code to run txn_rollback
> in a more consistent way.
>
> https://github.com/tarantool/tarantool/issues/4776
>
> Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
> ---
> src/box/txn.c | 27 +++++++++++++++++++++------
> 1 file changed, 21 insertions(+), 6 deletions(-)
>
> diff --git a/src/box/txn.c b/src/box/txn.c
> index a4ca48224..68365ea0b 100644
> --- a/src/box/txn.c
> +++ b/src/box/txn.c
> @@ -495,10 +495,8 @@ txn_write_to_wal(struct txn *txn)
> &txn->region,
> txn_entry_complete_cb,
> txn);
> - if (req == NULL) {
> - txn_rollback(txn);
> + if (req == NULL)
> return -1;
> - }
>
> struct txn_stmt *stmt;
> struct xrow_header **remote_row = req->rows;
> @@ -519,8 +517,20 @@ txn_write_to_wal(struct txn *txn)
> assert(remote_row == req->rows + txn->n_applier_rows);
> assert(local_row == remote_row + txn->n_new_rows);
>
> - /* Send the entry to the journal. */
> + /*
> + * Queue the entry for processing in journal
> + * engine. The semantics of complete_cb implies
> + * that fiber's txn (kept in storage) is nil
> + * becase WAL is a separet thread, for this
> + * sake we zap it here.
> + *
> + * Still this is messy since the caller runs
> + * txn_rollback if something bad happened. Thus
> + * restore the former txn on error path.
> + */
> + fiber_set_txn(fiber(), NULL);
> if (journal_write(req) < 0) {
> + fiber_set_txn(fiber(), txn);
> diag_set(ClientError, ER_WAL_IO);
> diag_log();
> return -1;
> @@ -583,8 +593,13 @@ txn_write(struct txn *txn)
> fiber_set_txn(fiber(), NULL);
> return 0;
> }
> - fiber_set_txn(fiber(), NULL);
> - return txn_write_to_wal(txn);
> +
> + if (txn_write_to_wal(txn) != 0) {
> + txn_rollback(txn);
> + return -1;
> + }
> +
> + return 0;
> }
>
> int
[-- Attachment #2: This is a digitally signed message part. --]
[-- Type: application/pgp-signature, Size: 488 bytes --]
^ permalink raw reply [flat|nested] 15+ messages in thread