[tarantool-patches] [PATCH v4 8/9] applier: apply transaction in parallel
Georgy Kirichenko
georgy at tarantool.org
Thu Jun 20 00:23:15 MSK 2019
Applier use asynchronous transaction to batch journal writes. All
appliers share the replicaset.applier.tx_vclock which means the vclock
applied but not necessarily written to a journal. Appliers use a trigger
to coordinate in case of failure - when a transaction is going to
be rolled back. Also an applier writer condition is shared across all
appliers and signaled in case of commit or hearth beat message.
Closes: #1254
---
src/box/applier.cc | 123 +++++++++++++++++++++++++++++++----------
src/box/applier.h | 6 +-
src/box/replication.cc | 7 +++
src/box/replication.h | 14 +++++
4 files changed, 119 insertions(+), 31 deletions(-)
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 5a92f6109..252dd58ea 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -50,6 +50,7 @@
#include "schema.h"
#include "txn.h"
#include "box.h"
+#include "scoped_guard.h"
STRS(applier_state, applier_STATE);
@@ -130,10 +131,10 @@ applier_writer_f(va_list ap)
* replication_timeout seconds any more.
*/
if (applier->version_id >= version_id(1, 7, 7))
- fiber_cond_wait_timeout(&applier->writer_cond,
+ fiber_cond_wait_timeout(&replicaset.applier.commit_cond,
TIMEOUT_INFINITY);
else
- fiber_cond_wait_timeout(&applier->writer_cond,
+ fiber_cond_wait_timeout(&replicaset.applier.commit_cond,
replication_timeout);
/* Send ACKs only when in FOLLOW mode ,*/
if (applier->state != APPLIER_SYNC &&
@@ -565,6 +566,36 @@ applier_read_tx(struct applier *applier, struct stailq *rows)
next)->row.is_commit);
}
+static void
+sequencer_rollback_cb(struct trigger *trigger, void *event)
+{
+ (void) trigger;
+ (void) event;
+ diag_set(ClientError, ER_WAL_IO);
+ diag_move(&fiber()->diag, &replicaset.applier.diag);
+ trigger_run(&replicaset.applier.on_replication_fail, NULL);
+ vclock_copy(&replicaset.applier.net_vclock, &replicaset.vclock);
+}
+
+static void
+sequencer_commit_cb(struct trigger *trigger, void *event)
+{
+ (void) trigger;
+ (void) event;
+ fiber_cond_broadcast(&replicaset.applier.commit_cond);
+}
+
+static void
+applier_on_fail(struct trigger *trigger, void *event)
+{
+ (void) event;
+ struct applier *applier = (struct applier *)trigger->data;
+ if (!diag_is_empty(&replicaset.applier.diag))
+ diag_add_error(&applier->diag, diag_last_error(&replicaset.applier.diag));
+ fiber_cancel(applier->reader);
+
+}
+
/**
* Apply all rows in the rows queue as a single transaction.
*
@@ -573,6 +604,22 @@ applier_read_tx(struct applier *applier, struct stailq *rows)
static int
applier_apply_tx(struct stailq *rows)
{
+ struct xrow_header *first_row =
+ &stailq_first_entry(rows, struct applier_tx_row,
+ next)->row;
+ struct replica *replica = replica_by_id(first_row->replica_id);
+ struct latch *latch = (replica ? &replica->order_latch :
+ &replicaset.applier.order_latch);
+ latch_lock(latch);
+ if (vclock_get(&replicaset.applier.net_vclock, first_row->replica_id) >=
+ first_row->lsn) {
+ /* Check there is a heathbeat message and wake a writers up. */
+ if (first_row->lsn == 0)
+ fiber_cond_broadcast(&replicaset.applier.commit_cond);
+ latch_unlock(latch);
+ return 0;
+ }
+
/**
* Explicitly begin the transaction so that we can
* control fiber->gc life cycle and, in case of apply
@@ -581,8 +628,10 @@ applier_apply_tx(struct stailq *rows)
*/
struct txn *txn = txn_begin();
struct applier_tx_row *item;
- if (txn == NULL)
- diag_raise();
+ if (txn == NULL) {
+ latch_unlock(latch);
+ return -1;
+ }
stailq_foreach_entry(item, rows, next) {
struct xrow_header *row = &item->row;
int res = apply_row(row);
@@ -623,10 +672,34 @@ applier_apply_tx(struct stailq *rows)
"Replication", "distributed transactions");
goto rollback;
}
- return txn_commit(txn);
+ /* We are ready to submit txn to wal. */
+ struct trigger *on_rollback, *on_commit;
+ on_rollback = (struct trigger *)region_alloc(&txn->region,
+ sizeof(struct trigger));
+ on_commit = (struct trigger *)region_alloc(&txn->region,
+ sizeof(struct trigger));
+ if (on_rollback == NULL || on_commit == NULL)
+ goto rollback;
+
+ trigger_create(on_rollback, sequencer_rollback_cb, NULL, NULL);
+ txn_on_rollback(txn, on_rollback);
+
+ trigger_create(on_commit, sequencer_commit_cb, NULL, NULL);
+ txn_on_commit(txn, on_commit);
+
+ if (txn_write(txn) < 0)
+ goto fail;
+ /* Transaction was sent to journal so promote vclock. */
+ vclock_follow(&replicaset.applier.net_vclock, first_row->replica_id,
+ first_row->lsn);
+ latch_unlock(latch);
+
+ return 0;
rollback:
txn_rollback(txn);
+fail:
+ latch_unlock(latch);
fiber_gc();
return -1;
}
@@ -735,6 +808,15 @@ applier_subscribe(struct applier *applier)
applier->lag = TIMEOUT_INFINITY;
+ /* Register a trigger to handle replication failures. */
+ struct trigger on_fail;
+ trigger_create(&on_fail, applier_on_fail, applier, NULL);
+ trigger_add(&replicaset.applier.on_replication_fail, &on_fail);
+ auto trigger_guard = make_scoped_guard([&] {
+ trigger_clear(&on_fail);
+ });
+
+
/*
* Process a stream of rows from the binary log.
*/
@@ -763,31 +845,10 @@ applier_subscribe(struct applier *applier)
struct stailq rows;
applier_read_tx(applier, &rows);
- struct xrow_header *first_row =
- &stailq_first_entry(&rows, struct applier_tx_row,
- next)->row;
applier->last_row_time = ev_monotonic_now(loop());
- struct replica *replica = replica_by_id(first_row->replica_id);
- struct latch *latch = (replica ? &replica->order_latch :
- &replicaset.applier.order_latch);
- /*
- * In a full mesh topology, the same set of changes
- * may arrive via two concurrently running appliers.
- * Hence we need a latch to strictly order all changes
- * that belong to the same server id.
- */
- latch_lock(latch);
- if (vclock_get(&replicaset.vclock, first_row->replica_id) <
- first_row->lsn &&
- applier_apply_tx(&rows) != 0) {
- latch_unlock(latch);
+ if (applier_apply_tx(&rows) != 0)
diag_raise();
- }
- latch_unlock(latch);
- if (applier->state == APPLIER_SYNC ||
- applier->state == APPLIER_FOLLOW)
- fiber_cond_signal(&applier->writer_cond);
if (ibuf_used(ibuf) == 0)
ibuf_reset(ibuf);
fiber_gc();
@@ -872,6 +933,11 @@ applier_f(va_list ap)
return -1;
}
} catch (FiberIsCancelled *e) {
+ if (!diag_is_empty(&applier->diag)) {
+ diag_move(&applier->diag, &fiber()->diag);
+ applier_disconnect(applier, APPLIER_STOPPED);
+ break;
+ }
applier_disconnect(applier, APPLIER_OFF);
break;
} catch (SocketError *e) {
@@ -959,7 +1025,7 @@ applier_new(const char *uri)
applier->last_row_time = ev_monotonic_now(loop());
rlist_create(&applier->on_state);
fiber_cond_create(&applier->resume_cond);
- fiber_cond_create(&applier->writer_cond);
+ diag_create(&applier->diag);
return applier;
}
@@ -972,7 +1038,6 @@ applier_delete(struct applier *applier)
assert(applier->io.fd == -1);
trigger_destroy(&applier->on_state);
fiber_cond_destroy(&applier->resume_cond);
- fiber_cond_destroy(&applier->writer_cond);
free(applier);
}
diff --git a/src/box/applier.h b/src/box/applier.h
index 5bff90031..348fdacf2 100644
--- a/src/box/applier.h
+++ b/src/box/applier.h
@@ -74,8 +74,6 @@ struct applier {
struct fiber *reader;
/** Background fiber to reply with vclock */
struct fiber *writer;
- /** Writer cond. */
- struct fiber_cond writer_cond;
/** Finite-state machine */
enum applier_state state;
/** Local time of this replica when the last row has been received */
@@ -114,8 +112,12 @@ struct applier {
bool is_paused;
/** Condition variable signaled to resume the applier. */
struct fiber_cond resume_cond;
+ struct diag diag;
};
+void
+applier_init();
+
/**
* Start a client to a remote master using a background fiber.
*
diff --git a/src/box/replication.cc b/src/box/replication.cc
index a1a2a9eb3..fd4d4e387 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -90,6 +90,13 @@ replication_init(void)
fiber_cond_create(&replicaset.applier.cond);
replicaset.replica_by_id = (struct replica **)calloc(VCLOCK_MAX, sizeof(struct replica *));
latch_create(&replicaset.applier.order_latch);
+
+ vclock_create(&replicaset.applier.net_vclock);
+ vclock_copy(&replicaset.applier.net_vclock, &replicaset.vclock);
+ rlist_create(&replicaset.applier.on_replication_fail);
+
+ fiber_cond_create(&replicaset.applier.commit_cond);
+ diag_create(&replicaset.applier.diag);
}
void
diff --git a/src/box/replication.h b/src/box/replication.h
index 8c8a9927e..a4830f5b5 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -232,6 +232,20 @@ struct replicaset {
* struct replica object).
*/
struct latch order_latch;
+ /*
+ * A vclock of the last transaction wich was read
+ * from an applier connection.
+ */
+ struct vclock net_vclock;
+ /* Signaled on replicated transaction commit. */
+ struct fiber_cond commit_cond;
+ /*
+ * Trigger to fire when replication stops in case
+ * of an error.
+ */
+ struct rlist on_replication_fail;
+ /* Diag to populate an error acros all appliers. */
+ struct diag diag;
} applier;
/** Map of all known replica_id's to correspponding replica's. */
struct replica **replica_by_id;
--
2.22.0
More information about the Tarantool-patches
mailing list