[Tarantool-patches] [RFC v7 1/2] applier: send transaction's first row WAL time in the applier_writer_f
Cyrill Gorcunov
gorcunov at gmail.com
Fri Jun 4 20:06:06 MSK 2021
Applier fiber sends current vclock of the node to remote relay reader,
pointing current state of fetched WAL data so the relay will know which
new data should be sent. The packet applier sends carries xrow_header::tm
field as a zero but we can reuse it to provide information about first
timestamp in a transaction we wrote to our WAL. Since old instances of
Tarantool simply ignore this field such extension won't cause any
problems.
The timestamp will be needed to account lag of downstream replicas
suitable for information purpose and cluster health monitoring.
We update applier statistics in WAL callbacks but since both
apply_synchro_row and apply_plain_tx are used not only in real data
application but in final join stage as well (in this stage we're not
writing the data yet) the apply_synchro_row is extended with replica_id
argument which is non zero when applier is subscribed.
The calculation of the downstream lag itself lag will be addressed
in next patch because sending the timestamp and its observation
are independent actions.
Part-of #5447
Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
src/box/applier.cc | 74 +++++++++++++++++++++++++++++++++---------
src/box/applier.h | 14 ++++++++
src/box/replication.cc | 1 +
src/box/replication.h | 5 +++
4 files changed, 79 insertions(+), 15 deletions(-)
diff --git a/src/box/applier.cc b/src/box/applier.cc
index 33181fdbf..94f247298 100644
--- a/src/box/applier.cc
+++ b/src/box/applier.cc
@@ -148,6 +148,22 @@ applier_check_sync(struct applier *applier)
}
}
+static void
+wal_stat_update(struct applier_wal_stat *wal_st)
+{
+ struct replica *r = replica_by_id(wal_st->replica_id);
+ if (likely(r != NULL))
+ r->applier_txn_start_tm = wal_st->txn_start_tm;
+}
+
+static void
+wal_stat_encode(uint32_t replica_id, struct xrow_header *xrow)
+{
+ struct replica *r = replica_by_id(replica_id);
+ if (likely(r != NULL))
+ xrow->tm = r->applier_txn_start_tm;
+}
+
/*
* Fiber function to write vclock to replication master.
* To track connection status, replica answers master
@@ -193,6 +209,7 @@ applier_writer_f(va_list ap)
applier->has_acks_to_send = false;
struct xrow_header xrow;
xrow_encode_vclock(&xrow, &replicaset.vclock);
+ wal_stat_encode(applier->instance_id, &xrow);
coio_write_xrow(&io, &xrow);
ERROR_INJECT(ERRINJ_APPLIER_SLOW_ACK, {
fiber_sleep(0.01);
@@ -490,7 +507,7 @@ static uint64_t
applier_read_tx(struct applier *applier, struct stailq *rows, double timeout);
static int
-apply_final_join_tx(struct stailq *rows);
+apply_final_join_tx(uint32_t replica_id, struct stailq *rows);
/**
* A helper struct to link xrow objects in a list.
@@ -535,7 +552,7 @@ applier_wait_register(struct applier *applier, uint64_t row_count)
next)->row);
break;
}
- if (apply_final_join_tx(&rows) != 0)
+ if (apply_final_join_tx(applier->instance_id, &rows) != 0)
diag_raise();
}
@@ -754,8 +771,12 @@ applier_txn_rollback_cb(struct trigger *trigger, void *event)
static int
applier_txn_wal_write_cb(struct trigger *trigger, void *event)
{
- (void) trigger;
(void) event;
+
+ struct applier_wal_stat *wal_st =
+ (struct applier_wal_stat *)trigger->data;
+ wal_stat_update(wal_st);
+
/* Broadcast the WAL write across all appliers. */
trigger_run(&replicaset.applier.on_wal_write, NULL);
return 0;
@@ -766,6 +787,8 @@ struct synchro_entry {
struct synchro_request *req;
/** Fiber created the entry. To wakeup when WAL write is done. */
struct fiber *owner;
+ /** WAL bound statistics. */
+ struct applier_wal_stat *wal_st;
/**
* The base journal entry. It has unsized array and then must be the
* last entry in the structure. But can workaround it via a union
@@ -789,6 +812,7 @@ apply_synchro_row_cb(struct journal_entry *entry)
if (entry->res < 0) {
applier_rollback_by_wal_io();
} else {
+ wal_stat_update(synchro_entry->wal_st);
txn_limbo_process(&txn_limbo, synchro_entry->req);
trigger_run(&replicaset.applier.on_wal_write, NULL);
}
@@ -797,7 +821,7 @@ apply_synchro_row_cb(struct journal_entry *entry)
/** Process a synchro request. */
static int
-apply_synchro_row(struct xrow_header *row)
+apply_synchro_row(uint32_t replica_id, struct xrow_header *row)
{
assert(iproto_type_is_synchro_request(row->type));
@@ -805,6 +829,7 @@ apply_synchro_row(struct xrow_header *row)
if (xrow_decode_synchro(row, &req) != 0)
goto err;
+ struct applier_wal_stat wal_st;
struct synchro_entry entry;
/*
* Rows array is cast from *[] to **, because otherwise g++ complains
@@ -817,6 +842,11 @@ apply_synchro_row(struct xrow_header *row)
apply_synchro_row_cb, &entry);
entry.req = &req;
entry.owner = fiber();
+
+ wal_st.replica_id = replica_id;
+ wal_st.txn_start_tm = row->tm;
+ entry.wal_st = &wal_st;
+
/*
* The WAL write is blocking. Otherwise it might happen that a CONFIRM
* or ROLLBACK is sent to WAL, and it would empty the limbo, but before
@@ -862,8 +892,9 @@ applier_handle_raft(struct applier *applier, struct xrow_header *row)
return box_raft_process(&req, applier->instance_id);
}
-static inline int
-apply_plain_tx(struct stailq *rows, bool skip_conflict, bool use_triggers)
+static int
+apply_plain_tx(uint32_t replica_id, struct stailq *rows,
+ bool skip_conflict, bool use_triggers)
{
/*
* Explicitly begin the transaction so that we can
@@ -931,10 +962,21 @@ apply_plain_tx(struct stailq *rows, bool skip_conflict, bool use_triggers)
goto fail;
}
+ struct applier_wal_stat *wal_st;
+ wal_st = region_alloc_object(&txn->region, typeof(*wal_st), &size);
+ if (wal_st == NULL) {
+ diag_set(OutOfMemory, size, "region_alloc_object", "wal_st");
+ goto fail;
+ }
+
trigger_create(on_rollback, applier_txn_rollback_cb, NULL, NULL);
txn_on_rollback(txn, on_rollback);
- trigger_create(on_wal_write, applier_txn_wal_write_cb, NULL, NULL);
+ item = stailq_first_entry(rows, struct applier_tx_row, next);
+ wal_st->replica_id = replica_id;
+ wal_st->txn_start_tm = item->row.tm;
+
+ trigger_create(on_wal_write, applier_txn_wal_write_cb, wal_st, NULL);
txn_on_wal_write(txn, on_wal_write);
}
@@ -946,7 +988,7 @@ apply_plain_tx(struct stailq *rows, bool skip_conflict, bool use_triggers)
/** A simpler version of applier_apply_tx() for final join stage. */
static int
-apply_final_join_tx(struct stailq *rows)
+apply_final_join_tx(uint32_t replica_id, struct stailq *rows)
{
struct xrow_header *first_row =
&stailq_first_entry(rows, struct applier_tx_row, next)->row;
@@ -957,9 +999,9 @@ apply_final_join_tx(struct stailq *rows)
vclock_follow_xrow(&replicaset.vclock, last_row);
if (unlikely(iproto_type_is_synchro_request(first_row->type))) {
assert(first_row == last_row);
- rc = apply_synchro_row(first_row);
+ rc = apply_synchro_row(replica_id, first_row);
} else {
- rc = apply_plain_tx(rows, false, false);
+ rc = apply_plain_tx(replica_id, rows, false, false);
}
fiber_gc();
return rc;
@@ -1088,12 +1130,14 @@ applier_apply_tx(struct applier *applier, struct stailq *rows)
* each other.
*/
assert(first_row == last_row);
- if ((rc = apply_synchro_row(first_row)) != 0)
- goto finish;
- } else if ((rc = apply_plain_tx(rows, replication_skip_conflict,
- true)) != 0) {
- goto finish;
+ rc = apply_synchro_row(applier->instance_id, first_row);
+ } else {
+ rc = apply_plain_tx(applier->instance_id, rows,
+ replication_skip_conflict, true);
}
+ if (rc != 0)
+ goto finish;
+
vclock_follow(&replicaset.applier.vclock, last_row->replica_id,
last_row->lsn);
finish:
diff --git a/src/box/applier.h b/src/box/applier.h
index 15ca1fcfd..00afa7247 100644
--- a/src/box/applier.h
+++ b/src/box/applier.h
@@ -133,6 +133,20 @@ struct applier {
struct vclock remote_vclock_at_subscribe;
};
+/**
+ * WAL related statistics.
+ */
+struct applier_wal_stat {
+ /** Replica ID initiated a transaction. */
+ uint32_t replica_id;
+ /**
+ * Timestamp of a transaction to be accounted
+ * for relay lag. Usually it is a first row in
+ * a transaction.
+ */
+ double txn_start_tm;
+};
+
/**
* Start a client to a remote master using a background fiber.
*
diff --git a/src/box/replication.cc b/src/box/replication.cc
index aefb812b3..c97c1fc04 100644
--- a/src/box/replication.cc
+++ b/src/box/replication.cc
@@ -184,6 +184,7 @@ replica_new(void)
trigger_create(&replica->on_applier_state,
replica_on_applier_state_f, NULL, NULL);
replica->applier_sync_state = APPLIER_DISCONNECTED;
+ replica->applier_txn_start_tm = 0;
latch_create(&replica->order_latch);
return replica;
}
diff --git a/src/box/replication.h b/src/box/replication.h
index 2ad1cbf66..d9817d4ff 100644
--- a/src/box/replication.h
+++ b/src/box/replication.h
@@ -331,6 +331,11 @@ struct replica {
* separate from applier.
*/
enum applier_state applier_sync_state;
+ /**
+ * Applier's last written to WAL transaction timestamp.
+ * Needed for relay lagging statistics.
+ */
+ double applier_txn_start_tm;
/* The latch is used to order replication requests. */
struct latch order_latch;
};
--
2.31.1
More information about the Tarantool-patches
mailing list