Tarantool development patches archive
 help / color / mirror / Atom feed
* [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup
@ 2019-11-18 16:05 Serge Petrenko
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation Serge Petrenko
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Serge Petrenko @ 2019-11-18 16:05 UTC (permalink / raw)
  To: v.shpilevoy; +Cc: tarantool-patches

The first patch factors all the clock-related members of struct fiber and struct
cord into appropriate structs and adds methods for their updates and resets.

The second patch fixes exponential moving average calculation so that we do not
experience huge numbers in average load percentage calculations.

The third patch alters fiber.top() test to wait for correct output before
testing it. 

Follow-up https://github.com/tarantool/tarantool/issues/2694
Branch https://github.com/tarantool/tarantool/tree/sp/gh-2694-test-fixup

Changes in v3:
 - introduce a new patch which refactors all the clock stat
   handling.
 - introduce a patch fixing EMA calculation
 - review fixes as per review from Vladislav

Changes in v2:
 - clean up all fibers clock stats on fiber.top_enable()
 - push 0 instead of NaN when cord clock_delta_last is 0
 - review fixes as per review from Vladislav

Serge Petrenko (3):
  fiber.top() refactor clock and cpu time calculation
  fiber.top(): alter exponential moving average calculation
  app/fiber: wait till a full event loop iteration ends

 src/lib/core/fiber.c    | 175 ++++++++++++++++++++++++----------------
 src/lib/core/fiber.h    | 109 +++++++++++++++++--------
 src/lua/fiber.c         |  20 +++--
 test/app/fiber.result   |  38 ++++++---
 test/app/fiber.test.lua |  36 +++++++--
 5 files changed, 252 insertions(+), 126 deletions(-)

-- 
2.21.0 (Apple Git-122)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation
  2019-11-18 16:05 [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Serge Petrenko
@ 2019-11-18 16:05 ` Serge Petrenko
  2019-11-18 22:25   ` Vladislav Shpilevoy
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 2/3] fiber.top(): alter exponential moving average calculation Serge Petrenko
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Serge Petrenko @ 2019-11-18 16:05 UTC (permalink / raw)
  To: v.shpilevoy; +Cc: tarantool-patches

Unify all the members related to fiber's clock statistics into struct
clock_stat and all the members related to cord's knowledge of cpu state
and clocks to struct cpu_stat.
Reset stats of all alive fibers on fiber.top_enable().

Follow-up #2694
---
 src/lib/core/fiber.c | 179 ++++++++++++++++++++++++++-----------------
 src/lib/core/fiber.h | 109 ++++++++++++++++++--------
 src/lua/fiber.c      |  20 +++--
 3 files changed, 199 insertions(+), 109 deletions(-)

diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c
index aebaba7f0..1e08d0ec9 100644
--- a/src/lib/core/fiber.c
+++ b/src/lib/core/fiber.c
@@ -88,6 +88,9 @@ static int (*fiber_invoke)(fiber_func f, va_list ap);
 
 #if ENABLE_FIBER_TOP
 static __thread bool fiber_top_enabled = false;
+
+uint64_t
+cpu_stat_on_csw(struct cpu_stat *stat);
 #endif /* ENABLE_FIBER_TOP */
 
 /**
@@ -103,18 +106,10 @@ clock_set_on_csw(struct fiber *caller)
 	if (!fiber_top_enabled)
 		return;
 
-	uint64_t clock;
-	uint32_t cpu_id;
-	clock = __rdtscp(&cpu_id);
+	uint64_t delta = cpu_stat_on_csw(&cord()->cpu_stat);
 
-	if (cpu_id == cord()->cpu_id_last) {
-		caller->clock_delta += clock - cord()->clock_last;
-		cord()->clock_delta += clock - cord()->clock_last;
-	} else {
-		cord()->cpu_id_last = cpu_id;
-		cord()->cpu_miss_count++;
-	}
-	cord()->clock_last = clock;
+	clock_stat_add_delta(&cord()->clock_stat, delta);
+	clock_stat_add_delta(&caller->clock_stat, delta);
 #endif /* ENABLE_FIBER_TOP */
 
 }
@@ -695,9 +690,7 @@ fiber_reset(struct fiber *fiber)
 	rlist_create(&fiber->on_stop);
 	fiber->flags = FIBER_DEFAULT_FLAGS;
 #if ENABLE_FIBER_TOP
-	fiber->cputime = 0;
-	fiber->clock_acc = 0;
-	fiber->clock_delta = 0;
+	clock_stat_reset(&fiber->clock_stat);
 #endif /* ENABLE_FIBER_TOP */
 }
 
@@ -1095,9 +1088,52 @@ loop_on_iteration_start(ev_loop *loop, ev_check *watcher, int revents)
 	(void) watcher;
 	(void) revents;
 
-	cord()->clock_last = __rdtscp(&cord()->cpu_id_last);
-	cord()->cpu_miss_count = 0;
+	cpu_stat_start(&cord()->cpu_stat);
+}
+
+/**
+ * Calculate the exponential moving average for the clock deltas
+ * per loop iteration. The coeffitient is 1/16.
+ */
+static inline uint64_t
+clock_diff_accumulate(uint64_t acc, uint64_t delta)
+{
+	if (acc > 0) {
+		return delta / 16 + 15 * acc / 16;
+	} else {
+		return delta;
+	}
+}
 
+inline void
+clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta)
+{
+	stat->delta += clock_delta;
+}
+
+void
+clock_stat_update(struct clock_stat *stat, double nsec_per_clock)
+{
+	stat->acc = clock_diff_accumulate(stat->acc, stat->delta);
+	stat->prev_delta = stat->delta;
+	stat->cputime += stat->delta * nsec_per_clock;
+	stat->delta = 0;
+}
+
+void
+clock_stat_reset(struct clock_stat *stat)
+{
+	stat->acc = 0;
+	stat->delta = 0;
+	stat->prev_delta = 0;
+	stat->cputime = 0;
+}
+
+void
+cpu_stat_start(struct cpu_stat *stat)
+{
+	stat->prev_clock = __rdtscp(&stat->prev_cpu_id);
+	stat->cpu_miss_count = 0;
 	/*
 	 * We want to measure thread cpu time here to calculate
 	 * each fiber's cpu time, so don't use libev's ev_now() or
@@ -1106,27 +1142,59 @@ loop_on_iteration_start(ev_loop *loop, ev_check *watcher, int revents)
 	 */
 	struct timespec ts;
 	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
-		say_debug("clock_gettime(): failed to get this"
-			  "thread's cpu time.");
+		say_debug("clock_gettime(): failed to get this thread's"
+			  " cpu time.");
 		return;
 	}
-	cord()->cputime_last = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
-					  ts.tv_nsec;
+	stat->prev_cputime = (uint64_t) ts.tv_sec * FIBER_TIME_RES + ts.tv_nsec;
 }
 
+void
+cpu_stat_reset(struct cpu_stat *stat)
+{
+	stat->prev_cpu_miss_count = 0;
+	cpu_stat_start(stat);
+}
 
-/**
- * Calculate the exponential moving average for the clock deltas
- * per loop iteration. The coeffitient is 1/16.
- */
-static inline uint64_t
-clock_diff_accumulate(uint64_t acc, uint64_t delta)
+uint64_t
+cpu_stat_on_csw(struct cpu_stat *stat)
 {
-	if (acc > 0) {
-		return delta / 16 + 15 * acc / 16;
+	uint32_t cpu_id;
+	uint64_t delta = 0;
+	uint64_t clock = __rdtscp(&cpu_id);
+
+	if (cpu_id == stat->prev_cpu_id) {
+		delta = clock - stat->prev_clock;
 	} else {
-		return delta;
+		stat->prev_cpu_id = cpu_id;
+		stat->cpu_miss_count++;
+	}
+	stat->prev_clock = clock;
+
+	return delta;
+}
+
+double
+cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat)
+{
+	stat->prev_cpu_miss_count = stat->cpu_miss_count;
+	stat->cpu_miss_count = 0;
+
+	struct timespec ts;
+	uint64_t delta_time;
+	double nsec_per_clock = 0;
+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
+		say_debug("clock_gettime(): failed to get this thread's"
+			  " cpu time.");
+	} else {
+		delta_time = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
+			     ts.tv_nsec;
+		if (delta_time > stat->prev_cputime && cord_clock_stat->delta > 0) {
+			delta_time -= stat->prev_cputime;
+			nsec_per_clock = (double) delta_time / cord()->clock_stat.delta;
+		}
 	}
+	return nsec_per_clock;
 }
 
 static void
@@ -1145,40 +1213,14 @@ loop_on_iteration_end(ev_loop *loop, ev_prepare *watcher, int revents)
 	 */
 	clock_set_on_csw(&cord()->sched);
 
-	cord()->cpu_miss_count_last = cord()->cpu_miss_count;
-	cord()->cpu_miss_count = 0;
+	double nsec_per_clock = cpu_stat_end(&cord()->cpu_stat,
+					     &cord()->clock_stat);
 
-	struct timespec ts;
-	uint64_t delta_time;
-	double nsec_per_clock = 0;
-
-	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
-		say_debug("clock_gettime(): failed to get this"
-			  "thread's cpu time.");
-	} else {
-		delta_time = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
-			     ts.tv_nsec;
-		assert(delta_time > cord()->cputime_last);
-		delta_time -= cord()->cputime_last;
-
-		if (cord()->clock_delta > 0)
-			nsec_per_clock = (double) delta_time / cord()->clock_delta;
-	}
-
-	cord()->clock_acc = clock_diff_accumulate(cord()->clock_acc, cord()->clock_delta);
-	cord()->clock_delta_last = cord()->clock_delta;
-	cord()->clock_delta = 0;
-
-	cord()->sched.clock_acc = clock_diff_accumulate(cord()->sched.clock_acc, cord()->sched.clock_delta);
-	cord()->sched.clock_delta_last = cord()->sched.clock_delta;
-	cord()->sched.cputime += cord()->sched.clock_delta * nsec_per_clock;
-	cord()->sched.clock_delta = 0;
+	clock_stat_update(&cord()->clock_stat, nsec_per_clock);
+	clock_stat_update(&cord()->sched.clock_stat, nsec_per_clock);
 
 	rlist_foreach_entry(fiber, &cord()->alive, link) {
-		fiber->clock_acc = clock_diff_accumulate(fiber->clock_acc, fiber->clock_delta);
-		fiber->clock_delta_last = fiber->clock_delta;
-		fiber->cputime += fiber->clock_delta * nsec_per_clock;
-		fiber->clock_delta = 0;
+		clock_stat_update(&fiber->clock_stat, nsec_per_clock);
 	}
 }
 
@@ -1203,17 +1245,14 @@ fiber_top_enable()
 		ev_check_start(cord()->loop, &cord()->check_event);
 		fiber_top_enabled = true;
 
-		cord()->clock_acc = 0;
-		cord()->cpu_miss_count_last = 0;
-		cord()->clock_delta_last = 0;
-		struct timespec ts;
-		if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
-			say_debug("clock_gettime(): failed to get this"
-				  "thread's cpu time.");
-			return;
+		cpu_stat_reset(&cord()->cpu_stat);
+		clock_stat_reset(&cord()->clock_stat);
+		clock_stat_reset(&cord()->sched.clock_stat);
+
+		struct fiber *fiber;
+		rlist_foreach_entry(fiber, &cord()->alive, link) {
+			clock_stat_reset(&fiber->clock_stat);
 		}
-		cord()->cputime_last = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
-						  ts.tv_nsec;
 	}
 }
 
diff --git a/src/lib/core/fiber.h b/src/lib/core/fiber.h
index ab313a050..06ce28bb1 100644
--- a/src/lib/core/fiber.h
+++ b/src/lib/core/fiber.h
@@ -56,14 +56,82 @@
 #define ENABLE_FIBER_TOP 1
 #endif
 
+#if defined(__cplusplus)
+extern "C" {
+#endif /* defined(__cplusplus) */
+
 #if ENABLE_FIBER_TOP
 /* A fiber reports used up CPU time with nanosecond resolution. */
 #define FIBER_TIME_RES 1000000000
-#endif /* ENABLE_FIBER_TOP */
 
-#if defined(__cplusplus)
-extern "C" {
-#endif /* defined(__cplusplus) */
+/**
+ * A struct containing all the info gathered for current fiber or
+ * thread as a whole when fiber.top() is enabled.
+ */
+struct clock_stat {
+	/**
+	 * Accumulated clock value calculated using exponential
+	 * moving average.
+	 */
+	uint64_t acc;
+	/**
+	 * Clock delta counter used on current event loop
+	 * iteration.
+	 */
+	uint64_t delta;
+	/**
+	 * Clock delta calculated on previous event loop
+	 * iteration.
+	 */
+	uint64_t prev_delta;
+	/**
+	 * Total processor time this fiber (or cord as a whole)
+	 * has spent with 1 / FIBER_TIME_RES second precision.
+	 */
+	uint64_t cputime;
+};
+
+void
+clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta);
+
+void
+clock_stat_update(struct clock_stat *stat, double nsec_per_clock);
+
+void
+clock_stat_reset(struct clock_stat *stat);
+
+/**
+ * A struct encapsulating all knowledge this cord has about cpu
+ * clocks and their state.
+ */
+struct cpu_stat {
+	uint64_t prev_clock;
+	/**
+	 * This thread's CPU time at the beginning of event loop
+	 * iteration. Used to calculate how much cpu time has
+	 * each loop iteration consumed and update fiber cpu
+	 * times propotionally. The resolution is
+	 * 1 / FIBER_TIME_RES seconds.
+	 */
+	uint64_t prev_cputime;
+	uint32_t prev_cpu_id;
+	uint32_t cpu_miss_count;
+	uint32_t prev_cpu_miss_count;
+};
+
+void
+cpu_stat_start(struct cpu_stat *stat);
+
+void
+cpu_stat_reset(struct cpu_stat *stat);
+
+uint64_t
+cpu_stat_on_csw(struct cpu_stat *stat);
+
+double
+cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat);
+
+#endif /* ENABLE_FIBER_TOP */
 
 enum { FIBER_NAME_MAX = 32 };
 
@@ -402,21 +470,7 @@ struct fiber {
 	/** Fiber flags */
 	uint32_t flags;
 #if ENABLE_FIBER_TOP
-	/**
-	 * Accumulated clock value calculated using exponential
-	 * moving average.
-	 */
-	uint64_t clock_acc;
-	/**
-	 * Total processor time this fiber has spent with
-	 * 1 / FIBER_TIME_RES second precision.
-	 */
-	uint64_t cputime;
-	/**
-	 * Clock delta calculated on previous event loop iteration.
-	 */
-	uint64_t clock_delta_last;
-	uint64_t clock_delta;
+	struct clock_stat clock_stat;
 #endif /* ENABLE_FIBER_TOP */
 	/** Link in cord->alive or cord->dead list. */
 	struct rlist link;
@@ -490,21 +544,8 @@ struct cord {
 	 */
 	uint32_t max_fid;
 #if ENABLE_FIBER_TOP
-	uint64_t clock_acc;
-	uint64_t clock_delta;
-	uint64_t clock_delta_last;
-	uint64_t clock_last;
-	uint32_t cpu_id_last;
-	uint32_t cpu_miss_count;
-	uint32_t cpu_miss_count_last;
-	/**
-	 * This thread's CPU time at the beginning of event loop
-	 * iteration. Used to calculate how much cpu time has
-	 * each loop iteration consumed and update fiber cpu
-	 * times propotionally. The resolution is
-	 * 1 / FIBER_TIME_RES seconds.
-	 */
-	uint64_t cputime_last;
+	struct clock_stat clock_stat;
+	struct cpu_stat cpu_stat;
 #endif /* ENABLE_FIBER_TOP */
 	pthread_t id;
 	const struct cord_on_exit *on_exit;
diff --git a/src/lua/fiber.c b/src/lua/fiber.c
index 8b3b22e55..53ebec9aa 100644
--- a/src/lua/fiber.c
+++ b/src/lua/fiber.c
@@ -278,7 +278,7 @@ lbox_fiber_statof(struct fiber *f, void *cb_ctx, bool backtrace)
 	lua_settable(L, -3);
 
 	lua_pushliteral(L, "time");
-	lua_pushnumber(L, f->cputime / (double) FIBER_TIME_RES);
+	lua_pushnumber(L, f->clock_stat.cputime / (double) FIBER_TIME_RES);
 	lua_settable(L, -3);
 
 	lua_pushliteral(L, "memory");
@@ -335,13 +335,23 @@ lbox_fiber_top_entry(struct fiber *f, void *cb_ctx)
 	lua_newtable(L);
 
 	lua_pushliteral(L, "average");
-	lua_pushnumber(L, f->clock_acc / (double)cord()->clock_acc * 100);
+	if (cord()->clock_stat.acc != 0) {
+		lua_pushnumber(L, f->clock_stat.acc /
+				  (double)cord()->clock_stat.acc * 100);
+	} else {
+		lua_pushnumber(L, 0);
+	}
 	lua_settable(L, -3);
 	lua_pushliteral(L, "instant");
-	lua_pushnumber(L, f->clock_delta_last / (double)cord()->clock_delta_last * 100);
+	if (cord()->clock_stat.prev_delta != 0) {
+		lua_pushnumber(L, f->clock_stat.prev_delta /
+				  (double)cord()->clock_stat.prev_delta * 100);
+	} else {
+		lua_pushnumber(L, 0);
+	}
 	lua_settable(L, -3);
 	lua_pushliteral(L, "time");
-	lua_pushnumber(L, f->cputime / (double) FIBER_TIME_RES);
+	lua_pushnumber(L, f->clock_stat.cputime / (double) FIBER_TIME_RES);
 	lua_settable(L, -3);
 	lua_settable(L, -3);
 
@@ -357,7 +367,7 @@ lbox_fiber_top(struct lua_State *L)
 	}
 	lua_newtable(L);
 	lua_pushliteral(L, "cpu_misses");
-	lua_pushnumber(L, cord()->cpu_miss_count_last);
+	lua_pushnumber(L, cord()->cpu_stat.prev_cpu_miss_count);
 	lua_settable(L, -3);
 
 	lua_pushliteral(L, "cpu");
-- 
2.21.0 (Apple Git-122)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Tarantool-patches] [PATCH v3 2/3] fiber.top(): alter exponential moving average calculation
  2019-11-18 16:05 [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Serge Petrenko
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation Serge Petrenko
@ 2019-11-18 16:05 ` Serge Petrenko
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 3/3] app/fiber: wait till a full event loop iteration ends Serge Petrenko
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Serge Petrenko @ 2019-11-18 16:05 UTC (permalink / raw)
  To: v.shpilevoy; +Cc: tarantool-patches

When fiber EMA is 0 and first non-zero observation is added to it, we assumed
that EMA should be equal to this observation (i.e. average value should
be the same as the observed one). This breaks the following invariant:
sum of clock EMAs of all fibers equals clock EMA of the thread.
If one of the fibers is just spawned and has a big clock delta, it
will assign this delta to its EMA, while the thread will calculate the
new EMA as 15 * EMA / 16 + delta / 16, which may lead to a situation
when fiber EMA is greater than cord EMA.

This caused occasional test failures:
```
[001] Test failed! Result content mismatch:
[001] --- app/fiber.result	Mon Nov 18 17:00:48 2019
[001] +++ app/fiber.reject	Mon Nov 18 17:33:10 2019
[001] @@ -1511,7 +1511,7 @@
[001]  -- not exact due to accumulated integer division errors
[001]  sum_avg > 99 and sum_avg < 101 or sum_avg
[001]  ---
[001] -- true
[001] +- 187.59585601717
[001]  ...
[001]  tbl = nil
[001]  ---

```

Follow-up #2694
---
 src/lib/core/fiber.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c
index 1e08d0ec9..232c29911 100644
--- a/src/lib/core/fiber.c
+++ b/src/lib/core/fiber.c
@@ -1098,11 +1098,7 @@ loop_on_iteration_start(ev_loop *loop, ev_check *watcher, int revents)
 static inline uint64_t
 clock_diff_accumulate(uint64_t acc, uint64_t delta)
 {
-	if (acc > 0) {
-		return delta / 16 + 15 * acc / 16;
-	} else {
-		return delta;
-	}
+	return delta / 16 + 15 * acc / 16;
 }
 
 inline void
-- 
2.21.0 (Apple Git-122)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Tarantool-patches] [PATCH v3 3/3] app/fiber: wait till a full event loop iteration ends
  2019-11-18 16:05 [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Serge Petrenko
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation Serge Petrenko
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 2/3] fiber.top(): alter exponential moving average calculation Serge Petrenko
@ 2019-11-18 16:05 ` Serge Petrenko
  2019-11-19 22:57 ` [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Vladislav Shpilevoy
  2019-11-21 17:07 ` Kirill Yukhin
  4 siblings, 0 replies; 11+ messages in thread
From: Serge Petrenko @ 2019-11-18 16:05 UTC (permalink / raw)
  To: v.shpilevoy; +Cc: tarantool-patches

fiber.top() fills in statistics every event loop iteration,
so if it was just enabled, fiber.top() returns zero in fiber cpu
usage statistics because total time consumed by the main thread was
not yet accounted for.
Same stands for viewing top() results for a freshly created fiber:
its metrics will be zero since it hasn't lived a full ev loop iteration
yet.
Fix this by delaying the test till top() results are meaningful and add
minor refactoring.

Follow-up #2694
---
 test/app/fiber.result   | 38 +++++++++++++++++++++++++++++---------
 test/app/fiber.test.lua | 36 ++++++++++++++++++++++++++++--------
 2 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/test/app/fiber.result b/test/app/fiber.result
index 4a094939f..b767bbb59 100644
--- a/test/app/fiber.result
+++ b/test/app/fiber.result
@@ -1469,6 +1469,12 @@ sum = 0
 fiber.top_enable()
 ---
 ...
+-- Wait till a full event loop iteration passes, so that
+-- top() contains meaningful results. On the ev loop iteration
+-- following fiber.top_enable() results will be zero.
+while fiber.top().cpu["1/sched"].instant == 0 do fiber.yield() end
+---
+...
 a = fiber.top()
 ---
 ...
@@ -1508,33 +1514,47 @@ sum_inst
 ---
 - 100
 ...
--- not exact due to accumulated integer division errors
-sum_avg > 99 and sum_avg < 101 or sum_avg
+-- when a fiber dies, its impact on the thread moving average
+-- persists for a couple of ev loop iterations, but it is no
+-- longer listed in fiber.top(). So sum_avg may way smaller than
+-- 100%. See gh-4625 for details and reenable the test below as
+-- soon as it is implemented.
+-- In rare cases when a fiber dies on the same event loop
+-- iteration as you issue fiber.top(), sum_inst will also be
+-- smaller than 100%, but it is so rare I don't even want to
+-- disable the test above.
+sum_avg <= 100 or sum_avg
 ---
 - true
 ...
+-- not exact due to accumulated integer division errors
+--sum_avg > 99 and sum_avg <= 100 or sum_avg
 tbl = nil
 ---
 ...
 f = fiber.new(function()\
-    for i = 1,1000 do end\
-    fiber.yield()\
-    tbl = fiber.top().cpu[fiber.self().id()..'/'..fiber.self().name()]\
+    local fiber_key = fiber.self().id()..'/'..fiber.self().name()\
+    tbl = fiber.top().cpu[fiber_key]\
+    while tbl.time == 0 do\
+        for i = 1,1000 do end\
+        fiber.yield()\
+        tbl = fiber.top().cpu[fiber_key]\
+    end\
 end)
 ---
 ...
-while f:status() ~= 'dead' do fiber.sleep(0.01) end
+while f:status() ~= 'dead' do fiber.yield() end
 ---
 ...
-tbl["average"] > 0
+tbl.average > 0
 ---
 - true
 ...
-tbl["instant"] > 0
+tbl.instant > 0
 ---
 - true
 ...
-tbl["time"] > 0
+tbl.time > 0
 ---
 - true
 ...
diff --git a/test/app/fiber.test.lua b/test/app/fiber.test.lua
index 38b85d554..7f7350acc 100644
--- a/test/app/fiber.test.lua
+++ b/test/app/fiber.test.lua
@@ -634,6 +634,12 @@ sum = 0
 -- gh-2694 fiber.top()
 fiber.top_enable()
 
+
+-- Wait till a full event loop iteration passes, so that
+-- top() contains meaningful results. On the ev loop iteration
+-- following fiber.top_enable() results will be zero.
+while fiber.top().cpu["1/sched"].instant == 0 do fiber.yield() end
+
 a = fiber.top()
 type(a)
 -- scheduler is present in fiber.top()
@@ -653,18 +659,32 @@ for k, v in pairs(a) do\
 end
 
 sum_inst
+-- when a fiber dies, its impact on the thread moving average
+-- persists for a couple of ev loop iterations, but it is no
+-- longer listed in fiber.top(). So sum_avg may way smaller than
+-- 100%. See gh-4625 for details and reenable the test below as
+-- soon as it is implemented.
+-- In rare cases when a fiber dies on the same event loop
+-- iteration as you issue fiber.top(), sum_inst will also be
+-- smaller than 100%, but it is so rare I don't even want to
+-- disable the test above.
+sum_avg <= 100 or sum_avg
 -- not exact due to accumulated integer division errors
-sum_avg > 99 and sum_avg < 101 or sum_avg
+--sum_avg > 99 and sum_avg <= 100 or sum_avg
 tbl = nil
 f = fiber.new(function()\
-    for i = 1,1000 do end\
-    fiber.yield()\
-    tbl = fiber.top().cpu[fiber.self().id()..'/'..fiber.self().name()]\
+    local fiber_key = fiber.self().id()..'/'..fiber.self().name()\
+    tbl = fiber.top().cpu[fiber_key]\
+    while tbl.time == 0 do\
+        for i = 1,1000 do end\
+        fiber.yield()\
+        tbl = fiber.top().cpu[fiber_key]\
+    end\
 end)
-while f:status() ~= 'dead' do fiber.sleep(0.01) end
-tbl["average"] > 0
-tbl["instant"] > 0
-tbl["time"] > 0
+while f:status() ~= 'dead' do fiber.yield() end
+tbl.average > 0
+tbl.instant > 0
+tbl.time > 0
 
 fiber.top_disable()
 fiber.top()
-- 
2.21.0 (Apple Git-122)

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation Serge Petrenko
@ 2019-11-18 22:25   ` Vladislav Shpilevoy
  2019-11-19  7:14     ` Sergey Petrenko
  0 siblings, 1 reply; 11+ messages in thread
From: Vladislav Shpilevoy @ 2019-11-18 22:25 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches

Hi! Thanks for the fixes!

I suggest you to move all functions related to cpu and clock
state to fiber.c and make them static there. In order not to
pollute the global namespace with fiber's private functions.

I did that in a commit above this one. You may squash, or redo,
whatever:

===================================================================

diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c
index 1e08d0ec9..3faa198d3 100644
--- a/src/lib/core/fiber.c
+++ b/src/lib/core/fiber.c
@@ -44,6 +44,112 @@
 
 #if ENABLE_FIBER_TOP
 #include <x86intrin.h> /* __rdtscp() */
+
+static inline void
+clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta)
+{
+	stat->delta += clock_delta;
+}
+
+/**
+ * Calculate the exponential moving average for the clock deltas
+ * per loop iteration. The coeffitient is 1/16.
+ */
+static inline uint64_t
+clock_diff_accumulate(uint64_t acc, uint64_t delta)
+{
+	if (acc > 0)
+		return delta / 16 + 15 * acc / 16;
+	else
+		return delta;
+}
+
+static inline void
+clock_stat_update(struct clock_stat *stat, double nsec_per_clock)
+{
+	stat->acc = clock_diff_accumulate(stat->acc, stat->delta);
+	stat->prev_delta = stat->delta;
+	stat->cputime += stat->delta * nsec_per_clock;
+	stat->delta = 0;
+}
+
+static inline void
+clock_stat_reset(struct clock_stat *stat)
+{
+	stat->acc = 0;
+	stat->delta = 0;
+	stat->prev_delta = 0;
+	stat->cputime = 0;
+}
+
+static void
+cpu_stat_start(struct cpu_stat *stat)
+{
+	stat->prev_clock = __rdtscp(&stat->prev_cpu_id);
+	stat->cpu_miss_count = 0;
+	/*
+	 * We want to measure thread cpu time here to calculate
+	 * each fiber's cpu time, so don't use libev's ev_now() or
+	 * ev_time() since they use either monotonic or realtime
+	 * system clocks.
+	 */
+	struct timespec ts;
+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
+		say_debug("clock_gettime(): failed to get this thread's"
+			  " cpu time.");
+		return;
+	}
+	stat->prev_cputime = (uint64_t) ts.tv_sec * FIBER_TIME_RES + ts.tv_nsec;
+}
+
+static inline void
+cpu_stat_reset(struct cpu_stat *stat)
+{
+	stat->prev_cpu_miss_count = 0;
+	cpu_stat_start(stat);
+}
+
+static uint64_t
+cpu_stat_on_csw(struct cpu_stat *stat)
+{
+	uint32_t cpu_id;
+	uint64_t delta, clock = __rdtscp(&cpu_id);
+
+	if (cpu_id == stat->prev_cpu_id) {
+		delta = clock - stat->prev_clock;
+	} else {
+		delta = 0;
+		stat->prev_cpu_id = cpu_id;
+		stat->cpu_miss_count++;
+	}
+	stat->prev_clock = clock;
+
+	return delta;
+}
+
+static double
+cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat)
+{
+	stat->prev_cpu_miss_count = stat->cpu_miss_count;
+	stat->cpu_miss_count = 0;
+
+	struct timespec ts;
+	uint64_t delta_time;
+	double nsec_per_clock = 0;
+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
+		say_debug("clock_gettime(): failed to get this thread's"
+			  " cpu time.");
+	} else {
+		delta_time = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
+			     ts.tv_nsec;
+		if (delta_time > stat->prev_cputime && cord_clock_stat->delta > 0) {
+			delta_time -= stat->prev_cputime;
+			nsec_per_clock = (double) delta_time / cord()->clock_stat.delta;
+		}
+	}
+	return nsec_per_clock;
+}
+
 #endif /* ENABLE_FIBER_TOP */
 
 #include "third_party/valgrind/memcheck.h"
@@ -88,9 +194,6 @@ static int (*fiber_invoke)(fiber_func f, va_list ap);
 
 #if ENABLE_FIBER_TOP
 static __thread bool fiber_top_enabled = false;
-
-uint64_t
-cpu_stat_on_csw(struct cpu_stat *stat);
 #endif /* ENABLE_FIBER_TOP */
 
 /**
@@ -1091,112 +1194,6 @@ loop_on_iteration_start(ev_loop *loop, ev_check *watcher, int revents)
 	cpu_stat_start(&cord()->cpu_stat);
 }
 
-/**
- * Calculate the exponential moving average for the clock deltas
- * per loop iteration. The coeffitient is 1/16.
- */
-static inline uint64_t
-clock_diff_accumulate(uint64_t acc, uint64_t delta)
-{
-	if (acc > 0) {
-		return delta / 16 + 15 * acc / 16;
-	} else {
-		return delta;
-	}
-}
-
-inline void
-clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta)
-{
-	stat->delta += clock_delta;
-}
-
-void
-clock_stat_update(struct clock_stat *stat, double nsec_per_clock)
-{
-	stat->acc = clock_diff_accumulate(stat->acc, stat->delta);
-	stat->prev_delta = stat->delta;
-	stat->cputime += stat->delta * nsec_per_clock;
-	stat->delta = 0;
-}
-
-void
-clock_stat_reset(struct clock_stat *stat)
-{
-	stat->acc = 0;
-	stat->delta = 0;
-	stat->prev_delta = 0;
-	stat->cputime = 0;
-}
-
-void
-cpu_stat_start(struct cpu_stat *stat)
-{
-	stat->prev_clock = __rdtscp(&stat->prev_cpu_id);
-	stat->cpu_miss_count = 0;
-	/*
-	 * We want to measure thread cpu time here to calculate
-	 * each fiber's cpu time, so don't use libev's ev_now() or
-	 * ev_time() since they use either monotonic or realtime
-	 * system clocks.
-	 */
-	struct timespec ts;
-	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
-		say_debug("clock_gettime(): failed to get this thread's"
-			  " cpu time.");
-		return;
-	}
-	stat->prev_cputime = (uint64_t) ts.tv_sec * FIBER_TIME_RES + ts.tv_nsec;
-}
-
-void
-cpu_stat_reset(struct cpu_stat *stat)
-{
-	stat->prev_cpu_miss_count = 0;
-	cpu_stat_start(stat);
-}
-
-uint64_t
-cpu_stat_on_csw(struct cpu_stat *stat)
-{
-	uint32_t cpu_id;
-	uint64_t delta = 0;
-	uint64_t clock = __rdtscp(&cpu_id);
-
-	if (cpu_id == stat->prev_cpu_id) {
-		delta = clock - stat->prev_clock;
-	} else {
-		stat->prev_cpu_id = cpu_id;
-		stat->cpu_miss_count++;
-	}
-	stat->prev_clock = clock;
-
-	return delta;
-}
-
-double
-cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat)
-{
-	stat->prev_cpu_miss_count = stat->cpu_miss_count;
-	stat->cpu_miss_count = 0;
-
-	struct timespec ts;
-	uint64_t delta_time;
-	double nsec_per_clock = 0;
-	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
-		say_debug("clock_gettime(): failed to get this thread's"
-			  " cpu time.");
-	} else {
-		delta_time = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
-			     ts.tv_nsec;
-		if (delta_time > stat->prev_cputime && cord_clock_stat->delta > 0) {
-			delta_time -= stat->prev_cputime;
-			nsec_per_clock = (double) delta_time / cord()->clock_stat.delta;
-		}
-	}
-	return nsec_per_clock;
-}
-
 static void
 loop_on_iteration_end(ev_loop *loop, ev_prepare *watcher, int revents)
 {
diff --git a/src/lib/core/fiber.h b/src/lib/core/fiber.h
index 06ce28bb1..c5b975513 100644
--- a/src/lib/core/fiber.h
+++ b/src/lib/core/fiber.h
@@ -91,15 +91,6 @@ struct clock_stat {
 	uint64_t cputime;
 };
 
-void
-clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta);
-
-void
-clock_stat_update(struct clock_stat *stat, double nsec_per_clock);
-
-void
-clock_stat_reset(struct clock_stat *stat);
-
 /**
  * A struct encapsulating all knowledge this cord has about cpu
  * clocks and their state.
@@ -119,18 +110,6 @@ struct cpu_stat {
 	uint32_t prev_cpu_miss_count;
 };
 
-void
-cpu_stat_start(struct cpu_stat *stat);
-
-void
-cpu_stat_reset(struct cpu_stat *stat);
-
-uint64_t
-cpu_stat_on_csw(struct cpu_stat *stat);
-
-double
-cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat);
-
 #endif /* ENABLE_FIBER_TOP */
 
 enum { FIBER_NAME_MAX = 32 };

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation
  2019-11-18 22:25   ` Vladislav Shpilevoy
@ 2019-11-19  7:14     ` Sergey Petrenko
  0 siblings, 0 replies; 11+ messages in thread
From: Sergey Petrenko @ 2019-11-19  7:14 UTC (permalink / raw)
  To: Vladislav Shpilevoy; +Cc: tarantool-patches

[-- Attachment #1: Type: text/plain, Size: 7571 bytes --]


>Вторник, 19 ноября 2019, 1:19 +03:00 от Vladislav Shpilevoy <v.shpilevoy@tarantool.org>:
>
>Hi! Thanks for the fixes!
>
>I suggest you to move all functions related to cpu and clock
>state to fiber.c and make them static there. In order not to
>pollute the global namespace with fiber's private functions.
>
>I did that in a commit above this one. You may squash, or redo,
>whatever:
Hi! Thanks, I squashed.
>
>
>===================================================================
>
>diff --git a/src/lib/core/fiber.c b/src/lib/core/fiber.c
>index 1e08d0ec9..3faa198d3 100644
>--- a/src/lib/core/fiber.c
>+++ b/src/lib/core/fiber.c
>@@ -44,6 +44,112 @@
>
> #if ENABLE_FIBER_TOP
> #include <x86intrin.h> /* __rdtscp() */
>+
>+static inline void
>+clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta)
>+{
>+	stat->delta += clock_delta;
>+}
>+
>+/**
>+ * Calculate the exponential moving average for the clock deltas
>+ * per loop iteration. The coeffitient is 1/16.
>+ */
>+static inline uint64_t
>+clock_diff_accumulate(uint64_t acc, uint64_t delta)
>+{
>+	if (acc > 0)
>+		return delta / 16 + 15 * acc / 16;
>+	else
>+		return delta;
>+}
>+
>+static inline void
>+clock_stat_update(struct clock_stat *stat, double nsec_per_clock)
>+{
>+	stat->acc = clock_diff_accumulate(stat->acc, stat->delta);
>+	stat->prev_delta = stat->delta;
>+	stat->cputime += stat->delta * nsec_per_clock;
>+	stat->delta = 0;
>+}
>+
>+static inline void
>+clock_stat_reset(struct clock_stat *stat)
>+{
>+	stat->acc = 0;
>+	stat->delta = 0;
>+	stat->prev_delta = 0;
>+	stat->cputime = 0;
>+}
>+
>+static void
>+cpu_stat_start(struct cpu_stat *stat)
>+{
>+	stat->prev_clock = __rdtscp(&stat->prev_cpu_id);
>+	stat->cpu_miss_count = 0;
>+	/*
>+	 * We want to measure thread cpu time here to calculate
>+	 * each fiber's cpu time, so don't use libev's ev_now() or
>+	 * ev_time() since they use either monotonic or realtime
>+	 * system clocks.
>+	 */
>+	struct timespec ts;
>+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
>+		say_debug("clock_gettime(): failed to get this thread's"
>+			  " cpu time.");
>+		return;
>+	}
>+	stat->prev_cputime = (uint64_t) ts.tv_sec * FIBER_TIME_RES + ts.tv_nsec;
>+}
>+
>+static inline void
>+cpu_stat_reset(struct cpu_stat *stat)
>+{
>+	stat->prev_cpu_miss_count = 0;
>+	cpu_stat_start(stat);
>+}
>+
>+static uint64_t
>+cpu_stat_on_csw(struct cpu_stat *stat)
>+{
>+	uint32_t cpu_id;
>+	uint64_t delta, clock = __rdtscp(&cpu_id);
>+
>+	if (cpu_id == stat->prev_cpu_id) {
>+		delta = clock - stat->prev_clock;
>+	} else {
>+		delta = 0;
>+		stat->prev_cpu_id = cpu_id;
>+		stat->cpu_miss_count++;
>+	}
>+	stat->prev_clock = clock;
>+
>+	return delta;
>+}
>+
>+static double
>+cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat)
>+{
>+	stat->prev_cpu_miss_count = stat->cpu_miss_count;
>+	stat->cpu_miss_count = 0;
>+
>+	struct timespec ts;
>+	uint64_t delta_time;
>+	double nsec_per_clock = 0;
>+	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
>+		say_debug("clock_gettime(): failed to get this thread's"
>+			  " cpu time.");
>+	} else {
>+		delta_time = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
>+			     ts.tv_nsec;
>+		if (delta_time > stat->prev_cputime && cord_clock_stat->delta > 0) {
>+			delta_time -= stat->prev_cputime;
>+			nsec_per_clock = (double) delta_time / cord()->clock_stat.delta;
>+		}
>+	}
>+	return nsec_per_clock;
>+}
>+
> #endif /* ENABLE_FIBER_TOP */
>
> #include "third_party/valgrind/memcheck.h"
>@@ -88,9 +194,6 @@ static int (*fiber_invoke)(fiber_func f, va_list ap);
>
> #if ENABLE_FIBER_TOP
> static __thread bool fiber_top_enabled = false;
>-
>-uint64_t
>-cpu_stat_on_csw(struct cpu_stat *stat);
> #endif /* ENABLE_FIBER_TOP */
>
> /**
>@@ -1091,112 +1194,6 @@ loop_on_iteration_start(ev_loop *loop, ev_check *watcher, int revents)
> 	cpu_stat_start(&cord()->cpu_stat);
> }
>
>-/**
>- * Calculate the exponential moving average for the clock deltas
>- * per loop iteration. The coeffitient is 1/16.
>- */
>-static inline uint64_t
>-clock_diff_accumulate(uint64_t acc, uint64_t delta)
>-{
>-	if (acc > 0) {
>-		return delta / 16 + 15 * acc / 16;
>-	} else {
>-		return delta;
>-	}
>-}
>-
>-inline void
>-clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta)
>-{
>-	stat->delta += clock_delta;
>-}
>-
>-void
>-clock_stat_update(struct clock_stat *stat, double nsec_per_clock)
>-{
>-	stat->acc = clock_diff_accumulate(stat->acc, stat->delta);
>-	stat->prev_delta = stat->delta;
>-	stat->cputime += stat->delta * nsec_per_clock;
>-	stat->delta = 0;
>-}
>-
>-void
>-clock_stat_reset(struct clock_stat *stat)
>-{
>-	stat->acc = 0;
>-	stat->delta = 0;
>-	stat->prev_delta = 0;
>-	stat->cputime = 0;
>-}
>-
>-void
>-cpu_stat_start(struct cpu_stat *stat)
>-{
>-	stat->prev_clock = __rdtscp(&stat->prev_cpu_id);
>-	stat->cpu_miss_count = 0;
>-	/*
>-	 * We want to measure thread cpu time here to calculate
>-	 * each fiber's cpu time, so don't use libev's ev_now() or
>-	 * ev_time() since they use either monotonic or realtime
>-	 * system clocks.
>-	 */
>-	struct timespec ts;
>-	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
>-		say_debug("clock_gettime(): failed to get this thread's"
>-			  " cpu time.");
>-		return;
>-	}
>-	stat->prev_cputime = (uint64_t) ts.tv_sec * FIBER_TIME_RES + ts.tv_nsec;
>-}
>-
>-void
>-cpu_stat_reset(struct cpu_stat *stat)
>-{
>-	stat->prev_cpu_miss_count = 0;
>-	cpu_stat_start(stat);
>-}
>-
>-uint64_t
>-cpu_stat_on_csw(struct cpu_stat *stat)
>-{
>-	uint32_t cpu_id;
>-	uint64_t delta = 0;
>-	uint64_t clock = __rdtscp(&cpu_id);
>-
>-	if (cpu_id == stat->prev_cpu_id) {
>-		delta = clock - stat->prev_clock;
>-	} else {
>-		stat->prev_cpu_id = cpu_id;
>-		stat->cpu_miss_count++;
>-	}
>-	stat->prev_clock = clock;
>-
>-	return delta;
>-}
>-
>-double
>-cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat)
>-{
>-	stat->prev_cpu_miss_count = stat->cpu_miss_count;
>-	stat->cpu_miss_count = 0;
>-
>-	struct timespec ts;
>-	uint64_t delta_time;
>-	double nsec_per_clock = 0;
>-	if (clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) != 0) {
>-		say_debug("clock_gettime(): failed to get this thread's"
>-			  " cpu time.");
>-	} else {
>-		delta_time = (uint64_t) ts.tv_sec * FIBER_TIME_RES +
>-			     ts.tv_nsec;
>-		if (delta_time > stat->prev_cputime && cord_clock_stat->delta > 0) {
>-			delta_time -= stat->prev_cputime;
>-			nsec_per_clock = (double) delta_time / cord()->clock_stat.delta;
>-		}
>-	}
>-	return nsec_per_clock;
>-}
>-
> static void
> loop_on_iteration_end(ev_loop *loop, ev_prepare *watcher, int revents)
> {
>diff --git a/src/lib/core/fiber.h b/src/lib/core/fiber.h
>index 06ce28bb1..c5b975513 100644
>--- a/src/lib/core/fiber.h
>+++ b/src/lib/core/fiber.h
>@@ -91,15 +91,6 @@ struct clock_stat {
> 	uint64_t cputime;
> };
>
>-void
>-clock_stat_add_delta(struct clock_stat *stat, uint64_t clock_delta);
>-
>-void
>-clock_stat_update(struct clock_stat *stat, double nsec_per_clock);
>-
>-void
>-clock_stat_reset(struct clock_stat *stat);
>-
> /**
>  * A struct encapsulating all knowledge this cord has about cpu
>  * clocks and their state.
>@@ -119,18 +110,6 @@ struct cpu_stat {
> 	uint32_t prev_cpu_miss_count;
> };
>
>-void
>-cpu_stat_start(struct cpu_stat *stat);
>-
>-void
>-cpu_stat_reset(struct cpu_stat *stat);
>-
>-uint64_t
>-cpu_stat_on_csw(struct cpu_stat *stat);
>-
>-double
>-cpu_stat_end(struct cpu_stat *stat, struct clock_stat *cord_clock_stat);
>-
> #endif /* ENABLE_FIBER_TOP */
>
> enum { FIBER_NAME_MAX = 32 };


-- 
Sergey Petrenko

[-- Attachment #2: Type: text/html, Size: 9279 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup
  2019-11-18 16:05 [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Serge Petrenko
                   ` (2 preceding siblings ...)
  2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 3/3] app/fiber: wait till a full event loop iteration ends Serge Petrenko
@ 2019-11-19 22:57 ` Vladislav Shpilevoy
  2019-11-20  1:44   ` Alexander Turenko
  2019-11-21 17:07 ` Kirill Yukhin
  4 siblings, 1 reply; 11+ messages in thread
From: Vladislav Shpilevoy @ 2019-11-19 22:57 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches

Hi! Thanks for the fixes!

LGTM.

On 18/11/2019 17:05, Serge Petrenko wrote:
> The first patch factors all the clock-related members of struct fiber and struct
> cord into appropriate structs and adds methods for their updates and resets.
> 
> The second patch fixes exponential moving average calculation so that we do not
> experience huge numbers in average load percentage calculations.
> 
> The third patch alters fiber.top() test to wait for correct output before
> testing it. 
> 
> Follow-up https://github.com/tarantool/tarantool/issues/2694
> Branch https://github.com/tarantool/tarantool/tree/sp/gh-2694-test-fixup
> 
> Changes in v3:
>  - introduce a new patch which refactors all the clock stat
>    handling.
>  - introduce a patch fixing EMA calculation
>  - review fixes as per review from Vladislav
> 
> Changes in v2:
>  - clean up all fibers clock stats on fiber.top_enable()
>  - push 0 instead of NaN when cord clock_delta_last is 0
>  - review fixes as per review from Vladislav
> 
> Serge Petrenko (3):
>   fiber.top() refactor clock and cpu time calculation
>   fiber.top(): alter exponential moving average calculation
>   app/fiber: wait till a full event loop iteration ends
> 
>  src/lib/core/fiber.c    | 175 ++++++++++++++++++++++++----------------
>  src/lib/core/fiber.h    | 109 +++++++++++++++++--------
>  src/lua/fiber.c         |  20 +++--
>  test/app/fiber.result   |  38 ++++++---
>  test/app/fiber.test.lua |  36 +++++++--
>  5 files changed, 252 insertions(+), 126 deletions(-)
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup
  2019-11-19 22:57 ` [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Vladislav Shpilevoy
@ 2019-11-20  1:44   ` Alexander Turenko
  2019-11-20  8:29     ` Serge Petrenko
  0 siblings, 1 reply; 11+ messages in thread
From: Alexander Turenko @ 2019-11-20  1:44 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches, Vladislav Shpilevoy

It still shows miscompares for me:

$ cat /proc/cpuinfo | grep processor | wc -l
8
$ (cd test && ./test-run.py $(yes fiber.test.lua | head -n 1000))

commit aefc64faaf7bfbb58ac53bf5abea55d1faabcbfe

The miscompare looks so:

[006] Test failed! Result content mismatch:
[006] --- app/fiber.result	Tue Nov 19 20:39:21 2019
[006] +++ app/fiber.reject	Wed Nov 20 04:33:39 2019
[006] @@ -1525,7 +1525,7 @@
[006]  -- disable the test above.
[006]  sum_avg <= 100 or sum_avg
[006]  ---
[006] -- true
[006] +- 100
[006]  ...
[006]  -- not exact due to accumulated integer division errors
[006]  --sum_avg > 99 and sum_avg <= 100 or sum_avg

Maybe it is some rounding issue.

When I changed the condition to sum_avg <= 101, I got:

[015] Test failed! Result content mismatch:
[015] --- app/fiber.result	Wed Nov 20 04:35:14 2019
[015] +++ app/fiber.reject	Wed Nov 20 04:35:36 2019
[015] @@ -1512,7 +1512,7 @@
[015]  ...
[015]  sum_inst
[015]  ---
[015] -- 100
[015] +- 45.802853351342
[015]  ...
[015]  -- when a fiber dies, its impact on the thread moving average
[015]  -- persists for a couple of ev loop iterations, but it is no

Sorry, but it does not look okay for me in the sense that it may lead to
problems in testing: we already have enough ones.

Let's provide a test that will work stable or describe why it is not
possible. In the latter case we can extract an unstable test case and
mark it as fragile using suite.ini test-run's option. It will not
strictly close the problem, but will lower its probability.

Maybe we can even retry such tests or allows them to fail:
https://github.com/tarantool/test-run/issues/189

Anyway, if those fixes descrease probability of fails, I don't mind.
Just noted that a further work may be needed.

WBR, Alexander Turenko.

On Tue, Nov 19, 2019 at 11:57:11PM +0100, Vladislav Shpilevoy wrote:
> Hi! Thanks for the fixes!
> 
> LGTM.
> 
> On 18/11/2019 17:05, Serge Petrenko wrote:
> > The first patch factors all the clock-related members of struct fiber and struct
> > cord into appropriate structs and adds methods for their updates and resets.
> > 
> > The second patch fixes exponential moving average calculation so that we do not
> > experience huge numbers in average load percentage calculations.
> > 
> > The third patch alters fiber.top() test to wait for correct output before
> > testing it. 
> > 
> > Follow-up https://github.com/tarantool/tarantool/issues/2694
> > Branch https://github.com/tarantool/tarantool/tree/sp/gh-2694-test-fixup
> > 
> > Changes in v3:
> >  - introduce a new patch which refactors all the clock stat
> >    handling.
> >  - introduce a patch fixing EMA calculation
> >  - review fixes as per review from Vladislav
> > 
> > Changes in v2:
> >  - clean up all fibers clock stats on fiber.top_enable()
> >  - push 0 instead of NaN when cord clock_delta_last is 0
> >  - review fixes as per review from Vladislav
> > 
> > Serge Petrenko (3):
> >   fiber.top() refactor clock and cpu time calculation
> >   fiber.top(): alter exponential moving average calculation
> >   app/fiber: wait till a full event loop iteration ends
> > 
> >  src/lib/core/fiber.c    | 175 ++++++++++++++++++++++++----------------
> >  src/lib/core/fiber.h    | 109 +++++++++++++++++--------
> >  src/lua/fiber.c         |  20 +++--
> >  test/app/fiber.result   |  38 ++++++---
> >  test/app/fiber.test.lua |  36 +++++++--
> >  5 files changed, 252 insertions(+), 126 deletions(-)
> > 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup
  2019-11-20  1:44   ` Alexander Turenko
@ 2019-11-20  8:29     ` Serge Petrenko
  2019-11-20 12:19       ` Alexander Turenko
  0 siblings, 1 reply; 11+ messages in thread
From: Serge Petrenko @ 2019-11-20  8:29 UTC (permalink / raw)
  To: Alexander Turenko; +Cc: tarantool-patches, Vladislav Shpilevoy

[-- Attachment #1: Type: text/plain, Size: 6581 bytes --]

Hi! Thanks for your reply!
--
Serge Petrenko
sergepetrenko@tarantool.org




> 20 нояб. 2019 г., в 4:44, Alexander Turenko <alexander.turenko@tarantool.org> написал(а):
> 
> It still shows miscompares for me:
> 
> $ cat /proc/cpuinfo | grep processor | wc -l
> 8
> $ (cd test && ./test-run.py $(yes fiber.test.lua | head -n 1000))
> 
> commit aefc64faaf7bfbb58ac53bf5abea55d1faabcbfe
> 
> The miscompare looks so:
> 
> [006] Test failed! Result content mismatch:
> [006] --- app/fiber.result	Tue Nov 19 20:39:21 2019
> [006] +++ app/fiber.reject	Wed Nov 20 04:33:39 2019
> [006] @@ -1525,7 +1525,7 @@
> [006]  -- disable the test above.
> [006]  sum_avg <= 100 or sum_avg
> [006]  ---
> [006] -- true
> [006] +- 100
> [006]  ...
> [006]  -- not exact due to accumulated integer division errors
> [006]  --sum_avg > 99 and sum_avg <= 100 or sum_avg
> 
> Maybe it is some rounding issue.
> 
> When I changed the condition to sum_avg <= 101, I got:
> 
> [015] Test failed! Result content mismatch:
> [015] --- app/fiber.result	Wed Nov 20 04:35:14 2019
> [015] +++ app/fiber.reject	Wed Nov 20 04:35:36 2019
> [015] @@ -1512,7 +1512,7 @@
> [015]  ...
> [015]  sum_inst
> [015]  ---
> [015] -- 100
> [015] +- 45.802853351342
> [015]  ...
> [015]  -- when a fiber dies, its impact on the thread moving average
> [015]  -- persists for a couple of ev loop iterations, but it is no
> 
> Sorry, but it does not look okay for me in the sense that it may lead to
> problems in testing: we already have enough ones.
> 
> Let's provide a test that will work stable or describe why it is not
> possible. In the latter case we can extract an unstable test case and
> mark it as fragile using suite.ini test-run's option. It will not
> strictly close the problem, but will lower its probability.
> 
> Maybe we can even retry such tests or allows them to fail:
> https://github.com/tarantool/test-run/issues/189
> 
> Anyway, if those fixes descrease probability of fails, I don't mind.
> Just noted that a further work may be needed.

Well, yes, they do. Just try to run this test before the patch.
Anyway, I guess you’re right that we shouldn’t leave flaky tests if
possible, so I commented the flaky pieces out and added a request to
reenable them once #4625 (List dead fibers in fiber.top <http://fiber.top/>() output) is
implemented.

The diff’s below. Now 1000 out of 1000 tests pass with 16 concurrent
jobs, at least on my machine.


> 
> WBR, Alexander Turenko.
> 
> On Tue, Nov 19, 2019 at 11:57:11PM +0100, Vladislav Shpilevoy wrote:
>> Hi! Thanks for the fixes!
>> 
>> LGTM.
>> 
>> On 18/11/2019 17:05, Serge Petrenko wrote:
>>> The first patch factors all the clock-related members of struct fiber and struct
>>> cord into appropriate structs and adds methods for their updates and resets.
>>> 
>>> The second patch fixes exponential moving average calculation so that we do not
>>> experience huge numbers in average load percentage calculations.
>>> 
>>> The third patch alters fiber.top() test to wait for correct output before
>>> testing it. 
>>> 
>>> Follow-up https://github.com/tarantool/tarantool/issues/2694
>>> Branch https://github.com/tarantool/tarantool/tree/sp/gh-2694-test-fixup
>>> 
>>> Changes in v3:
>>> - introduce a new patch which refactors all the clock stat
>>>   handling.
>>> - introduce a patch fixing EMA calculation
>>> - review fixes as per review from Vladislav
>>> 
>>> Changes in v2:
>>> - clean up all fibers clock stats on fiber.top_enable()
>>> - push 0 instead of NaN when cord clock_delta_last is 0
>>> - review fixes as per review from Vladislav
>>> 
>>> Serge Petrenko (3):
>>>  fiber.top() refactor clock and cpu time calculation
>>>  fiber.top(): alter exponential moving average calculation
>>>  app/fiber: wait till a full event loop iteration ends
>>> 
>>> src/lib/core/fiber.c    | 175 ++++++++++++++++++++++++----------------
>>> src/lib/core/fiber.h    | 109 +++++++++++++++++--------
>>> src/lua/fiber.c         |  20 +++--
>>> test/app/fiber.result   |  38 ++++++---
>>> test/app/fiber.test.lua |  36 +++++++--
>>> 5 files changed, 252 insertions(+), 126 deletions(-)
>>> 


diff --git a/test/app/fiber.result b/test/app/fiber.result
index b767bbb59..6d9604ad8 100644
--- a/test/app/fiber.result
+++ b/test/app/fiber.result
@@ -1510,25 +1510,21 @@ for k, v in pairs(a) do\
 end
 ---
 ...
-sum_inst
----
-- 100
-...
 -- when a fiber dies, its impact on the thread moving average
 -- persists for a couple of ev loop iterations, but it is no
 -- longer listed in fiber.top(). So sum_avg may way smaller than
--- 100%. See gh-4625 for details and reenable the test below as
+-- 100%. See gh-4625 for details and reenable both tests below as
 -- soon as it is implemented.
 -- In rare cases when a fiber dies on the same event loop
 -- iteration as you issue fiber.top(), sum_inst will also be
--- smaller than 100%, but it is so rare I don't even want to
--- disable the test above.
-sum_avg <= 100 or sum_avg
+-- smaller than 100%.
+-- sum_inst
+sum_avg <= 100.1 or sum_avg
 ---
 - true
 ...
 -- not exact due to accumulated integer division errors
---sum_avg > 99 and sum_avg <= 100 or sum_avg
+--sum_avg > 99 and sum_avg <= 100.1 or sum_avg
 tbl = nil
 ---
 ...
diff --git a/test/app/fiber.test.lua b/test/app/fiber.test.lua
index 7f7350acc..6df210d9c 100644
--- a/test/app/fiber.test.lua
+++ b/test/app/fiber.test.lua
@@ -658,19 +658,18 @@ for k, v in pairs(a) do\
     sum_avg = sum_avg + v["average"]\
 end
 
-sum_inst
 -- when a fiber dies, its impact on the thread moving average
 -- persists for a couple of ev loop iterations, but it is no
 -- longer listed in fiber.top(). So sum_avg may way smaller than
--- 100%. See gh-4625 for details and reenable the test below as
+-- 100%. See gh-4625 for details and reenable both tests below as
 -- soon as it is implemented.
 -- In rare cases when a fiber dies on the same event loop
 -- iteration as you issue fiber.top(), sum_inst will also be
--- smaller than 100%, but it is so rare I don't even want to
--- disable the test above.
-sum_avg <= 100 or sum_avg
+-- smaller than 100%.
+-- sum_inst
+sum_avg <= 100.1 or sum_avg
 -- not exact due to accumulated integer division errors
---sum_avg > 99 and sum_avg <= 100 or sum_avg
+--sum_avg > 99 and sum_avg <= 100.1 or sum_avg
 tbl = nil
 f = fiber.new(function()\
     local fiber_key = fiber.self().id()..'/'..fiber.self().name()\

[-- Attachment #2: Type: text/html, Size: 22357 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup
  2019-11-20  8:29     ` Serge Petrenko
@ 2019-11-20 12:19       ` Alexander Turenko
  0 siblings, 0 replies; 11+ messages in thread
From: Alexander Turenko @ 2019-11-20 12:19 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches, Vladislav Shpilevoy

On Wed, Nov 20, 2019 at 11:29:26AM +0300, Serge Petrenko wrote:
> Hi! Thanks for your reply!
> --
> Serge Petrenko
> sergepetrenko@tarantool.org
> 
> 
> 
> 
> > 20 нояб. 2019 г., в 4:44, Alexander Turenko <alexander.turenko@tarantool.org> написал(а):
> > 
> > It still shows miscompares for me:
> > 
> > $ cat /proc/cpuinfo | grep processor | wc -l
> > 8
> > $ (cd test && ./test-run.py $(yes fiber.test.lua | head -n 1000))
> > 
> > commit aefc64faaf7bfbb58ac53bf5abea55d1faabcbfe
> > 
> > The miscompare looks so:
> > 
> > [006] Test failed! Result content mismatch:
> > [006] --- app/fiber.result	Tue Nov 19 20:39:21 2019
> > [006] +++ app/fiber.reject	Wed Nov 20 04:33:39 2019
> > [006] @@ -1525,7 +1525,7 @@
> > [006]  -- disable the test above.
> > [006]  sum_avg <= 100 or sum_avg
> > [006]  ---
> > [006] -- true
> > [006] +- 100
> > [006]  ...
> > [006]  -- not exact due to accumulated integer division errors
> > [006]  --sum_avg > 99 and sum_avg <= 100 or sum_avg
> > 
> > Maybe it is some rounding issue.
> > 
> > When I changed the condition to sum_avg <= 101, I got:
> > 
> > [015] Test failed! Result content mismatch:
> > [015] --- app/fiber.result	Wed Nov 20 04:35:14 2019
> > [015] +++ app/fiber.reject	Wed Nov 20 04:35:36 2019
> > [015] @@ -1512,7 +1512,7 @@
> > [015]  ...
> > [015]  sum_inst
> > [015]  ---
> > [015] -- 100
> > [015] +- 45.802853351342
> > [015]  ...
> > [015]  -- when a fiber dies, its impact on the thread moving average
> > [015]  -- persists for a couple of ev loop iterations, but it is no
> > 
> > Sorry, but it does not look okay for me in the sense that it may lead to
> > problems in testing: we already have enough ones.
> > 
> > Let's provide a test that will work stable or describe why it is not
> > possible. In the latter case we can extract an unstable test case and
> > mark it as fragile using suite.ini test-run's option. It will not
> > strictly close the problem, but will lower its probability.
> > 
> > Maybe we can even retry such tests or allows them to fail:
> > https://github.com/tarantool/test-run/issues/189
> > 
> > Anyway, if those fixes descrease probability of fails, I don't mind.
> > Just noted that a further work may be needed.
> 
> Well, yes, they do. Just try to run this test before the patch.
> Anyway, I guess you’re right that we shouldn’t leave flaky tests if
> possible, so I commented the flaky pieces out and added a request to
> reenable them once #4625 (List dead fibers in fiber.top <http://fiber.top/>() output) is
> implemented.
> 
> The diff’s below. Now 1000 out of 1000 tests pass with 16 concurrent
> jobs, at least on my machine.

Verified 79579e399ff08017581ba58895d06bd7da54f67d. There are no more
fails on fiber top related cases. So, I'm okay now (it is not formal
LGTM, because I did not review for the whole patchset, just my personal
feeling).

NB: There are fails that are unrelated to fiber top:

[012] --- app/fiber.result	Wed Nov 20 14:39:10 2019
[012] +++ app/fiber.reject	Wed Nov 20 14:40:57 2019
[012] @@ -675,8 +675,8 @@
[012]  -- attempt to access local storage of dead fiber raises error
[012]  pcall(function(f) return f.storage end, f)
[012]  ---
[012] -- false
[012] -- '[string "return pcall(function(f) return f.storage end..."]:1: the fiber is dead'
[012] +- true
[012] +- key: some value
[012]  ...
[012]  --
[012]  -- Test that local storage is garbage collected when fiber is died

I run 1000 jobs in 16 threads three times and got 5, 8 and 6 fails. It
is not about fiber top, just noted.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup
  2019-11-18 16:05 [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Serge Petrenko
                   ` (3 preceding siblings ...)
  2019-11-19 22:57 ` [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Vladislav Shpilevoy
@ 2019-11-21 17:07 ` Kirill Yukhin
  4 siblings, 0 replies; 11+ messages in thread
From: Kirill Yukhin @ 2019-11-21 17:07 UTC (permalink / raw)
  To: Serge Petrenko; +Cc: tarantool-patches, v.shpilevoy

Hello,

On 18 ноя 19:05, Serge Petrenko wrote:
> The first patch factors all the clock-related members of struct fiber and struct
> cord into appropriate structs and adds methods for their updates and resets.
> 
> The second patch fixes exponential moving average calculation so that we do not
> experience huge numbers in average load percentage calculations.
> 
> The third patch alters fiber.top() test to wait for correct output before
> testing it. 
> 
> Follow-up https://github.com/tarantool/tarantool/issues/2694
> Branch https://github.com/tarantool/tarantool/tree/sp/gh-2694-test-fixup

I've checked the patch set into master.

--
Regards, Kirill Yukhin

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2019-11-21 17:07 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-11-18 16:05 [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Serge Petrenko
2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 1/3] fiber.top() refactor clock and cpu time calculation Serge Petrenko
2019-11-18 22:25   ` Vladislav Shpilevoy
2019-11-19  7:14     ` Sergey Petrenko
2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 2/3] fiber.top(): alter exponential moving average calculation Serge Petrenko
2019-11-18 16:05 ` [Tarantool-patches] [PATCH v3 3/3] app/fiber: wait till a full event loop iteration ends Serge Petrenko
2019-11-19 22:57 ` [Tarantool-patches] [PATCH v3 0/3] fiber.top(): minor fixup Vladislav Shpilevoy
2019-11-20  1:44   ` Alexander Turenko
2019-11-20  8:29     ` Serge Petrenko
2019-11-20 12:19       ` Alexander Turenko
2019-11-21 17:07 ` Kirill Yukhin

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox