dovecot-2.2: lib: timings - added quantiles

dovecot at dovecot.org dovecot at dovecot.org
Mon Sep 21 16:54:54 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/2f492fac75b7
changeset: 19173:2f492fac75b7
user:      Phil Carmody <phil at dovecot.fi>
date:      Mon Sep 21 19:52:32 2015 +0300
description:
lib: timings - added quantiles
Just sub-sample the stream. On the assumption that the samples come from
one distribution, then any randomly selected subset will share the same
distribution. Therefore the quantiles should be at approximately the same
value.

However, that's a big assumption, as there will almost certainly be time
dependency, and periodicity (24hrs, 7 days).

Signed-off-by: Phil Carmody <phil at dovecot.fi>

diffstat:

 src/lib/test-timing.c |  12 ++++++-
 src/lib/timing.c      |  78 +++++++++++++++++++++++++++++++++++++++++++-------
 src/lib/timing.h      |   4 ++
 3 files changed, 80 insertions(+), 14 deletions(-)

diffs (151 lines):

diff -r 1a5a45c80687 -r 2f492fac75b7 src/lib/test-timing.c
--- a/src/lib/test-timing.c	Mon Sep 21 19:51:05 2015 +0300
+++ b/src/lib/test-timing.c	Mon Sep 21 19:52:32 2015 +0300
@@ -2,7 +2,7 @@
 
 #include "test-lib.h"
 #include "timing.h"
-
+#include "sort.h"
 #include <stdlib.h>
 
 static void
@@ -29,7 +29,15 @@
 	test_assert_idx(timing_get_count(t) == input_size, input_size);
 	test_assert_idx(timing_get_min(t)  == min, input_size);
 	test_assert_idx(timing_get_max(t) == max, input_size);
-	test_assert_idx(timing_get_avg(t) == sum/input_size, input_size);
+	test_assert_idx(timing_get_avg(t) == (sum + input_size/2)/input_size, input_size);
+
+	/* these aren't always fully accurate: */
+	test_assert_idx(timing_get_median(t) >= copy[(input_size-1)/2] &&
+			timing_get_median(t) <= copy[input_size/2],
+			input_size);
+	/* when we have 20 elements, [19] is the max, not the 95th %ile, so subtract 1 */
+	test_assert_idx(timing_get_95th(t) == copy[input_size*95/100 - !(input_size%20)],
+			input_size);
 
 	i_free(copy);
 }
diff -r 1a5a45c80687 -r 2f492fac75b7 src/lib/timing.c
--- a/src/lib/timing.c	Mon Sep 21 19:51:05 2015 +0300
+++ b/src/lib/timing.c	Mon Sep 21 19:52:32 2015 +0300
@@ -1,14 +1,21 @@
 /* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
 
 #include "lib.h"
-#include "bsearch-insert-pos.h"
 #include "timing.h"
+#include "sort.h"
+#include <stdlib.h>
 
-#define TIMING_MAX_BUCKET_COUNT 20
+/* In order to have a vaguely accurate 95th percentile, you need way
+   more than 20 in your subsample. */
+#define TIMING_SUBSAMPLING_BUFFER (20*24) /* 20*24 fits in a page */
 
 struct timing {
 	unsigned int count;
-	uint64_t min, max, sum;
+	bool     sorted;
+	uint64_t min;
+	uint64_t samples[TIMING_SUBSAMPLING_BUFFER];
+	uint64_t max;
+	uint64_t sum;
 };
 
 struct timing *timing_init(void)
@@ -21,18 +28,30 @@
 	i_free_and_null(*_timing);
 }
 
-
 void timing_add_usecs(struct timing *timing, uint64_t usecs)
 {
-	if (timing->count++ == 0) {
-		timing->min = timing->max = timing->sum = usecs;
+	if (timing->count < TIMING_SUBSAMPLING_BUFFER) {
+		timing->samples[timing->count] = usecs;
+		if (timing->count == 0)
+			timing->min = timing->max = usecs;
 	} else {
-		if (timing->min > usecs)
-			timing->min = usecs;
-		if (timing->max < usecs)
-			timing->max = usecs;
-		timing->sum += usecs;
+		unsigned int count = timing->count;
+		unsigned int idx;
+		if (count > RAND_MAX >> 6)
+			idx = (rand()*((uint64_t)RAND_MAX+1) + rand()) % count;
+		else
+			idx = rand() % count;
+		if (idx < TIMING_SUBSAMPLING_BUFFER)
+			timing->samples[idx] = usecs;
 	}
+
+	timing->count++;
+	timing->sum += usecs;
+	if (timing->max < usecs)
+		timing->max = usecs;
+	if (timing->min > usecs)
+		timing->min = usecs;
+	timing->sorted = FALSE;
 }
 
 unsigned int timing_get_count(const struct timing *timing)
@@ -55,5 +74,40 @@
 	if (timing->count == 0)
 		return 0;
 
-	return timing->sum / timing->count;
+	return (timing->sum + timing->count/2) / timing->count;
 }
+
+static void timing_ensure_sorted(struct timing *timing)
+{
+	if (timing->sorted)
+		return;
+	i_qsort(timing->samples, timing->count, sizeof(*timing->samples),
+		uint64_cmp);
+	timing->sorted = TRUE;
+}
+
+uint64_t timing_get_median(const struct timing *timing)
+{
+	if (timing->count == 0)
+		return 0;
+	/* cast-away const - reading requires sorting */
+	timing_ensure_sorted((struct timing *)timing);
+	unsigned int count = (timing->count < TIMING_SUBSAMPLING_BUFFER)
+		? timing->count
+		: TIMING_SUBSAMPLING_BUFFER;
+	unsigned int idx1 = (count-1)/2, idx2 = count/2;
+	return (timing->samples[idx1] + timing->samples[idx2]) / 2;
+}
+
+uint64_t timing_get_95th(const struct timing *timing)
+{
+	if (timing->count == 0)
+		return 0;
+	/* cast-away const - reading requires sorting */
+	timing_ensure_sorted((struct timing *)timing);
+	unsigned int count = (timing->count < TIMING_SUBSAMPLING_BUFFER)
+		? timing->count
+		: TIMING_SUBSAMPLING_BUFFER;
+	unsigned int idx = count - count/20 - 1;
+	return timing->samples[idx];
+}
diff -r 1a5a45c80687 -r 2f492fac75b7 src/lib/timing.h
--- a/src/lib/timing.h	Mon Sep 21 19:51:05 2015 +0300
+++ b/src/lib/timing.h	Mon Sep 21 19:52:32 2015 +0300
@@ -16,5 +16,9 @@
 uint64_t timing_get_max(const struct timing *timing);
 /* Returns events' average. */
 uint64_t timing_get_avg(const struct timing *timing);
+/* Returns events' approximate (through random subsampling) median. */
+uint64_t timing_get_median(const struct timing *timing);
+/* Returns events' approximate (through random subsampling) 95th percentile. */
+uint64_t timing_get_95th(const struct timing *timing);
 
 #endif


More information about the dovecot-cvs mailing list