dovecot-2.2: lib-fts: Various improvements to test-fts-tokenizer
dovecot at dovecot.org
dovecot at dovecot.org
Sat May 9 15:30:04 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/e4b62ba0fb5a
changeset: 18603:e4b62ba0fb5a
user: Timo Sirainen <tss at iki.fi>
date: Sat May 09 18:28:04 2015 +0300
description:
lib-fts: Various improvements to test-fts-tokenizer
diffstat:
src/lib-fts/test-fts-tokenizer.c | 565 +++++++++-----------------------------
1 files changed, 134 insertions(+), 431 deletions(-)
diffs (truncated from 725 to 300 lines):
diff -r 7542e3be6721 -r e4b62ba0fb5a src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c Sat May 09 18:00:58 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c Sat May 09 18:28:04 2015 +0300
@@ -1,16 +1,30 @@
/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
#include "lib.h"
-#include "sha2.h"
-#include "hex-binary.h"
+#include "unichar.h"
#include "test-common.h"
#include "fts-tokenizer.h"
#include "fts-tokenizer-private.h"
-/* TODO: fix including and linking of this. */
-/* #include "fts-tokenizer-generic-private.h" */
+#include "fts-tokenizer-generic-private.h"
#include <stdlib.h>
+#define TEST_INPUT_TEXT \
+ "hello world\r\n\nAnd there\twas: text galore, " \
+ "abc at example.com, " \
+ "Bar Baz <bar at example.org>, " \
+ "foo at domain " \
+ "1234567890123456789012345678ä," \
+ "12345678901234567890123456789ä," \
+ "123456789012345678901234567890ä," \
+ "and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n " \
+ "(\"Hello world\")3.14 3,14 last"
+#define TEST_INPUT_ADDRESS \
+ "@invalid invalid@ Abc Dfg <abc.dfg at example.com>, " \
+ "Bar Baz <bar at example.org>" \
+ "Foo Bar (comment)foo.bar at host.example.org " \
+ "foo, foo at domain"
+
static void test_fts_tokenizer_find(void)
{
test_begin("fts tokenizer find");
@@ -19,34 +33,79 @@
test_end();
}
+static void
+test_tokenizer_inputoutput(struct fts_tokenizer *tok, const char *_input,
+ const char *const *expected_output)
+{
+ const unsigned char *input = (const unsigned char *)_input;
+ const char *token;
+ unsigned int i, max, outi, char_len, input_len = strlen(_input);
+
+ /* test all input at once */
+ outi = 0;
+ while (fts_tokenizer_next(tok, input, input_len, &token) > 0) {
+ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ outi++;
+ }
+ while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
+ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ outi++;
+ }
+ test_assert(expected_output[outi] == NULL);
+
+ /* test input one byte at a time */
+ for (i = outi = 0; i < input_len; i += char_len) {
+ char_len = uni_utf8_char_bytes(input[i]);
+ while (fts_tokenizer_next(tok, input+i, char_len, &token) > 0) {
+ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ outi++;
+ }
+ }
+ while (fts_tokenizer_final(tok, &token) > 0) {
+ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ outi++;
+ }
+ test_assert(expected_output[outi] == NULL);
+
+ /* test input in random chunks */
+ for (i = outi = 0; i < input_len; i += char_len) {
+ max = rand() % (input_len - i) + 1;
+ for (char_len = 0; char_len < max; )
+ char_len += uni_utf8_char_bytes(input[i+char_len]);
+ while (fts_tokenizer_next(tok, input+i, char_len, &token) > 0) {
+ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ outi++;
+ }
+ }
+ while (fts_tokenizer_final(tok, &token) > 0) {
+ test_assert_idx(strcmp(token, expected_output[outi]) == 0, outi);
+ outi++;
+ }
+ test_assert(expected_output[outi] == NULL);
+}
+
static void test_fts_tokenizer_generic_only(void)
{
- static const unsigned char input[] =
- "hello world\r\nAnd there\twas: text "
- "galore, and longlonglongabcdefghijklmnopqrstuvwxyz more.\n\n (\"Hello world\")last ";
+ static const char input[] = TEST_INPUT_TEXT;
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galore",
+ "abc", "example", "com", "Bar", "Baz",
+ "bar", "example", "org", "foo", "domain",
+ "1234567890123456789012345678ä",
+ "12345678901234567890123456789",
+ "123456789012345678901234567890",
"and", "longlonglongabcdefghijklmnopqr",
- "more", "Hello", "world", "last", NULL
+ "more", "Hello", "world", "3", "14", "3", "14", "last", NULL
};
struct fts_tokenizer *tok;
- const char * const *eopp = expected_output;
- const char *token, *error;
+ const char *error;
test_begin("fts tokenizer generic simple");
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
-/*TODO: Uncomment when fts-tokenizer-generic-private.h inclusion is fixed */
-/*test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);*/
- while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- test_assert(*eopp == NULL);
+ test_assert(((struct generic_fts_tokenizer *) tok)->algorithm == BOUNDARY_ALGORITHM_SIMPLE);
+
+ test_tokenizer_inputoutput(tok, input, expected_output);
fts_tokenizer_unref(&tok);
test_end();
}
@@ -55,7 +114,7 @@
{
/* with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
U+205A(e2 81 9a) and U+205F(e2 81 9f )*/
- static const unsigned char input[] =
+ static const char input[] =
"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
static const char *const expected_output[] = {
@@ -64,61 +123,12 @@
"and", "more", NULL
};
struct fts_tokenizer *tok;
- const char * const *eopp = expected_output;
- const char *token, *error;
+ const char *error;
test_begin("fts tokenizer generic simple with Unicode whitespace");
- fts_tokenizer_register(fts_tokenizer_generic);
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
- while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- test_assert(*eopp == NULL);
+ test_tokenizer_inputoutput(tok, input, expected_output);
fts_tokenizer_unref(&tok);
- fts_tokenizer_unregister(fts_tokenizer_generic);
- test_end();
-}
-
-static void test_fts_tokenizer_char_generic_only(void)
-{
- static const unsigned char input[] =
- "abc at example.com, "
- "Bar Baz <bar at example.org>, "
- "foo at domain";
- static const char *const expected_output[] = {
- "abc", "example", "com", "Bar", "Baz",
- "bar", "example", "org", "foo", "domain", NULL
- };
- struct fts_tokenizer *tok;
- const char * const *eopp = expected_output;
- const char *token, *error;
- unsigned int i;
- int ret;
-
- test_begin("fts tokenizer generic simple input one character at a time");
- fts_tokenizer_register(fts_tokenizer_generic);
-
- test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &tok, &error) == 0);
-
- for (i = 0; i <= sizeof(input)-1; ) {
- ret = i < sizeof(input)-1 ?
- fts_tokenizer_next(tok, &input[i], 1, &token) :
- fts_tokenizer_next(tok, NULL, 0, &token);
- if (ret == 0) {
- i++;
- continue;
- }
- test_assert(null_strcmp(token, *eopp) == 0);
- eopp++;
- }
- test_assert(*eopp == NULL);
- fts_tokenizer_unref(&tok);
- fts_tokenizer_unregister(fts_tokenizer_generic);
test_end();
}
@@ -126,34 +136,25 @@
static void test_fts_tokenizer_generic_tr29_only(void)
{
- static const unsigned char input[] =
- "hello world\r\n\nAnd there\twas: text "
- "galore, and more.\n\n (\"Hello world\")3.14 3,14 last"
- " longlonglongabcdefghijklmnopqrstuvwxyz 1.";
+ static const char input[] = TEST_INPUT_TEXT;
static const char *const expected_output[] = {
"hello", "world", "And",
"there", "was", "text", "galore",
- "and", "more", "Hello", "world", "3.14",
- "3,14", "last", "longlonglongabcdefghijklmnopqr", "1", NULL
+ "abc", "example.com", "Bar", "Baz",
+ "bar", "example.org", "foo", "domain",
+ "1234567890123456789012345678ä",
+ "12345678901234567890123456789",
+ "123456789012345678901234567890",
+ "and", "longlonglongabcdefghijklmnopqr",
+ "more", "Hello", "world", "3.14", "3,14", "last", NULL
};
struct fts_tokenizer *tok;
- const char * const *eopp = expected_output;
- const char *token, *error;
+ const char *error;
test_begin("fts tokenizer generic TR29");
- fts_tokenizer_register(fts_tokenizer_generic);
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
- while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- test_assert(*eopp == NULL);
+ test_tokenizer_inputoutput(tok, input, expected_output);
fts_tokenizer_unref(&tok);
- fts_tokenizer_unregister(fts_tokenizer_generic);
test_end();
}
@@ -163,7 +164,7 @@
{
/* with Unicode(utf8) U+2000(e2 80 80) and U+205A(e2 81 9a) and U+205F(e2
81 9f)*/
- static const unsigned char input[] =
+ static const char input[] =
"hello world\r\nAnd\xE2\x80\x80there\twas: text "
"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n";
static const char *const expected_output[] = {
@@ -172,404 +173,112 @@
"and", "more", NULL
};
struct fts_tokenizer *tok;
- const char * const *eopp = expected_output;
- const char *token, *error;
+ const char *error;
test_begin("fts tokenizer generic TR29 with Unicode whitespace");
- fts_tokenizer_register(fts_tokenizer_generic);
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
- while (fts_tokenizer_next(tok, input, sizeof(input)-1, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- while (fts_tokenizer_next(tok, NULL, 0, &token) > 0) {
- test_assert(strcmp(token, *eopp) == 0);
- eopp++;
- }
- test_assert(*eopp == NULL);
+ test_tokenizer_inputoutput(tok, input, expected_output);
fts_tokenizer_unref(&tok);
- fts_tokenizer_unregister(fts_tokenizer_generic);
test_end();
}
static void test_fts_tokenizer_generic_tr29_midnumlet_end(void)
{
/* u+FF0E is EF BC 8E */
- static const unsigned char input[] =
+ static const char input[] =
"hello world\xEF\xBC\x8E";
static const char *const expected_output[] = {
"hello", "world", NULL
};
struct fts_tokenizer *tok;
- const char * const *eopp = expected_output;
- const char *token, *error;
More information about the dovecot-cvs
mailing list