dovecot-2.2: lib-fts: Implemented "search" parameter to fts-toke...

dovecot at dovecot.org dovecot at dovecot.org
Sat May 9 10:17:37 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/60f07e741c57
changeset: 18573:60f07e741c57
user:      Timo Sirainen <tss at iki.fi>
date:      Sat May 09 13:15:09 2015 +0300
description:
lib-fts: Implemented "search" parameter to fts-tokenizer-address.

diffstat:

 src/lib-fts/fts-tokenizer-address.c |  41 ++++++++++++++++++++--------------
 src/lib-fts/test-fts-tokenizer.c    |  43 +++++++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 17 deletions(-)

diffs (131 lines):

diff -r 07597666aa29 -r 60f07e741c57 src/lib-fts/fts-tokenizer-address.c
--- a/src/lib-fts/fts-tokenizer-address.c	Sat May 09 13:01:45 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-address.c	Sat May 09 13:15:09 2015 +0300
@@ -76,17 +76,30 @@
 	return 1;
 }
 
-static int
+static bool
 fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok,
                                   const char **token_r)
 {
-	/* TODO: search option removes address from data here. */
-	if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN)
-		i_debug("Would remove current token");
+	if (tok->tokenizer.parent == NULL || str_len(tok->parent_data) == 0)
+		return FALSE;
+
+	if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN) {
+		/* we're searching and we want to find only the full
+		   user at domain (not "user" and "domain"). we'll do this by
+		   not feeding the last user at domain to parent tokenizer. */
+		unsigned int parent_prefix_len =
+			str_len(tok->parent_data) - str_len(tok->last_word);
+		i_assert(str_len(tok->parent_data) >= str_len(tok->last_word) &&
+			 strcmp(str_c(tok->parent_data) + parent_prefix_len,
+				str_c(tok->last_word)) == 0);
+		str_truncate(tok->parent_data, parent_prefix_len);
+		if (str_len(tok->parent_data) == 0)
+			return FALSE;
+	}
 
 	*token_r = t_strdup(str_c(tok->parent_data));
 	str_truncate(tok->parent_data, 0);
-	return 1;
+	return TRUE;
 }
 
 /* Used to rewind past characters that can not be the start of a new localpart.
@@ -204,8 +217,8 @@
 	/* end of data, output lingering tokens. first the parents data, then
 	   possibly our token, if complete enough */
 	if (size == 0) {
-		if (tok->tokenizer.parent != NULL && str_len(tok->parent_data) > 0)
-			return fts_tokenizer_address_parent_data(tok, token_r);
+		if (fts_tokenizer_address_parent_data(tok, token_r))
+			return 1;
 
 		if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN &&
 		    !domain_is_empty(tok))
@@ -254,16 +267,10 @@
 
 			break;
 		case EMAIL_ADDRESS_PARSER_STATE_COMPLETE:
-			/* skip tailing non-atext */
-			local_skip = skip_nonlocal_part(data+pos, size - pos);
-			*skip_r = pos + local_skip;
-			fts_tokenizer_address_update_parent(tok, data+pos,
-			                                    local_skip);
-			if (tok->tokenizer.parent != NULL)
-				return fts_tokenizer_address_parent_data(tok, token_r);
-			else {
-				return fts_tokenizer_address_current_token(tok, token_r);
-			}
+			*skip_r = pos;
+			if (fts_tokenizer_address_parent_data(tok, token_r))
+				return 1;
+			return fts_tokenizer_address_current_token(tok, token_r);
 		default:
 			i_unreached();
 		}
diff -r 07597666aa29 -r 60f07e741c57 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Sat May 09 13:01:45 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Sat May 09 13:15:09 2015 +0300
@@ -518,6 +518,48 @@
 	test_end();
 }
 
+static void test_fts_tokenizer_address_search(void)
+{
+	static const unsigned char input[] =
+		"@invalid invalid@ abc at example.com, "
+		"Bar Baz <bar at example.org>, "
+		"foo at domain";
+	static const char *const expected_output[] = {
+		"invalid", "invalid", "abc at example.com", "Bar", "Baz",
+		"bar at example.org", "foo at domain", NULL
+	};
+	static const char *const settings[] = { "search", "" };
+	struct fts_tokenizer *tok, *gen_tok;
+	const char * const *eopp = expected_output;
+	const char *token, *error;
+	unsigned int i;
+	int ret;
+
+	test_begin("fts tokenizer search email address + parent, input one character at a time");
+	fts_tokenizers_init();
+
+	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
+	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, settings, &tok, &error) == 0);
+
+	for (i = 0; i <= sizeof(input)-1; ) {
+		ret = i < sizeof(input)-1 ?
+			fts_tokenizer_next(tok, &input[i], 1, &token) :
+			fts_tokenizer_next(tok, NULL, 0, &token);
+		if (ret == 0) {
+			i++;
+			continue;
+		}
+		test_assert(*eopp != NULL);
+		test_assert(null_strcmp(token, *eopp) == 0);
+		eopp++;
+	}
+	test_assert(*eopp == NULL);
+	fts_tokenizer_unref(&tok);
+	fts_tokenizer_unref(&gen_tok);
+	fts_tokenizers_deinit();
+	test_end();
+}
+
 int main(void)
 {
 	static void (*test_functions[])(void) = {
@@ -534,6 +576,7 @@
 		test_fts_tokenizer_address_char,
 		test_fts_tokenizer_address_line,
 		test_fts_tokenizer_address_rand,
+		test_fts_tokenizer_address_search,
 		NULL
 	};
 


More information about the dovecot-cvs mailing list