dovecot-2.2: fts: Create tokenizers differently

dovecot at dovecot.org dovecot at dovecot.org
Sat May 9 08:32:07 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/ae0458c63761
changeset: 18549:ae0458c63761
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Sat May 09 11:02:22 2015 +0300
description:
fts: Create tokenizers differently

Create tokenizers earlier. Create separate tokenizers for search
and indexing. Enable configuration of tokenizers. Add some helpers
in fts-tokenizer.h api. Change tokenizer unit tests to match
those changes.

lib-fts: Refactor lib-fts settings a bit

Turned address tokenizer settings into "boolean" values. Changed
have_parent to "no_parent" and added "search" setting. Added
documentation in fts-tokenizer.h. Change unit tests accordingly.

diffstat:

 src/lib-fts/fts-tokenizer-address.c |   35 ++++++-----
 src/lib-fts/fts-tokenizer.c         |   19 ++++++
 src/lib-fts/fts-tokenizer.h         |   32 ++++++++--
 src/lib-fts/test-fts-tokenizer.c    |   22 +++----
 src/plugins/fts/fts-api-private.h   |    2 -
 src/plugins/fts/fts-build-mail.c    |    6 +-
 src/plugins/fts/fts-plugin.c        |    3 +
 src/plugins/fts/fts-search-args.c   |    7 +-
 src/plugins/fts/fts-storage.c       |   14 ----
 src/plugins/fts/fts-user.c          |  106 ++++++++++++++++++++++++++++++++++-
 src/plugins/fts/fts-user.h          |    3 +-
 11 files changed, 188 insertions(+), 61 deletions(-)

diffs (truncated from 599 to 300 lines):

diff -r abbd71252175 -r ae0458c63761 src/lib-fts/fts-tokenizer-address.c
--- a/src/lib-fts/fts-tokenizer-address.c	Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-address.c	Sat May 09 11:02:22 2015 +0300
@@ -5,8 +5,8 @@
 #include "buffer.h"
 #include "fts-tokenizer-private.h"
 
-/* Return not only our tokens, but also data for parent to process.*/
-#define FTS_DEFAULT_HAVE_PARENT 1
+#define FTS_DEFAULT_NO_PARENT FALSE
+#define FTS_DEFAULT_SEARCH FALSE
 
 enum email_address_parser_state {
 	EMAIL_ADDRESS_PARSER_STATE_NONE = 0,
@@ -21,8 +21,8 @@
 	string_t *last_word;
 	string_t *parent_data; /* Copy of input data between tokens.
 	                          TODO: could be buffer_t maybe */
-	unsigned int have_parent; /* Setting for stand-alone usage.
-	                             Might be superfluous. */
+	bool no_parent;
+	bool search;
 };
 
 /*
@@ -85,18 +85,17 @@
 				   const char **error_r)
 {
 	struct email_address_fts_tokenizer *tok;
-	unsigned int have_parent = FTS_DEFAULT_HAVE_PARENT;
+	bool no_parent = FTS_DEFAULT_NO_PARENT;
+	bool search = FTS_DEFAULT_SEARCH;
 	unsigned int i;
 
 	for (i = 0; settings[i] != NULL; i += 2) {
-		const char *key = settings[i], *value = settings[i+1];
+		const char *key = settings[i];
 
-		if (strcmp(key, "have_parent") == 0) {
-			if (str_to_uint(value, &have_parent) < 0 ) {
-				*error_r = t_strdup_printf(
-					"Invalid parent setting: %s", value);
-				return -1;
-			}
+		if (strcmp(key, "no_parent") == 0) {
+			no_parent = TRUE;
+		}else if (strcmp(key, "search") == 0) {
+			search = TRUE;
 		} else {
 			*error_r = t_strdup_printf("Unknown setting: %s", key);
 			return -1;
@@ -107,7 +106,8 @@
 	tok->tokenizer = *fts_tokenizer_email_address;
 	tok->last_word = str_new(default_pool, 128);
 	tok->parent_data = str_new(default_pool, 128);
-	tok->have_parent = have_parent;
+	tok->no_parent = no_parent;
+	tok->search = search;
 	*tokenizer_r = &tok->tokenizer;
 	return 0;
 }
@@ -134,6 +134,9 @@
 fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok)
 {
 	const char *ret;
+	/* TODO: search option removes address from data here. */
+	if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN)
+		i_debug("Would remove current token");
 
 	ret = t_strdup(str_c(tok->parent_data));
 	str_truncate(tok->parent_data, 0);
@@ -250,7 +253,7 @@
 fts_tokenizer_address_update_parent(struct email_address_fts_tokenizer *tok,
                                     const unsigned char *data, size_t size)
 {
-	if (tok->have_parent > 0)
+	if (!tok->no_parent)
 		str_append_n(tok->parent_data, data, size);
 }
 static const char *
@@ -273,7 +276,7 @@
 	/* end of data, output lingering tokens. first the parents data, then
 	   possibly our token, if complete enough */
 	if (size == 0) {
-		if (tok->have_parent > 0 && str_len(tok->parent_data) > 0)
+		if (!tok->no_parent && str_len(tok->parent_data) > 0)
 		    return fts_tokenizer_address_parent_data(tok);
 
 		if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN
@@ -328,7 +331,7 @@
 			*skip_r = pos + local_skip;
 			fts_tokenizer_address_update_parent(tok, data+pos,
 			                                    local_skip);
-			if (tok->have_parent > 0)
+			if (!tok->no_parent)
 				return fts_tokenizer_address_parent_data(tok);
 			else {
 				return fts_tokenizer_address_current_token(tok);
diff -r abbd71252175 -r ae0458c63761 src/lib-fts/fts-tokenizer.c
--- a/src/lib-fts/fts-tokenizer.c	Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/fts-tokenizer.c	Sat May 09 11:02:22 2015 +0300
@@ -10,6 +10,20 @@
 
 ARRAY(struct fts_tokenizer) fts_tokenizer_classes;
 
+void fts_tokenizers_init(void)
+{
+	if (!array_is_created(&fts_tokenizer_classes)) {
+		fts_tokenizer_register(fts_tokenizer_generic);
+		fts_tokenizer_register(fts_tokenizer_email_address);
+	}
+}
+
+void fts_tokenizers_deinit(void)
+{
+	if (array_is_created(&fts_tokenizer_classes))
+		array_free(&fts_tokenizer_classes);
+}
+
 /* private */
 void fts_tokenizer_register(const struct fts_tokenizer *tok_class)
 {
@@ -47,6 +61,11 @@
 	return NULL;
 }
 
+const char *fts_tokenizer_name(const struct fts_tokenizer *tok)
+{
+	return tok->name;
+}
+
 int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
 			 struct fts_tokenizer *parent,
 			 const char *const *settings,
diff -r abbd71252175 -r ae0458c63761 src/lib-fts/fts-tokenizer.h
--- a/src/lib-fts/fts-tokenizer.h	Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/fts-tokenizer.h	Sat May 09 11:02:22 2015 +0300
@@ -3,7 +3,9 @@
 
 /*
  Settings are given in the form of a const char * const *settings =
- {"key, "value", "key2", "value2", NULL} array of string pairs.
+ {"key, "value", "key2", "value2", NULL} array of string pairs. Some
+ keys, like "no_parent" and "search" are a sort of boolean and the
+ value does not matter, just mentioning the key enables the functionality.
  The array has to be NULL terminated.
 */
 /* Email address header tokenizer that returns "user at domain.org" input as
@@ -13,15 +15,21 @@
    allows doing an explicit "user at domain" search, which returns only mails
    matching that exact address (instead of e.g. a mail with both user at domain2
    and user2 at domain words). */
-/* Settings: "have_parent", Return not only our tokens, but also data
-   for parent to process. Defaults to 1. Should normally not need to
-   be changed. */
+/* Settings:
+   "no_parent", Return only our tokens, no data for parent to process.
+   Defaults to disabled. Should normally not be needed.
+
+   "search" Remove addresses from parent data stream, so they are not processed
+   further. Defaults to disabled. Enable by defining the keyword (and any
+   value). */
 extern const struct fts_tokenizer *fts_tokenizer_email_address;
 #define FTS_TOKENIZER_EMAIL_ADDRESS_NAME "email-address"
 
 /* Generic email content tokenizer. Cuts text into tokens. */
-/* Settings: "maxlen" Maximum length of token, before an arbitary cut
-   off is made. Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
+/* Settings: 
+   "maxlen" Maximum length of token, before an arbitary cut off is made.
+   Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
+
    "algorithm", accepted values are "simple" or "tr29". Defines the
    method for looking for word boundaries. Simple is faster and will
    work for many texts, especially those using latin alphabets, but
@@ -35,9 +43,18 @@
 extern const struct fts_tokenizer *fts_tokenizer_generic;
 #define FTS_TOKENIZER_GENERIC_NAME "generic"
 
+/*
+ Tokenizing workflow, find --> create --> filter --> destroy.
+ Do init before first use and deinit after all done.
+ */
+
+/* Register all built-in tokenizers. */
+void fts_tokenizers_init(void);
+void fts_tokenizers_deinit(void);
+
 const struct fts_tokenizer *fts_tokenizer_find(const char *name);
 
-/* Create a new tokenizer. The settings is an array of key,value pairs. */
+/* Create a new tokenizer. The settings are described above. */
 int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
 			 struct fts_tokenizer *parent,
 			 const char *const *settings,
@@ -57,4 +74,5 @@
 fts_tokenizer_next(struct fts_tokenizer *tok,
 		   const unsigned char *data, size_t size);
 
+const char *fts_tokenizer_name(const struct fts_tokenizer *tok);
 #endif
diff -r abbd71252175 -r ae0458c63761 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Sat May 09 11:02:22 2015 +0300
@@ -25,7 +25,7 @@
 	const char *token, *error;
 
 	test_begin("fts tokenizer generic simple");
-	fts_tokenizer_register(fts_tokenizer_generic);
+	fts_tokenizers_init();
 	tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
 	test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
 	while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
@@ -38,7 +38,7 @@
 	}
 	test_assert(*eopp == NULL);
 	fts_tokenizer_unref(&tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
+	fts_tokenizers_deinit();
 	test_end();
 }
 
@@ -267,7 +267,7 @@
 		"abc at example.com", "bar at example.org",
 		"foo at domain", "foo at domain", "bar at example.org", NULL
 	};
-	const char *const settings[] = {"have_parent", "0", NULL};
+	const char *const settings[] = {"no_parent", "foo", NULL};
 	struct fts_tokenizer *tok;
 	const char * const *eopp = expected_output;
 	const char *token, *error;
@@ -305,7 +305,7 @@
 		"abc at example.com", "bar at example.org",
 		"foo at domain", NULL
 	};
-	const char *const settings[] = {"have_parent", "0", NULL};
+	const char *const settings[] = {"no_parent", "0", NULL};
 	struct fts_tokenizer *tok;
 	const char * const *eopp = expected_output;
 	const char *token, *error;
@@ -346,7 +346,7 @@
 	struct fts_tokenizer *tok;
 	const char * const *eopp = expected_output;
 	const char *token, *error;
-	const char *const settings[] = {"have_parent", "0", NULL};
+	const char *const settings[] = {"no_parent", "abc", NULL};
 	unsigned int i, step, step_max = 10;
 
 	test_begin("fts tokenizer email address, input random length");
@@ -390,8 +390,7 @@
 	unsigned int i;
 
 	test_begin("fts tokenizer email address + parent, input one character at a time");
-	fts_tokenizer_register(fts_tokenizer_generic);
-	fts_tokenizer_register(fts_tokenizer_email_address);
+	fts_tokenizers_init();
 
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
@@ -411,8 +410,7 @@
 	test_assert(*eopp == NULL);
 	fts_tokenizer_unref(&tok);
 	fts_tokenizer_unref(&gen_tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
-	fts_tokenizer_unregister(fts_tokenizer_email_address);
+	fts_tokenizers_deinit();
 	test_end();
 }
 
@@ -437,8 +435,7 @@
 	unsigned int i;
 
 	test_begin("fts tokenizer email address + parent, input one line at a time");
-	fts_tokenizer_register(fts_tokenizer_generic);
-	fts_tokenizer_register(fts_tokenizer_email_address);
+	fts_tokenizers_init();
 
 	test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
 	test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
@@ -457,8 +454,7 @@
 	test_assert(*eopp == NULL);
 	fts_tokenizer_unref(&tok);
 	fts_tokenizer_unref(&gen_tok);
-	fts_tokenizer_unregister(fts_tokenizer_generic);
-	fts_tokenizer_unregister(fts_tokenizer_email_address);
+	fts_tokenizers_deinit();
 	test_end();
 
 }
diff -r abbd71252175 -r ae0458c63761 src/plugins/fts/fts-api-private.h
--- a/src/plugins/fts/fts-api-private.h	Sat May 09 10:53:25 2015 +0300
+++ b/src/plugins/fts/fts-api-private.h	Sat May 09 11:02:22 2015 +0300
@@ -76,8 +76,6 @@
 	struct fts_backend_vfuncs v;
 	struct mail_namespace *ns;
 
-	struct fts_tokenizer *tokenizer;
-
 	unsigned int updating:1;
 };


More information about the dovecot-cvs mailing list