dovecot-2.2: fts: Added FTS_BACKEND_FLAG_TOKENIZED_INPUT, which ...

dovecot at dovecot.org dovecot at dovecot.org
Mon Apr 20 13:24:08 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/914bdca67d1f
changeset: 18415:914bdca67d1f
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Apr 20 16:22:36 2015 +0300
description:
fts: Added FTS_BACKEND_FLAG_TOKENIZED_INPUT, which is implemented via lib-fts.

diffstat:

 src/plugins/fts/Makefile.am       |   11 +-
 src/plugins/fts/fts-api-private.h |    9 +-
 src/plugins/fts/fts-build-mail.c  |  148 ++++++++++++++++++++++++-
 src/plugins/fts/fts-plugin.c      |    5 +
 src/plugins/fts/fts-search-args.c |  168 +++++++++++++++++++++++++++++
 src/plugins/fts/fts-search-args.h |    7 +
 src/plugins/fts/fts-storage.c     |   40 +++++-
 src/plugins/fts/fts-user.c        |  218 ++++++++++++++++++++++++++++++++++++++
 src/plugins/fts/fts-user.h        |   22 +++
 9 files changed, 612 insertions(+), 16 deletions(-)

diffs (truncated from 834 to 300 lines):

diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/Makefile.am
--- a/src/plugins/fts/Makefile.am	Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/Makefile.am	Mon Apr 20 16:22:36 2015 +0300
@@ -4,6 +4,7 @@
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/lib \
 	-I$(top_srcdir)/src/lib-settings \
+	-I$(top_srcdir)/src/lib-fts \
 	-I$(top_srcdir)/src/lib-http \
 	-I$(top_srcdir)/src/lib-mail \
 	-I$(top_srcdir)/src/lib-index \
@@ -18,6 +19,8 @@
 module_LTLIBRARIES = \
 	lib20_fts_plugin.la
 
+lib20_fts_plugin_la_LIBADD = ../../lib-fts/libfts.la
+
 lib20_fts_plugin_la_SOURCES = \
 	fts-api.c \
 	fts-build-mail.c \
@@ -29,8 +32,10 @@
 	fts-parser-tika.c \
 	fts-plugin.c \
 	fts-search.c \
+	fts-search-args.c \
 	fts-search-serialize.c \
-	fts-storage.c
+	fts-storage.c \
+	fts-user.c
 
 pkginc_libdir=$(pkgincludedir)
 pkginc_lib_HEADERS = \
@@ -44,8 +49,10 @@
 	doveadm-fts.h \
 	fts-build-mail.h \
 	fts-plugin.h \
+	fts-search-args.h \
 	fts-search-serialize.h \
-	fts-storage.h
+	fts-storage.h \
+	fts-user.h
 
 pkglibexec_PROGRAMS = xml2text
 
diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/fts-api-private.h
--- a/src/plugins/fts/fts-api-private.h	Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/fts-api-private.h	Mon Apr 20 16:22:36 2015 +0300
@@ -61,7 +61,12 @@
 	/* Send only fully indexable words rather than randomly sized blocks */
 	FTS_BACKEND_FLAG_BUILD_FULL_WORDS	= 0x04,
 	/* Fuzzy search works */
-	FTS_BACKEND_FLAG_FUZZY_SEARCH		= 0x08
+	FTS_BACKEND_FLAG_FUZZY_SEARCH		= 0x08,
+	/* Tokenize all the input. update_build_more() will be called a single
+	   directly indexable token at a time. Searching will modify the search
+	   args so that lookup() sees only tokens that can be directly
+	   searched. */
+	FTS_BACKEND_FLAG_TOKENIZED_INPUT	= 0x10
 };
 
 struct fts_backend {
@@ -71,6 +76,8 @@
 	struct fts_backend_vfuncs v;
 	struct mail_namespace *ns;
 
+	struct fts_tokenizer *tokenizer;
+
 	unsigned int updating:1;
 };
 
diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c	Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/fts-build-mail.c	Mon Apr 20 16:22:36 2015 +0300
@@ -10,6 +10,10 @@
 #include "message-decoder.h"
 #include "mail-storage.h"
 #include "fts-parser.h"
+#include "fts-user.h"
+#include "fts-language.h"
+#include "fts-tokenizer.h"
+#include "fts-filter.h"
 #include "fts-api-private.h"
 #include "fts-build-mail.h"
 
@@ -27,7 +31,13 @@
 	char *content_type, *content_disposition;
 	struct fts_parser *body_parser;
 
-	buffer_t *word_buf;
+	buffer_t *word_buf, *pending_input;
+	struct fts_user_language *cur_user_lang;
+};
+
+static struct fts_user_language fts_user_language_data = {
+	.lang = &fts_language_data,
+	.filter = NULL
 };
 
 static int fts_build_data(struct fts_mail_build_context *ctx,
@@ -62,6 +72,25 @@
 		i_strndup(hdr->full_value, hdr->full_value_len);
 }
 
+static bool header_has_language(const char *name)
+{
+	/* FIXME: should email address headers be detected as different
+	   languages? That mainly contains people's names.. */
+	/*if (message_header_is_address(name))
+		return TRUE;*/
+
+	/* Subject definitely contains language-specific data that can be
+	   detected. Comment and Keywords headers also could contain, although
+	   just about nobody uses those headers.
+
+	   For now we assume that other headers contain non-language specific
+	   data that we don't want to filter in special ways. For example
+	   it is good to be able to search for Message-IDs. */
+	return strcasecmp(name, "Subject") == 0 ||
+		strcasecmp(name, "Comments") == 0 ||
+		strcasecmp(name, "Keywords") == 0;
+}
+
 static void fts_parse_mail_header(struct fts_mail_build_context *ctx,
 				  const struct message_block *raw_block)
 {
@@ -116,6 +145,11 @@
 	key.part = block->part;
 	key.hdr_name = hdr->name;
 
+	if (!header_has_language(key.hdr_name))
+		ctx->cur_user_lang = &fts_user_language_data;
+	else
+		ctx->cur_user_lang = NULL;
+
 	if (!fts_backend_update_set_build_key(ctx->update_ctx, &key))
 		return;
 
@@ -184,6 +218,7 @@
 	}
 	key.body_content_type = content_type;
 	key.body_content_disposition = ctx->content_disposition;
+	ctx->cur_user_lang = NULL;
 	if (!fts_backend_update_set_build_key(ctx->update_ctx, &key)) {
 		if (ctx->body_parser != NULL)
 			(void)fts_parser_deinit(&ctx->body_parser);
@@ -193,8 +228,104 @@
 }
 
 static int
-fts_build_body_block_full_words(struct fts_mail_build_context *ctx,
-				const unsigned char *data, size_t size, bool last)
+fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx,
+				 const unsigned char *data, size_t size)
+{
+	struct fts_tokenizer *tokenizer = ctx->update_ctx->backend->tokenizer;
+	struct fts_filter *filter = ctx->cur_user_lang->filter;
+	const char *token;
+	while ((token = fts_tokenizer_next(tokenizer, data, size)) != NULL) {
+		if (filter != NULL) {
+			token = fts_filter_filter(filter, token);
+			if (token == NULL)
+				continue;
+		}
+		if (fts_backend_update_build_more(ctx->update_ctx,
+						  (const void *)token,
+						  strlen(token)) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+static int
+fts_detect_language(struct fts_mail_build_context *ctx,
+		    const unsigned char *data, size_t size, bool last,
+		    const struct fts_language **lang_r)
+{
+	struct mail_user *user = ctx->update_ctx->backend->ns->user;
+	struct fts_language_list *lang_list = fts_user_get_language_list(user);
+	const struct fts_language *lang;
+
+	switch (fts_language_detect(lang_list, data, size, &lang)) {
+	case FTS_LANGUAGE_RESULT_SHORT:
+		/* save the input so far and try again later */
+		buffer_append(ctx->pending_input, data, size);
+		if (last) {
+			/* we've run out of data. use the default language. */
+			*lang_r = fts_language_list_get_first(lang_list);
+			return 1;
+		}
+		return 0;
+	case FTS_LANGUAGE_RESULT_UNKNOWN:
+		/* use the default language */
+		*lang_r = fts_language_list_get_first(lang_list);
+		return 1;
+	case FTS_LANGUAGE_RESULT_OK:
+		*lang_r = lang;
+		return 1;
+	case FTS_LANGUAGE_RESULT_ERROR:
+		/* internal language detection library failure
+		   (e.g. invalid config). don't index anything. */
+		return -1;
+	default:
+		i_unreached();
+	}
+}
+
+static int
+fts_build_tokenized(struct fts_mail_build_context *ctx,
+		    const unsigned char *data, size_t size, bool last)
+{
+	struct mail_user *user = ctx->update_ctx->backend->ns->user;
+	const struct fts_language *lang;
+	const char *error;
+	int ret;
+
+	if (ctx->cur_user_lang != NULL) {
+		/* we already have a language */
+	} else if ((ret = fts_detect_language(ctx, data, size, last, &lang)) < 0) {
+		return -1;
+	} else if (ret == 0) {
+		/* wait for more data */
+		return 0;
+	} else {
+		if (fts_user_language_get(user, lang, &ctx->cur_user_lang,
+					  &error) < 0) {
+			i_error("fts-dovecot: Can't index input because of invalid language '%s' config: %s",
+				lang->name, error);
+			return -1;
+		}
+		if (ctx->pending_input->used > 0) {
+			if (fts_build_add_tokens_with_filter(ctx,
+					ctx->pending_input->data,
+					ctx->pending_input->used) < 0)
+				return -1;
+			buffer_set_used_size(ctx->pending_input, 0);
+		}
+	}
+	if (fts_build_add_tokens_with_filter(ctx, data, size) < 0)
+		return -1;
+	if (last) {
+		if (fts_build_add_tokens_with_filter(ctx, NULL, 0) < 0)
+			return -1;
+	}
+	return 0;
+}
+
+static int
+fts_build_full_words(struct fts_mail_build_context *ctx,
+		     const unsigned char *data, size_t size, bool last)
 {
 	size_t i;
 
@@ -248,8 +379,11 @@
 			  const unsigned char *data, size_t size, bool last)
 {
 	if ((ctx->update_ctx->backend->flags &
-	     FTS_BACKEND_FLAG_BUILD_FULL_WORDS) != 0) {
-		return fts_build_body_block_full_words(ctx, data, size, last);
+	     FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) {
+		return fts_build_tokenized(ctx, data, size, last);
+	} else if ((ctx->update_ctx->backend->flags &
+		    FTS_BACKEND_FLAG_BUILD_FULL_WORDS) != 0) {
+		return fts_build_full_words(ctx, data, size, last);
 	} else {
 		return fts_backend_update_build_more(ctx->update_ctx, data, size);
 	}
@@ -308,6 +442,8 @@
 	memset(&ctx, 0, sizeof(ctx));
 	ctx.update_ctx = update_ctx;
 	ctx.mail = mail;
+	if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
+		ctx.pending_input = buffer_create_dynamic(default_pool, 128);
 
 	prev_part = NULL;
 	parser = message_parser_init(pool_datastack_create(), input,
@@ -398,6 +534,8 @@
 	i_free(ctx.content_disposition);
 	if (ctx.word_buf != NULL)
 		buffer_free(&ctx.word_buf);
+	if (ctx.pending_input != NULL)
+		buffer_free(&ctx.pending_input);
 	return ret < 0 ? -1 : 1;
 }
 
diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/fts-plugin.c
--- a/src/plugins/fts/fts-plugin.c	Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/fts-plugin.c	Mon Apr 20 16:22:36 2015 +0300
@@ -2,8 +2,10 @@
 
 #include "lib.h"
 #include "mail-storage-hooks.h"
+#include "fts-filter.h"
 #include "fts-parser.h"
 #include "fts-storage.h"
+#include "fts-user.h"
 #include "fts-plugin.h"
 
 #include <stdlib.h>
@@ -11,6 +13,7 @@
 const char *fts_plugin_version = DOVECOT_ABI_VERSION;


More information about the dovecot-cvs mailing list