dovecot-2.2: fts: Lowecase non-human language input while indexing.

dovecot at dovecot.org dovecot at dovecot.org
Sat May 9 11:43:00 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/fcc20dce3c83
changeset: 18585:fcc20dce3c83
user:      Timo Sirainen <tss at iki.fi>
date:      Sat May 09 14:41:05 2015 +0300
description:
fts: Lowecase non-human language input while indexing.

diffstat:

 src/plugins/fts/fts-build-mail.c |  29 +++++++++++++++++++++--------
 src/plugins/fts/fts-user.c       |  23 +++++++++++++++++++++++
 src/plugins/fts/fts-user.h       |   1 +
 3 files changed, 45 insertions(+), 8 deletions(-)

diffs (112 lines):

diff -r 75b4b312ea09 -r fcc20dce3c83 src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c	Sat May 09 14:26:42 2015 +0300
+++ b/src/plugins/fts/fts-build-mail.c	Sat May 09 14:41:05 2015 +0300
@@ -35,11 +35,6 @@
 	struct fts_user_language *cur_user_lang;
 };
 
-static struct fts_user_language fts_user_language_data = {
-	.lang = &fts_language_data,
-	.filter = NULL
-};
-
 static int fts_build_data(struct fts_mail_build_context *ctx,
 			  const unsigned char *data, size_t size, bool last);
 
@@ -127,6 +122,17 @@
 	i_free(buf);
 }
 
+static bool data_has_8bit(const unsigned char *data, size_t size)
+{
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		if ((data[i] & 0x80) != 0)
+			return TRUE;
+	}
+	return FALSE;
+}
+
 static void fts_build_mail_header(struct fts_mail_build_context *ctx,
 				  const struct message_block *block)
 {
@@ -145,10 +151,17 @@
 	key.part = block->part;
 	key.hdr_name = hdr->name;
 
-	if (!header_has_language(key.hdr_name))
-		ctx->cur_user_lang = &fts_user_language_data;
+	/* Headers that don't contain any human language will only be
+	   translated to lowercase - no stemming or other filtering. There's
+	   unfortunately no pefect way of detecting which headers contain
+	   human languages, so we have a list of some hardcoded header names
+	   and we'll also assume that if there's any 8bit content it's a human
+	   language. */
+	if (header_has_language(key.hdr_name) ||
+	    data_has_8bit(hdr->full_value, hdr->full_value_len))
+		ctx->cur_user_lang = NULL;
 	else
-		ctx->cur_user_lang = NULL;
+		ctx->cur_user_lang = fts_user_get_data_lang(ctx->update_ctx->backend->ns->user);
 
 	if (!fts_backend_update_set_build_key(ctx->update_ctx, &key))
 		return;
diff -r 75b4b312ea09 -r fcc20dce3c83 src/plugins/fts/fts-user.c
--- a/src/plugins/fts/fts-user.c	Sat May 09 14:26:42 2015 +0300
+++ b/src/plugins/fts/fts-user.c	Sat May 09 14:41:05 2015 +0300
@@ -16,6 +16,7 @@
 
 	struct fts_language_list *lang_list;
 	struct fts_tokenizer *index_tokenizer, *search_tokenizer;
+	struct fts_user_language *data_lang;
 	ARRAY_TYPE(fts_user_language) languages;
 };
 
@@ -269,6 +270,26 @@
 	return &fuser->languages;
 }
 
+struct fts_user_language *fts_user_get_data_lang(struct mail_user *user)
+{
+	struct fts_user *fuser = FTS_USER_CONTEXT(user);
+	struct fts_user_language *lang;
+	const char *error;
+
+	if (fuser->data_lang != NULL)
+		return fuser->data_lang;
+
+	lang = p_new(user->pool, struct fts_user_language, 1);
+	lang->lang = &fts_language_data;
+
+	if (fts_filter_create(fts_filter_lowercase, NULL, lang->lang, NULL,
+			      &lang->filter, &error) < 0)
+		i_unreached();
+	i_assert(lang->filter != NULL);
+	fuser->data_lang = lang;
+	return fuser->data_lang;
+}
+
 static void fts_user_free(struct fts_user *fuser)
 {
 	struct fts_user_language *const *user_langp;
@@ -280,6 +301,8 @@
 		if ((*user_langp)->filter != NULL)
 			fts_filter_unref(&(*user_langp)->filter);
 	}
+	if (fuser->data_lang != NULL && fuser->data_lang->filter != NULL)
+		fts_filter_unref(&fuser->data_lang->filter);
 
 	if (fuser->index_tokenizer != NULL)
 		fts_tokenizer_unref(&fuser->index_tokenizer);
diff -r 75b4b312ea09 -r fcc20dce3c83 src/plugins/fts/fts-user.h
--- a/src/plugins/fts/fts-user.h	Sat May 09 14:26:42 2015 +0300
+++ b/src/plugins/fts/fts-user.h	Sat May 09 14:41:05 2015 +0300
@@ -15,6 +15,7 @@
 struct fts_language_list *fts_user_get_language_list(struct mail_user *user);
 const ARRAY_TYPE(fts_user_language) *
 fts_user_get_all_languages(struct mail_user *user);
+struct fts_user_language *fts_user_get_data_lang(struct mail_user *user);
 
 int fts_mail_user_init(struct mail_user *user, const char **error_r);
 void fts_mail_user_deinit(struct mail_user *user);


More information about the dovecot-cvs mailing list