dovecot-2.2: fts: Lowecase non-human language input while indexing.
dovecot at dovecot.org
dovecot at dovecot.org
Sat May 9 11:43:00 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/fcc20dce3c83
changeset: 18585:fcc20dce3c83
user: Timo Sirainen <tss at iki.fi>
date: Sat May 09 14:41:05 2015 +0300
description:
fts: Lowecase non-human language input while indexing.
diffstat:
src/plugins/fts/fts-build-mail.c | 29 +++++++++++++++++++++--------
src/plugins/fts/fts-user.c | 23 +++++++++++++++++++++++
src/plugins/fts/fts-user.h | 1 +
3 files changed, 45 insertions(+), 8 deletions(-)
diffs (112 lines):
diff -r 75b4b312ea09 -r fcc20dce3c83 src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c Sat May 09 14:26:42 2015 +0300
+++ b/src/plugins/fts/fts-build-mail.c Sat May 09 14:41:05 2015 +0300
@@ -35,11 +35,6 @@
struct fts_user_language *cur_user_lang;
};
-static struct fts_user_language fts_user_language_data = {
- .lang = &fts_language_data,
- .filter = NULL
-};
-
static int fts_build_data(struct fts_mail_build_context *ctx,
const unsigned char *data, size_t size, bool last);
@@ -127,6 +122,17 @@
i_free(buf);
}
+static bool data_has_8bit(const unsigned char *data, size_t size)
+{
+ size_t i;
+
+ for (i = 0; i < size; i++) {
+ if ((data[i] & 0x80) != 0)
+ return TRUE;
+ }
+ return FALSE;
+}
+
static void fts_build_mail_header(struct fts_mail_build_context *ctx,
const struct message_block *block)
{
@@ -145,10 +151,17 @@
key.part = block->part;
key.hdr_name = hdr->name;
- if (!header_has_language(key.hdr_name))
- ctx->cur_user_lang = &fts_user_language_data;
+ /* Headers that don't contain any human language will only be
+ translated to lowercase - no stemming or other filtering. There's
+ unfortunately no pefect way of detecting which headers contain
+ human languages, so we have a list of some hardcoded header names
+ and we'll also assume that if there's any 8bit content it's a human
+ language. */
+ if (header_has_language(key.hdr_name) ||
+ data_has_8bit(hdr->full_value, hdr->full_value_len))
+ ctx->cur_user_lang = NULL;
else
- ctx->cur_user_lang = NULL;
+ ctx->cur_user_lang = fts_user_get_data_lang(ctx->update_ctx->backend->ns->user);
if (!fts_backend_update_set_build_key(ctx->update_ctx, &key))
return;
diff -r 75b4b312ea09 -r fcc20dce3c83 src/plugins/fts/fts-user.c
--- a/src/plugins/fts/fts-user.c Sat May 09 14:26:42 2015 +0300
+++ b/src/plugins/fts/fts-user.c Sat May 09 14:41:05 2015 +0300
@@ -16,6 +16,7 @@
struct fts_language_list *lang_list;
struct fts_tokenizer *index_tokenizer, *search_tokenizer;
+ struct fts_user_language *data_lang;
ARRAY_TYPE(fts_user_language) languages;
};
@@ -269,6 +270,26 @@
return &fuser->languages;
}
+struct fts_user_language *fts_user_get_data_lang(struct mail_user *user)
+{
+ struct fts_user *fuser = FTS_USER_CONTEXT(user);
+ struct fts_user_language *lang;
+ const char *error;
+
+ if (fuser->data_lang != NULL)
+ return fuser->data_lang;
+
+ lang = p_new(user->pool, struct fts_user_language, 1);
+ lang->lang = &fts_language_data;
+
+ if (fts_filter_create(fts_filter_lowercase, NULL, lang->lang, NULL,
+ &lang->filter, &error) < 0)
+ i_unreached();
+ i_assert(lang->filter != NULL);
+ fuser->data_lang = lang;
+ return fuser->data_lang;
+}
+
static void fts_user_free(struct fts_user *fuser)
{
struct fts_user_language *const *user_langp;
@@ -280,6 +301,8 @@
if ((*user_langp)->filter != NULL)
fts_filter_unref(&(*user_langp)->filter);
}
+ if (fuser->data_lang != NULL && fuser->data_lang->filter != NULL)
+ fts_filter_unref(&fuser->data_lang->filter);
if (fuser->index_tokenizer != NULL)
fts_tokenizer_unref(&fuser->index_tokenizer);
diff -r 75b4b312ea09 -r fcc20dce3c83 src/plugins/fts/fts-user.h
--- a/src/plugins/fts/fts-user.h Sat May 09 14:26:42 2015 +0300
+++ b/src/plugins/fts/fts-user.h Sat May 09 14:41:05 2015 +0300
@@ -15,6 +15,7 @@
struct fts_language_list *fts_user_get_language_list(struct mail_user *user);
const ARRAY_TYPE(fts_user_language) *
fts_user_get_all_languages(struct mail_user *user);
+struct fts_user_language *fts_user_get_data_lang(struct mail_user *user);
int fts_mail_user_init(struct mail_user *user, const char **error_r);
void fts_mail_user_deinit(struct mail_user *user);
More information about the dovecot-cvs
mailing list