dovecot-2.2: fts: Added FTS_BACKEND_FLAG_TOKENIZED_INPUT, which ...
dovecot at dovecot.org
dovecot at dovecot.org
Mon Apr 20 13:24:08 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/914bdca67d1f
changeset: 18415:914bdca67d1f
user: Timo Sirainen <tss at iki.fi>
date: Mon Apr 20 16:22:36 2015 +0300
description:
fts: Added FTS_BACKEND_FLAG_TOKENIZED_INPUT, which is implemented via lib-fts.
diffstat:
src/plugins/fts/Makefile.am | 11 +-
src/plugins/fts/fts-api-private.h | 9 +-
src/plugins/fts/fts-build-mail.c | 148 ++++++++++++++++++++++++-
src/plugins/fts/fts-plugin.c | 5 +
src/plugins/fts/fts-search-args.c | 168 +++++++++++++++++++++++++++++
src/plugins/fts/fts-search-args.h | 7 +
src/plugins/fts/fts-storage.c | 40 +++++-
src/plugins/fts/fts-user.c | 218 ++++++++++++++++++++++++++++++++++++++
src/plugins/fts/fts-user.h | 22 +++
9 files changed, 612 insertions(+), 16 deletions(-)
diffs (truncated from 834 to 300 lines):
diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/Makefile.am
--- a/src/plugins/fts/Makefile.am Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/Makefile.am Mon Apr 20 16:22:36 2015 +0300
@@ -4,6 +4,7 @@
AM_CPPFLAGS = \
-I$(top_srcdir)/src/lib \
-I$(top_srcdir)/src/lib-settings \
+ -I$(top_srcdir)/src/lib-fts \
-I$(top_srcdir)/src/lib-http \
-I$(top_srcdir)/src/lib-mail \
-I$(top_srcdir)/src/lib-index \
@@ -18,6 +19,8 @@
module_LTLIBRARIES = \
lib20_fts_plugin.la
+lib20_fts_plugin_la_LIBADD = ../../lib-fts/libfts.la
+
lib20_fts_plugin_la_SOURCES = \
fts-api.c \
fts-build-mail.c \
@@ -29,8 +32,10 @@
fts-parser-tika.c \
fts-plugin.c \
fts-search.c \
+ fts-search-args.c \
fts-search-serialize.c \
- fts-storage.c
+ fts-storage.c \
+ fts-user.c
pkginc_libdir=$(pkgincludedir)
pkginc_lib_HEADERS = \
@@ -44,8 +49,10 @@
doveadm-fts.h \
fts-build-mail.h \
fts-plugin.h \
+ fts-search-args.h \
fts-search-serialize.h \
- fts-storage.h
+ fts-storage.h \
+ fts-user.h
pkglibexec_PROGRAMS = xml2text
diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/fts-api-private.h
--- a/src/plugins/fts/fts-api-private.h Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/fts-api-private.h Mon Apr 20 16:22:36 2015 +0300
@@ -61,7 +61,12 @@
/* Send only fully indexable words rather than randomly sized blocks */
FTS_BACKEND_FLAG_BUILD_FULL_WORDS = 0x04,
/* Fuzzy search works */
- FTS_BACKEND_FLAG_FUZZY_SEARCH = 0x08
+ FTS_BACKEND_FLAG_FUZZY_SEARCH = 0x08,
+ /* Tokenize all the input. update_build_more() will be called a single
+ directly indexable token at a time. Searching will modify the search
+ args so that lookup() sees only tokens that can be directly
+ searched. */
+ FTS_BACKEND_FLAG_TOKENIZED_INPUT = 0x10
};
struct fts_backend {
@@ -71,6 +76,8 @@
struct fts_backend_vfuncs v;
struct mail_namespace *ns;
+ struct fts_tokenizer *tokenizer;
+
unsigned int updating:1;
};
diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/fts-build-mail.c Mon Apr 20 16:22:36 2015 +0300
@@ -10,6 +10,10 @@
#include "message-decoder.h"
#include "mail-storage.h"
#include "fts-parser.h"
+#include "fts-user.h"
+#include "fts-language.h"
+#include "fts-tokenizer.h"
+#include "fts-filter.h"
#include "fts-api-private.h"
#include "fts-build-mail.h"
@@ -27,7 +31,13 @@
char *content_type, *content_disposition;
struct fts_parser *body_parser;
- buffer_t *word_buf;
+ buffer_t *word_buf, *pending_input;
+ struct fts_user_language *cur_user_lang;
+};
+
+static struct fts_user_language fts_user_language_data = {
+ .lang = &fts_language_data,
+ .filter = NULL
};
static int fts_build_data(struct fts_mail_build_context *ctx,
@@ -62,6 +72,25 @@
i_strndup(hdr->full_value, hdr->full_value_len);
}
+static bool header_has_language(const char *name)
+{
+ /* FIXME: should email address headers be detected as different
+ languages? That mainly contains people's names.. */
+ /*if (message_header_is_address(name))
+ return TRUE;*/
+
+ /* Subject definitely contains language-specific data that can be
+ detected. Comment and Keywords headers also could contain, although
+ just about nobody uses those headers.
+
+ For now we assume that other headers contain non-language specific
+ data that we don't want to filter in special ways. For example
+ it is good to be able to search for Message-IDs. */
+ return strcasecmp(name, "Subject") == 0 ||
+ strcasecmp(name, "Comments") == 0 ||
+ strcasecmp(name, "Keywords") == 0;
+}
+
static void fts_parse_mail_header(struct fts_mail_build_context *ctx,
const struct message_block *raw_block)
{
@@ -116,6 +145,11 @@
key.part = block->part;
key.hdr_name = hdr->name;
+ if (!header_has_language(key.hdr_name))
+ ctx->cur_user_lang = &fts_user_language_data;
+ else
+ ctx->cur_user_lang = NULL;
+
if (!fts_backend_update_set_build_key(ctx->update_ctx, &key))
return;
@@ -184,6 +218,7 @@
}
key.body_content_type = content_type;
key.body_content_disposition = ctx->content_disposition;
+ ctx->cur_user_lang = NULL;
if (!fts_backend_update_set_build_key(ctx->update_ctx, &key)) {
if (ctx->body_parser != NULL)
(void)fts_parser_deinit(&ctx->body_parser);
@@ -193,8 +228,104 @@
}
static int
-fts_build_body_block_full_words(struct fts_mail_build_context *ctx,
- const unsigned char *data, size_t size, bool last)
+fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx,
+ const unsigned char *data, size_t size)
+{
+ struct fts_tokenizer *tokenizer = ctx->update_ctx->backend->tokenizer;
+ struct fts_filter *filter = ctx->cur_user_lang->filter;
+ const char *token;
+ while ((token = fts_tokenizer_next(tokenizer, data, size)) != NULL) {
+ if (filter != NULL) {
+ token = fts_filter_filter(filter, token);
+ if (token == NULL)
+ continue;
+ }
+ if (fts_backend_update_build_more(ctx->update_ctx,
+ (const void *)token,
+ strlen(token)) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static int
+fts_detect_language(struct fts_mail_build_context *ctx,
+ const unsigned char *data, size_t size, bool last,
+ const struct fts_language **lang_r)
+{
+ struct mail_user *user = ctx->update_ctx->backend->ns->user;
+ struct fts_language_list *lang_list = fts_user_get_language_list(user);
+ const struct fts_language *lang;
+
+ switch (fts_language_detect(lang_list, data, size, &lang)) {
+ case FTS_LANGUAGE_RESULT_SHORT:
+ /* save the input so far and try again later */
+ buffer_append(ctx->pending_input, data, size);
+ if (last) {
+ /* we've run out of data. use the default language. */
+ *lang_r = fts_language_list_get_first(lang_list);
+ return 1;
+ }
+ return 0;
+ case FTS_LANGUAGE_RESULT_UNKNOWN:
+ /* use the default language */
+ *lang_r = fts_language_list_get_first(lang_list);
+ return 1;
+ case FTS_LANGUAGE_RESULT_OK:
+ *lang_r = lang;
+ return 1;
+ case FTS_LANGUAGE_RESULT_ERROR:
+ /* internal language detection library failure
+ (e.g. invalid config). don't index anything. */
+ return -1;
+ default:
+ i_unreached();
+ }
+}
+
+static int
+fts_build_tokenized(struct fts_mail_build_context *ctx,
+ const unsigned char *data, size_t size, bool last)
+{
+ struct mail_user *user = ctx->update_ctx->backend->ns->user;
+ const struct fts_language *lang;
+ const char *error;
+ int ret;
+
+ if (ctx->cur_user_lang != NULL) {
+ /* we already have a language */
+ } else if ((ret = fts_detect_language(ctx, data, size, last, &lang)) < 0) {
+ return -1;
+ } else if (ret == 0) {
+ /* wait for more data */
+ return 0;
+ } else {
+ if (fts_user_language_get(user, lang, &ctx->cur_user_lang,
+ &error) < 0) {
+ i_error("fts-dovecot: Can't index input because of invalid language '%s' config: %s",
+ lang->name, error);
+ return -1;
+ }
+ if (ctx->pending_input->used > 0) {
+ if (fts_build_add_tokens_with_filter(ctx,
+ ctx->pending_input->data,
+ ctx->pending_input->used) < 0)
+ return -1;
+ buffer_set_used_size(ctx->pending_input, 0);
+ }
+ }
+ if (fts_build_add_tokens_with_filter(ctx, data, size) < 0)
+ return -1;
+ if (last) {
+ if (fts_build_add_tokens_with_filter(ctx, NULL, 0) < 0)
+ return -1;
+ }
+ return 0;
+}
+
+static int
+fts_build_full_words(struct fts_mail_build_context *ctx,
+ const unsigned char *data, size_t size, bool last)
{
size_t i;
@@ -248,8 +379,11 @@
const unsigned char *data, size_t size, bool last)
{
if ((ctx->update_ctx->backend->flags &
- FTS_BACKEND_FLAG_BUILD_FULL_WORDS) != 0) {
- return fts_build_body_block_full_words(ctx, data, size, last);
+ FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) {
+ return fts_build_tokenized(ctx, data, size, last);
+ } else if ((ctx->update_ctx->backend->flags &
+ FTS_BACKEND_FLAG_BUILD_FULL_WORDS) != 0) {
+ return fts_build_full_words(ctx, data, size, last);
} else {
return fts_backend_update_build_more(ctx->update_ctx, data, size);
}
@@ -308,6 +442,8 @@
memset(&ctx, 0, sizeof(ctx));
ctx.update_ctx = update_ctx;
ctx.mail = mail;
+ if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
+ ctx.pending_input = buffer_create_dynamic(default_pool, 128);
prev_part = NULL;
parser = message_parser_init(pool_datastack_create(), input,
@@ -398,6 +534,8 @@
i_free(ctx.content_disposition);
if (ctx.word_buf != NULL)
buffer_free(&ctx.word_buf);
+ if (ctx.pending_input != NULL)
+ buffer_free(&ctx.pending_input);
return ret < 0 ? -1 : 1;
}
diff -r 81e5b977e5c5 -r 914bdca67d1f src/plugins/fts/fts-plugin.c
--- a/src/plugins/fts/fts-plugin.c Mon Apr 20 16:19:07 2015 +0300
+++ b/src/plugins/fts/fts-plugin.c Mon Apr 20 16:22:36 2015 +0300
@@ -2,8 +2,10 @@
#include "lib.h"
#include "mail-storage-hooks.h"
+#include "fts-filter.h"
#include "fts-parser.h"
#include "fts-storage.h"
+#include "fts-user.h"
#include "fts-plugin.h"
#include <stdlib.h>
@@ -11,6 +13,7 @@
const char *fts_plugin_version = DOVECOT_ABI_VERSION;
More information about the dovecot-cvs
mailing list