dovecot-2.2: fts: Added support for per-language tokenizer setti...
dovecot at dovecot.org
dovecot at dovecot.org
Thu Dec 3 12:24:25 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/0cb2c54fa452
changeset: 19451:0cb2c54fa452
user: Timo Sirainen <tss at iki.fi>
date: Thu Dec 03 14:24:06 2015 +0200
description:
fts: Added support for per-language tokenizer settings.
fts_tokenizer_<lang> now overrides fts_tokenizers setting.
fts_tokenizer_<name>_<lang> now overrides fts_tokenizer_<name> setting.
diffstat:
src/plugins/fts/fts-build-mail.c | 34 ++++++----
src/plugins/fts/fts-search-args.c | 86 ++++++++++++++++-----------
src/plugins/fts/fts-user.c | 119 ++++++++++++++++++++-----------------
src/plugins/fts/fts-user.h | 3 +-
4 files changed, 134 insertions(+), 108 deletions(-)
diffs (truncated from 445 to 300 lines):
diff -r be47ca42cbc4 -r 0cb2c54fa452 src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c Thu Dec 03 12:22:24 2015 +0200
+++ b/src/plugins/fts/fts-build-mail.c Thu Dec 03 14:24:06 2015 +0200
@@ -135,6 +135,18 @@
return FALSE;
}
+static void fts_mail_build_ctx_set_lang(struct fts_mail_build_context *ctx,
+ struct fts_user_language *user_lang)
+{
+ i_assert(user_lang != NULL);
+
+ ctx->cur_user_lang = user_lang;
+ /* reset tokenizer between fields - just to be sure no state
+ leaks between fields (especially if previous indexing had
+ failed) */
+ fts_tokenizer_reset(user_lang->index_tokenizer);
+}
+
static void
fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx,
const struct message_header_line *hdr)
@@ -148,8 +160,10 @@
if (header_has_language(hdr->name) ||
data_has_8bit(hdr->full_value, hdr->full_value_len))
ctx->cur_user_lang = NULL;
- else
- ctx->cur_user_lang = fts_user_get_data_lang(ctx->update_ctx->backend->ns->user);
+ else {
+ fts_mail_build_ctx_set_lang(ctx,
+ fts_user_get_data_lang(ctx->update_ctx->backend->ns->user));
+ }
}
static int fts_build_mail_header(struct fts_mail_build_context *ctx,
@@ -268,12 +282,11 @@
fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx,
const unsigned char *data, size_t size)
{
- struct fts_tokenizer *tokenizer;
+ struct fts_tokenizer *tokenizer = ctx->cur_user_lang->index_tokenizer;
struct fts_filter *filter = ctx->cur_user_lang->filter;
const char *token, *error;
int ret = 1, ret2;
- tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user);
while (ret > 0) T_BEGIN {
ret = ret2 = fts_tokenizer_next(tokenizer, data, size, &token, &error);
if (ret2 > 0 && filter != NULL)
@@ -341,8 +354,7 @@
/* wait for more data */
return 0;
} else {
- ctx->cur_user_lang = fts_user_language_find(user, lang);
- i_assert(ctx->cur_user_lang != NULL);
+ fts_mail_build_ctx_set_lang(ctx, fts_user_language_find(user, lang));
if (ctx->pending_input->used > 0) {
if (fts_build_add_tokens_with_filter(ctx,
@@ -480,16 +492,8 @@
memset(&ctx, 0, sizeof(ctx));
ctx.update_ctx = update_ctx;
ctx.mail = mail;
- if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) {
+ if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
ctx.pending_input = buffer_create_dynamic(default_pool, 128);
- /* reset tokenizer between mails - just to be sure no state
- leaks between mails (especially if previous indexing had
- failed) */
- struct fts_tokenizer *tokenizer;
-
- tokenizer = fts_user_get_index_tokenizer(update_ctx->backend->ns->user);
- fts_tokenizer_reset(tokenizer);
- }
prev_part = NULL;
parser = message_parser_init(pool_datastack_create(), input,
diff -r be47ca42cbc4 -r 0cb2c54fa452 src/plugins/fts/fts-search-args.c
--- a/src/plugins/fts/fts-search-args.c Thu Dec 03 12:22:24 2015 +0200
+++ b/src/plugins/fts/fts-search-args.c Thu Dec 03 14:24:06 2015 +0200
@@ -54,14 +54,14 @@
}
static int
-fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *languages,
- pool_t pool,
- struct mail_search_arg *parent_arg,
- const struct mail_search_arg *orig_arg,
- const char *orig_token, const char *token)
+fts_backend_dovecot_expand_tokens(struct fts_filter *filter,
+ pool_t pool,
+ struct mail_search_arg *parent_arg,
+ const struct mail_search_arg *orig_arg,
+ const char *orig_token, const char *token,
+ const char **error_r)
{
struct mail_search_arg *arg;
- struct fts_user_language *const *langp;
ARRAY_TYPE(const_string) tokens;
const char *token2, *error;
int ret;
@@ -73,15 +73,14 @@
array_append(&tokens, &token, 1);
/* add the word filtered */
- array_foreach(languages, langp) {
+ if (filter != NULL) {
token2 = t_strdup(token);
- ret = (*langp)->filter == NULL ? 1 :
- fts_filter_filter((*langp)->filter, &token2, &error);
+ ret = fts_filter_filter(filter, &token2, &error);
if (ret > 0) {
token2 = t_strdup(token2);
array_append(&tokens, &token2, 1);
} else if (ret < 0) {
- i_error("fts: Couldn't filter search tokens: %s", error);
+ *error_r = t_strdup_printf("Couldn't filter search token: %s", error);
return -1;
}
}
@@ -94,18 +93,50 @@
return 0;
}
+static int
+fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang,
+ pool_t pool, struct mail_search_arg *and_arg,
+ struct mail_search_arg *orig_arg,
+ const char *orig_token, const char **error_r)
+{
+ unsigned int orig_token_len = strlen(orig_token);
+ const char *token, *error;
+ int ret;
+
+ /* reset tokenizer between search args in case there's any state left
+ from some previous failure */
+ fts_tokenizer_reset(user_lang->search_tokenizer);
+ while ((ret = fts_tokenizer_next(user_lang->search_tokenizer,
+ (const void *)orig_token,
+ orig_token_len, &token, error_r)) > 0) {
+ if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+ and_arg, orig_arg, orig_token,
+ token, error_r) < 0)
+ return -1;
+ }
+ while (ret >= 0 &&
+ (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
+ if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+ and_arg, orig_arg, orig_token,
+ token, error_r) < 0)
+ return -1;
+ }
+ if (ret < 0) {
+ *error_r = t_strdup_printf("Couldn't tokenize search args: %s", error);
+ return -1;
+ }
+ return 0;
+}
+
static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool,
struct mail_search_arg **argp)
{
const ARRAY_TYPE(fts_user_language) *languages;
+ struct fts_user_language *const *langp;
struct mail_search_arg *and_arg, *orig_arg = *argp;
- const char *error, *token, *orig_token = orig_arg->value.str;
- unsigned int orig_token_len = strlen(orig_token);
- struct fts_tokenizer *tokenizer;
- int ret;
+ const char *error, *orig_token = orig_arg->value.str;
languages = fts_user_get_all_languages(backend->ns->user);
- tokenizer = fts_user_get_search_tokenizer(backend->ns->user);
/* we want all the tokens found from the string to be found, so create
a parent AND and place all the filtered token alternatives under
@@ -115,27 +146,12 @@
and_arg->match_not = orig_arg->match_not;
and_arg->next = orig_arg->next;
- /* reset tokenizer between search args in case there's any state left
- from some previous failure */
- fts_tokenizer_reset(tokenizer);
- while ((ret = fts_tokenizer_next(tokenizer,
- (const void *)orig_token,
- orig_token_len, &token, &error)) > 0) {
- if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg,
- orig_arg, orig_token,
- token) < 0)
+ array_foreach(languages, langp) {
+ if (fts_backend_dovecot_tokenize_lang(*langp, pool, and_arg,
+ orig_arg, orig_token, &error) < 0) {
+ i_error("fts: %s", error);
return -1;
- }
- while (ret >= 0 &&
- (ret = fts_tokenizer_final(tokenizer, &token, &error)) > 0) {
- if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg,
- orig_arg, orig_token,
- token) < 0)
- return -1;
- }
- if (ret < 0) {
- i_error("fts: Couldn't tokenize search args: %s", error);
- return -1;
+ }
}
if (and_arg->value.subargs == NULL) {
diff -r be47ca42cbc4 -r 0cb2c54fa452 src/plugins/fts/fts-user.c
--- a/src/plugins/fts/fts-user.c Thu Dec 03 12:22:24 2015 +0200
+++ b/src/plugins/fts/fts-user.c Thu Dec 03 14:24:06 2015 +0200
@@ -16,7 +16,6 @@
int refcount;
struct fts_language_list *lang_list;
- struct fts_tokenizer *index_tokenizer, *search_tokenizer;
struct fts_user_language *data_lang;
ARRAY_TYPE(fts_user_language) languages;
};
@@ -148,6 +147,7 @@
static int
fts_user_create_tokenizer(struct mail_user *user,
+ const struct fts_language *lang,
struct fts_tokenizer **tokenizer_r, bool search,
const char **error_r)
{
@@ -158,11 +158,15 @@
unsigned int i;
int ret = 0;
- tokenizers_key = "fts_tokenizers";
+ tokenizers_key = t_strconcat("fts_tokenizers_", lang->name, NULL);
str = mail_user_plugin_getenv(user, tokenizers_key);
if (str == NULL) {
- *error_r = "fts_tokenizers setting is missing";
- return -1;
+ str = mail_user_plugin_getenv(user, "fts_tokenizers");
+ if (str == NULL) {
+ *error_r = t_strdup_printf("%s or fts_tokenizers setting must exist", tokenizers_key);
+ return -1;
+ }
+ tokenizers_key = "fts_tokenizers";
}
tokenizers = t_strsplit_spaces(str, " ");
@@ -177,8 +181,12 @@
}
tokenizer_set_name = t_str_replace(tokenizers[i], '-', '_');
- set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name);
+ set_key = t_strdup_printf("fts_tokenizer_%s_%s", tokenizer_set_name, lang->name);
str = mail_user_plugin_getenv(user, set_key);
+ if (str == NULL) {
+ set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name);
+ str = mail_user_plugin_getenv(user, set_key);
+ }
/* tell the tokenizers that we're tokenizing a search string
(instead of tokenizing indexed data) */
@@ -205,18 +213,20 @@
return 0;
}
-static int fts_user_init_tokenizers(struct mail_user *user,
- struct fts_user *fuser,
- const char **error_r)
+static int
+fts_user_language_init_tokenizers(struct mail_user *user,
+ struct fts_user_language *user_lang,
+ const char **error_r)
{
- if (fts_user_create_tokenizer(user, &fuser->index_tokenizer, FALSE,
+ if (fts_user_create_tokenizer(user, user_lang->lang,
+ &user_lang->index_tokenizer, FALSE,
error_r) < 0)
return -1;
- if (fts_user_create_tokenizer(user, &fuser->search_tokenizer, TRUE,
+ if (fts_user_create_tokenizer(user, user_lang->lang,
+ &user_lang->search_tokenizer, TRUE,
error_r) < 0)
return -1;
-
return 0;
}
@@ -234,35 +244,21 @@
return NULL;
}
-struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user)
-{
- struct fts_user *fuser = FTS_USER_CONTEXT(user);
-
- return fuser->index_tokenizer;
-}
-
-struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user)
More information about the dovecot-cvs
mailing list