dovecot-2.2: fts: Added support for per-language tokenizer setti...

dovecot at dovecot.org dovecot at dovecot.org
Thu Dec 3 12:24:25 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/0cb2c54fa452
changeset: 19451:0cb2c54fa452
user:      Timo Sirainen <tss at iki.fi>
date:      Thu Dec 03 14:24:06 2015 +0200
description:
fts: Added support for per-language tokenizer settings.
fts_tokenizer_<lang> now overrides fts_tokenizers setting.
fts_tokenizer_<name>_<lang> now overrides fts_tokenizer_<name> setting.

diffstat:

 src/plugins/fts/fts-build-mail.c  |   34 ++++++----
 src/plugins/fts/fts-search-args.c |   86 ++++++++++++++++-----------
 src/plugins/fts/fts-user.c        |  119 ++++++++++++++++++++-----------------
 src/plugins/fts/fts-user.h        |    3 +-
 4 files changed, 134 insertions(+), 108 deletions(-)

diffs (truncated from 445 to 300 lines):

diff -r be47ca42cbc4 -r 0cb2c54fa452 src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c	Thu Dec 03 12:22:24 2015 +0200
+++ b/src/plugins/fts/fts-build-mail.c	Thu Dec 03 14:24:06 2015 +0200
@@ -135,6 +135,18 @@
 	return FALSE;
 }
 
+static void fts_mail_build_ctx_set_lang(struct fts_mail_build_context *ctx,
+					struct fts_user_language *user_lang)
+{
+	i_assert(user_lang != NULL);
+
+	ctx->cur_user_lang = user_lang;
+	/* reset tokenizer between fields - just to be sure no state
+	   leaks between fields (especially if previous indexing had
+	   failed) */
+	fts_tokenizer_reset(user_lang->index_tokenizer);
+}
+
 static void
 fts_build_tokenized_hdr_update_lang(struct fts_mail_build_context *ctx,
 				    const struct message_header_line *hdr)
@@ -148,8 +160,10 @@
 	if (header_has_language(hdr->name) ||
 	    data_has_8bit(hdr->full_value, hdr->full_value_len))
 		ctx->cur_user_lang = NULL;
-	else
-		ctx->cur_user_lang = fts_user_get_data_lang(ctx->update_ctx->backend->ns->user);
+	else {
+		fts_mail_build_ctx_set_lang(ctx,
+			fts_user_get_data_lang(ctx->update_ctx->backend->ns->user));
+	}
 }
 
 static int fts_build_mail_header(struct fts_mail_build_context *ctx,
@@ -268,12 +282,11 @@
 fts_build_add_tokens_with_filter(struct fts_mail_build_context *ctx,
 				 const unsigned char *data, size_t size)
 {
-	struct fts_tokenizer *tokenizer;
+	struct fts_tokenizer *tokenizer = ctx->cur_user_lang->index_tokenizer;
 	struct fts_filter *filter = ctx->cur_user_lang->filter;
 	const char *token, *error;
 	int ret = 1, ret2;
 
-	tokenizer = fts_user_get_index_tokenizer(ctx->update_ctx->backend->ns->user);
 	while (ret > 0) T_BEGIN {
 		ret = ret2 = fts_tokenizer_next(tokenizer, data, size, &token, &error);
 		if (ret2 > 0 && filter != NULL)
@@ -341,8 +354,7 @@
 		/* wait for more data */
 		return 0;
 	} else {
-		ctx->cur_user_lang = fts_user_language_find(user, lang);
-		i_assert(ctx->cur_user_lang != NULL);
+		fts_mail_build_ctx_set_lang(ctx, fts_user_language_find(user, lang));
 
 		if (ctx->pending_input->used > 0) {
 			if (fts_build_add_tokens_with_filter(ctx,
@@ -480,16 +492,8 @@
 	memset(&ctx, 0, sizeof(ctx));
 	ctx.update_ctx = update_ctx;
 	ctx.mail = mail;
-	if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) {
+	if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
 		ctx.pending_input = buffer_create_dynamic(default_pool, 128);
-		/* reset tokenizer between mails - just to be sure no state
-		   leaks between mails (especially if previous indexing had
-		   failed) */
-		struct fts_tokenizer *tokenizer;
-
-		tokenizer = fts_user_get_index_tokenizer(update_ctx->backend->ns->user);
-		fts_tokenizer_reset(tokenizer);
-	}
 
 	prev_part = NULL;
 	parser = message_parser_init(pool_datastack_create(), input,
diff -r be47ca42cbc4 -r 0cb2c54fa452 src/plugins/fts/fts-search-args.c
--- a/src/plugins/fts/fts-search-args.c	Thu Dec 03 12:22:24 2015 +0200
+++ b/src/plugins/fts/fts-search-args.c	Thu Dec 03 14:24:06 2015 +0200
@@ -54,14 +54,14 @@
 }
 
 static int
-fts_backend_dovecot_expand_lang_tokens(const ARRAY_TYPE(fts_user_language) *languages,
-				       pool_t pool,
-				       struct mail_search_arg *parent_arg,
-				       const struct mail_search_arg *orig_arg,
-				       const char *orig_token, const char *token)
+fts_backend_dovecot_expand_tokens(struct fts_filter *filter,
+				  pool_t pool,
+				  struct mail_search_arg *parent_arg,
+				  const struct mail_search_arg *orig_arg,
+				  const char *orig_token, const char *token,
+				  const char **error_r)
 {
 	struct mail_search_arg *arg;
-	struct fts_user_language *const *langp;
 	ARRAY_TYPE(const_string) tokens;
 	const char *token2, *error;
 	int ret;
@@ -73,15 +73,14 @@
 	array_append(&tokens, &token, 1);
 
 	/* add the word filtered */
-	array_foreach(languages, langp) {
+	if (filter != NULL) {
 		token2 = t_strdup(token);
-		ret = (*langp)->filter == NULL ? 1 :
-			fts_filter_filter((*langp)->filter, &token2, &error);
+		ret = fts_filter_filter(filter, &token2, &error);
 		if (ret > 0) {
 			token2 = t_strdup(token2);
 			array_append(&tokens, &token2, 1);
 		} else if (ret < 0) {
-			i_error("fts: Couldn't filter search tokens: %s", error);
+			*error_r = t_strdup_printf("Couldn't filter search token: %s", error);
 			return -1;
 		}
 	}
@@ -94,18 +93,50 @@
 	return 0;
 }
 
+static int
+fts_backend_dovecot_tokenize_lang(struct fts_user_language *user_lang,
+				  pool_t pool, struct mail_search_arg *and_arg,
+				  struct mail_search_arg *orig_arg,
+				  const char *orig_token, const char **error_r)
+{
+	unsigned int orig_token_len = strlen(orig_token);
+	const char *token, *error;
+	int ret;
+
+	/* reset tokenizer between search args in case there's any state left
+	   from some previous failure */
+	fts_tokenizer_reset(user_lang->search_tokenizer);
+	while ((ret = fts_tokenizer_next(user_lang->search_tokenizer,
+					 (const void *)orig_token,
+					 orig_token_len, &token, error_r)) > 0) {
+		if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+						      and_arg, orig_arg, orig_token,
+						      token, error_r) < 0)
+			return -1;
+	}
+	while (ret >= 0 &&
+	       (ret = fts_tokenizer_final(user_lang->search_tokenizer, &token, &error)) > 0) {
+		if (fts_backend_dovecot_expand_tokens(user_lang->filter, pool,
+						      and_arg, orig_arg, orig_token,
+						      token, error_r) < 0)
+			return -1;
+	}
+	if (ret < 0) {
+		*error_r = t_strdup_printf("Couldn't tokenize search args: %s", error);
+		return -1;
+	}
+	return 0;
+}
+
 static int fts_search_arg_expand(struct fts_backend *backend, pool_t pool,
 				 struct mail_search_arg **argp)
 {
 	const ARRAY_TYPE(fts_user_language) *languages;
+	struct fts_user_language *const *langp;
 	struct mail_search_arg *and_arg, *orig_arg = *argp;
-	const char *error, *token, *orig_token = orig_arg->value.str;
-	unsigned int orig_token_len = strlen(orig_token);
-	struct fts_tokenizer *tokenizer;
-	int ret;
+	const char *error, *orig_token = orig_arg->value.str;
 
 	languages = fts_user_get_all_languages(backend->ns->user);
-	tokenizer = fts_user_get_search_tokenizer(backend->ns->user);
 
 	/* we want all the tokens found from the string to be found, so create
 	   a parent AND and place all the filtered token alternatives under
@@ -115,27 +146,12 @@
 	and_arg->match_not = orig_arg->match_not;
 	and_arg->next = orig_arg->next;
 
-	/* reset tokenizer between search args in case there's any state left
-	   from some previous failure */
-	fts_tokenizer_reset(tokenizer);
-	while ((ret = fts_tokenizer_next(tokenizer,
-					 (const void *)orig_token,
-					 orig_token_len, &token, &error)) > 0) {
-		if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg,
-							   orig_arg, orig_token,
-							   token) < 0)
+	array_foreach(languages, langp) {
+		if (fts_backend_dovecot_tokenize_lang(*langp, pool, and_arg,
+						      orig_arg, orig_token, &error) < 0) {
+			i_error("fts: %s", error);
 			return -1;
-	}
-	while (ret >= 0 &&
-	       (ret = fts_tokenizer_final(tokenizer, &token, &error)) > 0) {
-		if (fts_backend_dovecot_expand_lang_tokens(languages, pool, and_arg,
-							   orig_arg, orig_token,
-							   token) < 0)
-			return -1;
-	}
-	if (ret < 0) {
-		i_error("fts: Couldn't tokenize search args: %s", error);
-		return -1;
+		}
 	}
 
 	if (and_arg->value.subargs == NULL) {
diff -r be47ca42cbc4 -r 0cb2c54fa452 src/plugins/fts/fts-user.c
--- a/src/plugins/fts/fts-user.c	Thu Dec 03 12:22:24 2015 +0200
+++ b/src/plugins/fts/fts-user.c	Thu Dec 03 14:24:06 2015 +0200
@@ -16,7 +16,6 @@
 	int refcount;
 
 	struct fts_language_list *lang_list;
-	struct fts_tokenizer *index_tokenizer, *search_tokenizer;
 	struct fts_user_language *data_lang;
 	ARRAY_TYPE(fts_user_language) languages;
 };
@@ -148,6 +147,7 @@
 
 static int
 fts_user_create_tokenizer(struct mail_user *user,
+			  const struct fts_language *lang,
 			  struct fts_tokenizer **tokenizer_r, bool search,
 			  const char **error_r)
 {
@@ -158,11 +158,15 @@
 	unsigned int i;
 	int ret = 0;
 
-	tokenizers_key = "fts_tokenizers";
+	tokenizers_key = t_strconcat("fts_tokenizers_", lang->name, NULL);
 	str = mail_user_plugin_getenv(user, tokenizers_key);
 	if (str == NULL) {
-		*error_r = "fts_tokenizers setting is missing";
-		return -1;
+		str = mail_user_plugin_getenv(user, "fts_tokenizers");
+		if (str == NULL) {
+			*error_r = t_strdup_printf("%s or fts_tokenizers setting must exist", tokenizers_key);
+			return -1;
+		}
+		tokenizers_key = "fts_tokenizers";
 	}
 
 	tokenizers = t_strsplit_spaces(str, " ");
@@ -177,8 +181,12 @@
 		}
 
 		tokenizer_set_name = t_str_replace(tokenizers[i], '-', '_');
-		set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name);
+		set_key = t_strdup_printf("fts_tokenizer_%s_%s", tokenizer_set_name, lang->name);
 		str = mail_user_plugin_getenv(user, set_key);
+		if (str == NULL) {
+			set_key = t_strdup_printf("fts_tokenizer_%s", tokenizer_set_name);
+			str = mail_user_plugin_getenv(user, set_key);
+		}
 
 		/* tell the tokenizers that we're tokenizing a search string
 		   (instead of tokenizing indexed data) */
@@ -205,18 +213,20 @@
 	return 0;
 }
 
-static int fts_user_init_tokenizers(struct mail_user *user,
-				    struct fts_user *fuser,
-				    const char **error_r)
+static int
+fts_user_language_init_tokenizers(struct mail_user *user,
+				  struct fts_user_language *user_lang,
+				  const char **error_r)
 {
-	if (fts_user_create_tokenizer(user, &fuser->index_tokenizer, FALSE,
+	if (fts_user_create_tokenizer(user, user_lang->lang,
+				      &user_lang->index_tokenizer, FALSE,
 	                              error_r) < 0)
 		return -1;
 
-	if (fts_user_create_tokenizer(user, &fuser->search_tokenizer, TRUE,
+	if (fts_user_create_tokenizer(user, user_lang->lang,
+				      &user_lang->search_tokenizer, TRUE,
 	                              error_r) < 0)
 		return -1;
-
 	return 0;
 }
 
@@ -234,35 +244,21 @@
 	return NULL;
 }
 
-struct fts_tokenizer *fts_user_get_index_tokenizer(struct mail_user *user)
-{
-	struct fts_user *fuser = FTS_USER_CONTEXT(user);
-
-	return fuser->index_tokenizer;
-}
-
-struct fts_tokenizer *fts_user_get_search_tokenizer(struct mail_user *user)


More information about the dovecot-cvs mailing list