Patch: enhancements for solr/tika integration
John Fawcett
john at voipsupport.it
Sun Jan 9 04:38:37 UTC 2022
Hi
here's a patch with some enhancements that I am applying locally for
solr/tika integration. Hopefully this can be considered for inclusion.
I've tested up to 2.3.16 and this patch applies against latest version
2.3.17.1. The contents are:
1. Allow username and password in tika configuration (as is already
supported for solr). For example:
fts_tika = https://user:password@tika_host:tika_port/tika/tika
This means that tika can be more easily located on a different server to
dovecot. Locating Solr and Tika on separate servers can isolate the
Dovecot server from issues that may happen on Solr or Tika (like out of
memory).
2. Introduce an optional configurable limit on the size of emails whose
message bodies will be indexed with solr via a new setting fts_max_size,
to be put in the plugin section. For example
plugin {
...
fts_max_size = 2M
...
}
Message body indexing is skipped for messages that are larger than this
setting.
3. Introduce an optional configurable limit on the size of emails whose
attachments will be sent to tika for keyword extraction, via a new
setting fts_max_size_tika, to be put in the plugin section. For example:
plugin {
...
fts_max_size_tika = 1M
}
I noticed that some very large attachments are being sent to tika and
these often do not have any useful content to index. For simplicity the
check is done on message size not individual or total attachment size.
Here's the patch:
diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-build-mail.c
dovecot-2.3.17.1-new/src/plugins/fts/fts-build-mail.c
--- dovecot-2.3.17.1-orig/src/plugins/fts/fts-build-mail.c 2021-12-03
12:48:47.000000000 +0100
+++ dovecot-2.3.17.1-new/src/plugins/fts/fts-build-mail.c 2022-01-09
01:33:23.398341998 +0100
@@ -17,6 +17,7 @@
#include "fts-filter.h"
#include "fts-api-private.h"
#include "fts-build-mail.h"
+#include "settings-parser.h"
/* there are other characters as well, but this doesn't have to be
exact */
#define IS_WORD_WHITESPACE(c) \
@@ -25,6 +26,8 @@
wherever */
#define MAX_WORD_SIZE 1024
+static bool debug = FALSE;
+
struct fts_mail_build_context {
struct mail *mail;
struct fts_backend_update_context *update_ctx;
@@ -34,6 +37,7 @@
buffer_t *word_buf, *pending_input;
struct fts_user_language *cur_user_lang;
+ bool oversized_tika;
};
static int fts_build_data(struct fts_mail_build_context *ctx,
@@ -223,7 +227,7 @@
parser_context.user = mail_storage_get_user(storage);
parser_context.content_disposition = ctx->content_disposition;
-
+ parser_context.oversized_tika = ctx->oversized_tika;
if (fts_parser_init(&parser_context, &ctx->body_parser)) {
/* extract text using the the returned parser */
*binary_body_r = TRUE;
@@ -496,7 +500,32 @@
bool binary_body;
const char *error;
int ret;
-
+ uoff_t msg_size;
+ uoff_t fts_max_size = 0;
+ uoff_t fts_max_size_tika = 0;
+ const char * fts_max_size_setting;
+ const char * fts_max_size_tika_setting;
+ bool oversized_msg;
+ bool oversized_tika;
+
+ fts_max_size_setting =
mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size");
+ if (fts_max_size_setting != NULL) {
+ if (debug) i_debug("fts_max_size %s",fts_max_size_setting);
+ if (settings_get_size(fts_max_size_setting,
&fts_max_size, &error) < 0) {
+ i_error("%s",error);
+ fts_max_size = 0;
+ }
+ if (debug) i_debug("fts_max_size (value)
%"PRIuUOFF_T,fts_max_size);
+ }
+ fts_max_size_tika_setting =
mail_user_plugin_getenv(update_ctx->backend->ns->user,
"fts_max_size_tika");
+ if (fts_max_size_tika_setting != NULL) {
+ if (debug) i_debug("fts_max_size_tika
%s",fts_max_size_tika_setting);
+ if (settings_get_size(fts_max_size_tika_setting,
&fts_max_size_tika, &error) < 0) {
+ i_error("%s",error);
+ fts_max_size_tika = 0;
+ }
+ if (debug) i_debug("fts_max_size_tika (value)
%"PRIuUOFF_T,fts_max_size_tika);
+ }
*may_need_retry_r = FALSE;
if (mail_get_stream_because(mail, NULL, NULL, "fts indexing",
&input) < 0) {
if (mail->expunged)
@@ -505,10 +534,21 @@
mailbox_get_last_internal_error(mail->box, NULL));
return -1;
}
-
+ oversized_msg = FALSE;
+ oversized_tika = FALSE;
+ i_stream_get_size(input,TRUE,&msg_size);
+ if (fts_max_size > 0 && msg_size > fts_max_size) {
+ i_info("Skipping message body indexing because size
%"PRIuUOFF_T" exceeds setting fts_max_size
%s",msg_size,fts_max_size_setting);
+ oversized_msg = TRUE;
+ }
+ if (fts_max_size_tika > 0 && msg_size > fts_max_size_tika) {
+ i_info("Skipping message attachment indexing because
size %"PRIuUOFF_T" exceeds setting fts_max_size_tika
%s",msg_size,fts_max_size_tika_setting);
+ oversized_tika = TRUE;
+ }
i_zero(&ctx);
ctx.update_ctx = update_ctx;
ctx.mail = mail;
+ ctx.oversized_tika = oversized_tika;
if ((update_ctx->backend->flags &
FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
ctx.pending_input = buffer_create_dynamic(default_pool,
128);
@@ -563,7 +603,7 @@
message_decoder_set_return_binary(decoder, TRUE);
body_part = TRUE;
} else {
- if (skip_body)
+ if (skip_body ||oversized_msg)
continue;
}
@@ -597,7 +637,7 @@
else
(void)fts_parser_deinit(&ctx.body_parser, NULL);
}
- if (ret == 0 && body_part && !skip_body && !body_added) {
+ if (ret == 0 && body_part && !skip_body && !oversized_msg &&
!body_added) {
/* make sure body is added even when it doesn't exist */
block.data = NULL; block.size = 0;
ret = fts_build_body_block(&ctx, &block, TRUE);
diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser.h
dovecot-2.3.17.1-new/src/plugins/fts/fts-parser.h
--- dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser.h 2021-12-03
12:48:47.000000000 +0100
+++ dovecot-2.3.17.1-new/src/plugins/fts/fts-parser.h 2022-01-09
01:33:23.399341985 +0100
@@ -10,6 +10,7 @@
/* Can't be NULL */
const char *content_type;
const char *content_disposition;
+ bool oversized_tika;
};
struct fts_parser_vfuncs {
diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser-tika.c
dovecot-2.3.17.1-new/src/plugins/fts/fts-parser-tika.c
--- dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser-tika.c 2021-12-03
12:48:47.000000000 +0100
+++ dovecot-2.3.17.1-new/src/plugins/fts/fts-parser-tika.c 2022-01-09
01:33:23.400341972 +0100
@@ -57,7 +57,7 @@
tuser = p_new(user->pool, struct fts_parser_tika_user, 1);
MODULE_CONTEXT_SET(user, fts_parser_tika_user_module, tuser);
- if (http_url_parse(url, NULL, 0, user->pool,
+ if (http_url_parse(url, NULL, HTTP_URL_ALLOW_USERINFO_PART,
user->pool,
&tuser->http_url, &error) < 0) {
i_error("fts_tika: Failed to parse HTTP url %s: %s",
url, error);
return -1;
@@ -148,6 +148,10 @@
if (tika_get_http_client_url(parser_context->user, &http_url) < 0)
return NULL;
+ if (parser_context->oversized_tika) {
+ i_info("skipping tika parser due to oversized message");
+ return NULL;
+ }
if (http_url->path == NULL)
http_url->path = "/";
@@ -159,6 +163,11 @@
http_url->host.name,
t_strconcat(http_url->path,
http_url->enc_query, NULL),
fts_tika_parser_response, parser);
+ if (http_url->user != NULL) {
+ http_client_request_set_auth_simple(
+ http_req, http_url->user, http_url->password);
+ }
+
http_client_request_set_port(http_req, http_url->port);
http_client_request_set_ssl(http_req, http_url->have_ssl);
if (parser_context->content_type != NULL)
John
More information about the dovecot
mailing list