Hi here's a patch with some enhancements that I am applying locally for solr/tika integration. Hopefully this can be considered for inclusion. I've tested up to 2.3.16 and this patch applies against latest version 2.3.17.1. The contents are: 1. Allow username and password in tika configuration (as is already supported for solr). For example: fts_tika = https://user:password@tika_host:tika_port/tika/tika This means that tika can be more easily located on a different server to dovecot. Locating Solr and Tika on separate servers can isolate the Dovecot server from issues that may happen on Solr or Tika (like out of memory). 2. Introduce an optional configurable limit on the size of emails whose message bodies will be indexed with solr via a new setting fts_max_size, to be put in the plugin section. For example plugin { ... fts_max_size = 2M ... } Message body indexing is skipped for messages that are larger than this setting. 3. Introduce an optional configurable limit on the size of emails whose attachments will be sent to tika for keyword extraction, via a new setting fts_max_size_tika, to be put in the plugin section. For example: plugin { ... fts_max_size_tika = 1M } I noticed that some very large attachments are being sent to tika and these often do not have any useful content to index. For simplicity the check is done on message size not individual or total attachment size. Here's the patch: diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-build-mail.c dovecot-2.3.17.1-new/src/plugins/fts/fts-build-mail.c --- dovecot-2.3.17.1-orig/src/plugins/fts/fts-build-mail.c 2021-12-03 12:48:47.000000000 +0100 +++ dovecot-2.3.17.1-new/src/plugins/fts/fts-build-mail.c 2022-01-09 01:33:23.398341998 +0100 @@ -17,6 +17,7 @@ #include "fts-filter.h" #include "fts-api-private.h" #include "fts-build-mail.h" +#include "settings-parser.h" /* there are other characters as well, but this doesn't have to be exact */ #define IS_WORD_WHITESPACE(c) \ @@ -25,6 +26,8 @@ wherever */ #define MAX_WORD_SIZE 1024 +static bool debug = FALSE; + struct fts_mail_build_context { struct mail *mail; struct fts_backend_update_context *update_ctx; @@ -34,6 +37,7 @@ buffer_t *word_buf, *pending_input; struct fts_user_language *cur_user_lang; + bool oversized_tika; }; static int fts_build_data(struct fts_mail_build_context *ctx, @@ -223,7 +227,7 @@ parser_context.user = mail_storage_get_user(storage); parser_context.content_disposition = ctx->content_disposition; - + parser_context.oversized_tika = ctx->oversized_tika; if (fts_parser_init(&parser_context, &ctx->body_parser)) { /* extract text using the the returned parser */ *binary_body_r = TRUE; @@ -496,7 +500,32 @@ bool binary_body; const char *error; int ret; - + uoff_t msg_size; + uoff_t fts_max_size = 0; + uoff_t fts_max_size_tika = 0; + const char * fts_max_size_setting; + const char * fts_max_size_tika_setting; + bool oversized_msg; + bool oversized_tika; + + fts_max_size_setting = mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size"); + if (fts_max_size_setting != NULL) { + if (debug) i_debug("fts_max_size %s",fts_max_size_setting); + if (settings_get_size(fts_max_size_setting, &fts_max_size, &error) < 0) { + i_error("%s",error); + fts_max_size = 0; + } + if (debug) i_debug("fts_max_size (value) %"PRIuUOFF_T,fts_max_size); + } + fts_max_size_tika_setting = mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size_tika"); + if (fts_max_size_tika_setting != NULL) { + if (debug) i_debug("fts_max_size_tika %s",fts_max_size_tika_setting); + if (settings_get_size(fts_max_size_tika_setting, &fts_max_size_tika, &error) < 0) { + i_error("%s",error); + fts_max_size_tika = 0; + } + if (debug) i_debug("fts_max_size_tika (value) %"PRIuUOFF_T,fts_max_size_tika); + } *may_need_retry_r = FALSE; if (mail_get_stream_because(mail, NULL, NULL, "fts indexing", &input) < 0) { if (mail->expunged) @@ -505,10 +534,21 @@ mailbox_get_last_internal_error(mail->box, NULL)); return -1; } - + oversized_msg = FALSE; + oversized_tika = FALSE; + i_stream_get_size(input,TRUE,&msg_size); + if (fts_max_size > 0 && msg_size > fts_max_size) { + i_info("Skipping message body indexing because size %"PRIuUOFF_T" exceeds setting fts_max_size %s",msg_size,fts_max_size_setting); + oversized_msg = TRUE; + } + if (fts_max_size_tika > 0 && msg_size > fts_max_size_tika) { + i_info("Skipping message attachment indexing because size %"PRIuUOFF_T" exceeds setting fts_max_size_tika %s",msg_size,fts_max_size_tika_setting); + oversized_tika = TRUE; + } i_zero(&ctx); ctx.update_ctx = update_ctx; ctx.mail = mail; + ctx.oversized_tika = oversized_tika; if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0) ctx.pending_input = buffer_create_dynamic(default_pool, 128); @@ -563,7 +603,7 @@ message_decoder_set_return_binary(decoder, TRUE); body_part = TRUE; } else { - if (skip_body) + if (skip_body ||oversized_msg) continue; } @@ -597,7 +637,7 @@ else (void)fts_parser_deinit(&ctx.body_parser, NULL); } - if (ret == 0 && body_part && !skip_body && !body_added) { + if (ret == 0 && body_part && !skip_body && !oversized_msg && !body_added) { /* make sure body is added even when it doesn't exist */ block.data = NULL; block.size = 0; ret = fts_build_body_block(&ctx, &block, TRUE); diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser.h dovecot-2.3.17.1-new/src/plugins/fts/fts-parser.h --- dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser.h 2021-12-03 12:48:47.000000000 +0100 +++ dovecot-2.3.17.1-new/src/plugins/fts/fts-parser.h 2022-01-09 01:33:23.399341985 +0100 @@ -10,6 +10,7 @@ /* Can't be NULL */ const char *content_type; const char *content_disposition; + bool oversized_tika; }; struct fts_parser_vfuncs { diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser-tika.c dovecot-2.3.17.1-new/src/plugins/fts/fts-parser-tika.c --- dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser-tika.c 2021-12-03 12:48:47.000000000 +0100 +++ dovecot-2.3.17.1-new/src/plugins/fts/fts-parser-tika.c 2022-01-09 01:33:23.400341972 +0100 @@ -57,7 +57,7 @@ tuser = p_new(user->pool, struct fts_parser_tika_user, 1); MODULE_CONTEXT_SET(user, fts_parser_tika_user_module, tuser); - if (http_url_parse(url, NULL, 0, user->pool, + if (http_url_parse(url, NULL, HTTP_URL_ALLOW_USERINFO_PART, user->pool, &tuser->http_url, &error) < 0) { i_error("fts_tika: Failed to parse HTTP url %s: %s", url, error); return -1; @@ -148,6 +148,10 @@ if (tika_get_http_client_url(parser_context->user, &http_url) < 0) return NULL; + if (parser_context->oversized_tika) { + i_info("skipping tika parser due to oversized message"); + return NULL; + } if (http_url->path == NULL) http_url->path = "/"; @@ -159,6 +163,11 @@ http_url->host.name, t_strconcat(http_url->path, http_url->enc_query, NULL), fts_tika_parser_response, parser); + if (http_url->user != NULL) { + http_client_request_set_auth_simple( + http_req, http_url->user, http_url->password); + } + http_client_request_set_port(http_req, http_url->port); http_client_request_set_ssl(http_req, http_url->have_ssl); if (parser_context->content_type != NULL) John