Patch: enhancements for solr/tika integration

John Fawcett john at voipsupport.it
Sun Jan 9 04:38:37 UTC 2022


Hi

here's a patch with some enhancements that I am applying locally for 
solr/tika integration. Hopefully this can be considered for inclusion. 
I've tested up to 2.3.16 and this patch applies against latest version 
2.3.17.1. The contents are:

1. Allow username and password in tika configuration (as is already 
supported for solr). For example:

fts_tika = https://user:password@tika_host:tika_port/tika/tika

This means that tika can be more easily located on a different server to 
dovecot. Locating Solr and Tika on separate servers can isolate the 
Dovecot server from issues that may happen on Solr or Tika (like out of 
memory).

2. Introduce an optional configurable limit on the size of emails whose 
message bodies will be indexed with solr via a new setting fts_max_size, 
to be put in the plugin section. For example

plugin {

...

fts_max_size = 2M

...

}

Message body indexing is skipped for messages that are larger than this 
setting.

3. Introduce an optional configurable limit on the size of emails whose 
attachments will be sent to tika for keyword extraction, via a new 
setting fts_max_size_tika, to be put in the plugin section. For example:

plugin {

...

fts_max_size_tika = 1M

}
I noticed that some very large attachments are being sent to tika and 
these often do not have any useful content to index. For simplicity the 
check is done on message size not individual or total attachment size.

Here's the patch:

diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-build-mail.c 
dovecot-2.3.17.1-new/src/plugins/fts/fts-build-mail.c
--- dovecot-2.3.17.1-orig/src/plugins/fts/fts-build-mail.c 2021-12-03 
12:48:47.000000000 +0100
+++ dovecot-2.3.17.1-new/src/plugins/fts/fts-build-mail.c 2022-01-09 
01:33:23.398341998 +0100
@@ -17,6 +17,7 @@
  #include "fts-filter.h"
  #include "fts-api-private.h"
  #include "fts-build-mail.h"
+#include "settings-parser.h"

  /* there are other characters as well, but this doesn't have to be 
exact */
  #define IS_WORD_WHITESPACE(c) \
@@ -25,6 +26,8 @@
     wherever */
  #define MAX_WORD_SIZE 1024

+static bool debug = FALSE;
+
  struct fts_mail_build_context {
         struct mail *mail;
         struct fts_backend_update_context *update_ctx;
@@ -34,6 +37,7 @@

         buffer_t *word_buf, *pending_input;
         struct fts_user_language *cur_user_lang;
+       bool oversized_tika;
  };

  static int fts_build_data(struct fts_mail_build_context *ctx,
@@ -223,7 +227,7 @@
         parser_context.user = mail_storage_get_user(storage);
         parser_context.content_disposition = ctx->content_disposition;

-
+       parser_context.oversized_tika = ctx->oversized_tika;
         if (fts_parser_init(&parser_context, &ctx->body_parser)) {
                 /* extract text using the the returned parser */
                 *binary_body_r = TRUE;
@@ -496,7 +500,32 @@
         bool binary_body;
         const char *error;
         int ret;
-
+       uoff_t msg_size;
+       uoff_t fts_max_size = 0;
+       uoff_t fts_max_size_tika = 0;
+       const char * fts_max_size_setting;
+       const char * fts_max_size_tika_setting;
+       bool oversized_msg;
+       bool oversized_tika;
+
+       fts_max_size_setting = 
mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size");
+       if (fts_max_size_setting != NULL) {
+               if (debug) i_debug("fts_max_size %s",fts_max_size_setting);
+               if (settings_get_size(fts_max_size_setting, 
&fts_max_size, &error) < 0) {
+                       i_error("%s",error);
+                       fts_max_size = 0;
+               }
+               if (debug) i_debug("fts_max_size (value) 
%"PRIuUOFF_T,fts_max_size);
+       }
+       fts_max_size_tika_setting = 
mail_user_plugin_getenv(update_ctx->backend->ns->user, 
"fts_max_size_tika");
+       if (fts_max_size_tika_setting != NULL) {
+               if (debug) i_debug("fts_max_size_tika 
%s",fts_max_size_tika_setting);
+               if (settings_get_size(fts_max_size_tika_setting, 
&fts_max_size_tika, &error) < 0) {
+                       i_error("%s",error);
+                       fts_max_size_tika = 0;
+               }
+               if (debug) i_debug("fts_max_size_tika (value) 
%"PRIuUOFF_T,fts_max_size_tika);
+       }
         *may_need_retry_r = FALSE;
         if (mail_get_stream_because(mail, NULL, NULL, "fts indexing", 
&input) < 0) {
                 if (mail->expunged)
@@ -505,10 +534,21 @@
mailbox_get_last_internal_error(mail->box, NULL));
                 return -1;
         }
-
+       oversized_msg = FALSE;
+       oversized_tika = FALSE;
+       i_stream_get_size(input,TRUE,&msg_size);
+       if (fts_max_size > 0 && msg_size > fts_max_size) {
+               i_info("Skipping message body indexing because size 
%"PRIuUOFF_T" exceeds setting fts_max_size 
%s",msg_size,fts_max_size_setting);
+               oversized_msg = TRUE;
+       }
+       if (fts_max_size_tika > 0 && msg_size > fts_max_size_tika) {
+               i_info("Skipping message attachment indexing because 
size %"PRIuUOFF_T" exceeds setting fts_max_size_tika 
%s",msg_size,fts_max_size_tika_setting);
+               oversized_tika = TRUE;
+       }
         i_zero(&ctx);
         ctx.update_ctx = update_ctx;
         ctx.mail = mail;
+       ctx.oversized_tika = oversized_tika;
         if ((update_ctx->backend->flags & 
FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
                 ctx.pending_input = buffer_create_dynamic(default_pool, 
128);

@@ -563,7 +603,7 @@
message_decoder_set_return_binary(decoder, TRUE);
                         body_part = TRUE;
                 } else {
-                       if (skip_body)
+                       if (skip_body ||oversized_msg)
                                 continue;
                 }

@@ -597,7 +637,7 @@
                 else
(void)fts_parser_deinit(&ctx.body_parser, NULL);
         }
-       if (ret == 0 && body_part && !skip_body && !body_added) {
+       if (ret == 0 && body_part && !skip_body && !oversized_msg && 
!body_added) {
                 /* make sure body is added even when it doesn't exist */
                 block.data = NULL; block.size = 0;
                 ret = fts_build_body_block(&ctx, &block, TRUE);
diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser.h 
dovecot-2.3.17.1-new/src/plugins/fts/fts-parser.h
--- dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser.h  2021-12-03 
12:48:47.000000000 +0100
+++ dovecot-2.3.17.1-new/src/plugins/fts/fts-parser.h   2022-01-09 
01:33:23.399341985 +0100
@@ -10,6 +10,7 @@
         /* Can't be NULL */
         const char *content_type;
         const char *content_disposition;
+       bool oversized_tika;
  };

  struct fts_parser_vfuncs {
diff -ur dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser-tika.c 
dovecot-2.3.17.1-new/src/plugins/fts/fts-parser-tika.c
--- dovecot-2.3.17.1-orig/src/plugins/fts/fts-parser-tika.c 2021-12-03 
12:48:47.000000000 +0100
+++ dovecot-2.3.17.1-new/src/plugins/fts/fts-parser-tika.c 2022-01-09 
01:33:23.400341972 +0100
@@ -57,7 +57,7 @@
         tuser = p_new(user->pool, struct fts_parser_tika_user, 1);
         MODULE_CONTEXT_SET(user, fts_parser_tika_user_module, tuser);

-       if (http_url_parse(url, NULL, 0, user->pool,
+       if (http_url_parse(url, NULL, HTTP_URL_ALLOW_USERINFO_PART, 
user->pool,
                            &tuser->http_url, &error) < 0) {
                 i_error("fts_tika: Failed to parse HTTP url %s: %s", 
url, error);
                 return -1;
@@ -148,6 +148,10 @@

         if (tika_get_http_client_url(parser_context->user, &http_url) < 0)
                 return NULL;
+       if (parser_context->oversized_tika) {
+               i_info("skipping tika parser due to oversized message");
+               return NULL;
+       }
         if (http_url->path == NULL)
                 http_url->path = "/";

@@ -159,6 +163,11 @@
                         http_url->host.name,
                         t_strconcat(http_url->path, 
http_url->enc_query, NULL),
                         fts_tika_parser_response, parser);
+        if (http_url->user != NULL) {
+                http_client_request_set_auth_simple(
+                        http_req, http_url->user, http_url->password);
+        }
+
         http_client_request_set_port(http_req, http_url->port);
         http_client_request_set_ssl(http_req, http_url->have_ssl);
         if (parser_context->content_type != NULL)

John



More information about the dovecot mailing list