[patch] enhancement for adding limits to solr and tika parsing

John Fawcett john at voipsupport.it
Mon Dec 7 15:27:45 EET 2020


Hi

As mentioned in a previous thread, the solr + tika combination has
caused me some issues due to attachment size. While tika seems to be
able to parse large attachments, the resulting volume of text can
overwhelm the solr server.

One solution would be to throw resources at the problem, but in my case
such large attachments don't contain anything worthwile indexing.
Additionally I don't want people to be able to randomly crash my solr
server by sending large compressed attachments that expand into huge
volumes for solr. It's also a safety feature to have sane limits on what
can be indexed.

Attached is a first attempt to address the problem. I did not find a way
to easily get actual attachment sizes, so I used an already available
information - the overall message size. It may not be ideal but at least
introduces limits where none existed.

I have introduced two new parameters for the plugin section, for example:

plugin {

    fts_max_size = 2M
    fts_max_size_tika = 1M

}
They can be used separately or together. Both sizes refer to the overall
message size. The meaning is:

fts_max_size - do not parse message bodies if the message size exceeds
this value. A value of 0 indicates no limit.  If the message body is not
parsed, attachments are also not parsed.

fts_max_size_tika - do not parse message attachments with tika if the
message size exceeds this value. A value of 0 indicates no limit.

If using both settings it makes sense to have fts_max_size >
fts_max_size_tika, since  with a smaller fts_max_size bodies are not
indexed including attachments and the fts_max_size_tika will have no
effect.

The difference (ft_max_size - fts_max_size_tika) places an upper bound
on the size of the non attachment body text that will be indexed.
However, any attachments over the fts_size will automatically consume
this limit and no body text will be indexed for those messages. I've
only updated the tika parser not the script parser though the script
parser potentially could benefit from this approach.

The attached patch also includes the rolled up patch for using basic
auth with the tika server and the previous posted patch (not mine) which
solves an assert when using solr and tika together.

John






-------------- next part --------------
diff -ur dovecot-2.3.11.3-orig/src/plugins/fts/fts-build-mail.c dovecot-2.3.11.3/src/plugins/fts/fts-build-mail.c
--- dovecot-2.3.11.3-orig/src/plugins/fts/fts-build-mail.c	2020-08-12 14:20:41.000000000 +0200
+++ dovecot-2.3.11.3/src/plugins/fts/fts-build-mail.c	2020-12-07 14:05:23.654217555 +0100
@@ -17,6 +17,7 @@
 #include "fts-filter.h"
 #include "fts-api-private.h"
 #include "fts-build-mail.h"
+#include "settings-parser.h"
 
 /* there are other characters as well, but this doesn't have to be exact */
 #define IS_WORD_WHITESPACE(c) \
@@ -34,6 +35,7 @@
 
 	buffer_t *word_buf, *pending_input;
 	struct fts_user_language *cur_user_lang;
+	bool oversized_tika;
 };
 
 static int fts_build_data(struct fts_mail_build_context *ctx,
@@ -236,7 +238,7 @@
 	parser_context.user = mail_storage_get_user(storage);
 	parser_context.content_disposition = ctx->content_disposition;
 
-	
+	parser_context.oversized_tika = ctx->oversized_tika;	
 	if (fts_parser_init(&parser_context, &ctx->body_parser)) {
 		/* extract text using the the returned parser */
 		*binary_body_r = TRUE;
@@ -488,7 +490,32 @@
 	bool binary_body;
 	const char *error;
 	int ret;
-
+	uoff_t msg_size;
+	uoff_t fts_max_size = 0;
+	uoff_t fts_max_size_tika = 0;
+	const char * fts_max_size_setting;
+	const char * fts_max_size_tika_setting;
+	bool oversized_msg;
+	bool oversized_tika;
+
+	fts_max_size_setting = mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size");   
+	if (fts_max_size_setting != NULL) {
+		i_debug("fts_max_size %s",fts_max_size_setting);
+		if (settings_get_size(fts_max_size_setting, &fts_max_size, &error) < 0) {
+			i_error("%s",error);	
+			fts_max_size = 0;	
+		}
+		i_debug("fts_max_size (value) %"PRIuUOFF_T,fts_max_size);
+	}
+	fts_max_size_tika_setting = mail_user_plugin_getenv(update_ctx->backend->ns->user, "fts_max_size_tika");   
+	if (fts_max_size_tika_setting != NULL) {
+		i_debug("fts_max_size_tika %s",fts_max_size_tika_setting);
+		if (settings_get_size(fts_max_size_tika_setting, &fts_max_size_tika, &error) < 0) {
+			i_error("%s",error);	
+			fts_max_size_tika = 0;	
+		}
+		i_debug("fts_max_size_tika (value) %"PRIuUOFF_T,fts_max_size_tika);
+	}
 	*may_need_retry_r = FALSE;
 	if (mail_get_stream_because(mail, NULL, NULL, "fts indexing", &input) < 0) {
 		if (mail->expunged)
@@ -498,10 +525,21 @@
 			mailbox_get_last_internal_error(mail->box, NULL));
 		return -1;
 	}
-
+	oversized_msg = FALSE;
+	oversized_tika = FALSE;
+	i_stream_get_size(input,TRUE,&msg_size);
+	if (fts_max_size > 0 && msg_size > fts_max_size) {
+		i_info("Skipping message body indexing because size %"PRIuUOFF_T" exceeds setting fts_max_size %s",msg_size,fts_max_size_setting);
+		oversized_msg = TRUE;
+	}
+	if (fts_max_size_tika > 0 && msg_size > fts_max_size_tika) {
+		i_info("Skipping message attachment indexing because size %"PRIuUOFF_T" exceeds setting fts_max_size_tika %s",msg_size,fts_max_size_tika_setting);
+		oversized_tika = TRUE;
+	}
 	i_zero(&ctx);
 	ctx.update_ctx = update_ctx;
 	ctx.mail = mail;
+	ctx.oversized_tika = oversized_tika;
 	if ((update_ctx->backend->flags & FTS_BACKEND_FLAG_TOKENIZED_INPUT) != 0)
 		ctx.pending_input = buffer_create_dynamic(default_pool, 128);
 
@@ -556,7 +594,7 @@
 				message_decoder_set_return_binary(decoder, TRUE);
 			body_part = TRUE;
 		} else {
-			if (skip_body)
+			if (skip_body ||oversized_msg)
 				continue;
 		}
 
@@ -590,7 +628,7 @@
 		else
 			(void)fts_parser_deinit(&ctx.body_parser, NULL);
 	}
-	if (ret == 0 && body_part && !skip_body && !body_added) {
+	if (ret == 0 && body_part && !skip_body && !oversized_msg && !body_added) {
 		/* make sure body is added even when it doesn't exist */
 		block.data = NULL; block.size = 0;
 		ret = fts_build_body_block(&ctx, &block, TRUE);
diff -ur dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser.h dovecot-2.3.11.3/src/plugins/fts/fts-parser.h
--- dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser.h	2020-08-12 14:20:41.000000000 +0200
+++ dovecot-2.3.11.3/src/plugins/fts/fts-parser.h	2020-12-07 12:42:55.653635916 +0100
@@ -10,6 +10,7 @@
 	/* Can't be NULL */
 	const char *content_type;
 	const char *content_disposition;
+	bool oversized_tika;
 };
 
 struct fts_parser_vfuncs {
diff -ur dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser-tika.c dovecot-2.3.11.3/src/plugins/fts/fts-parser-tika.c
--- dovecot-2.3.11.3-orig/src/plugins/fts/fts-parser-tika.c	2020-08-12 14:20:41.000000000 +0200
+++ dovecot-2.3.11.3/src/plugins/fts/fts-parser-tika.c	2020-12-07 13:01:33.732476038 +0100
@@ -57,7 +57,7 @@
 	tuser = p_new(user->pool, struct fts_parser_tika_user, 1);
 	MODULE_CONTEXT_SET(user, fts_parser_tika_user_module, tuser);
 
-	if (http_url_parse(url, NULL, 0, user->pool,
+	if (http_url_parse(url, NULL, HTTP_URL_ALLOW_USERINFO_PART, user->pool,
 			   &tuser->http_url, &error) < 0) {
 		i_error("fts_tika: Failed to parse HTTP url %s: %s", url, error);
 		return -1;
@@ -77,7 +77,8 @@
 		http_set.request_timeout_msecs = 60*1000;
 		http_set.ssl = &ssl_set;
 		http_set.debug = user->mail_debug;
-		tika_http_client = http_client_init(&http_set);
+		tika_http_client = http_client_init_private(&http_set);
 	}
 	*http_url_r = tuser->http_url;
 	return 0;
@@ -141,6 +142,10 @@
 
 	if (tika_get_http_client_url(parser_context->user, &http_url) < 0)
 		return NULL;
+	if (parser_context->oversized_tika) {
+		i_info("skipping tika parser due to oversized message");
+		return NULL;
+	}
 	if (http_url->path == NULL)
 		http_url->path = "/";
 
@@ -152,6 +157,11 @@
 			http_url->host.name,
 			t_strconcat(http_url->path, http_url->enc_query, NULL),
 			fts_tika_parser_response, parser);
+        if (http_url->user != NULL) {
+                http_client_request_set_auth_simple(
+                        http_req, http_url->user, http_url->password);
+        }
+
 	http_client_request_set_port(http_req, http_url->port);
 	http_client_request_set_ssl(http_req, http_url->have_ssl);
 	if (parser_context->content_type != NULL)
diff -ur dovecot-2.3.11.3-orig/src/plugins/fts-solr/solr-connection.c dovecot-2.3.11.3/src/plugins/fts-solr/solr-connection.c
--- dovecot-2.3.11.3-orig/src/plugins/fts-solr/solr-connection.c	2020-08-12 14:20:41.000000000 +0200
+++ dovecot-2.3.11.3/src/plugins/fts-solr/solr-connection.c	2020-11-15 18:34:13.657576104 +0100
@@ -103,7 +103,8 @@
 		http_set.ssl = ssl_client_set;
 		http_set.debug = solr_set->debug;
 		http_set.rawlog_dir = solr_set->rawlog_dir;
-		solr_http_client = http_client_init(&http_set);
+		solr_http_client = http_client_init_private(&http_set);
 	}
 
 	*conn_r = conn;


More information about the dovecot mailing list