dovecot-2.2: fts: Added fts-parser support for Tika
dovecot at dovecot.org
dovecot at dovecot.org
Thu Apr 17 08:30:16 UTC 2014
details: http://hg.dovecot.org/dovecot-2.2/rev/3db2ab503759
changeset: 17233:3db2ab503759
user: Timo Sirainen <tss at iki.fi>
date: Thu Apr 17 10:29:10 2014 +0200
description:
fts: Added fts-parser support for Tika
It can be enabled by setting:
plugin {
fts_tika = http://localhost:9998/tika/
}
diffstat:
src/plugins/fts/Makefile.am | 2 +
src/plugins/fts/fts-parser-tika.c | 224 ++++++++++++++++++++++++++++++++++++++
src/plugins/fts/fts-parser.c | 3 +-
src/plugins/fts/fts-parser.h | 1 +
4 files changed, 229 insertions(+), 1 deletions(-)
diffs (271 lines):
diff -r 45e7980f6507 -r 3db2ab503759 src/plugins/fts/Makefile.am
--- a/src/plugins/fts/Makefile.am Thu Apr 17 10:27:43 2014 +0200
+++ b/src/plugins/fts/Makefile.am Thu Apr 17 10:29:10 2014 +0200
@@ -4,6 +4,7 @@
AM_CPPFLAGS = \
-I$(top_srcdir)/src/lib \
-I$(top_srcdir)/src/lib-settings \
+ -I$(top_srcdir)/src/lib-http \
-I$(top_srcdir)/src/lib-mail \
-I$(top_srcdir)/src/lib-index \
-I$(top_srcdir)/src/lib-storage \
@@ -25,6 +26,7 @@
fts-parser.c \
fts-parser-html.c \
fts-parser-script.c \
+ fts-parser-tika.c \
fts-plugin.c \
fts-search.c \
fts-search-serialize.c \
diff -r 45e7980f6507 -r 3db2ab503759 src/plugins/fts/fts-parser-tika.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/fts/fts-parser-tika.c Thu Apr 17 10:29:10 2014 +0200
@@ -0,0 +1,224 @@
+/* Copyright (c) 2014 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "ioloop.h"
+#include "istream.h"
+#include "module-context.h"
+#include "http-url.h"
+#include "http-client.h"
+#include "message-parser.h"
+#include "mail-user.h"
+#include "fts-parser.h"
+
+#define TIKA_USER_CONTEXT(obj) \
+ MODULE_CONTEXT(obj, fts_parser_tika_user_module)
+
+struct fts_parser_tika_user {
+ union mail_user_module_context module_ctx;
+ struct http_url *http_url;
+};
+
+struct tika_fts_parser {
+ struct fts_parser parser;
+ struct mail_user *user;
+ struct http_client_request *http_req;
+
+ struct ioloop *ioloop;
+ struct io *io;
+ struct istream *payload;
+
+ bool http_req_finished;
+ bool failed;
+};
+
+static struct http_client *tika_http_client = NULL;
+static MODULE_CONTEXT_DEFINE_INIT(fts_parser_tika_user_module,
+ &mail_user_module_register);
+
+static int
+tika_get_http_client_url(struct mail_user *user, struct http_url **http_url_r)
+{
+ struct fts_parser_tika_user *tuser = TIKA_USER_CONTEXT(user);
+ struct http_client_settings http_set;
+ const char *url, *error;
+
+ url = mail_user_plugin_getenv(user, "fts_tika");
+ if (url == NULL) {
+ /* fts_tika disabled */
+ return -1;
+ }
+
+ if (tuser != NULL) {
+ *http_url_r = tuser->http_url;
+ return *http_url_r == NULL ? -1 : 0;
+ }
+
+ tuser = p_new(user->pool, struct fts_parser_tika_user, 1);
+ MODULE_CONTEXT_SET(user, fts_parser_tika_user_module, tuser);
+
+ if (http_url_parse(url, NULL, 0, user->pool,
+ &tuser->http_url, &error) < 0) {
+ i_error("fts_tika: Failed to parse HTTP url %s: %s", url, error);
+ return -1;
+ }
+
+ if (tika_http_client == NULL) {
+ memset(&http_set, 0, sizeof(http_set));
+ http_set.max_idle_time_msecs = 100;
+ http_set.max_parallel_connections = 1;
+ http_set.max_pipelined_requests = 1;
+ http_set.max_redirects = 1;
+ http_set.max_attempts = 3;
+ http_set.debug = user->mail_debug;
+ tika_http_client = http_client_init(&http_set);
+ }
+ *http_url_r = tuser->http_url;
+ return 0;
+}
+
+static void
+fts_tika_parser_response(const struct http_response *response,
+ struct tika_fts_parser *parser)
+{
+ i_assert(parser->payload == NULL);
+
+ switch (response->status) {
+ case 200:
+ /* read response */
+ i_stream_ref(response->payload);
+ parser->payload = response->payload;
+ break;
+ case 204: /* empty response */
+ case 422: /* Unprocessable Entity */
+ if (parser->user->mail_debug) {
+ i_debug("fts_tika: PUT %s failed: %s",
+ mail_user_plugin_getenv(parser->user, "fts_tika"),
+ response->reason);
+ }
+ parser->payload = i_stream_create_from_data("", 0);
+ break;
+ default:
+ i_error("fts_tika: PUT %s failed: %s",
+ mail_user_plugin_getenv(parser->user, "fts_tika"),
+ response->reason);
+ parser->failed = TRUE;
+ break;
+ }
+ parser->http_req_finished = TRUE;
+ io_loop_stop(current_ioloop);
+}
+
+static struct fts_parser *
+fts_parser_tika_try_init(struct mail_user *user, const char *content_type,
+ const char *content_disposition)
+{
+ struct tika_fts_parser *parser;
+ struct http_url *http_url;
+ struct http_client_request *http_req;
+
+ if (tika_get_http_client_url(user, &http_url) < 0)
+ return NULL;
+
+ parser = i_new(struct tika_fts_parser, 1);
+ parser->parser.v = fts_parser_tika;
+ parser->user = user;
+
+ http_req = http_client_request(tika_http_client, "PUT",
+ http_url->host_name,
+ t_strconcat(http_url->path, http_url->enc_query, NULL),
+ fts_tika_parser_response, parser);
+ http_client_request_set_port(http_req, http_url->port);
+ http_client_request_set_ssl(http_req, http_url->have_ssl);
+ http_client_request_add_header(http_req, "Content-Type", content_type);
+ http_client_request_add_header(http_req, "Content-Disposition",
+ content_disposition);
+ http_client_request_add_header(http_req, "Accept", "text/plain");
+
+ parser->http_req = http_req;
+ return &parser->parser;
+}
+
+static void fts_parser_tika_more(struct fts_parser *_parser,
+ struct message_block *block)
+{
+ struct tika_fts_parser *parser = (struct tika_fts_parser *)_parser;
+ const unsigned char *data;
+ size_t size;
+ ssize_t ret;
+
+ if (block->size > 0) {
+ /* first we'll send everything to Tika */
+ if (!parser->failed &&
+ http_client_request_send_payload(&parser->http_req,
+ block->data,
+ block->size) < 0)
+ parser->failed = TRUE;
+ block->size = 0;
+ return;
+ }
+
+ if (parser->payload == NULL) {
+ /* read the result from Tika */
+ if (!parser->failed &&
+ http_client_request_finish_payload(&parser->http_req) < 0)
+ parser->failed = TRUE;
+ if (!parser->failed && parser->payload == NULL)
+ http_client_wait(tika_http_client);
+ if (parser->failed)
+ return;
+ i_assert(parser->payload != NULL);
+ }
+ /* continue returning data from Tika */
+ while ((ret = i_stream_read_data(parser->payload, &data, &size, 0)) == 0) {
+ if (parser->failed)
+ return;
+ /* wait for more input from Tika */
+ if (parser->ioloop == NULL) {
+ parser->ioloop = io_loop_create();
+ parser->io = io_add_istream(parser->payload, io_loop_stop,
+ current_ioloop);
+ } else {
+ io_loop_set_current(parser->ioloop);
+ }
+ io_loop_run(current_ioloop);
+ }
+ if (size > 0) {
+ i_assert(ret > 0);
+ block->data = data;
+ block->size = size;
+ i_stream_skip(parser->payload, size);
+ } else {
+ /* finished */
+ i_assert(ret == -1);
+ }
+}
+
+static void fts_parser_tika_deinit(struct fts_parser *_parser)
+{
+ struct tika_fts_parser *parser = (struct tika_fts_parser *)_parser;
+
+ if (parser->ioloop != NULL) {
+ io_remove(&parser->io);
+ io_loop_destroy(&parser->ioloop);
+ }
+ if (parser->payload != NULL)
+ i_stream_unref(&parser->payload);
+ /* FIXME: kludgy, http_req should be NULL here if we don't want to
+ free it. requires lib-http changes. */
+ if (parser->http_req != NULL && !parser->http_req_finished)
+ http_client_request_abort(&parser->http_req);
+ i_free(parser);
+}
+
+static void fts_parser_tika_unload(void)
+{
+ if (tika_http_client != NULL)
+ http_client_deinit(&tika_http_client);
+}
+
+struct fts_parser_vfuncs fts_parser_tika = {
+ fts_parser_tika_try_init,
+ fts_parser_tika_more,
+ fts_parser_tika_deinit,
+ fts_parser_tika_unload
+};
diff -r 45e7980f6507 -r 3db2ab503759 src/plugins/fts/fts-parser.c
--- a/src/plugins/fts/fts-parser.c Thu Apr 17 10:27:43 2014 +0200
+++ b/src/plugins/fts/fts-parser.c Thu Apr 17 10:29:10 2014 +0200
@@ -8,7 +8,8 @@
const struct fts_parser_vfuncs *parsers[] = {
&fts_parser_html,
- &fts_parser_script
+ &fts_parser_script,
+ &fts_parser_tika
};
bool fts_parser_init(struct mail_user *user,
diff -r 45e7980f6507 -r 3db2ab503759 src/plugins/fts/fts-parser.h
--- a/src/plugins/fts/fts-parser.h Thu Apr 17 10:27:43 2014 +0200
+++ b/src/plugins/fts/fts-parser.h Thu Apr 17 10:29:10 2014 +0200
@@ -20,6 +20,7 @@
extern struct fts_parser_vfuncs fts_parser_html;
extern struct fts_parser_vfuncs fts_parser_script;
+extern struct fts_parser_vfuncs fts_parser_tika;
bool fts_parser_init(struct mail_user *user,
const char *content_type, const char *content_disposition,
More information about the dovecot-cvs
mailing list