dovecot-2.2: lib-mail: Added message_snippet_generate() to produ...
dovecot at dovecot.org
dovecot at dovecot.org
Fri Jan 16 22:33:26 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/5211234206ea
changeset: 18157:5211234206ea
user: Timo Sirainen <tss at iki.fi>
date: Sat Jan 17 00:23:36 2015 +0200
description:
lib-mail: Added message_snippet_generate() to produce a short text snippet of a mail.
diffstat:
src/lib-mail/Makefile.am | 7 +
src/lib-mail/message-snippet.c | 136 ++++++++++++++++++++++++++++++++++++
src/lib-mail/message-snippet.h | 14 +++
src/lib-mail/test-message-snippet.c | 80 +++++++++++++++++++++
4 files changed, 237 insertions(+), 0 deletions(-)
diffs (280 lines):
diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/Makefile.am
--- a/src/lib-mail/Makefile.am Sat Jan 17 00:15:44 2015 +0200
+++ b/src/lib-mail/Makefile.am Sat Jan 17 00:23:36 2015 +0200
@@ -29,6 +29,7 @@
message-part-serialize.c \
message-search.c \
message-size.c \
+ message-snippet.c \
ostream-dot.c \
quoted-printable.c \
rfc2231-parser.c \
@@ -62,6 +63,7 @@
message-part-serialize.h \
message-search.h \
message-size.h \
+ message-snippet.h \
ostream-dot.h \
quoted-printable.h \
rfc2231-parser.h \
@@ -87,6 +89,7 @@
test-message-id \
test-message-parser \
test-message-part \
+ test-message-snippet \
test-ostream-dot \
test-quoted-printable \
test-rfc2231-parser
@@ -166,6 +169,10 @@
test_message_part_LDADD = message-part.lo message-parser.lo message-header-parser.lo message-size.lo rfc822-parser.lo rfc2231-parser.lo $(test_libs)
test_message_part_DEPENDENCIES = $(test_deps)
+test_message_snippet_SOURCES = test-message-snippet.c
+test_message_snippet_LDADD = message-snippet.lo mail-html2text.lo $(test_message_decoder_LDADD) message-parser.lo message-header-parser.lo message-header-decode.lo message-size.lo
+test_message_snippet_DEPENDENCIES = $(test_deps)
+
test_mail_html2text_SOURCES = test-mail-html2text.c
test_mail_html2text_LDADD = mail-html2text.lo $(test_libs)
test_mail_html2text_DEPENDENCIES = $(test_deps)
diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/message-snippet.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/message-snippet.c Sat Jan 17 00:23:36 2015 +0200
@@ -0,0 +1,136 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "buffer.h"
+#include "str.h"
+#include "istream.h"
+#include "mail-html2text.h"
+#include "message-parser.h"
+#include "message-decoder.h"
+#include "message-snippet.h"
+
+enum snippet_state {
+ /* beginning of the line */
+ SNIPPET_STATE_NEWLINE = 0,
+ /* within normal text */
+ SNIPPET_STATE_NORMAL,
+ /* within quoted text - skip until EOL */
+ SNIPPET_STATE_QUOTED
+};
+
+struct snippet_context {
+ string_t *snippet;
+ unsigned int chars_left;
+ enum snippet_state state;
+ bool add_whitespace;
+ struct mail_html2text *html2text;
+ buffer_t *plain_output;
+};
+
+static bool snippet_generate(struct snippet_context *ctx,
+ const unsigned char *data, size_t size)
+{
+ unsigned int i, count;
+
+ if (ctx->html2text != NULL) {
+ buffer_set_used_size(ctx->plain_output, 0);
+ mail_html2text_more(ctx->html2text, data, size,
+ ctx->plain_output);
+ data = ctx->plain_output->data;
+ size = ctx->plain_output->used;
+ }
+
+ /* message-decoder should feed us only valid and complete
+ UTF-8 input */
+ for (i = 0; i < size; i += count) {
+ count = 1;
+ switch (ctx->state) {
+ case SNIPPET_STATE_NEWLINE:
+ if (data[i] == '>' && ctx->html2text == NULL) {
+ ctx->state = SNIPPET_STATE_QUOTED;
+ break;
+ }
+ ctx->state = SNIPPET_STATE_NORMAL;
+ /* fallthrough */
+ case SNIPPET_STATE_NORMAL:
+ if (data[i] == '\r' || data[i] == '\n' ||
+ data[i] == '\t' || data[i] == ' ') {
+ ctx->add_whitespace = TRUE;
+ if (data[i] == '\n')
+ ctx->state = SNIPPET_STATE_NEWLINE;
+ break;
+ }
+ if (ctx->add_whitespace) {
+ str_append_c(ctx->snippet, ' ');
+ ctx->add_whitespace = FALSE;
+ if (ctx->chars_left-- == 0)
+ return FALSE;
+ }
+ if (ctx->chars_left-- == 0)
+ return FALSE;
+ count = uni_utf8_char_bytes(data[i]);
+ i_assert(i + count <= size);
+ str_append_n(ctx->snippet, data + i, count);
+ break;
+ case SNIPPET_STATE_QUOTED:
+ if (data[i] == '\n')
+ ctx->state = SNIPPET_STATE_NEWLINE;
+ break;
+ }
+ }
+ return TRUE;
+}
+
+int message_snippet_generate(struct istream *input,
+ unsigned int max_snippet_chars,
+ string_t *snippet)
+{
+ struct message_parser_ctx *parser;
+ struct message_part *parts;
+ struct message_decoder_context *decoder;
+ struct message_block raw_block, block;
+ struct snippet_context ctx;
+ pool_t pool;
+ int ret;
+
+ memset(&ctx, 0, sizeof(ctx));
+ pool = pool_alloconly_create("message snippet", 1024);
+ ctx.snippet = snippet;
+ ctx.chars_left = max_snippet_chars;
+
+ parser = message_parser_init(pool_datastack_create(), input, 0, 0);
+ decoder = message_decoder_init(NULL, 0);
+ while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
+ if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
+ continue;
+ if (block.size == 0) {
+ const char *ct;
+
+ if (block.hdr != NULL)
+ continue;
+
+ /* end of headers - verify that we can use this
+ Content-Type. we get here only once, because we
+ always handle only one non-multipart MIME part. */
+ ct = message_decoder_current_content_type(decoder);
+ if (ct == NULL)
+ /* text/plain */ ;
+ else if (strcasecmp(ct, "text/html") == 0) {
+ ctx.html2text = mail_html2text_init(MAIL_HTML2TEXT_FLAG_SKIP_QUOTED);
+ ctx.plain_output = buffer_create_dynamic(pool, 1024);
+ } else if (strncasecmp(ct, "text/", 5) != 0)
+ break;
+ continue;
+ }
+ if (!snippet_generate(&ctx, block.data, block.size))
+ break;
+ }
+ i_assert(ret != 0);
+ message_decoder_deinit(&decoder);
+ if (message_parser_deinit(&parser, &parts) < 0)
+ i_unreached();
+ if (ctx.html2text != NULL)
+ mail_html2text_deinit(&ctx.html2text);
+ pool_unref(&pool);
+ return input->stream_errno == 0 ? 0 : -1;
+}
diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/message-snippet.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/message-snippet.h Sat Jan 17 00:23:36 2015 +0200
@@ -0,0 +1,14 @@
+#ifndef MESSAGE_SNIPPET_H
+#define MESSAGE_SNIPPET_H
+
+/* Generate UTF-8 text snippet from the beginning of the given mail input
+ stream. The stream is expected to start at the MIME part's headers whose
+ snippet is being generated. Returns 0 if ok, -1 if I/O error.
+
+ Currently only Content-Type: text/ is supported, others will result in an
+ empty string. */
+int message_snippet_generate(struct istream *input,
+ unsigned int max_snippet_chars,
+ string_t *snippet);
+
+#endif
diff -r d59753d9f5e9 -r 5211234206ea src/lib-mail/test-message-snippet.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/test-message-snippet.c Sat Jan 17 00:23:36 2015 +0200
@@ -0,0 +1,80 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "istream.h"
+#include "message-snippet.h"
+#include "test-common.h"
+
+static struct {
+ const char *input;
+ unsigned int max_snippet_chars;
+ const char *output;
+} tests[] = {
+ { "Content-Type: text/plain\n"
+ "\n"
+ "1234567890 234567890",
+ 12,
+ "1234567890 2" },
+ { "Content-Type: text/plain\n"
+ "\n"
+ "line1\n>quote2\nline2\n",
+ 100,
+ "line1 line2" },
+ { "Content-Type: text/plain\n"
+ "\n"
+ "line1\n>quote2\n> quote3\n > line4\n\n \t\t \nline5\n \t ",
+ 100,
+ "line1 > line4 line5" },
+ { "Content-Type: text/plain; charset=utf-8\n"
+ "\n"
+ "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4\xC3\xA4",
+ 11,
+ "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+ { "Content-Type: text/plain; charset=utf-8\n"
+ "Content-Transfer-Encoding: quoted-printable\n"
+ "\n"
+ "hyv=C3=A4=C3=A4 p=C3=A4iv=C3=A4=C3=A4",
+ 11,
+ "hyv\xC3\xA4\xC3\xA4 p\xC3\xA4iv\xC3\xA4" },
+
+ { "Content-Transfer-Encoding: quoted-printable\n"
+ "Content-Type: text/html;\n"
+ " charset=utf-8\n"
+ "\n"
+ "<html><head><meta http-equiv=3D\"Content-Type\" content=3D\"text/html =\n"
+ "charset=3Dutf-8\"></head><body style=3D\"word-wrap: break-word; =\n"
+ "-webkit-nbsp-mode: space; -webkit-line-break: after-white-space;\" =\n"
+ "class=3D\"\">Hi,<div class=3D\"\"><br class=3D\"\"></div><div class=3D\"\">How =\n"
+ "is it going? <blockquote>quoted text is ignored</blockquote>\n"
+ "> -foo\n"
+ "</div><br =class=3D\"\"></body></html>=\n",
+ 100,
+ "Hi, How is it going? > -foo" },
+};
+
+static void test_message_snippet(void)
+{
+ string_t *str = t_str_new(128);
+ struct istream *input;
+ unsigned int i;
+
+ test_begin("message snippet");
+ for (i = 0; i < N_ELEMENTS(tests); i++) {
+ str_truncate(str, 0);
+ input = i_stream_create_from_data(tests[i].input, strlen(tests[i].input));
+ test_assert_idx(message_snippet_generate(input, tests[i].max_snippet_chars, str) == 0, i);
+ test_assert_idx(strcmp(tests[i].output, str_c(str)) == 0, i);
+ i_stream_destroy(&input);
+ }
+ test_end();
+}
+
+int main(void)
+{
+ static void (*test_functions[])(void) = {
+ test_message_snippet,
+ NULL
+ };
+ return test_run(test_functions);
+}
More information about the dovecot-cvs
mailing list