dovecot-1.2: Added message header (RFC 2047) encoder.

Wed Oct 28 04:47:30 EET 2009

details:   http://hg.dovecot.org/dovecot-1.2/rev/bd4a6f500c75
changeset: 9455:bd4a6f500c75
user:      Timo Sirainen <tss at iki.fi>
date:      Tue Oct 27 22:47:25 2009 -0400
description:
Added message header (RFC 2047) encoder.

diffstat:

3 files changed, 190 insertions(+)
src/lib-mail/Makefile.am             |    2 
src/lib-mail/message-header-encode.c |  173 ++++++++++++++++++++++++++++++++++
src/lib-mail/message-header-encode.h |   15 ++

diffs (215 lines):

diff -r 04d841ecb5bf -r bd4a6f500c75 src/lib-mail/Makefile.am

--- a/src/lib-mail/Makefile.am	Tue Oct 27 17:35:16 2009 -0400
+++ b/src/lib-mail/Makefile.am	Tue Oct 27 22:47:25 2009 -0400
@@ -11,6 +11,7 @@ libmail_a_SOURCES = \
 	message-date.c \
 	message-decoder.c \
 	message-header-decode.c \
+	message-header-encode.c \
 	message-header-parser.c \
 	message-id.c \
 	message-parser.c \
@@ -30,6 +31,7 @@ headers = \
 	message-date.h \
 	message-decoder.h \
 	message-header-decode.h \
+	message-header-encode.h \
 	message-header-parser.h \
 	message-id.h \
 	message-parser.h \
diff -r 04d841ecb5bf -r bd4a6f500c75 src/lib-mail/message-header-encode.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/message-header-encode.c	Tue Oct 27 22:47:25 2009 -0400
@@ -0,0 +1,173 @@
+/* Copyright (c) 2009 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "base64.h"
+#include "message-header-encode.h"
+
+#define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?="))
+#define MIME_MAX_LINE_LEN 76
+
+#define IS_LWSP(c) \
+	((c) == ' ' || (c) == '\t' || (c) == '\n')
+
+static bool input_idx_need_encoding(const unsigned char *input, unsigned int i)
+{
+	if ((input[i] & 0x80) != 0)
+		return TRUE;
+
+	if (input[i] == '=' && input[i+1] == '?' &&
+	    (i == 0 || IS_LWSP(input[i-1])))
+		return TRUE;
+	return FALSE;
+}
+
+static unsigned int str_last_line_len(string_t *str)
+{
+	const unsigned char *data = str_data(str);
+	unsigned int i = str_len(str);
+
+	while (i > 0 && data[i-1] != '\n')
+		i--;
+	return str_len(str) - i;
+}
+
+void message_header_encode_q(const unsigned char *input, unsigned int len,
+			     string_t *output)
+{
+	unsigned int i, line_len, line_len_left;
+
+	line_len = str_last_line_len(output);
+	if (line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
+		str_append(output, "\n\t");
+		line_len = 1;
+	}
+
+	str_append(output, "=?utf-8?q?");
+	line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - line_len;
+	for (i = 0; i < len; i++) {
+		if (line_len_left < 3) {
+			/* if we're not at the beginning of a character,
+			   go backwards until we are */
+			while ((input[i] & 0xc0) == 0x80) {
+				str_truncate(output, str_len(output)-3);
+				i--;
+			}
+			str_append(output, "?=\n\t=?utf-8?q?");
+			line_len_left = MIME_MAX_LINE_LEN -
+				MIME_WRAPPER_LEN - 1;
+		}
+		switch (input[i]) {
+		case ' ':
+			str_append_c(output, '_');
+			break;
+		case '=':
+		case '?':
+		case '_':
+			str_printfa(output, "=%2X", input[i]);
+			break;
+		default:
+			if (input[i] < 32 || (input[i] & 0x80) != 0) {
+				line_len_left -= 2;
+				str_printfa(output, "=%2X", input[i]);
+			} else {
+				str_append_c(output, input[i]);
+			}
+			break;
+		}
+		line_len_left--;
+	}
+	str_append(output, "?=");
+}
+
+void message_header_encode_b(const unsigned char *input, unsigned int len,
+			     string_t *output)
+{
+	unsigned int line_len, line_len_left, max;
+
+	line_len = str_last_line_len(output);
+	if (line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN) {
+		str_append(output, "\n\t");
+		line_len = 1;
+	}
+
+	for (;;) {
+		line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - line_len;
+		max = MAX_BASE64_DECODED_SIZE(line_len_left);
+		do {
+			max--;
+			if (max > len)
+				max = len;
+			else {
+				/* all of it doesn't fit. find a character where we
+				   can split it from. */
+				while (max > 0 && (input[max] & 0xc0) == 0x80)
+					max--;
+			}
+		} while (MAX_BASE64_ENCODED_SIZE(max) > line_len_left &&
+			 max > 0);
+
+		if (max > 0) {
+			str_append(output, "=?utf-8?b?");
+			base64_encode(input, max, output);
+			str_append(output, "?=");
+		}
+
+		input += max;
+		len -= max;
+
+		if (len == 0)
+			break;
+
+		str_append(output, "\n\t");
+		line_len = 1;
+	}
+}
+
+void message_header_encode(const char *_input, string_t *output)
+{
+	const unsigned char *input = (const unsigned char *)_input;
+	unsigned int i, first_idx, last_idx;
+	unsigned int enc_chars, enc_len, base64_len, q_len;
+	bool use_q;
+
+	/* find the first word that needs encoding */
+	for (i = 0; input[i] != '\0'; i++) {
+		if (input_idx_need_encoding(input, i))
+			break;
+	}
+	if (input[i] == '\0') {
+		/* no encoding necessary */
+		str_append(output, _input);
+		return;
+	}
+	first_idx = i;
+	while (first_idx > 0 && !IS_LWSP(input[first_idx-1]))
+		first_idx--;
+
+	/* find the last word that needs encoding */
+	last_idx = ++i; enc_chars = 1;
+	for (; input[i] != '\0'; i++) {
+		if (input_idx_need_encoding(input, i)) {
+			last_idx = i + 1;
+			enc_chars++;
+		}
+	}
+	while (input[last_idx] != '\0' && !IS_LWSP(input[last_idx]))
+		last_idx++;
+
+	/* figure out if we should use Q or B encoding. Prefer Q if it's not
+	   too much larger. */
+	enc_len = last_idx - first_idx;
+	base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
+	q_len = enc_len + enc_chars*3;
+	use_q = q_len*2/3 <= base64_len;
+
+	/* and do it */
+	str_append_n(output, input, first_idx);
+	if (use_q)
+		message_header_encode_q(input + first_idx, enc_len, output);
+	else
+		message_header_encode_b(input + first_idx, enc_len, output);
+	str_append(output, _input + last_idx);
+}
diff -r 04d841ecb5bf -r bd4a6f500c75 src/lib-mail/message-header-encode.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-mail/message-header-encode.h	Tue Oct 27 22:47:25 2009 -0400
@@ -0,0 +1,15 @@
+#ifndef MESSAGE_HEADER_ENCODE_H
+#define MESSAGE_HEADER_ENCODE_H
+
+/* Encode UTF-8 input into output wherever necessary. */
+void message_header_encode(const char *input, string_t *output);
+
+/* Encode the whole UTF-8 input using "Q" or "B" encoding into output.
+   The output is split into multiple lines if necessary. The first line length
+   is looked up from the output string. */
+void message_header_encode_q(const unsigned char *input, unsigned int len,
+			     string_t *output);
+void message_header_encode_b(const unsigned char *input, unsigned int len,
+			     string_t *output);
+
+#endif