dovecot-2.2: lib-mail: message_header_encode() now preserves fol...

Sun May 11 19:20:02 UTC 2014

details:   http://hg.dovecot.org/dovecot-2.2/rev/e6c96db70cfb
changeset: 17368:e6c96db70cfb
user:      Timo Sirainen <tss at iki.fi>
date:      Sun May 11 22:15:08 2014 +0300
description:
lib-mail: message_header_encode() now preserves folding whitespace
This function could still use some cleaning up, but good enough for now..

Also it should try to minimize the encoded words, not necessarily encoding
everything between the first and the last words that have to be encoded.

diffstat:

 src/lib-mail/message-header-encode.c      |  134 +++++++++++++++++++++++------
 src/lib-mail/message-header-encode.h      |    7 +-
 src/lib-mail/test-message-header-encode.c |   21 ++++-
 3 files changed, 130 insertions(+), 32 deletions(-)

diffs (235 lines):

diff -r 009caac530e9 -r e6c96db70cfb src/lib-mail/message-header-encode.c

--- a/src/lib-mail/message-header-encode.c	Sun May 11 21:08:51 2014 +0300
+++ b/src/lib-mail/message-header-encode.c	Sun May 11 22:15:08 2014 +0300
@@ -14,17 +14,44 @@
 static bool input_idx_need_encoding(const unsigned char *input,
 				    unsigned int i, unsigned int len)
 {
-	/* 8bit chars */
-	if ((input[i] & 0x80) != 0)
-		return TRUE;
-	/* control chars */
-	if (input[i] < 32)
-		return TRUE;
-
-	/* <LWSP>=? */
-	if (input[i] == '=' && i+1 < len && input[i+1] == '?' &&
-	    (i == 0 || IS_LWSP(input[i-1])))
-		return TRUE;
+	switch (input[i]) {
+	case '\r':
+		if (i+1 == len || input[i+1] != '\n')
+			return TRUE;
+		i++;
+		/* fall through and verify the LF as well */
+	case '\n':
+		if (i+1 == len) {
+			/* trailing LF - we need to drop it */
+			return TRUE;
+		}
+		if (input[i+1] != '\t' && input[i+1] != ' ') {
+			/* LF not followed by whitespace - we need to
+			   add the whitespace */
+			return TRUE;
+		}
+		break;
+	case '\t':
+		/* TAB doesn't need to be encoded */
+		break;
+	case '=':
+		/* <LWSP>=? - we need to check backwards a bit to see if
+		   there is LWSP (note that we don't want to return TRUE for
+		   the LWSP itself yet, so we need to do this backwards
+		   check) */
+		if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
+		    memcmp(input + i, "=?", 2) == 0)
+			return TRUE;
+		break;
+	default:
+		/* 8bit chars */
+		if ((input[i] & 0x80) != 0)
+			return TRUE;
+		/* control chars */
+		if (input[i] < 32)
+			return TRUE;
+		break;
+	}
 	return FALSE;
 }
 
@@ -45,9 +72,9 @@
 	str_append(output, "=?utf-8?q?");
 	for (i = 0; i < len; i++) {
 		if (line_len_left < 3) {
-			/* if we're not at the beginning of a character,
+			/* if we're not at the beginning of an UTF8 character,
 			   go backwards until we are */
-			while ((input[i] & 0xc0) == 0x80) {
+			while (i > 0 && (input[i] & 0xc0) == 0x80) {
 				str_truncate(output, str_len(output)-3);
 				i--;
 			}
@@ -131,9 +158,11 @@
 void message_header_encode_data(const unsigned char *input, unsigned int len,
 				string_t *output)
 {
-	unsigned int i, first_idx, last_idx;
+	unsigned int i, j, first_line_len, cur_line_len, last_idx;
 	unsigned int enc_chars, enc_len, base64_len, q_len;
-	bool use_q;
+	const unsigned char *next_line_input;
+	unsigned int next_line_len;
+	bool use_q, cr;
 
 	/* find the first word that needs encoding */
 	for (i = 0; i < len; i++) {
@@ -145,13 +174,36 @@
 		str_append_data(output, input, len);
 		return;
 	}
-	first_idx = i;
-	while (first_idx > 0 && !IS_LWSP(input[first_idx-1]))
-		first_idx--;
+	/* go back to the beginning of the word so it is fully encoded */
+	if (input[i] != '\r' && input[i] != '\n') {
+		while (i > 0 && !IS_LWSP(input[i-1]))
+			i--;
+	}
+
+	/* write the prefix */
+	str_append_data(output, input, i);
+	first_line_len = j = i;
+	while (j > 0 && input[j-1] != '\n') j--;
+	if (j != 0)
+		first_line_len = j;
+
+	input += i;
+	len -= i;
+
+	/* we'll encode data only up to the next LF, the rest is handled
+	   recursively. */
+	next_line_input = memchr(input, '\n', len);
+	if (next_line_input != NULL) {
+		if (next_line_input != input && next_line_input[-1] == '\r')
+			next_line_input--;
+		cur_line_len = next_line_input - input;
+		next_line_len = len - cur_line_len;
+		len = cur_line_len;
+	}
 
 	/* find the last word that needs encoding */
-	last_idx = ++i; enc_chars = 1;
-	for (; i < len; i++) {
+	last_idx = 0; enc_chars = 0;
+	for (i = 0; i < len; i++) {
 		if (input_idx_need_encoding(input, i, len)) {
 			last_idx = i + 1;
 			enc_chars++;
@@ -162,19 +214,43 @@
 
 	/* figure out if we should use Q or B encoding. Prefer Q if it's not
 	   too much larger. */
-	enc_len = last_idx - first_idx;
+	enc_len = last_idx;
 	base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
 	q_len = enc_len + enc_chars*3;
 	use_q = q_len*2/3 <= base64_len;
 
 	/* and do it */
-	str_append_data(output, input, first_idx);
-	if (use_q) {
-		message_header_encode_q(input + first_idx, enc_len,
-					output, first_idx);
-	} else {
-		message_header_encode_b(input + first_idx, enc_len,
-					output, first_idx);
+	if (enc_len == 0)
+		;
+	else if (use_q)
+		message_header_encode_q(input, enc_len, output, first_line_len);
+	else
+		message_header_encode_b(input, enc_len, output, first_line_len);
+	str_append_data(output, input + last_idx, len - last_idx);
+
+	if (next_line_input != NULL) {
+		/* we're at [CR]LF */
+		i = 0;
+		if (next_line_input[0] == '\r') {
+			cr = TRUE;
+			i++;
+		}
+		i_assert(next_line_input[i] == '\n');
+		if (++i == next_line_len)
+			return; /* drop trailing [CR]LF */
+
+		if (cr)
+			str_append_c(output, '\r');
+		str_append_c(output, '\n');
+
+		if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
+			str_append_c(output, next_line_input[i]);
+			i++;
+		} else {
+			/* make it valid folding whitespace by adding a TAB */
+			str_append_c(output, '\t');
+		}
+		message_header_encode_data(next_line_input+i, next_line_len-i,
+					   output);
 	}
-	str_append_data(output, input + last_idx, len - last_idx);
 }
diff -r 009caac530e9 -r e6c96db70cfb src/lib-mail/message-header-encode.h
--- a/src/lib-mail/message-header-encode.h	Sun May 11 21:08:51 2014 +0300
+++ b/src/lib-mail/message-header-encode.h	Sun May 11 22:15:08 2014 +0300
@@ -2,14 +2,17 @@
 #define MESSAGE_HEADER_ENCODE_H
 
 /* Encode UTF-8 input into output wherever necessary using either Q or B
-   encoding depending on which takes less space (approximately). */
+   encoding depending on which takes less space (approximately). Folding
+   whitespace is preserved. Bare [CR]LF will be preserved by adding a TAB
+   after it to make it a valid folding whitespace. */
 void message_header_encode(const char *input, string_t *output);
 void message_header_encode_data(const unsigned char *input, unsigned int len,
 				string_t *output);
 
 /* Encode the whole UTF-8 input using "Q" or "B" encoding into output.
    The output is split into multiple lines if necessary (max 76 chars/line).
-   The first line's length is given as parameter. */
+   The first line's length is given as parameter. All the control characters
+   are encoded, including NUL, CR and LF. */
 void message_header_encode_q(const unsigned char *input, unsigned int len,
 			     string_t *output, unsigned int first_line_len);
 void message_header_encode_b(const unsigned char *input, unsigned int len,
diff -r 009caac530e9 -r e6c96db70cfb src/lib-mail/test-message-header-encode.c
--- a/src/lib-mail/test-message-header-encode.c	Sun May 11 21:08:51 2014 +0300
+++ b/src/lib-mail/test-message-header-encode.c	Sun May 11 22:15:08 2014 +0300
@@ -170,8 +170,27 @@
 		"a ää ä b", "a =?utf-8?b?w6TDpCDDpA==?= b",
 		"ä a ä", "=?utf-8?q?=C3=A4_a_=C3=A4?=",
 		"ää a ä", "=?utf-8?b?w6TDpCBhIMOk?=",
+		"=", "=",
+		"?", "?",
+		"a=?", "a=?",
+		"=?", "=?utf-8?q?=3D=3F?=",
+		"=?x", "=?utf-8?q?=3D=3Fx?=",
+		"a\n=?", "a\n\t=?utf-8?q?=3D=3F?=",
+		"a\t=?", "a\t=?utf-8?q?=3D=3F?=",
+		"a =?", "a =?utf-8?q?=3D=3F?=",
 		"foo\001bar", "=?utf-8?q?foo=01bar?=",
-		"\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?="
+		"\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?=",
+
+		"a\r\n b", "a\r\n b",
+		"a\r\n\tb", "a\r\n\tb",
+		"a\r\nb", "a\r\n\tb",
+		"a\n b", "a\n b",
+		"a\n  b", "a\n  b",
+		"a\nb", "a\n\tb",
+		"a\r\n", "a",
+		"a\n", "a",
+		"foo\n \001bar", "foo\n =?utf-8?q?=01bar?=",
+		"foo\001\n bar", "=?utf-8?q?foo=01?=\n bar"
 	};                          
 	string_t *str = t_str_new(128);
 	unsigned int i;