dovecot-2.1: uni_utf8_*(): Treat overlong UTF8 sequences as inva...

Fri Jan 4 23:24:41 EET 2013

details:   http://hg.dovecot.org/dovecot-2.1/rev/7be4709aab5e
changeset: 14864:7be4709aab5e
user:      Timo Sirainen <tss at iki.fi>
date:      Fri Jan 04 23:24:26 2013 +0200
description:
uni_utf8_*(): Treat overlong UTF8 sequences as invalid.

diffstat:

 src/lib/Makefile.am    |   1 +
 src/lib/test-lib.c     |   1 +
 src/lib/test-lib.h     |   1 +
 src/lib/test-unichar.c |  24 ++++++++++++++++++++++++
 src/lib/unichar.c      |  28 ++++++++++++++--------------
 5 files changed, 41 insertions(+), 14 deletions(-)

diffs (125 lines):

diff -r 721aba9128b9 -r 7be4709aab5e src/lib/Makefile.am

--- a/src/lib/Makefile.am	Fri Jan 04 05:29:25 2013 +0200
+++ b/src/lib/Makefile.am	Fri Jan 04 23:24:26 2013 +0200
@@ -273,6 +273,7 @@
 	test-str-find.c \
 	test-str-sanitize.c \
 	test-time-util.c \
+	test-unichar.c \
 	test-utc-mktime.c \
 	test-var-expand.c
 
diff -r 721aba9128b9 -r 7be4709aab5e src/lib/test-lib.c
--- a/src/lib/test-lib.c	Fri Jan 04 05:29:25 2013 +0200
+++ b/src/lib/test-lib.c	Fri Jan 04 23:24:26 2013 +0200
@@ -30,6 +30,7 @@
 		test_str_find,
 		test_str_sanitize,
 		test_time_util,
+		test_unichar,
 		test_utc_mktime,
 		test_var_expand,
 		NULL
diff -r 721aba9128b9 -r 7be4709aab5e src/lib/test-lib.h
--- a/src/lib/test-lib.h	Fri Jan 04 05:29:25 2013 +0200
+++ b/src/lib/test-lib.h	Fri Jan 04 23:24:26 2013 +0200
@@ -29,6 +29,7 @@
 void test_str_find(void);
 void test_str_sanitize(void);
 void test_time_util(void);
+void test_unichar(void);
 void test_utc_mktime(void);
 void test_var_expand(void);
 
diff -r 721aba9128b9 -r 7be4709aab5e src/lib/test-unichar.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/test-unichar.c	Fri Jan 04 23:24:26 2013 +0200
@@ -0,0 +1,24 @@
+/* Copyright (c) 2007-2012 Dovecot authors, see the included COPYING file */
+
+#include "test-lib.h"
+#include "str.h"
+#include "unichar.h"
+
+void test_unichar(void)
+{
+	static const char *overlong_utf8 = "\xf8\x80\x95\x81\xa1";
+	unichar_t chr, chr2;
+	string_t *str = t_str_new(16);
+
+	test_begin("unichars");
+	for (chr = 0; chr <= 0x10ffff; chr++) {
+		str_truncate(str, 0);
+		uni_ucs4_to_utf8_c(chr, str);
+		test_assert(uni_utf8_str_is_valid(str_c(str)));
+		test_assert(uni_utf8_get_char(str_c(str), &chr2) > 0);
+		test_assert(chr2 == chr);
+	}
+	test_assert(!uni_utf8_str_is_valid(overlong_utf8));
+	test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0);
+	test_end();
+}
diff -r 721aba9128b9 -r 7be4709aab5e src/lib/unichar.c
--- a/src/lib/unichar.c	Fri Jan 04 05:29:25 2013 +0200
+++ b/src/lib/unichar.c	Fri Jan 04 23:24:26 2013 +0200
@@ -37,8 +37,10 @@
 
 int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
 {
+	static unichar_t lowest_valid_chr_table[] =
+		{ 0, 0, 0x80, 0x800, 0x10000, 0x20000, 0x40000 };
 	const unsigned char *input = _input;
-	unichar_t chr;
+	unichar_t chr, lowest_valid_chr;
 	unsigned int i, len;
 	int ret;
 
@@ -75,10 +77,12 @@
 		return -1;
 	}
 
-	if (len <= max_len)
+	if (len <= max_len) {
+		lowest_valid_chr = lowest_valid_chr_table[len];
 		ret = 1;
-	else {
+	} else {
 		/* check first if the input is invalid before returning 0 */
+		lowest_valid_chr = 0;
 		ret = 0;
 		len = max_len;
 	}
@@ -91,6 +95,10 @@
 		chr <<= 6;
 		chr |= input[i] & 0x3f;
 	}
+	if (chr < lowest_valid_chr) {
+		/* overlong encoding */
+		return -1;
+	}
 
 	*chr_r = chr;
 	return ret;
@@ -340,19 +348,11 @@
 static inline unsigned int
 is_valid_utf8_seq(const unsigned char *input, unsigned int size)
 {
-	unsigned int i, len;
+	unichar_t chr;
 
-	len = uni_utf8_char_bytes(input[0]);
-	if (unlikely(len > size || len == 1))
+	if (uni_utf8_get_char_n(input, size, &chr) <= 0)
 		return 0;
-
-	/* the rest of the chars should be in 0x80..0xbf range.
-	   anything else is start of a sequence or invalid */
-	for (i = 1; i < len; i++) {
-		if (unlikely(input[i] < 0x80 || input[i] > 0xbf))
-			return 0;
-	}
-	return len;
+	return uni_utf8_char_bytes(input[0]);
 }
 
 static int uni_utf8_find_invalid_pos(const unsigned char *input, size_t size,