dovecot-2.2: liblib: Added uni_utf8_short_*() for handling UTF8 ...
dovecot at dovecot.org
dovecot at dovecot.org
Sat Jan 5 01:14:34 EET 2013
details: http://hg.dovecot.org/dovecot-2.2/rev/172295f5a78b
changeset: 15516:172295f5a78b
user: Timo Sirainen <tss at iki.fi>
date: Tue Nov 27 03:48:15 2012 +0200
description:
liblib: Added uni_utf8_short_*() for handling UTF8 data where [56]-byte sequences are invalid.
diffstat:
src/lib/unichar.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
src/lib/unichar.h | 7 ++++++
2 files changed, 71 insertions(+), 0 deletions(-)
diffs (86 lines):
diff -r bebe54e1d640 -r 172295f5a78b src/lib/unichar.c
--- a/src/lib/unichar.c Tue Nov 27 02:41:53 2012 +0200
+++ b/src/lib/unichar.c Tue Nov 27 03:48:15 2012 +0200
@@ -420,3 +420,67 @@
return uni_utf8_find_invalid_pos(data, size, &i) == 0;
}
+
+static int
+uni_utf8_short_find_invalid_pos(const unsigned char *input, size_t size,
+ size_t *pos_r)
+{
+ size_t i, len;
+
+ /* find the first invalid utf8 sequence */
+ for (i = 0; i < size;) {
+ if (input[i] < 0x80)
+ i++;
+ else {
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (unlikely(len == 0 || len > 4)) {
+ *pos_r = i;
+ return -1;
+ }
+ i += len;
+ }
+ }
+ return 0;
+}
+
+bool uni_utf8_short_get_valid_data(const unsigned char *input, size_t size,
+ buffer_t *buf)
+{
+ size_t i, len;
+
+ if (uni_utf8_short_find_invalid_pos(input, size, &i) == 0)
+ return TRUE;
+
+ /* broken utf-8 input - skip the broken characters */
+ while (i < size) {
+ if (input[i] < 0x80) {
+ buffer_append_c(buf, input[i++]);
+ continue;
+ }
+
+ len = is_valid_utf8_seq(input + i, size-i);
+ if (len == 0 || len > 4) {
+ i += I_MAX(len, 1);
+ output_add_replacement_char(buf);
+ continue;
+ }
+ buffer_append(buf, input + i, len);
+ i += len;
+ }
+ return FALSE;
+}
+
+bool uni_utf8_short_str_is_valid(const char *str)
+{
+ size_t i;
+
+ return uni_utf8_short_find_invalid_pos((const unsigned char *)str,
+ strlen(str), &i) == 0;
+}
+
+bool uni_utf8_short_data_is_valid(const unsigned char *data, size_t size)
+{
+ size_t i;
+
+ return uni_utf8_find_invalid_pos(data, size, &i) == 0;
+}
diff -r bebe54e1d640 -r 172295f5a78b src/lib/unichar.h
--- a/src/lib/unichar.h Tue Nov 27 02:41:53 2012 +0200
+++ b/src/lib/unichar.h Tue Nov 27 03:48:15 2012 +0200
@@ -88,4 +88,11 @@
/* Returns TRUE if data contains only valid UTF-8 input. */
bool uni_utf8_data_is_valid(const unsigned char *data, size_t size);
+/* Same as the non-short variants, but assume 5-byte and 6-byte UTF8
+ sequences are illegal. */
+bool uni_utf8_short_get_valid_data(const unsigned char *input, size_t size,
+ buffer_t *buf);
+bool uni_utf8_short_str_is_valid(const char *str);
+bool uni_utf8_short_data_is_valid(const unsigned char *data, size_t size);
+
#endif
More information about the dovecot-cvs
mailing list