dovecot: Replace invalid UTF8 input with a replacement character.
dovecot at dovecot.org
dovecot at dovecot.org
Tue Jan 22 09:35:10 EET 2008
details: http://hg.dovecot.org/dovecot/rev/6f014a866f38
changeset: 7185:6f014a866f38
user: Timo Sirainen <tss at iki.fi>
date: Tue Jan 22 09:31:59 2008 +0200
description:
Replace invalid UTF8 input with a replacement character.
diffstat:
2 files changed, 27 insertions(+), 4 deletions(-)
src/lib/unichar.c | 19 +++++++++++++++++++
src/lib/unichar.h | 12 ++++++++----
diffs (81 lines):
diff -r 7416737df8b8 -r 6f014a866f38 src/lib/unichar.c
--- a/src/lib/unichar.c Tue Jan 22 08:49:24 2008 +0200
+++ b/src/lib/unichar.c Tue Jan 22 09:31:59 2008 +0200
@@ -260,6 +260,22 @@ static bool uni_ucs4_decompose_multi_utf
return TRUE;
}
+static void output_add_replacement_char(buffer_t *output)
+{
+ /* 0xfffd */
+ static const unsigned char replacement_utf8[] = { 0xef, 0xbf, 0xbd };
+#define REPLACEMENT_UTF8_LEN 3
+
+ if (output->used >= REPLACEMENT_UTF8_LEN &&
+ memcmp(CONST_PTR_OFFSET(output->data,
+ output->used - REPLACEMENT_UTF8_LEN),
+ replacement_utf8, REPLACEMENT_UTF8_LEN) == 0) {
+ /* don't add the replacement char multiple times */
+ return;
+ }
+ buffer_append(output, replacement_utf8, REPLACEMENT_UTF8_LEN);
+}
+
int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len,
buffer_t *output)
{
@@ -273,6 +289,7 @@ int uni_utf8_to_decomposed_titlecase(con
/* invalid input. try the next byte. */
ret = -1;
input++; max_len--;
+ output_add_replacement_char(output);
continue;
}
bytes = uni_utf8_char_bytes(*input);
@@ -327,6 +344,7 @@ broken:
/* broken utf-8 input - skip the broken characters */
buffer_append(buf, input, i++);
+ output_add_replacement_char(buf);
while (i < size) {
if (input[i] < 0x80) {
buffer_append_c(buf, input[i++]);
@@ -336,6 +354,7 @@ broken:
len = is_valid_utf8_seq(input + i, size-i);
if (len == 0) {
i++;
+ output_add_replacement_char(buf);
continue;
}
buffer_append(buf, input + i, len);
diff -r 7416737df8b8 -r 6f014a866f38 src/lib/unichar.h
--- a/src/lib/unichar.h Tue Jan 22 08:49:24 2008 +0200
+++ b/src/lib/unichar.h Tue Jan 22 09:31:59 2008 +0200
@@ -1,5 +1,8 @@
#ifndef UNICHAR_H
#define UNICHAR_H
+
+/* Character used to replace invalid input. */
+#define UNICODE_REPLACEMENT_CHAR 0xfffd
typedef uint32_t unichar_t;
ARRAY_DEFINE_TYPE(unichars, unichar_t);
@@ -37,13 +40,14 @@ unichar_t uni_ucs4_to_titlecase(unichar_
/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
output buffer. Returns 0 if ok, -1 if input was invalid. This generates
- output that's compatible with i;unicode-casemap comparator. */
+ output that's compatible with i;unicode-casemap comparator. Invalid input
+ is replaced with unicode replacement character (0xfffd). */
int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len,
buffer_t *output);
-/* If input contains only valid UTF-8 characters, return TRUE. If input
- contains invalid UTF-8 characters, write only the valid ones to buf and
- return FALSE. */
+/* If input contains only valid UTF-8 characters, return TRUE without updating
+ buf. If input contains invalid UTF-8 characters, replace them with unicode
+ replacement character (0xfffd), write the output to buf and return FALSE. */
bool uni_utf8_get_valid_data(const unsigned char *input, size_t size,
buffer_t *buf);
More information about the dovecot-cvs
mailing list