dovecot-1.2: fts-solr: Replace characters not valid for XML with...
dovecot at dovecot.org
dovecot at dovecot.org
Fri Aug 20 22:38:37 EEST 2010
details: http://hg.dovecot.org/dovecot-1.2/rev/cf0da2cd31fb
changeset: 9604:cf0da2cd31fb
user: Timo Sirainen <tss at iki.fi>
date: Fri Aug 20 20:38:26 2010 +0100
description:
fts-solr: Replace characters not valid for XML with replacement char.
diffstat:
src/plugins/fts-solr/fts-backend-solr.c | 32 ++++++++++++++++++++++++++++++++
1 files changed, 32 insertions(+), 0 deletions(-)
diffs (64 lines):
diff -r 5efba9f9f0a7 -r cf0da2cd31fb src/plugins/fts-solr/fts-backend-solr.c
--- a/src/plugins/fts-solr/fts-backend-solr.c Fri Aug 20 20:37:31 2010 +0100
+++ b/src/plugins/fts-solr/fts-backend-solr.c Fri Aug 20 20:38:26 2010 +0100
@@ -4,6 +4,7 @@
#include "array.h"
#include "str.h"
#include "strescape.h"
+#include "unichar.h"
#include "mail-storage-private.h"
#include "mail-namespace.h"
#include "solr-connection.h"
@@ -74,9 +75,25 @@
return name;
}
+static bool is_valid_xml_char(unichar_t chr)
+{
+ /* Valid characters in XML:
+
+ #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] |
+ [#x10000-#x10FFFF]
+
+ This function gets called only for #x80 and higher */
+ if (chr > 0xd7ff && chr < 0xe000)
+ return FALSE;
+ if (chr > 0xfffd && chr < 0x10000)
+ return FALSE;
+ return chr < 0x10ffff;
+}
+
static void
xml_encode_data(string_t *dest, const unsigned char *data, unsigned int len)
{
+ unichar_t chr;
unsigned int i;
for (i = 0; i < len; i++) {
@@ -101,11 +118,26 @@
/* SOLR doesn't like control characters.
replace them with spaces. */
str_append_c(dest, ' ');
+ } else if (data[i] >= 0x80) {
+ /* make sure the character is valid for XML
+ so we don't get XML parser errors */
+ unsigned int char_len =
+ uni_utf8_char_bytes(data[0]);
+ if (i + char_len <= len &&
+ uni_utf8_get_char_n(data, len, &chr) == 0 &&
+ is_valid_xml_char(chr))
+ str_append_n(dest, data + i, char_len);
+ else {
+ str_append_n(dest, utf8_replacement_char,
+ UTF8_REPLACEMENT_CHAR_LEN);
+ }
+ i += char_len - 1;
} else {
str_append_c(dest, data[i]);
}
break;
}
+ i += uni_utf8_char_bytes(data[0]);
}
}
More information about the dovecot-cvs
mailing list