[Dovecot] search and UTF-8 normalization forms (NFD)
Florian Zeitz
florob at babelmonkeys.de
Sat May 11 18:13:54 EEST 2013
Am 10.05.2013 15:24, schrieb Florian Zeitz:
> Could you elaborate a bit why you think i;unicode-casemap does not
> handle this case?
>
> Is it only applied to the query, but not the header, or vice versa?
> It seems to me that Step 2 should map both inputs to LATIN CAPITAL
> LETTER U + COMBINING DIAERESIS.
>
> Regards,
> Florian
>
So... I had a look at this. Turns out that the current implementation of
Unicode decomposition (Step 2(b) in i;unicode-casemap) in Dovecot is
broken. It only handles decomposition properties that include a tag.
I've attached a hg export that fixes this.
-------------- next part --------------
# HG changeset patch
# User Florian Zeitz <florob at babelmonkeys.de>
# Date 1368284892 -7200
# Sat May 11 17:08:12 2013 +0200
# Node ID 91f175781d9b75f1617ca5ba50dd58860ef0ae13
# Parent 62874b472dc6e5c30fe7fbc64c1bf868e08bf482
liblib: Fix Unicode decomposition
diff --git a/src/lib/test-unichar.c b/src/lib/test-unichar.c
--- a/src/lib/test-unichar.c
+++ b/src/lib/test-unichar.c
@@ -2,11 +2,15 @@
#include "test-lib.h"
#include "str.h"
+#include "buffer.h"
#include "unichar.h"
void test_unichar(void)
{
- static const char *overlong_utf8 = "\xf8\x80\x95\x81\xa1";
+ static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1";
+ static const char collate_in[] = "\xc3\xbc \xc2\xb3";
+ static const char collate_exp[] = "U\xcc\x88 3";
+ buffer_t *collate_out;
unichar_t chr, chr2;
string_t *str = t_str_new(16);
@@ -18,6 +22,13 @@
test_assert(uni_utf8_get_char(str_c(str), &chr2) > 0);
test_assert(chr2 == chr);
}
+
+ collate_out = buffer_create_dynamic(default_pool, 32);
+ uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in),
+ collate_out);
+ test_assert(!strcmp(collate_out->data, collate_exp));
+ buffer_free(&collate_out);
+
test_assert(!uni_utf8_str_is_valid(overlong_utf8));
test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0);
test_end();
diff --git a/src/lib/unichar.c b/src/lib/unichar.c
--- a/src/lib/unichar.c
+++ b/src/lib/unichar.c
@@ -287,7 +287,7 @@
static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
{
- const uint16_t *value;
+ const uint32_t *value;
unsigned int idx;
if (chr < multidecomp_keys[0] || chr > 0xffff)
diff --git a/src/lib/unicodemap.pl b/src/lib/unicodemap.pl
--- a/src/lib/unicodemap.pl
+++ b/src/lib/unicodemap.pl
@@ -30,14 +30,14 @@
push @titlecase32_keys, $code;
push @titlecase32_values, $value;
}
- } elsif ($decomp =~ /\<[^>]*> (.+)/) {
+ } elsif ($decomp =~ /(?:\<[^>]*> )?(.+)/) {
# decompositions
my $decomp_codes = $1;
if ($decomp_codes =~ /^([0-9A-Z]*)$/i) {
# unicharacter decomposition. use separate lists for this
my $value = eval("0x$1");
- if ($value > 0xffff) {
- print STDERR "Error: We've assumed decomposition codes are max. 16bit\n";
+ if ($value > 0xffffffff) {
+ print STDERR "Error: We've assumed decomposition codes are max. 32bit\n";
exit 1;
}
if ($code <= 0xff) {
@@ -61,8 +61,8 @@
foreach my $dcode (split(" ", $decomp_codes)) {
my $value = eval("0x$dcode");
- if ($value > 0xffff) {
- print STDERR "Error: We've assumed decomposition codes are max. 16bit\n";
+ if ($value > 0xffffffff) {
+ print STDERR "Error: We've assumed decomposition codes are max. 32bit\n";
exit 1;
}
push @multidecomp_values, $value;
@@ -78,7 +78,7 @@
my $last = $#list;
my $n = 0;
foreach my $key (@list) {
- printf("0x%04x", $key);
+ printf("0x%05x", $key);
last if ($n == $last);
print ",";
@@ -137,7 +137,7 @@
print_list(\@uni16_decomp_keys);
print "\n};\n";
-print "static const uint16_t uni16_decomp_values[] = {\n\t";
+print "static const uint32_t uni16_decomp_values[] = {\n\t";
print_list(\@uni16_decomp_values);
print "\n};\n";
@@ -145,7 +145,7 @@
print_list(\@uni32_decomp_keys);
print "\n};\n";
-print "static const uint16_t uni32_decomp_values[] = {\n\t";
+print "static const uint32_t uni32_decomp_values[] = {\n\t";
print_list(\@uni32_decomp_values);
print "\n};\n";
@@ -157,6 +157,6 @@
print_list(\@multidecomp_offsets);
print "\n};\n";
-print "static const uint16_t multidecomp_values[] = {\n\t";
+print "static const uint32_t multidecomp_values[] = {\n\t";
print_list(\@multidecomp_values);
print "\n};\n";
More information about the dovecot
mailing list