# HG changeset patch # User Florian Zeitz # Date 1368284892 -7200 # Sat May 11 17:08:12 2013 +0200 # Node ID 91f175781d9b75f1617ca5ba50dd58860ef0ae13 # Parent 62874b472dc6e5c30fe7fbc64c1bf868e08bf482 liblib: Fix Unicode decomposition diff --git a/src/lib/test-unichar.c b/src/lib/test-unichar.c --- a/src/lib/test-unichar.c +++ b/src/lib/test-unichar.c @@ -2,11 +2,15 @@ #include "test-lib.h" #include "str.h" +#include "buffer.h" #include "unichar.h" void test_unichar(void) { - static const char *overlong_utf8 = "\xf8\x80\x95\x81\xa1"; + static const char overlong_utf8[] = "\xf8\x80\x95\x81\xa1"; + static const char collate_in[] = "\xc3\xbc \xc2\xb3"; + static const char collate_exp[] = "U\xcc\x88 3"; + buffer_t *collate_out; unichar_t chr, chr2; string_t *str = t_str_new(16); @@ -18,6 +22,13 @@ test_assert(uni_utf8_get_char(str_c(str), &chr2) > 0); test_assert(chr2 == chr); } + + collate_out = buffer_create_dynamic(default_pool, 32); + uni_utf8_to_decomposed_titlecase(collate_in, sizeof(collate_in), + collate_out); + test_assert(!strcmp(collate_out->data, collate_exp)); + buffer_free(&collate_out); + test_assert(!uni_utf8_str_is_valid(overlong_utf8)); test_assert(uni_utf8_get_char(overlong_utf8, &chr2) < 0); test_end(); diff --git a/src/lib/unichar.c b/src/lib/unichar.c --- a/src/lib/unichar.c +++ b/src/lib/unichar.c @@ -287,7 +287,7 @@ static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output) { - const uint16_t *value; + const uint32_t *value; unsigned int idx; if (chr < multidecomp_keys[0] || chr > 0xffff) diff --git a/src/lib/unicodemap.pl b/src/lib/unicodemap.pl --- a/src/lib/unicodemap.pl +++ b/src/lib/unicodemap.pl @@ -30,14 +30,14 @@ push @titlecase32_keys, $code; push @titlecase32_values, $value; } - } elsif ($decomp =~ /\<[^>]*> (.+)/) { + } elsif ($decomp =~ /(?:\<[^>]*> )?(.+)/) { # decompositions my $decomp_codes = $1; if ($decomp_codes =~ /^([0-9A-Z]*)$/i) { # unicharacter decomposition. use separate lists for this my $value = eval("0x$1"); - if ($value > 0xffff) { - print STDERR "Error: We've assumed decomposition codes are max. 16bit\n"; + if ($value > 0xffffffff) { + print STDERR "Error: We've assumed decomposition codes are max. 32bit\n"; exit 1; } if ($code <= 0xff) { @@ -61,8 +61,8 @@ foreach my $dcode (split(" ", $decomp_codes)) { my $value = eval("0x$dcode"); - if ($value > 0xffff) { - print STDERR "Error: We've assumed decomposition codes are max. 16bit\n"; + if ($value > 0xffffffff) { + print STDERR "Error: We've assumed decomposition codes are max. 32bit\n"; exit 1; } push @multidecomp_values, $value; @@ -78,7 +78,7 @@ my $last = $#list; my $n = 0; foreach my $key (@list) { - printf("0x%04x", $key); + printf("0x%05x", $key); last if ($n == $last); print ","; @@ -137,7 +137,7 @@ print_list(\@uni16_decomp_keys); print "\n};\n"; -print "static const uint16_t uni16_decomp_values[] = {\n\t"; +print "static const uint32_t uni16_decomp_values[] = {\n\t"; print_list(\@uni16_decomp_values); print "\n};\n"; @@ -145,7 +145,7 @@ print_list(\@uni32_decomp_keys); print "\n};\n"; -print "static const uint16_t uni32_decomp_values[] = {\n\t"; +print "static const uint32_t uni32_decomp_values[] = {\n\t"; print_list(\@uni32_decomp_values); print "\n};\n"; @@ -157,6 +157,6 @@ print_list(\@multidecomp_offsets); print "\n};\n"; -print "static const uint16_t multidecomp_values[] = {\n\t"; +print "static const uint32_t multidecomp_values[] = {\n\t"; print_list(\@multidecomp_values); print "\n};\n";