dovecot: Added uni_ucs4_to_titlecase() and uni_utf8_to_decompose...
dovecot at dovecot.org
dovecot at dovecot.org
Fri Jul 20 17:39:16 EEST 2007
details: http://hg.dovecot.org/dovecot/rev/04b9eb27283c
changeset: 6129:04b9eb27283c
user: Timo Sirainen <tss at iki.fi>
date: Fri Jul 20 17:25:16 2007 +0300
description:
Added uni_ucs4_to_titlecase() and uni_utf8_to_decomposed_titlecase(). They
use a unicharmap.c file generated from UnicodeData.txt.
diffstat:
5 files changed, 273 insertions(+)
.hgignore | 1
src/lib/Makefile.am | 8 ++
src/lib/unichar.c | 121 ++++++++++++++++++++++++++++++++++++++++++++
src/lib/unichar.h | 9 +++
src/lib/unicodemap.pl | 134 +++++++++++++++++++++++++++++++++++++++++++++++++
diffs (truncated from 318 to 300 lines):
diff -r 6d2bee707053 -r 04b9eb27283c .hgignore
--- a/.hgignore Fri Jul 20 17:21:53 2007 +0300
+++ b/.hgignore Fri Jul 20 17:25:16 2007 +0300
@@ -53,6 +53,7 @@ src/dict/dict
src/dict/dict
src/imap-login/imap-login
src/imap/imap
+src/lib/unicodemap.c
src/lib-dict/dict-drivers-register.c
src/lib-sql/sql-drivers-register.c
src/lib-storage/register/mail-storage-register.c
diff -r 6d2bee707053 -r 04b9eb27283c src/lib/Makefile.am
--- a/src/lib/Makefile.am Fri Jul 20 17:21:53 2007 +0300
+++ b/src/lib/Makefile.am Fri Jul 20 17:25:16 2007 +0300
@@ -1,4 +1,12 @@ noinst_LIBRARIES = liblib.a
noinst_LIBRARIES = liblib.a
+
+BUILT_SOURCES = unicodemap.c
+
+EXTRA_DIST = unicodemap.c
+
+unicodemap.c:
+ test -f UnicodeData.txt || wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
+ perl unicodemap.pl < UnicodeData.txt > $@
liblib_a_SOURCES = \
backtrace-string.c \
diff -r 6d2bee707053 -r 04b9eb27283c src/lib/unichar.c
--- a/src/lib/unichar.c Fri Jul 20 17:21:53 2007 +0300
+++ b/src/lib/unichar.c Fri Jul 20 17:25:16 2007 +0300
@@ -2,7 +2,13 @@
#include "lib.h"
#include "buffer.h"
+#include "bsearch-insert-pos.h"
#include "unichar.h"
+
+#include "unicodemap.c"
+
+#define HANGUL_FIRST 0xac00
+#define HANGUL_LAST 0xd7a3
static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
@@ -164,3 +170,118 @@ unsigned int uni_utf8_strlen_n(const voi
}
return len;
}
+
+static bool uint16_find(const uint16_t *data, unsigned int count,
+ uint16_t value, unsigned int *idx_r)
+{
+ BINARY_NUMBER_SEARCH(data, count, value, idx_r);
+}
+
+static bool uint32_find(const uint32_t *data, unsigned int count,
+ uint32_t value, unsigned int *idx_r)
+{
+ BINARY_NUMBER_SEARCH(data, count, value, idx_r);
+}
+
+unichar_t uni_ucs4_to_titlecase(unichar_t chr)
+{
+ unsigned int idx;
+
+ if (chr <= 0xffff) {
+ if (!uint16_find(titlecase16_keys, N_ELEMENTS(titlecase16_keys),
+ chr, &idx))
+ return chr;
+ else
+ return titlecase16_values[idx];
+ } else {
+ if (!uint32_find(titlecase32_keys, N_ELEMENTS(titlecase32_keys),
+ chr, &idx))
+ return chr;
+ else
+ return titlecase32_values[idx];
+ }
+}
+
+static bool uni_ucs4_decompose_uni(unichar_t *chr)
+{
+ unsigned int idx;
+
+ if (*chr <= 0xffff) {
+ if (!uint16_find(uni16_decomp_keys,
+ N_ELEMENTS(uni16_decomp_keys),
+ *chr, &idx))
+ return FALSE;
+ *chr = uni16_decomp_values[idx];
+ } else {
+ if (!uint32_find(uni32_decomp_keys,
+ N_ELEMENTS(uni32_decomp_keys),
+ *chr, &idx))
+ return FALSE;
+ *chr = uni32_decomp_values[idx];
+ }
+ return TRUE;
+}
+
+static void uni_ucs4_decompose_hangul_utf8(unichar_t chr, buffer_t *output)
+{
+#define SBase HANGUL_FIRST
+#define LBase 0x1100
+#define VBase 0x1161
+#define TBase 0x11A7
+#define LCount 19
+#define VCount 21
+#define TCount 28
+#define NCount (VCount * TCount)
+ unsigned int SIndex = chr - SBase;
+ unichar_t L = LBase + SIndex / NCount;
+ unichar_t V = VBase + (SIndex % NCount) / TCount;
+ unichar_t T = TBase + SIndex % TCount;
+
+ uni_ucs4_to_utf8_c(L, output);
+ uni_ucs4_to_utf8_c(V, output);
+ if (T != TBase) uni_ucs4_to_utf8_c(T, output);
+}
+
+static bool uni_ucs4_decompose_multi_utf8(unichar_t chr, buffer_t *output)
+{
+ const uint16_t *value;
+ unsigned int idx;
+
+ if (chr > 0xffff)
+ return FALSE;
+
+ if (!uint16_find(multidecomp_keys, N_ELEMENTS(multidecomp_keys),
+ chr, &idx))
+ return FALSE;
+
+ value = &multidecomp_values[multidecomp_offsets[idx]];
+ for (; *value != 0; value++)
+ uni_ucs4_to_utf8_c(*value, output);
+ return TRUE;
+}
+
+int uni_utf8_to_decomposed_titlecase(const void *_input, size_t max_len,
+ buffer_t *output)
+{
+ const unsigned char *input = _input;
+ unsigned int bytes;
+ unichar_t chr;
+
+ while (max_len > 0 && *input != '\0') {
+ if (uni_utf8_get_char_n(input, max_len, &chr) <= 0) {
+ /* invalid input */
+ return -1;
+ }
+ bytes = uni_utf8_char_bytes(*input);
+ input += bytes;
+ max_len -= bytes;
+
+ chr = uni_ucs4_to_titlecase(chr);
+ if (chr >= HANGUL_FIRST && chr <= HANGUL_LAST)
+ uni_ucs4_decompose_hangul_utf8(chr, output);
+ else if (uni_ucs4_decompose_uni(&chr) ||
+ !uni_ucs4_decompose_multi_utf8(chr, output))
+ uni_ucs4_to_utf8_c(chr, output);
+ }
+ return 0;
+}
diff -r 6d2bee707053 -r 04b9eb27283c src/lib/unichar.h
--- a/src/lib/unichar.h Fri Jul 20 17:21:53 2007 +0300
+++ b/src/lib/unichar.h Fri Jul 20 17:25:16 2007 +0300
@@ -31,4 +31,13 @@ static inline unsigned int uni_utf8_char
return uni_utf8_non1_bytes[(uint8_t)chr - (192 + 2)];
}
+/* Return given character in titlecase. */
+unichar_t uni_ucs4_to_titlecase(unichar_t chr);
+
+/* Convert UTF-8 input to titlecase and decompose the titlecase characters to
+ output buffer. Returns 0 if ok, -1 if input was invalid. This generates
+ output that's compatible with i;unicode-casemap comparator. */
+int uni_utf8_to_decomposed_titlecase(const void *input, size_t max_len,
+ buffer_t *output);
+
#endif
diff -r 6d2bee707053 -r 04b9eb27283c src/lib/unicodemap.pl
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib/unicodemap.pl Fri Jul 20 17:25:16 2007 +0300
@@ -0,0 +1,134 @@
+#!/usr/bin/env perl
+use strict;
+
+my (@titlecase16_keys, @titlecase16_values);
+my (@titlecase32_keys, @titlecase32_values);
+my (@uni16_decomp_keys, @uni16_decomp_values);
+my (@uni32_decomp_keys, @uni32_decomp_values);
+my (@multidecomp_keys, @multidecomp_offsets, @multidecomp_values);
+while (<>) {
+ chomp $_;
+ my @arr = split(";");
+ my $code = eval("0x".$arr[0]);
+ my $decomp = $arr[5];
+ my $titlecode = $arr[14];
+
+ if ($titlecode ne "") {
+ # titlecase mapping
+ my $value = eval("0x$titlecode");
+ if ($value == $code) {
+ # the same character, ignore
+ } elsif ($code <= 0xffff && $value <= 0xffff) {
+ push @titlecase16_keys, $code;
+ push @titlecase16_values, $value;
+ } else {
+ push @titlecase32_keys, $code;
+ push @titlecase32_values, $value;
+ }
+ } elsif ($decomp =~ /\<[^>]*> (.+)/) {
+ # decompositions
+ my $decomp_codes = $1;
+ if ($decomp_codes =~ /^([0-9A-Z]*)$/i) {
+ # unicharacter decomposition. use separate lists for this
+ my $value = eval("0x$1");
+ if ($value > 0xffff) {
+ print STDERR "We've assumed decomposition codes are max. 16bit\n";
+ exit;
+ }
+ if ($code <= 0xffff) {
+ push @uni16_decomp_keys, $code;
+ push @uni16_decomp_values, $value;
+ } else {
+ push @uni32_decomp_keys, $code;
+ push @uni32_decomp_values, $value;
+ }
+ } else {
+ # multicharacter decomposition.
+ if ($code > 0xffff) {
+ print STDERR "We've assumed multi-decomposition key codes are max. 16bit\n";
+ exit;
+ }
+
+ push @multidecomp_keys, $code;
+ push @multidecomp_offsets, scalar(@multidecomp_values);
+
+ foreach my $dcode (split(" ", $decomp_codes)) {
+ my $value = eval("0x$dcode");
+ if ($value > 0xffff) {
+ print STDERR "We've assumed decomposition codes are max. 16bit\n";
+ exit;
+ }
+ push @multidecomp_values, $value;
+ }
+ push @multidecomp_values, 0;
+ }
+ }
+}
+
+sub print_list {
+ my @list = @{$_[0]};
+
+ my $last = $#list;
+ my $n = 0;
+ foreach my $key (@list) {
+ printf("0x%04x", $key);
+ last if ($n == $last);
+ print ",";
+
+ $n++;
+ if (($n % 8) == 0) {
+ print "\n\t";
+ } else {
+ print " ";
+ }
+ }
+}
+
+print "/* This file is automatically generated by unicodemap.pl from UnicodeData.txt
+
+ NOTE: decompositions for characters having titlecase characters
+ are not included, because we first translate everything to titlecase */\n";
+
+print "static uint16_t titlecase16_keys[] = {\n\t";
+print_list(\@titlecase16_keys);
+print "\n};\n";
+
+print "static uint16_t titlecase16_values[] = {\n\t";
+print_list(\@titlecase16_values);
+print "\n};\n";
+
+print "static uint32_t titlecase32_keys[] = {\n\t";
+print_list(\@titlecase32_keys);
+print "\n};\n";
+
+print "static uint32_t titlecase32_values[] = {\n\t";
+print_list(\@titlecase32_values);
+print "\n};\n";
+
+print "static uint16_t uni16_decomp_keys[] = {\n\t";
+print_list(\@uni16_decomp_keys);
+print "\n};\n";
+
+print "static uint16_t uni16_decomp_values[] = {\n\t";
+print_list(\@uni16_decomp_values);
+print "\n};\n";
+
+print "static uint32_t uni32_decomp_keys[] = {\n\t";
More information about the dovecot-cvs
mailing list