[dovecot-cvs] dovecot: Rewrote some code and cleaned up the API
dovecot at dovecot.org
dovecot at dovecot.org
Mon Jun 11 04:37:54 EEST 2007
details: http://hg.dovecot.org/dovecot/rev/8101787cdd1c
changeset: 5683:8101787cdd1c
user: Timo Sirainen <tss at iki.fi>
date: Mon Jun 11 04:37:29 2007 +0300
description:
Rewrote some code and cleaned up the API
diffstat:
2 files changed, 122 insertions(+), 189 deletions(-)
src/lib/unichar.c | 289 +++++++++++++++++++----------------------------------
src/lib/unichar.h | 22 ++--
diffs (truncated from 387 to 300 lines):
diff -r ff5ba9cb6cd0 -r 8101787cdd1c src/lib/unichar.c
--- a/src/lib/unichar.c Mon Jun 11 02:27:55 2007 +0300
+++ b/src/lib/unichar.c Mon Jun 11 04:37:29 2007 +0300
@@ -1,49 +1,15 @@
-/* Copyright (C) 2005 Timo Sirainen */
-
-/* Contains code from GLIB:
- *
- * Copyright (C) 1999 Tom Tromey
- * Copyright (C) 2000 Red Hat, Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
+/* Copyright (C) 2005-2007 Timo Sirainen */
#include "lib.h"
#include "buffer.h"
#include "unichar.h"
-#define UTF8_LENGTH(Char) \
- ((Char) < 0x80 ? 1 : \
- ((Char) < 0x800 ? 2 : \
- ((Char) < 0x10000 ? 3 : \
- ((Char) < 0x200000 ? 4 : \
- ((Char) < 0x4000000 ? 5 : 6)))))
-
-static const char utf8_skip_data[256] = {
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
- 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
};
-const char *const uni_utf8_skip = utf8_skip_data;
+const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
unsigned int uni_strlen(const unichar_t *str)
{
@@ -54,146 +20,71 @@ unsigned int uni_strlen(const unichar_t
return len;
}
-unichar_t uni_utf8_get_char(const char *input)
+int uni_utf8_get_char(const char *input, unichar_t *chr_r)
{
- return uni_utf8_get_char_len((const unsigned char *)input, (size_t)-1);
+ return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1,
+ chr_r);
}
-unichar_t uni_utf8_get_char_len(const unsigned char *input, size_t max_len)
+int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
{
- unsigned int i, len;
- unichar_t wc = *input;
+ const unsigned char *input = _input;
+ unichar_t chr;
+ unsigned int i, len;
+ int ret;
- i_assert(max_len > 0);
+ i_assert(max_len > 0);
- if (wc < 0x80)
- {
- return wc;
- }
- else if (wc < 0xc0)
- {
- return (unichar_t)-1;
- }
- else if (wc < 0xe0)
- {
- len = 2;
- wc &= 0x1f;
- }
- else if (wc < 0xf0)
- {
- len = 3;
- wc &= 0x0f;
- }
- else if (wc < 0xf8)
- {
- len = 4;
- wc &= 0x07;
- }
- else if (wc < 0xfc)
- {
- len = 5;
- wc &= 0x03;
- }
- else if (wc < 0xfe)
- {
- len = 6;
- wc &= 0x01;
- }
- else
- {
- return (unichar_t)-1;
- }
-
- if (max_len != (size_t)-1 && len > max_len)
- {
- for (i = 1; i < max_len; i++)
- {
- if ((input[i] & 0xc0) != 0x80)
- return (unichar_t)-1;
- }
- return (unichar_t)-2;
- }
-
- for (i = 1; i < len; ++i)
- {
- if ((input[i] & 0xc0) != 0x80)
- {
- if (input[i] != '\0')
- return (unichar_t)-1;
- else
- return (unichar_t)-2;
+ if (*input < 0x80) {
+ *chr_r = *input;
+ return 1;
}
- wc <<= 6;
- wc |= (input[i] & 0x3f);
- }
+ /* first byte has len highest bits set, followed by zero bit.
+ the rest of the bits are used as the highest bits of the value. */
+ chr = *input;
+ len = uni_utf8_char_bytes(*input);
+ switch (len) {
+ case 2:
+ chr &= 0x1f;
+ break;
+ case 3:
+ chr &= 0x0f;
+ break;
+ case 4:
+ chr &= 0x07;
+ break;
+ case 5:
+ chr &= 0x03;
+ break;
+ case 6:
+ chr &= 0x01;
+ break;
+ default:
+ /* only 7bit chars should have len==1 */
+ i_assert(len == 1);
+ return -1;
+ }
- if (UTF8_LENGTH(wc) != len)
- return (unichar_t)-1;
-
- return wc;
-}
+ if (len <= max_len)
+ ret = 1;
+ else {
+ /* check first if the input is invalid before returning 0 */
+ ret = 0;
+ len = max_len;
+ }
-/**
- * g_unichar_to_utf8:
- * @c: a ISO10646 character code
- * @outbuf: output buffer, must have at least 6 bytes of space.
- * If %NULL, the length will be computed and returned
- * and nothing will be written to @outbuf.
- *
- * Converts a single character to UTF-8.
- *
- * Return value: number of bytes written
- **/
-static int
-g_unichar_to_utf8(unichar_t c, char *outbuf)
-{
- unsigned int len = 0;
- int first;
- int i;
+ /* the following bytes must all be 10xxxxxx */
+ for (i = 1; i < len; i++) {
+ if ((input[i] & 0xc0) != 0x80)
+ return input[i] == '\0' ? 0 : -1;
- if (c < 0x80)
- {
- first = 0;
- len = 1;
- }
- else if (c < 0x800)
- {
- first = 0xc0;
- len = 2;
- }
- else if (c < 0x10000)
- {
- first = 0xe0;
- len = 3;
- }
- else if (c < 0x200000)
- {
- first = 0xf0;
- len = 4;
- }
- else if (c < 0x4000000)
- {
- first = 0xf8;
- len = 5;
- }
- else
- {
- first = 0xfc;
- len = 6;
- }
+ chr <<= 6;
+ chr |= input[i] & 0x3f;
+ }
- if (outbuf)
- {
- for (i = len - 1; i > 0; --i)
- {
- outbuf[i] = (c & 0x3f) | 0x80;
- c >>= 6;
- }
- outbuf[0] = c | first;
- }
-
- return len;
+ *chr_r = chr;
+ return ret;
}
int uni_utf8_to_ucs4(const char *input, buffer_t *output)
@@ -201,12 +92,11 @@ int uni_utf8_to_ucs4(const char *input,
unichar_t chr;
while (*input != '\0') {
- chr = uni_utf8_get_char(input);
- if (chr & 0x80000000) {
+ if (uni_utf8_get_char(input, &chr) <= 0) {
/* invalid input */
return -1;
}
- input = uni_utf8_next_char(input);
+ input += uni_utf8_char_bytes(*input);
buffer_append(output, &chr, sizeof(chr));
}
@@ -215,24 +105,59 @@ int uni_utf8_to_ucs4(const char *input,
void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
{
- void *buf;
- int char_len;
-
- for (; *input != '\0' && len > 0; input++, len--) {
- buf = buffer_append_space_unsafe(output, 6);
- char_len = g_unichar_to_utf8(*input, buf);
- buffer_set_used_size(output, output->used - 6 + char_len);
- }
+ for (; *input != '\0' && len > 0; input++, len--)
+ uni_ucs4_to_utf8_c(*input, output);
}
-unsigned int uni_utf8_strlen_n(const void *input, size_t size)
+void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
{
- const uint8_t *data = (const uint8_t *)input;
+ unsigned char first;
+ int bitpos;
+
+ if (chr < 0x80) {
+ buffer_append_c(output, chr);
+ return;
+ }
More information about the dovecot-cvs
mailing list