[dovecot-cvs] dovecot: Rewrote some code and cleaned up the API

Mon Jun 11 04:37:54 EEST 2007

details:   http://hg.dovecot.org/dovecot/rev/8101787cdd1c
changeset: 5683:8101787cdd1c
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Jun 11 04:37:29 2007 +0300
description:
Rewrote some code and cleaned up the API

diffstat:

2 files changed, 122 insertions(+), 189 deletions(-)
src/lib/unichar.c |  289 +++++++++++++++++++----------------------------------
src/lib/unichar.h |   22 ++--

diffs (truncated from 387 to 300 lines):

diff -r ff5ba9cb6cd0 -r 8101787cdd1c src/lib/unichar.c

--- a/src/lib/unichar.c	Mon Jun 11 02:27:55 2007 +0300
+++ b/src/lib/unichar.c	Mon Jun 11 04:37:29 2007 +0300
@@ -1,49 +1,15 @@
-/* Copyright (C) 2005 Timo Sirainen */
-
-/* Contains code from GLIB:
- *
- * Copyright (C) 1999 Tom Tromey
- * Copyright (C) 2000 Red Hat, Inc.
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
+/* Copyright (C) 2005-2007 Timo Sirainen */
 
 #include "lib.h"
 #include "buffer.h"
 #include "unichar.h"
 
-#define UTF8_LENGTH(Char)              \
-  ((Char) < 0x80 ? 1 :                 \
-   ((Char) < 0x800 ? 2 :               \
-    ((Char) < 0x10000 ? 3 :            \
-     ((Char) < 0x200000 ? 4 :          \
-      ((Char) < 0x4000000 ? 5 : 6)))))
-
-static const char utf8_skip_data[256] = {
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+static const uint8_t utf8_non1_bytes[256 - 192 - 2] = {
+	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+	3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
 };
 
-const char *const uni_utf8_skip = utf8_skip_data;
+const uint8_t *const uni_utf8_non1_bytes = utf8_non1_bytes;
 
 unsigned int uni_strlen(const unichar_t *str)
 {
@@ -54,146 +20,71 @@ unsigned int uni_strlen(const unichar_t 
 	return len;
 }
 
-unichar_t uni_utf8_get_char(const char *input)
+int uni_utf8_get_char(const char *input, unichar_t *chr_r)
 {
-	return uni_utf8_get_char_len((const unsigned char *)input, (size_t)-1);
+	return uni_utf8_get_char_n((const unsigned char *)input, (size_t)-1,
+				   chr_r);
 }
 
-unichar_t uni_utf8_get_char_len(const unsigned char *input, size_t max_len)
+int uni_utf8_get_char_n(const void *_input, size_t max_len, unichar_t *chr_r)
 {
-  unsigned int i, len;
-  unichar_t wc = *input;
+	const unsigned char *input = _input;
+	unichar_t chr;
+	unsigned int i, len;
+	int ret;
 
-  i_assert(max_len > 0);
+	i_assert(max_len > 0);
 
-  if (wc < 0x80)
-    {
-      return wc;
-    }
-  else if (wc < 0xc0)
-    {
-      return (unichar_t)-1;
-    }
-  else if (wc < 0xe0)
-    {
-      len = 2;
-      wc &= 0x1f;
-    }
-  else if (wc < 0xf0)
-    {
-      len = 3;
-      wc &= 0x0f;
-    }
-  else if (wc < 0xf8)
-    {
-      len = 4;
-      wc &= 0x07;
-    }
-  else if (wc < 0xfc)
-    {
-      len = 5;
-      wc &= 0x03;
-    }
-  else if (wc < 0xfe)
-    {
-      len = 6;
-      wc &= 0x01;
-    }
-  else
-    {
-      return (unichar_t)-1;
-    }
-
-  if (max_len != (size_t)-1 && len > max_len)
-    {
-      for (i = 1; i < max_len; i++)
-	{
-	  if ((input[i] & 0xc0) != 0x80)
-	    return (unichar_t)-1;
-	}
-      return (unichar_t)-2;
-    }
-
-  for (i = 1; i < len; ++i)
-    {
-      if ((input[i] & 0xc0) != 0x80)
-	{
-	  if (input[i] != '\0')
-	    return (unichar_t)-1;
-	  else
-	    return (unichar_t)-2;
+	if (*input < 0x80) {
+		*chr_r = *input;
+		return 1;
 	}
 
-      wc <<= 6;
-      wc |= (input[i] & 0x3f);
-    }
+	/* first byte has len highest bits set, followed by zero bit.
+	   the rest of the bits are used as the highest bits of the value. */
+	chr = *input;
+	len = uni_utf8_char_bytes(*input);
+	switch (len) {
+	case 2:
+		chr &= 0x1f;
+		break;
+	case 3:
+		chr &= 0x0f;
+		break;
+	case 4:
+		chr &= 0x07;
+		break;
+	case 5:
+		chr &= 0x03;
+		break;
+	case 6:
+		chr &= 0x01;
+		break;
+	default:
+		/* only 7bit chars should have len==1 */
+		i_assert(len == 1);
+		return -1;
+	}
 
-  if (UTF8_LENGTH(wc) != len)
-    return (unichar_t)-1;
-  
-  return wc;
-}
+	if (len <= max_len)
+		ret = 1;
+	else {
+		/* check first if the input is invalid before returning 0 */
+		ret = 0;
+		len = max_len;
+	}
 
-/**
- * g_unichar_to_utf8:
- * @c: a ISO10646 character code
- * @outbuf: output buffer, must have at least 6 bytes of space.
- *       If %NULL, the length will be computed and returned
- *       and nothing will be written to @outbuf.
- * 
- * Converts a single character to UTF-8.
- * 
- * Return value: number of bytes written
- **/
-static int
-g_unichar_to_utf8(unichar_t c, char *outbuf)
-{
-  unsigned int len = 0;
-  int first;
-  int i;
+	/* the following bytes must all be 10xxxxxx */
+	for (i = 1; i < len; i++) {
+		if ((input[i] & 0xc0) != 0x80)
+			return input[i] == '\0' ? 0 : -1;
 
-  if (c < 0x80)
-    {
-      first = 0;
-      len = 1;
-    }
-  else if (c < 0x800)
-    {
-      first = 0xc0;
-      len = 2;
-    }
-  else if (c < 0x10000)
-    {
-      first = 0xe0;
-      len = 3;
-    }
-   else if (c < 0x200000)
-    {
-      first = 0xf0;
-      len = 4;
-    }
-  else if (c < 0x4000000)
-    {
-      first = 0xf8;
-      len = 5;
-    }
-  else
-    {
-      first = 0xfc;
-      len = 6;
-    }
+		chr <<= 6;
+		chr |= input[i] & 0x3f;
+	}
 
-  if (outbuf)
-    {
-      for (i = len - 1; i > 0; --i)
-	{
-	  outbuf[i] = (c & 0x3f) | 0x80;
-	  c >>= 6;
-	}
-      outbuf[0] = c | first;
-    }
-
-  return len;
+	*chr_r = chr;
+	return ret;
 }
 
 int uni_utf8_to_ucs4(const char *input, buffer_t *output)
@@ -201,12 +92,11 @@ int uni_utf8_to_ucs4(const char *input, 
 	unichar_t chr;
 
 	while (*input != '\0') {
-		chr = uni_utf8_get_char(input);
-		if (chr & 0x80000000) {
+		if (uni_utf8_get_char(input, &chr) <= 0) {
 			/* invalid input */
 			return -1;
 		}
-                input = uni_utf8_next_char(input);
+                input += uni_utf8_char_bytes(*input);
 
 		buffer_append(output, &chr, sizeof(chr));
 	}
@@ -215,24 +105,59 @@ int uni_utf8_to_ucs4(const char *input, 
 
 void uni_ucs4_to_utf8(const unichar_t *input, size_t len, buffer_t *output)
 {
-	void *buf;
-	int char_len;
-
-	for (; *input != '\0' && len > 0; input++, len--) {
-		buf = buffer_append_space_unsafe(output, 6);
-		char_len = g_unichar_to_utf8(*input, buf);
-		buffer_set_used_size(output, output->used - 6 + char_len);
-	}
+	for (; *input != '\0' && len > 0; input++, len--)
+		uni_ucs4_to_utf8_c(*input, output);
 }
 
-unsigned int uni_utf8_strlen_n(const void *input, size_t size)
+void uni_ucs4_to_utf8_c(unichar_t chr, buffer_t *output)
 {
-	const uint8_t *data = (const uint8_t *)input;
+	unsigned char first;
+	int bitpos;
+
+	if (chr < 0x80) {
+		buffer_append_c(output, chr);
+		return;
+	}