[dovecot-cvs] dovecot/src/plugins/fts-squat squat-trie.c, 1.9, 1.10 squat-trie.h, 1.3, 1.4

tss at dovecot.org tss at dovecot.org
Wed Dec 13 13:46:25 UTC 2006


Update of /var/lib/cvs/dovecot/src/plugins/fts-squat
In directory talvi:/tmp/cvs-serv28927

Modified Files:
	squat-trie.c squat-trie.h 
Log Message:
Only ASCII texts were actually indexed. Now UTF-8 input is properly
converted to UTF-16 which is indexed.



Index: squat-trie.c
===================================================================
RCS file: /var/lib/cvs/dovecot/src/plugins/fts-squat/squat-trie.c,v
retrieving revision 1.9
retrieving revision 1.10
diff -u -d -r1.9 -r1.10
--- squat-trie.c	13 Dec 2006 13:08:28 -0000	1.9
+++ squat-trie.c	13 Dec 2006 13:46:23 -0000	1.10
@@ -10,6 +10,7 @@
 #include "read-full.h"
 #include "write-full.h"
 #include "mmap-util.h"
+#include "unichar.h"
 #include "squat-uidlist.h"
 #include "squat-trie.h"
 
@@ -19,8 +20,8 @@
 #include <fcntl.h>
 #include <ctype.h>
 
-/* normalization changes 0..32 -> 0 */
-#define MAX_8BIT_CHAR_COUNT (256 - 32)
+/* 8bit character counter holds only 255, so we can't use 256. */
+#define MAX_8BIT_CHAR_COUNT 255
 
 #define FAST_8BIT_LEVEL 2
 
@@ -228,9 +229,10 @@
 	return value;
 }
 
-static const void *data_normalize(const void *data, size_t size, buffer_t *dest)
+static const uint16_t *
+data_normalize(const void *data, size_t size, buffer_t *dest)
 {
-	const uint8_t *src = data;
+	const unsigned char *src = data;
 	size_t i;
 
 	buffer_set_used_size(dest, 0);
@@ -239,12 +241,24 @@
 
 		if (src[i] <= 32)
 			chr = 0;
-		else if (src[i] > 'z')
-			chr = src[i] - 32 - 26;
-		else
+		else if (src[i] <= 'z')
 			chr = i_toupper(src[i]) - 32;
+		else if (src[i] < 128)
+			chr = src[i] - 32 - 26;
+		else {
+			/* UTF-8 input */
+			unichar_t uchr;
+
+			/* FIXME: can we do anything better than just
+			   truncate with >16bit values? */
+			uchr = uni_utf8_get_char_len(src+i, size-i);
+			uchr -= 32 - 26;
+			chr = uchr < (uint16_t)-1 ? uchr : 0;
+			i += uni_utf8_skip[src[i] & 0xff] - 1;
+		}
 		buffer_append(dest, &chr, sizeof(chr));
 	}
+
 	return dest->data;
 }
 
@@ -389,7 +403,7 @@
 	chars8_offset = p - trie->const_mmap_base;
 	chars8_size = chars8_count * (sizeof(uint8_t) + sizeof(uint32_t));
 
-	if (chars8_count > 256 ||
+	if (chars8_count > MAX_8BIT_CHAR_COUNT ||
 	    chars8_offset + chars8_size > trie->mmap_size) {
 		squat_trie_set_corrupted(trie, "trie offset broken");
 		return -1;
@@ -900,7 +914,7 @@
 
 	if (level <= FAST_8BIT_LEVEL) {
 		uint8_t *chars;
-		unsigned int chars16_count = chr >= 256 ? 1 : 0;
+		unsigned int chars16_count = chr >= MAX_8BIT_CHAR_COUNT ? 1 : 0;
 
 		node = i_malloc(sizeof(*node) +
 				ALIGN(MAX_8BIT_CHAR_COUNT) +
@@ -920,7 +934,7 @@
 			chrp = (uint16_t *)&chars[i];
 			*chrp = chr;
 		}
-	} else if (chr < 256) {
+	} else if (chr < MAX_8BIT_CHAR_COUNT) {
 		uint8_t *chrp;
 
 		idx_offset += ALIGN(sizeof(*chrp));
@@ -963,7 +977,7 @@
 		node->chars_16bit_count * idx_size;
 	old_size = sizeof(*node) + old_size_8bit + old_size_16bit;
 
-	if (chr < 256) {
+	if (chr < MAX_8BIT_CHAR_COUNT) {
 		new_idx_offset = sizeof(*node) +
 			ALIGN(node->chars_8bit_count + sizeof(uint8_t));
 		new_size = new_idx_offset + old_size_16bit +
@@ -976,7 +990,7 @@
 	}
 
 	new_node = t_buffer_get(new_size);
-	if (chr < 256) {
+	if (chr < MAX_8BIT_CHAR_COUNT) {
 		hole1_pos = sizeof(*node) + char_idx;
 		old_idx_offset = sizeof(*node) + ALIGN(node->chars_8bit_count);
 	} else {
@@ -988,7 +1002,7 @@
 	hole2_pos = old_idx_offset + idx_size * char_idx;
 
 	memcpy(new_node, node, hole1_pos);
-	if (chr < 256) {
+	if (chr < MAX_8BIT_CHAR_COUNT) {
 		uint8_t *chrp = PTR_OFFSET(new_node, hole1_pos);
 		*chrp = chr;
 		new_node->chars_8bit_count++;
@@ -1034,10 +1048,9 @@
 	bool modified = FALSE;
 	int ret;
 
-	if (*data < 256) {
+	if (*data < MAX_8BIT_CHAR_COUNT) {
 		unsigned int count;
 
-		i_assert(*data < MAX_8BIT_CHAR_COUNT);
 		if (node == NULL) {
 			ctx->node_count++;
 			node = *parent = node_alloc(*data, level);
@@ -1132,7 +1145,7 @@
 	if (node == NULL)
 		return 0;
 
-	if (*data < 256) {
+	if (*data < MAX_8BIT_CHAR_COUNT) {
 		if (level <= FAST_8BIT_LEVEL)
 			char_idx = *data;
 		else {
@@ -1229,7 +1242,7 @@
 }
 
 int squat_trie_build_more(struct squat_trie_build_context *ctx, uint32_t uid,
-			  const void *data, size_t size)
+			  const unsigned char *data, size_t size)
 {
 	const uint16_t *str;
 	uint16_t buf[(BLOCK_SIZE-1)*2];
@@ -1287,10 +1300,10 @@
 			}
 		}
 	}
-
 	ctx->prev_added_size = I_MIN(size, BLOCK_SIZE-1);
 	memcpy(ctx->prev_added, str + i,
 	       sizeof(ctx->prev_added[0]) * ctx->prev_added_size);
+
 	t_pop();
 	return 0;
 }
@@ -1549,6 +1562,7 @@
 static void squat_trie_compress_chars8(struct trie_node *node)
 {
 	uint8_t *chars = NODE_CHARS8(node);
+	uint16_t *chars16, *old_chars16 = NODE_CHARS16(node, 0);
 	struct trie_node **child_src = NODE_CHILDREN8(node);
 	struct trie_node **child_dest;
 	unsigned int i, j, old_count;
@@ -1566,6 +1580,13 @@
 		if (child_src[i] != NULL)
 			child_dest[j++] = child_src[i];
 	}
+
+	if (node->chars_16bit_count > 0) {
+		chars16 = NODE_CHARS16(node, 0);
+		memmove(chars16, old_chars16,
+			ALIGN(sizeof(*chars16) * node->chars_16bit_count) +
+			sizeof(*child_src) * node->chars_16bit_count);
+	}
 }
 
 static void squat_trie_compress_chars16(struct trie_node *node)
@@ -1646,6 +1667,11 @@
 	bool need_char_compress = FALSE;
 
 	for (i = 0; i < count; i++) {
+		if (children[i] == NULL) {
+			need_char_compress = TRUE;
+			continue;
+		}
+
 		child_idx = POINTER_CAST_TO(children[i], size_t);
 		i_assert((child_idx & 1) != 0);
 		child_idx &= ~1;

Index: squat-trie.h
===================================================================
RCS file: /var/lib/cvs/dovecot/src/plugins/fts-squat/squat-trie.h,v
retrieving revision 1.3
retrieving revision 1.4
diff -u -d -r1.3 -r1.4
--- squat-trie.h	6 Dec 2006 23:43:15 -0000	1.3
+++ squat-trie.h	13 Dec 2006 13:46:23 -0000	1.4
@@ -18,7 +18,7 @@
 struct squat_trie_build_context *
 squat_trie_build_init(struct squat_trie *trie, uint32_t *last_uid_r);
 int squat_trie_build_more(struct squat_trie_build_context *ctx, uint32_t uid,
-			  const void *data, size_t size);
+			  const unsigned char *data, size_t size);
 int squat_trie_build_deinit(struct squat_trie_build_context *ctx);
 
 int squat_trie_compress(struct squat_trie *trie,



More information about the dovecot-cvs mailing list