dovecot-2.2: lib-fts: Optimization for tr29 - we don't need to t...

dovecot at dovecot.org dovecot at dovecot.org
Mon Jun 1 18:31:45 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/3ff93cabcac3
changeset: 18769:3ff93cabcac3
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Jun 01 21:08:27 2015 +0300
description:
lib-fts: Optimization for tr29 - we don't need to track last_size explicitly

diffstat:

 src/lib-fts/fts-tokenizer-generic-private.h |   1 -
 src/lib-fts/fts-tokenizer-generic.c         |  20 +++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diffs (60 lines):

diff -r 6e459e8c3a5b -r 3ff93cabcac3 src/lib-fts/fts-tokenizer-generic-private.h
--- a/src/lib-fts/fts-tokenizer-generic-private.h	Mon Jun 01 18:35:58 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic-private.h	Mon Jun 01 21:08:27 2015 +0300
@@ -43,7 +43,6 @@
 	enum boundary_algorithm algorithm;
 	enum letter_type prev_letter;
 	enum letter_type prev_prev_letter;
-	size_t last_size; /* Bytes in latest utf8 character. */
 	buffer_t *token;
 };
 
diff -r 6e459e8c3a5b -r 3ff93cabcac3 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 18:35:58 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 21:08:27 2015 +0300
@@ -184,7 +184,6 @@
 
 	tok->prev_letter = LETTER_TYPE_NONE;
 	tok->prev_prev_letter = LETTER_TYPE_NONE;
-	tok->last_size = 0;
 	buffer_set_used_size(tok->token, 0);
 }
 
@@ -552,17 +551,21 @@
 fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
                                          const char **token_r)
 {
-	size_t end_skip = 0;
-	ssize_t len;
+	const unsigned char *data = tok->token->data;
+	ssize_t len = tok->token->used;
 
-	if (is_one_past_end(tok))
-		end_skip = tok->last_size;
+	if (is_one_past_end(tok)) {
+		/* delete the last character */
+		while ((data[len-1] & 0x80) != 0)
+			len--;
+		i_assert(len > 0);
+		len--;
+	}
 
 	tok->prev_prev_letter = LETTER_TYPE_NONE;
 	tok->prev_letter = LETTER_TYPE_NONE;
 
-	len = tok->token->used - end_skip;
-	*token_r = len == 0 ? "" : fts_uni_strndup(tok->token->data, len);
+	*token_r = len == 0 ? "" : fts_uni_strndup(data, len);
 	buffer_set_used_size(tok->token, 0);
 	return len > 0;
 }
@@ -629,8 +632,7 @@
 		char_start_i = i;
 		if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
 			i_unreached();
-		tok->last_size = uni_utf8_char_bytes(data[i]);
-		i += tok->last_size - 1; /* Utf8 bytes > 1, for() handles the 1 byte increment. */
+		i += uni_utf8_char_bytes(data[i]) - 1; /* Utf8 bytes > 1, for() handles the 1 byte increment. */
 		lt = letter_type(c);
 		if (tok->prev_letter == LETTER_TYPE_NONE && is_nonword(lt)) {
 			/* TODO: test that start_skip works with multibyte utf8 chars */


More information about the dovecot-cvs mailing list