dovecot-2.2: lib-fts: Optimized truncation of partial trailing U...

Tue Jun 2 19:03:32 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/633ab1934f1f
changeset: 18804:633ab1934f1f
user:      Timo Sirainen <tss at iki.fi>
date:      Tue Jun 02 22:01:07 2015 +0300
description:
lib-fts: Optimized truncation of partial trailing UTF-8 characters in tokenizers.

diffstat:

 src/lib-fts/fts-tokenizer-generic.c |  42 +++++++++++++++++++++++++-----------
 1 files changed, 29 insertions(+), 13 deletions(-)

diffs (101 lines):

diff -r f227c2318e02 -r 633ab1934f1f src/lib-fts/fts-tokenizer-generic.c

--- a/src/lib-fts/fts-tokenizer-generic.c	Tue Jun 02 21:56:29 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Tue Jun 02 22:01:07 2015 +0300
@@ -90,43 +90,56 @@
 	i_free(tok);
 }
 
-static const char *fts_uni_strndup(const unsigned char *data, size_t size)
+static void
+fts_tokenizer_delete_trailing_partial_char(const unsigned char *data,
+					   size_t *len)
 {
 	size_t pos;
+	unsigned int char_bytes;
 
-	/* if input is truncated with a partial UTF-8 character, drop it */
-	(void)uni_utf8_partial_strlen_n(data, size, &pos);
-	i_assert(pos > 0);
-	return t_strndup(data, pos);
+	/* the token is truncated - make sure the last character
+	   exists entirely in the token */
+	for (pos = *len-1; pos > 0; pos--) {
+		if ((data[pos] & 0x80) == 0 ||
+		    ((data[pos] & (0x80|0x40)) == (0x80|0x40)))
+			break;
+	}
+	char_bytes = uni_utf8_char_bytes(data[pos]);
+	if (char_bytes != *len-pos) {
+		i_assert(char_bytes > *len-pos);
+		*len = pos;
+	}
 }
 
 static bool
 fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
                                            const char **token_r)
 {
-	const unsigned char *data;
+	const unsigned char *data = tok->token->data;
 	size_t len = tok->token->used;
 
-	if (len > 0 && tok->untruncated_length <= tok->max_length) {
+	if (tok->untruncated_length <= tok->max_length) {
 		/* Remove the trailing apostrophe - it was made
 		   into U+0027 earlier. There can be only a single such
 		   apostrophe, because otherwise the token would have already
 		   been split. We also want to remove the trailing apostrophe
 		   only if it's the the last character in the nontruncated
 		   token - a truncated token may end with apostrophe. */
-		data = tok->token->data;
-		if (data[len-1] == '\'') {
+		if (len > 0 && data[len-1] == '\'') {
 			len--;
 			i_assert(len > 0 && data[len-1] != '\'');
 		}
+	} else {
+		fts_tokenizer_delete_trailing_partial_char(data, &len);
 	}
+	i_assert(len <= tok->max_length);
 
 	*token_r = len == 0 ? "" :
-		fts_uni_strndup(tok->token->data, len);
+		t_strndup(tok->token->data, len);
 	buffer_set_used_size(tok->token, 0);
 	tok->untruncated_length = 0;
 	tok->prev_letter = LETTER_TYPE_NONE;
-	return (*token_r)[0] != '\0';
+	return len > 0;
 }
 
 static bool uint32_find(const uint32_t *data, unsigned int count,
@@ -541,7 +554,7 @@
                                          const char **token_r)
 {
 	const unsigned char *data = tok->token->data;
-	ssize_t len = tok->token->used;
+	size_t len = tok->token->used;
 
 	if (is_one_past_end(tok) &&
 	    tok->untruncated_length <= tok->max_length) {
@@ -551,16 +564,19 @@
 			len--;
 		i_assert(len > 0);
 		len--;
+	} else if (tok->untruncated_length > tok->max_length) {
+		fts_tokenizer_delete_trailing_partial_char(data, &len);
 	}
 	/* we're skipping all non-token chars at the beginning of the word,
 	   so by this point we must have something here - even if we just
 	   deleted the last character */
 	i_assert(len > 0);
+	i_assert(len <= tok->max_length);
 
 	tok->prev_prev_letter = LETTER_TYPE_NONE;
 	tok->prev_letter = LETTER_TYPE_NONE;
 
-	*token_r = fts_uni_strndup(data, len);
+	*token_r = t_strndup(data, len);
 	buffer_set_used_size(tok->token, 0);
 	tok->untruncated_length = 0;
 }