dovecot-2.2: lib-fts: Improve using max_length in tr29 tokenizer

Sat May 9 08:32:35 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/b0a934361563
changeset: 18560:b0a934361563
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Sat May 09 11:17:03 2015 +0300
description:
lib-fts: Improve using max_length in tr29 tokenizer

diffstat:

 src/lib-fts/fts-tokenizer-generic.c |  17 +++++++++--------
 src/lib-fts/test-fts-tokenizer.c    |   5 +++--
 2 files changed, 12 insertions(+), 10 deletions(-)

diffs (69 lines):

diff -r 2048dade16e7 -r b0a934361563 src/lib-fts/fts-tokenizer-generic.c

--- a/src/lib-fts/fts-tokenizer-generic.c	Sat May 09 11:16:22 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Sat May 09 11:17:03 2015 +0300
@@ -469,11 +469,14 @@
                                          const char **token_r)
 {
 	size_t end_skip = 0;
+	ssize_t len;
 
 	if (is_one_past_end(tok))
 		end_skip = tok->last_size;
 
-	*token_r = t_strndup(tok->token->data, tok->token->used - end_skip);
+	len = I_MIN(tok->token->used, tok->max_length) - end_skip;
+	i_assert(len > 0);
+	*token_r = t_strndup(tok->token->data, len);
 	buffer_set_used_size(tok->token, 0);
 	tok->prev_prev_letter = LETTER_TYPE_NONE;
 	tok->prev_letter = LETTER_TYPE_NONE;
@@ -525,7 +528,7 @@
 		(struct generic_fts_tokenizer *)_tok;
 
 	unichar_t c;
-	size_t i, char_start_i, start_skip = 0, len;
+	size_t i, char_start_i, start_skip = 0;
 	enum letter_type lt;
 
 	/* TODO: Process 8bit chars separately, to speed things up. */
@@ -542,17 +545,15 @@
 			continue;
 		}
 		if (uni_found_word_boundary(tok, lt)) {
-			len =  I_MIN(char_start_i, tok->max_length);
-			i_assert(len >= start_skip && size >= start_skip);
+			i_assert(char_start_i >= start_skip && size >= start_skip);
 			buffer_append(tok->token, data + start_skip,
-			              len - start_skip);
+			              char_start_i - start_skip);
 			*skip_r = i + 1;
 			return fts_tokenizer_generic_tr29_current_token(tok, token_r);
 		}
 	}
-	len =  I_MIN(i, tok->max_length);
-	i_assert(len >= start_skip && size >= start_skip);
-	buffer_append(tok->token, data + start_skip, len - start_skip);
+	i_assert(i >= start_skip && size >= start_skip);
+	buffer_append(tok->token, data + start_skip, i - start_skip);
 	*skip_r = i;
 
 	if (size == 0 && tok->token->used > 0) {
diff -r 2048dade16e7 -r b0a934361563 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Sat May 09 11:16:22 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Sat May 09 11:17:03 2015 +0300
@@ -126,12 +126,13 @@
 {
 	static const unsigned char input[] =
 		"hello world\r\n\nAnd there\twas: text "
-		"galore, and more.\n\n (\"Hello world\")3.14 3,14 last 1.";
+		"galore, and more.\n\n (\"Hello world\")3.14 3,14 last"
+		" longlonglongabcdefghijklmnopqrstuvwxyz 1.";
 	static const char *const expected_output[] = {
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
 		"and", "more", "Hello", "world", "3.14",
-		"3,14", "last", "1", NULL
+		"3,14", "last", "longlonglongabcdefghijklmnopqr", "1", NULL
 	};
 	const struct fts_tokenizer *tok_class;
 	struct fts_tokenizer *tok;