dovecot-2.2: lib-fts: tokenizers - Fixed removal of trailing cha...

Mon Jun 1 19:01:30 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/78d473873e12
changeset: 18781:78d473873e12
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Jun 01 21:58:30 2015 +0300
description:
lib-fts: tokenizers - Fixed removal of trailing character in truncated tokens.
If the token is truncated, we don't want to remove the trailing character
since it's not actually there.

Also we don't want to remove trailing apostrophes from a truncated word,
because they're not actually at the end of the (untruncated) token there.
This doesn't make a big difference, but it's slightly more correct.

diffstat:

 src/lib-fts/fts-tokenizer-generic-private.h |   1 +
 src/lib-fts/fts-tokenizer-generic.c         |   9 +++++++--
 src/lib-fts/test-fts-tokenizer.c            |  24 ++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diffs (113 lines):

diff -r b6510dfd396f -r 78d473873e12 src/lib-fts/fts-tokenizer-generic-private.h

--- a/src/lib-fts/fts-tokenizer-generic-private.h	Mon Jun 01 21:51:33 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic-private.h	Mon Jun 01 21:58:30 2015 +0300
@@ -43,6 +43,7 @@
 	enum boundary_algorithm algorithm;
 	enum letter_type prev_letter;
 	enum letter_type prev_prev_letter;
+	size_t untruncated_length;
 	buffer_t *token;
 };
 
diff -r b6510dfd396f -r 78d473873e12 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 21:51:33 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 21:58:30 2015 +0300
@@ -107,7 +107,7 @@
 	const unsigned char *data;
 	size_t len = tok->token->used;
 
-	if (len > 0) {
+	if (len > 0 && tok->untruncated_length <= tok->max_length) {
 		/* Remove the trailing apostrophe - it was made
 		   into U+0027 earlier. There can be only a single such
 		   apostrophe, because otherwise the token would have already
@@ -124,6 +124,7 @@
 	*token_r = len == 0 ? "" :
 		fts_uni_strndup(tok->token->data, len);
 	buffer_set_used_size(tok->token, 0);
+	tok->untruncated_length = 0;
 	tok->prev_letter = LETTER_TYPE_NONE;
 	return (*token_r)[0] != '\0';
 }
@@ -176,6 +177,7 @@
 
 	tok->prev_letter = LETTER_TYPE_NONE;
 	tok->prev_prev_letter = LETTER_TYPE_NONE;
+	tok->untruncated_length = 0;
 	buffer_set_used_size(tok->token, 0);
 }
 
@@ -184,6 +186,7 @@
 {
 	buffer_append(tok->token, data,
 		      I_MIN(size, tok->max_length - tok->token->used));
+	tok->untruncated_length += size;
 }
 
 static int
@@ -541,7 +544,8 @@
 	const unsigned char *data = tok->token->data;
 	ssize_t len = tok->token->used;
 
-	if (is_one_past_end(tok)) {
+	if (is_one_past_end(tok) &&
+	    tok->untruncated_length <= tok->max_length) {
 		/* delete the last character */
 		while ((data[len-1] & 0x80) != 0)
 			len--;
@@ -558,6 +562,7 @@
 
 	*token_r = fts_uni_strndup(data, len);
 	buffer_set_used_size(tok->token, 0);
+	tok->untruncated_length = 0;
 }
 
 struct letter_fn {
diff -r b6510dfd396f -r 78d473873e12 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Mon Jun 01 21:51:33 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Mon Jun 01 21:58:30 2015 +0300
@@ -32,6 +32,14 @@
 	"' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
 
 	"'1234567890123456789012345678ä,"
+	"123456789012345678901234567x'ä,"
+	"1234567890123456789012345678x're,"
+	"1234567890123456789012345678x',"
+	"1234567890123456789012345678x'',"
+	"12345678901234567890123456789x',"
+	"12345678901234567890123456789x'',"
+	"123456789012345678901234567890x',"
+	"123456789012345678901234567890x'',"
 
 	/* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
 	   U+205A(e2 81 9a) and U+205F(e2 81 9f) */
@@ -139,6 +147,14 @@
 		"word", "pre", "post", NULL,
 
 		"1234567890123456789012345678ä",
+		"123456789012345678901234567x'",
+		"1234567890123456789012345678x'",
+		"1234567890123456789012345678x",
+		"1234567890123456789012345678x",
+		"12345678901234567890123456789x",
+		"12345678901234567890123456789x",
+		"123456789012345678901234567890",
+		"123456789012345678901234567890",
 
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
@@ -183,6 +199,14 @@
 		"word", "pre", "post", NULL,
 
 		"1234567890123456789012345678ä",
+		"123456789012345678901234567x'",
+		"1234567890123456789012345678x'",
+		"1234567890123456789012345678x",
+		"1234567890123456789012345678x",
+		"12345678901234567890123456789x",
+		"12345678901234567890123456789x",
+		"123456789012345678901234567890",
+		"123456789012345678901234567890",
 
 		"hello", "world", "And",
 		"there", "was", "text", "galore",