dovecot-2.2: lib-fts: Fixed tr29 tokenizer to delete last charac...

dovecot at dovecot.org dovecot at dovecot.org
Tue Jun 2 17:52:47 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/97b5c5e88540
changeset: 18796:97b5c5e88540
user:      Timo Sirainen <tss at iki.fi>
date:      Tue Jun 02 20:50:23 2015 +0300
description:
lib-fts: Fixed tr29 tokenizer to delete last character correctly when it's preceded by non-ASCII

diffstat:

 src/lib-fts/fts-tokenizer-generic.c |  3 ++-
 src/lib-fts/test-fts-tokenizer.c    |  6 +++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diffs (43 lines):

diff -r 0bcd3e9e77d4 -r 97b5c5e88540 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Tue Jun 02 19:59:45 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Tue Jun 02 20:50:23 2015 +0300
@@ -546,7 +546,8 @@
 	if (is_one_past_end(tok) &&
 	    tok->untruncated_length <= tok->max_length) {
 		/* delete the last character */
-		while ((data[len-1] & 0x80) != 0)
+		while ((data[len-1] & 0x80) != 0 &&
+		       ((data[len-1] & (0x80|0x40)) != (0x80|0x40)))
 			len--;
 		i_assert(len > 0);
 		len--;
diff -r 0bcd3e9e77d4 -r 97b5c5e88540 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Tue Jun 02 19:59:45 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Tue Jun 02 20:50:23 2015 +0300
@@ -17,7 +17,7 @@
 
 static const char *test_inputs[] = {
 	/* generic things and word truncation: */
-	"hello world\r\n\nAnd there\twas: text galore, "
+	"hello world\r\n\nAnd there\twas: text galor\xC3\xA9\xE2\x80\xA7 "
 	"abc at example.com, "
 	"Bar Baz <bar at example.org>, "
 	"foo at domain "
@@ -137,7 +137,7 @@
 {
 	static const char *const expected_output[] = {
 		"hello", "world", "And",
-		"there", "was", "text", "galore",
+		"there", "was", "text", "galor\xC3\xA9",
 		"abc", "example", "com", "Bar", "Baz",
 		"bar", "example", "org", "foo", "domain",
 		"1234567890123456789012345678ä",
@@ -194,7 +194,7 @@
 {
 	static const char *const expected_output[] = {
 		"hello", "world", "And",
-		"there", "was", "text", "galore",
+		"there", "was", "text", "galor\xC3\xA9",
 		"abc", "example", "com", "Bar", "Baz",
 		"bar", "example", "org", "foo", "domain",
 		"1234567890123456789012345678ä",


More information about the dovecot-cvs mailing list