dovecot-2.2: lib-fts: tokenizers - don't include removed apostro...

dovecot at dovecot.org dovecot at dovecot.org
Mon Jun 1 19:01:24 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/e3f9e4c8a338
changeset: 18778:e3f9e4c8a338
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Jun 01 21:48:59 2015 +0300
description:
lib-fts: tokenizers - don't include removed apostrophes as part of the token size

diffstat:

 src/lib-fts/fts-tokenizer-generic.c |  13 +++++++++++++
 src/lib-fts/test-fts-tokenizer.c    |   6 ++++++
 2 files changed, 19 insertions(+), 0 deletions(-)

diffs (53 lines):

diff -r f44961c66a48 -r e3f9e4c8a338 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 21:35:39 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 21:48:59 2015 +0300
@@ -180,6 +180,19 @@
 	size_t append_len, pos = 0, appended = 0;
 	unichar_t c;
 
+	if (size == 0)
+		return;
+	if (data[0] == '\'' && tok->token->used == 0) {
+		/* Skip apostrophes in the beginning of the token.
+		   We need to do it here so that we don't truncate the
+		   token too early. */
+		data++;
+		size--;
+		if (size == 0)
+			return;
+		i_assert(data[0] != '\'');
+	}
+
 	i_assert(tok->max_length >= tok->token->used);
 	append_len = I_MIN(size, tok->max_length - tok->token->used);
 
diff -r f44961c66a48 -r e3f9e4c8a338 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Mon Jun 01 21:35:39 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Mon Jun 01 21:48:59 2015 +0300
@@ -31,6 +31,8 @@
 
 	"' ' '' ''' 'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
 
+	"'1234567890123456789012345678ä,"
+
 	/* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
 	   U+205A(e2 81 9a) and U+205F(e2 81 9f) */
 	"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
@@ -136,6 +138,8 @@
 		"quoted", "text", "word", "hlo", "words", "you're", "bad",
 		"word", "pre", "post", NULL,
 
+		"1234567890123456789012345678ä",
+
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
 		"and", "more", NULL,
@@ -178,6 +182,8 @@
 		"quoted", "text", "word", "hlo", "words", "you're", "bad",
 		"word", "pre", "post", NULL,
 
+		"1234567890123456789012345678ä",
+
 		"hello", "world", "And",
 		"there", "was", "text", "galore",
 		"and", "more", NULL,


More information about the dovecot-cvs mailing list