dovecot-2.2: lib-fts: simple tokenizer cleanup - make prev_lette...

Mon Jun 1 18:32:01 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/33547eaa0cac
changeset: 18774:33547eaa0cac
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Jun 01 21:27:09 2015 +0300
description:
lib-fts: simple tokenizer cleanup - make prev_letter updating more explicit.
It was now hidden inside one of the functions, which didn't make the
prev_letter very consistent when a word break was found. It didn't actually
matter what the prev_letter was at that point, but now the behavior is more
consistent.

diffstat:

 src/lib-fts/fts-tokenizer-generic.c |  32 +++++++++++++++++++-------------
 1 files changed, 19 insertions(+), 13 deletions(-)

diffs (62 lines):

diff -r b239f075147b -r 33547eaa0cac src/lib-fts/fts-tokenizer-generic.c

--- a/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 21:19:47 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 21:27:09 2015 +0300
@@ -163,18 +163,14 @@
 	return FALSE;
 }
 
-static bool
-fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
+static inline bool
+fts_simple_is_word_break(struct generic_fts_tokenizer *tok,
+			 unichar_t c, bool apostrophe)
 {
-	if (IS_APOSTROPHE(c)) {
-		if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
-			return TRUE;
-		else
-			tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
-	} else {
-		tok->prev_letter = LETTER_TYPE_NONE;
-	}
-	return FALSE;
+	if (apostrophe)
+		return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
+	else
+		return fts_ascii_word_break(c) || fts_uni_word_break(c);
 }
 
 static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -223,13 +219,15 @@
 	size_t i, start = 0;
 	unsigned int char_size;
 	unichar_t c;
+	bool apostrophe;
 
 	for (i = 0; i < size; i += char_size) {
 		if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
 			i_unreached();
 		char_size = uni_utf8_char_bytes(data[i]);
-		if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
-		    fts_apostrophe_word_break(tok, c)) {
+
+		apostrophe = IS_APOSTROPHE(c);
+		if (fts_simple_is_word_break(tok, c, apostrophe)) {
 			tok_append_truncated(tok, data + start, i - start);
 			if (tok->token->used > 0 &&
 			    fts_tokenizer_generic_simple_current_token(tok, token_r)) {
@@ -237,6 +235,14 @@
 				return 1;
 			}
 			start = i + char_size;
+			/* it doesn't actually matter at this point how whether
+			   subsequent apostrophes are handled by prefix
+			   skipping or by ignoring empty tokens - they will be
+			   dropped in any case. */
+			tok->prev_letter = LETTER_TYPE_NONE;
+		} else {
+			tok->prev_letter = apostrophe ?
+				LETTER_TYPE_SINGLE_QUOTE : LETTER_TYPE_NONE;
 		}
 	}
 	/* word boundary not found yet */