dovecot-2.2: lib-fts: simple tokenizer cleanup - make prev_lette...
dovecot at dovecot.org
dovecot at dovecot.org
Mon Jun 1 18:32:01 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/33547eaa0cac
changeset: 18774:33547eaa0cac
user: Timo Sirainen <tss at iki.fi>
date: Mon Jun 01 21:27:09 2015 +0300
description:
lib-fts: simple tokenizer cleanup - make prev_letter updating more explicit.
It was now hidden inside one of the functions, which didn't make the
prev_letter very consistent when a word break was found. It didn't actually
matter what the prev_letter was at that point, but now the behavior is more
consistent.
diffstat:
src/lib-fts/fts-tokenizer-generic.c | 32 +++++++++++++++++++-------------
1 files changed, 19 insertions(+), 13 deletions(-)
diffs (62 lines):
diff -r b239f075147b -r 33547eaa0cac src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:19:47 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:27:09 2015 +0300
@@ -163,18 +163,14 @@
return FALSE;
}
-static bool
-fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
+static inline bool
+fts_simple_is_word_break(struct generic_fts_tokenizer *tok,
+ unichar_t c, bool apostrophe)
{
- if (IS_APOSTROPHE(c)) {
- if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
- return TRUE;
- else
- tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
- } else {
- tok->prev_letter = LETTER_TYPE_NONE;
- }
- return FALSE;
+ if (apostrophe)
+ return tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE;
+ else
+ return fts_ascii_word_break(c) || fts_uni_word_break(c);
}
static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -223,13 +219,15 @@
size_t i, start = 0;
unsigned int char_size;
unichar_t c;
+ bool apostrophe;
for (i = 0; i < size; i += char_size) {
if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
i_unreached();
char_size = uni_utf8_char_bytes(data[i]);
- if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
- fts_apostrophe_word_break(tok, c)) {
+
+ apostrophe = IS_APOSTROPHE(c);
+ if (fts_simple_is_word_break(tok, c, apostrophe)) {
tok_append_truncated(tok, data + start, i - start);
if (tok->token->used > 0 &&
fts_tokenizer_generic_simple_current_token(tok, token_r)) {
@@ -237,6 +235,14 @@
return 1;
}
start = i + char_size;
+ /* it doesn't actually matter at this point how whether
+ subsequent apostrophes are handled by prefix
+ skipping or by ignoring empty tokens - they will be
+ dropped in any case. */
+ tok->prev_letter = LETTER_TYPE_NONE;
+ } else {
+ tok->prev_letter = apostrophe ?
+ LETTER_TYPE_SINGLE_QUOTE : LETTER_TYPE_NONE;
}
}
/* word boundary not found yet */
More information about the dovecot-cvs
mailing list