dovecot-2.2: lib-fts: tr29 cleanup - consistently call valid cha...
dovecot at dovecot.org
dovecot at dovecot.org
Mon Jun 1 18:32:01 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/62b201a1ee06
changeset: 18772:62b201a1ee06
user: Timo Sirainen <tss at iki.fi>
date: Mon Jun 01 21:16:35 2015 +0300
description:
lib-fts: tr29 cleanup - consistently call valid chars "token" and "non-token" chars.
Instead of word/token/text.
diffstat:
src/lib-fts/fts-tokenizer-generic.c | 14 +++++++-------
1 files changed, 7 insertions(+), 7 deletions(-)
diffs (46 lines):
diff -r 04ea590951c1 -r 62b201a1ee06 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:11:55 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c Mon Jun 01 21:16:35 2015 +0300
@@ -513,9 +513,8 @@
TODO: Does this "reverse approach" include too much in "whitespace"?
TODO: Possibly use is_word_break()?
*/
-static bool is_nonword(enum letter_type lt)
+static bool is_nontoken(enum letter_type lt)
{
-
if (lt == LETTER_TYPE_REGIONAL_INDICATOR || lt == LETTER_TYPE_KATAKANA ||
lt == LETTER_TYPE_HEBREW_LETTER || lt == LETTER_TYPE_ALETTER ||
lt == LETTER_TYPE_NUMERIC)
@@ -561,7 +560,7 @@
i_assert(len > 0);
len--;
}
- /* we're skipping all non-text at the beginning of the word,
+ /* we're skipping all non-token chars at the beginning of the word,
so by this point we must have something here - even if we just
deleted the last character */
i_assert(len > 0);
@@ -596,7 +595,7 @@
* No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
* Break just once, not before and after.
* Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
- * Other things also (e.g. is_nonword(), not really pure tr29. Meant
+ * Other things also (e.g. is_nontoken(), not really pure tr29. Meant
to assist in finding individual words.
*/
static bool
@@ -637,9 +636,10 @@
i_unreached();
i += uni_utf8_char_bytes(data[i]);
lt = letter_type(c);
- if (tok->prev_letter == LETTER_TYPE_NONE && is_nonword(lt)) {
- /* TODO: test that start_skip works with multibyte utf8 chars */
- start_skip = i; /* Skip non-token chars at start of data */
+ if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
+ /* Skip non-token chars at the beginning of token */
+ i_assert(tok->token->used == 0);
+ start_skip = i;
continue;
}
if (uni_found_word_boundary(tok, lt)) {
More information about the dovecot-cvs
mailing list