dovecot-2.2: lib-fts: Fix simple tokenizer apostrophe handling.
dovecot at dovecot.org
dovecot at dovecot.org
Thu May 21 10:39:16 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/6c655ce3b857
changeset: 18732:6c655ce3b857
user: Teemu Huovila <teemu.huovila at dovecot.fi>
date: Thu May 21 06:29:15 2015 -0400
description:
lib-fts: Fix simple tokenizer apostrophe handling.
Apostrophes and quotation marks are now treated as word breaks,
except U+0027 between non-wordbrek characters. The characters
U+2019 and U+FF07 are transformed to U+0027 before processing.
diffstat:
src/lib-fts/fts-tokenizer-generic-private.h | 3 +-
src/lib-fts/fts-tokenizer-generic.c | 110 ++++++++++++++++++++-------
src/lib-fts/test-fts-tokenizer.c | 9 ++
src/lib-fts/word-properties.pl | 2 +-
4 files changed, 93 insertions(+), 31 deletions(-)
diffs (254 lines):
diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/fts-tokenizer-generic-private.h
--- a/src/lib-fts/fts-tokenizer-generic-private.h Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/fts-tokenizer-generic-private.h Thu May 21 06:29:15 2015 -0400
@@ -40,8 +40,7 @@
struct fts_tokenizer tokenizer;
unsigned int max_length;
enum boundary_algorithm algorithm;
- enum letter_type prev_letter; /* These two are basically the
- state of the parsing. */
+ enum letter_type prev_letter;
enum letter_type prev_prev_letter;
size_t last_size; /* Bytes in latest utf8 character. */
buffer_t *token;
diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/fts-tokenizer-generic.c Thu May 21 06:29:15 2015 -0400
@@ -11,7 +11,12 @@
#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
-static unsigned char fts_ascii_word_boundaries[128] = {
+#define IS_NONASCII_APOSTROPHE(c) \
+ ((c) == 0x2019 || (c) == 0xFF07)
+#define IS_APOSTROPHE(c) \
+ ((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
+
+static unsigned char fts_ascii_word_breaks[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 16-31 */
@@ -95,34 +100,60 @@
return t_strndup(data, pos);
}
-static void
+static bool
fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
const char **token_r)
{
- *token_r = fts_uni_strndup(tok->token->data, tok->token->used);
+ const unsigned char *data;
+ size_t start = 0, len;
+
+ /* clean trailing and starting apostrophes. they were all made
+ into U+0027 earlier. */
+ data = tok->token->data;
+ len = tok->token->used;
+ while (len > 0 && data[len - 1] == '\'')
+ len--;
+ while (start < len && data[start] == '\'')
+ start++;
+
+ *token_r = len - start == 0 ? "" :
+ fts_uni_strndup(CONST_PTR_OFFSET(tok->token->data, start),
+ len - start);
buffer_set_used_size(tok->token, 0);
+ return (*token_r)[0] != '\0';
}
-/* TODO: This is duplicated from unichar.c */
static bool uint32_find(const uint32_t *data, unsigned int count,
uint32_t value, unsigned int *idx_r)
{
BINARY_NUMBER_SEARCH(data, count, value, idx_r);
}
-static bool is_word_break(unichar_t c)
+static bool fts_ascii_word_break(unsigned char c)
+{
+ if (c < 0x80)
+ return fts_ascii_word_breaks[c] != 0;
+ return FALSE;
+}
+
+static bool fts_uni_word_break(unichar_t c)
{
unsigned int idx;
+ /* Override some apostrophes, which get special treatment. */
+ if (IS_APOSTROPHE(c))
+ return FALSE;
+
/* Unicode General Punctuation, including deprecated characters. */
if (c >= 0x2000 && c <= 0x206f)
return TRUE;
-
/* From word-break-data.c, which is generated from PropList.txt. */
if (uint32_find(White_Space, N_ELEMENTS(White_Space), c, &idx))
return TRUE;
if (uint32_find(Dash, N_ELEMENTS(Dash), c, &idx))
return TRUE;
+ if (uint32_find(Quotation_Mark, N_ELEMENTS(Quotation_Mark), c, &idx))
+ return TRUE;
if (uint32_find(Terminal_Punctuation, N_ELEMENTS(Terminal_Punctuation), c, &idx))
return TRUE;
if (uint32_find(STerm, N_ELEMENTS(STerm), c, &idx))
@@ -133,17 +164,17 @@
}
static bool
-data_is_word_boundary(const unsigned char *data, size_t size, size_t *i)
+fts_apostrophe_word_break(struct generic_fts_tokenizer *tok, unichar_t c)
{
- unichar_t c;
-
- if (data[*i] < 0x80)
- return fts_ascii_word_boundaries[data[*i]] != 0;
- /* unicode punctuation? */
- if (uni_utf8_get_char_n(data + *i, size - *i, &c) <= 0)
- i_unreached();
- *i += uni_utf8_char_bytes(data[*i]) - 1;
- return is_word_break(c);
+ if (IS_APOSTROPHE(c)) {
+ if (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
+ return TRUE;
+ else
+ tok->prev_letter = LETTER_TYPE_SINGLE_QUOTE;
+ } else {
+ tok->prev_letter = LETTER_TYPE_NONE;
+ }
+ return FALSE;
}
static void fts_tokenizer_generic_reset(struct fts_tokenizer *_tok)
@@ -160,10 +191,26 @@
static void tok_append_truncated(struct generic_fts_tokenizer *tok,
const unsigned char *data, size_t size)
{
+ size_t append_len, pos = 0, appended = 0;
+ unichar_t c;
+
i_assert(tok->max_length >= tok->token->used);
+ append_len = I_MIN(size, tok->max_length - tok->token->used);
- buffer_append(tok->token, data,
- I_MIN(size, tok->max_length - tok->token->used));
+ /* Append only one kind of apostrophes. Simplifies things when returning
+ token. */
+ while (pos < append_len) {
+ if (uni_utf8_get_char_n(data + pos, size - pos, &c) <= 0)
+ i_unreached();
+ if (IS_NONASCII_APOSTROPHE(c)) {
+ buffer_append(tok->token, data, pos);
+ buffer_append_c(tok->token, '\'');
+ appended = pos + 1;
+ }
+ pos += uni_utf8_char_bytes(data[pos]);
+ }
+ if (appended < append_len)
+ buffer_append(tok->token, data + appended, append_len - appended);
}
static int
@@ -175,21 +222,27 @@
struct generic_fts_tokenizer *tok =
(struct generic_fts_tokenizer *)_tok;
size_t i, char_start_i, len, start = 0;
+ unsigned int char_size;
+ unichar_t c;
- for (i = 0; i < size; i++) {
+ for (i = 0; i < size; i += char_size) {
char_start_i = i;
- if (data_is_word_boundary(data, size, &i)) {
+ if (uni_utf8_get_char_n(data + i, size - i, &c) <= 0)
+ i_unreached();
+ char_size = uni_utf8_char_bytes(data[i]);
+ if (fts_ascii_word_break(data[i]) || fts_uni_word_break(c) ||
+ fts_apostrophe_word_break(tok, c)) {
len = char_start_i - start;
tok_append_truncated(tok, data + start, len);
if (tok->token->used == 0) {
- /* no text read yet */
- start = i + 1;
+ start = i + char_size;
continue;
}
- /* word boundary found - return a new token */
- *skip_r = i + 1;
- fts_tokenizer_generic_simple_current_token(tok, token_r);
- return 1;
+
+ if (fts_tokenizer_generic_simple_current_token(tok, token_r)) {
+ *skip_r = i + char_size;
+ return 1;
+ }
}
}
/* word boundary not found yet */
@@ -199,9 +252,10 @@
/* return the last token */
if (size == 0 && tok->token->used > 0) {
- fts_tokenizer_generic_simple_current_token(tok, token_r);
- return 1;
+ if (fts_tokenizer_generic_simple_current_token(tok, token_r))
+ return 1;
}
+
return 0;
}
diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/test-fts-tokenizer.c Thu May 21 06:29:15 2015 -0400
@@ -29,6 +29,8 @@
"1.",
+ "'quoted text' 'word' 'hlo words' you're bad'''word '''pre post'''",
+
/* whitespace: with Unicode(utf8) U+FF01(ef bc 81)(U+2000(e2 80 80) and
U+205A(e2 81 9a) and U+205F(e2 81 9f) */
"hello\xEF\xBC\x81world\r\nAnd\xE2\x80\x80there\twas: text "
@@ -99,6 +101,7 @@
outi++;
}
test_assert_idx(expected_output[outi] == NULL, outi);
+
return outi+1;
}
@@ -130,6 +133,9 @@
"1", NULL,
+ "quoted", "text", "word", "hlo", "words", "you're", "bad",
+ "word", "pre", "post", NULL,
+
"hello", "world", "And",
"there", "was", "text", "galore",
"and", "more", NULL,
@@ -169,6 +175,9 @@
"1", NULL,
+ "quoted", "text", "word", "hlo", "words", "you're", "bad",
+ "word", "pre", "post", NULL,
+
"hello", "world", "And",
"there", "was", "text", "galore",
"and", "more", NULL,
diff -r 5ca59cffbf2f -r 6c655ce3b857 src/lib-fts/word-properties.pl
--- a/src/lib-fts/word-properties.pl Thu May 21 06:17:32 2015 -0400
+++ b/src/lib-fts/word-properties.pl Thu May 21 06:29:15 2015 -0400
@@ -8,7 +8,7 @@
@categories = qw(CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter
Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet);
} elsif ($which eq 'breaks') {
- @categories = qw(White_Space Dash Terminal_Punctuation STerm Pattern_White_Space);
+ @categories = qw(White_Space Dash Quotation_Mark Terminal_Punctuation STerm Pattern_White_Space);
} else {
die "specify 'boundaries' or 'breaks'";
}
More information about the dovecot-cvs
mailing list