dovecot-2.2: lib-fts: Add Unicode TR29 rule WB5a setting to toke...
dovecot at dovecot.org
dovecot at dovecot.org
Mon Aug 17 10:46:08 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/f7f6ec738683
changeset: 18944:f7f6ec738683
user: Teemu Huovila <teemu.huovila at dovecot.fi>
date: Mon Aug 17 13:18:03 2015 +0300
description:
lib-fts: Add Unicode TR29 rule WB5a setting to tokenizer.
Splits prefixing contracted words from base word.
E.g. "l'homme" -> "l" "homme". Together with a language specific stopword list
unnecessary contractions can thus be filtered away.
This is disabled by default and only works with the TR29 algorithm.
Enable by "fts_tokenizer_generic = algorithm=tr29 wb5a=yes"
diffstat:
src/lib-fts/fts-common.h | 37 ++++++++++++++++-
src/lib-fts/fts-tokenizer-generic-private.h | 5 ++
src/lib-fts/fts-tokenizer-generic.c | 64 +++++++++++++++++++++++++--
src/lib-fts/test-fts-tokenizer.c | 65 ++++++++++++++++++++++++++++-
4 files changed, 163 insertions(+), 8 deletions(-)
diffs (truncated from 334 to 300 lines):
diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/fts-common.h
--- a/src/lib-fts/fts-common.h Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/fts-common.h Mon Aug 17 13:18:03 2015 +0300
@@ -6,5 +6,40 @@
((c) == 0x2019 || (c) == 0xFF07)
#define IS_APOSTROPHE(c) \
((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
-
+#define IS_WB5A_APOSTROPHE(c) \
+ ((c) == 0x0027 || (c) == 0x2019)
+/* The h letters are included because it is an exception in French.
+ A, E, H, I, O, U, Y, a, e, h, i, o, u, y */
+#define IS_ASCII_VOWEL(c) \
+ ((c) == 0x0041 || (c) == 0x0045 || (c) == 0x0048 || (c) == 0x0049 || \
+ (c) == 0x004F || (c) == 0x0055 || (c) == 0x0059 || (c) == 0x0061 || \
+ (c) == 0x0065 || (c) == 0x0068 || (c) == 0x0069 || (c) == 0x006F || \
+ (c) == 0x0075 || (c) == 0x0079)
+#define IS_NONASCII_VOWEL(c) \
+ /*latin capital letter a with grave, acute and circumflex*/ \
+ ((c) == 0x00C0 || (c) == 0x00C1 || (c) == 0x00C2 || \
+ /* latin capital letter e with grave, acute and circumflex */ \
+ (c) == 0x00C8 || (c) == 0x00C9 || (c) == 0x00CA || \
+ /* latin capital letter i with grave, acute and circumflex */ \
+ (c) == 0x00CC || (c) == 0x00CD || (c) == 0x00CE || \
+ /* latin capital letter o with grave, acute and circumflex */ \
+ (c) == 0x00D2 || (c) == 0x00D3 || (c) == 0x00D4 || \
+ /* latin capital letter u with grave, acute and circumflex */ \
+ (c) == 0x00D9 || (c) == 0x00DA || (c) == 0x00DB || \
+ /* latin capital letter y with acute */ \
+ (c) == 0x00DD || \
+ /* latin small letter a with grave, acute and circumflex */ \
+ (c) == 0x00E0 || (c) == 0x00E1 || (c) == 0x00E2 || \
+ /* latin small letter e with grave, acute and circumflex */ \
+ (c) == 0x00E8 || (c) == 0x00E9 || (c) == 0x00EA || \
+ /* latin small letter i with grave, acute and circumflex */ \
+ (c) == 0x00EC || (c) == 0x00ED || (c) == 0x00EE || \
+ /* latin small letter o with grave, acute and circumflex */ \
+ (c) == 0x00F2 || (c) == 0x00F3 || (c) == 0x00F4 || \
+ /* latin small letter u with grave, acute and circumflex */ \
+ (c) == 0x00F9 || (c) == 0x00FA || (c) == 0x00FB || \
+ /* latin small letter y with acute */ \
+ (c) == 0x00FD )
+#define IS_VOWEL(c) \
+ (IS_ASCII_VOWEL(c) || IS_NONASCII_VOWEL(c))
#endif
diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/fts-tokenizer-generic-private.h
--- a/src/lib-fts/fts-tokenizer-generic-private.h Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic-private.h Mon Aug 17 13:18:03 2015 +0300
@@ -40,6 +40,11 @@
struct generic_fts_tokenizer {
struct fts_tokenizer tokenizer;
unsigned int max_length;
+ bool wb5a; /* TR29 rule for prefix separation
+ in e.g. French or Italian. */
+ bool seen_wb5a;
+ unichar_t prev_letter_c;
+ unichar_t letter_c;
enum boundary_algorithm algorithm;
enum letter_type prev_letter;
enum letter_type prev_prev_letter;
diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c Mon Aug 17 13:18:03 2015 +0300
@@ -2,6 +2,7 @@
#include "lib.h"
#include "buffer.h"
+#include "str.h"
#include "unichar.h"
#include "bsearch-insert-pos.h"
#include "fts-common.h"
@@ -11,6 +12,7 @@
#include "word-break-data.c"
#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
+#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
static unsigned char fts_ascii_word_breaks[128] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
@@ -32,6 +34,7 @@
struct generic_fts_tokenizer *tok;
unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
+ bool wb5a = FALSE;
unsigned int i;
for (i = 0; settings[i] != NULL; i += 2) {
@@ -57,12 +60,22 @@
} else if (strcmp(key, "search") == 0) {
/* tokenizing a search string -
makes no difference to us */
+ } else if (strcasecmp(key, "wb5a") == 0) {
+ if (strcasecmp(value, "no") == 0)
+ wb5a = FALSE;
+ else
+ wb5a = TRUE;
} else {
*error_r = t_strdup_printf("Unknown setting: %s", key);
return -1;
}
}
+ if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) {
+ *error_r = "Can not use WB5a for algorithms other than TR29.";
+ return -1;
+ }
+
tok = i_new(struct generic_fts_tokenizer, 1);
if (algo == BOUNDARY_ALGORITHM_TR29)
tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
@@ -70,6 +83,7 @@
tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
tok->max_length = max_length;
tok->algorithm = algo;
+ tok->wb5a = wb5a;
tok->token = buffer_create_dynamic(default_pool, 64);
*tokenizer_r = &tok->tokenizer;
@@ -369,6 +383,14 @@
static bool letter_aletter(struct generic_fts_tokenizer *tok)
{
+
+ /* WB5a */
+ if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+ if (IS_WB5A_APOSTROPHE(tok->prev_letter_c) && IS_VOWEL(tok->letter_c)) {
+ tok->seen_wb5a = TRUE;
+ return TRUE;
+ }
+
/* WB5 */
if (tok->prev_letter == LETTER_TYPE_ALETTER)
return FALSE;
@@ -489,8 +511,8 @@
return TRUE; /* Any / Any */
}
+
static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
-
{
return TRUE; /* Any / Any */
}
@@ -498,11 +520,17 @@
static void
add_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
{
- if(tok->prev_letter != LETTER_TYPE_NONE) {
+ if(tok->prev_letter != LETTER_TYPE_NONE)
tok->prev_prev_letter = tok->prev_letter;
- tok->prev_letter = lt;
- } else
- tok->prev_letter = lt;
+ tok->prev_letter = lt;
+}
+
+static void
+add_letter_c(struct generic_fts_tokenizer *tok, unichar_t c)
+{
+ if(tok->letter_c != 0)
+ tok->prev_letter_c = tok->letter_c;
+ tok->letter_c = c;
}
/*
@@ -544,6 +572,7 @@
return FALSE;
}
+
static void
fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
const char **token_r)
@@ -569,12 +598,23 @@
tok->prev_prev_letter = LETTER_TYPE_NONE;
tok->prev_letter = LETTER_TYPE_NONE;
-
*token_r = t_strndup(data, len);
buffer_set_used_size(tok->token, 0);
tok->untruncated_length = 0;
}
+static void wb5a_reinsert(struct generic_fts_tokenizer *tok)
+{
+ string_t *utf8_str = t_str_new(6);
+
+ uni_ucs4_to_utf8_c(tok->letter_c, utf8_str);
+ buffer_insert(tok->token, 0, str_data(utf8_str), str_len(utf8_str));
+ tok->prev_letter = letter_type(tok->letter_c);
+ tok->letter_c = 0;
+ tok->prev_letter_c = 0;
+ tok->seen_wb5a = FALSE;
+}
+
struct letter_fn {
bool (*fn)(struct generic_fts_tokenizer *tok);
};
@@ -599,6 +639,8 @@
(ALetter | Hebrew_Letter) and MidNumLetQ (MidNumLet | Single_Quote).
Adaptions:
+ * Added optional WB5a as a configurable option. The cut of prefix is
+ max FTS_WB5A_PREFIX chars.
* No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
* Break just once, not before and after.
* Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
@@ -644,12 +686,22 @@
i += char_size;
lt = letter_type(c);
+ /* The WB5a break is detected only when the "after
+ break" char is inspected. That char needs to be
+ reinserted as the "previous char". */
+ if (tok->seen_wb5a)
+ wb5a_reinsert(tok);
+
if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
/* Skip non-token chars at the beginning of token */
i_assert(tok->token->used == 0);
start_pos = i;
continue;
}
+
+ if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+ add_letter_c(tok, c);
+
if (uni_found_word_boundary(tok, lt)) {
i_assert(char_start_i >= start_pos && size >= start_pos);
tok_append_truncated(tok, data + start_pos,
diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c Mon Aug 17 13:18:03 2015 +0300
@@ -52,8 +52,10 @@
"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
/* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E */
- "hello world\xEF\xBC\x8E"
+ "hello world\xEF\xBC\x8E",
+ /* TR29 WB5a */
+ "l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
};
static void test_fts_tokenizer_find(void)
@@ -172,6 +174,8 @@
"hello", "world", NULL,
+ "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
+
NULL
};
struct fts_tokenizer *tok;
@@ -229,6 +233,7 @@
"hello", "world", NULL,
+ "l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
NULL
};
struct fts_tokenizer *tok;
@@ -241,6 +246,63 @@
test_end();
}
+const char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
+
+/* TODO: U+206F is in "Format" and therefore currently not word break.
+ This definitely needs to be remapped. */
+static void test_fts_tokenizer_generic_tr29_wb5a(void)
+{
+ static const char *const expected_output[] = {
+ "hello", "world", "And",
+ "there", "was", "text", "galor\xC3\xA9",
+ "abc", "example", "com", "Bar", "Baz",
+ "bar", "example", "org", "foo", "domain",
+ "1234567890123456789012345678ä",
+ "12345678901234567890123456789",
+ "123456789012345678901234567890",
+ "and", "longlonglongabcdefghijklmnopqr",
+ "more", "Hello", "world", "3", "14", "3,14", "last", NULL,
+
+ "1", NULL,
+
+ "quoted", "text", "word", "hlo", "words", "you're", "bad",
+ "word", "pre", "post", NULL,
+
+ "1234567890123456789012345678ä",
+ "123456789012345678901234567x'",
+ "1234567890123456789012345678x'",
+ "1234567890123456789012345678x",
+ "1234567890123456789012345678x",
+ "12345678901234567890123456789x",
+ "12345678901234567890123456789x",
+ "123456789012345678901234567890",
+ "123456789012345678901234567890",
+
+ "quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
+ "word", "pre", "post", NULL,
More information about the dovecot-cvs
mailing list