dovecot-2.2: lib-fts: Add Unicode TR29 rule WB5a setting to toke...

Mon Aug 17 10:46:08 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/f7f6ec738683
changeset: 18944:f7f6ec738683
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Mon Aug 17 13:18:03 2015 +0300
description:
lib-fts: Add Unicode TR29 rule WB5a setting to tokenizer.
Splits prefixing contracted words from base word.
E.g. "l'homme" -> "l" "homme". Together with a language specific stopword list
unnecessary contractions can thus be filtered away.

This is disabled by default and only works with the TR29 algorithm.
Enable by "fts_tokenizer_generic = algorithm=tr29 wb5a=yes"

diffstat:

 src/lib-fts/fts-common.h                    |  37 ++++++++++++++++-
 src/lib-fts/fts-tokenizer-generic-private.h |   5 ++
 src/lib-fts/fts-tokenizer-generic.c         |  64 +++++++++++++++++++++++++--
 src/lib-fts/test-fts-tokenizer.c            |  65 ++++++++++++++++++++++++++++-
 4 files changed, 163 insertions(+), 8 deletions(-)

diffs (truncated from 334 to 300 lines):

diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/fts-common.h

--- a/src/lib-fts/fts-common.h	Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/fts-common.h	Mon Aug 17 13:18:03 2015 +0300
@@ -6,5 +6,40 @@
 	((c) == 0x2019 || (c) == 0xFF07)
 #define IS_APOSTROPHE(c) \
 	((c) == 0x0027 || IS_NONASCII_APOSTROPHE(c))
-
+#define IS_WB5A_APOSTROPHE(c) \
+	((c) == 0x0027 || (c) == 0x2019)
+/* The h letters are included because it is an exception in French.
+   A, E, H, I, O, U, Y, a, e, h, i, o, u, y */
+#define IS_ASCII_VOWEL(c) \
+	((c) == 0x0041 || (c) == 0x0045 || (c) == 0x0048 || (c) == 0x0049 || \
+	 (c) == 0x004F || (c) == 0x0055 || (c) == 0x0059 || (c) == 0x0061 || \
+	 (c) == 0x0065 || (c) == 0x0068 || (c) == 0x0069 || (c) == 0x006F || \
+	 (c) == 0x0075 || (c) == 0x0079)
+#define IS_NONASCII_VOWEL(c) \
+	/*latin capital letter a with grave, acute and circumflex*/ \
+	((c) == 0x00C0 || (c) == 0x00C1 || (c) == 0x00C2 || \
+	 /* latin capital letter e with grave, acute and circumflex */ \
+	 (c) == 0x00C8 || (c) == 0x00C9 || (c) == 0x00CA || \
+	 /*  latin capital letter i with grave, acute and circumflex */ \
+	 (c) == 0x00CC || (c) == 0x00CD || (c) == 0x00CE || \
+	 /*  latin capital letter o with grave, acute and circumflex */ \
+	 (c) == 0x00D2 || (c) == 0x00D3 || (c) == 0x00D4 || \
+	 /* latin capital letter u with grave, acute and circumflex */ \
+	 (c) == 0x00D9 || (c) == 0x00DA || (c) == 0x00DB || \
+	 /* latin capital letter y with acute */ \
+	 (c) == 0x00DD || \
+	 /* latin small letter a with grave, acute and circumflex */ \
+	 (c) == 0x00E0 || (c) == 0x00E1 || (c) == 0x00E2 || \
+	 /* latin small letter e with grave, acute and circumflex */ \
+	 (c) == 0x00E8 || (c) == 0x00E9 || (c) == 0x00EA || \
+	 /* latin small letter i with grave, acute and circumflex */ \
+	 (c) == 0x00EC || (c) == 0x00ED || (c) == 0x00EE || \
+	 /* latin small letter o with grave, acute and circumflex */ \
+	 (c) == 0x00F2 || (c) == 0x00F3 || (c) == 0x00F4 || \
+	 /* latin small letter u with grave, acute and circumflex */ \
+	 (c) == 0x00F9 || (c) == 0x00FA || (c) == 0x00FB || \
+	 /* latin small letter y with acute */ \
+	 (c) == 0x00FD )
+#define IS_VOWEL(c) \
+	(IS_ASCII_VOWEL(c) || IS_NONASCII_VOWEL(c))
 #endif
diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/fts-tokenizer-generic-private.h
--- a/src/lib-fts/fts-tokenizer-generic-private.h	Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic-private.h	Mon Aug 17 13:18:03 2015 +0300
@@ -40,6 +40,11 @@
 struct generic_fts_tokenizer {
 	struct fts_tokenizer tokenizer;
 	unsigned int max_length;
+	bool wb5a; /* TR29 rule for prefix separation
+	              in e.g. French or Italian. */
+	bool seen_wb5a;
+	unichar_t prev_letter_c;
+	unichar_t letter_c;
 	enum boundary_algorithm algorithm;
 	enum letter_type prev_letter;
 	enum letter_type prev_prev_letter;
diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Mon Aug 17 13:18:03 2015 +0300
@@ -2,6 +2,7 @@
 
 #include "lib.h"
 #include "buffer.h"
+#include "str.h"
 #include "unichar.h"
 #include "bsearch-insert-pos.h"
 #include "fts-common.h"
@@ -11,6 +12,7 @@
 #include "word-break-data.c"
 
 #define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
+#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
 
 static unsigned char fts_ascii_word_breaks[128] = {
 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0-15 */
@@ -32,6 +34,7 @@
 	struct generic_fts_tokenizer *tok;
 	unsigned int max_length = FTS_DEFAULT_TOKEN_MAX_LENGTH;
 	enum boundary_algorithm algo = BOUNDARY_ALGORITHM_SIMPLE;
+	bool wb5a = FALSE;
 	unsigned int i;
 
 	for (i = 0; settings[i] != NULL; i += 2) {
@@ -57,12 +60,22 @@
 		} else if (strcmp(key, "search") == 0) {
 			/* tokenizing a search string -
 			   makes no difference to us */
+		} else if (strcasecmp(key, "wb5a") == 0) {
+			if (strcasecmp(value, "no") == 0)
+				wb5a = FALSE;
+			else
+				wb5a = TRUE;
 		} else {
 			*error_r = t_strdup_printf("Unknown setting: %s", key);
 			return -1;
 		}
 	}
 
+	if (wb5a && algo != BOUNDARY_ALGORITHM_TR29) {
+		*error_r = "Can not use WB5a for algorithms other than TR29.";
+		return -1;
+	}
+
 	tok = i_new(struct generic_fts_tokenizer, 1);
 	if (algo == BOUNDARY_ALGORITHM_TR29)
 		tok->tokenizer.v = &generic_tokenizer_vfuncs_tr29;
@@ -70,6 +83,7 @@
 		tok->tokenizer.v = &generic_tokenizer_vfuncs_simple;
 	tok->max_length = max_length;
 	tok->algorithm = algo;
+	tok->wb5a = wb5a;
 	tok->token = buffer_create_dynamic(default_pool, 64);
 
 	*tokenizer_r = &tok->tokenizer;
@@ -369,6 +383,14 @@
 
 static bool letter_aletter(struct generic_fts_tokenizer *tok)
 {
+
+	/* WB5a */
+	if (tok->wb5a && tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+		if (IS_WB5A_APOSTROPHE(tok->prev_letter_c) && IS_VOWEL(tok->letter_c)) {
+			tok->seen_wb5a = TRUE;
+			return TRUE;
+		}
+
 	/* WB5 */
 	if (tok->prev_letter == LETTER_TYPE_ALETTER)
 		return FALSE;
@@ -489,8 +511,8 @@
 
        return TRUE; /* Any / Any */
 }
+
 static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
-
 {
 	return TRUE; /* Any / Any */
 }
@@ -498,11 +520,17 @@
 static void
 add_prev_letter(struct generic_fts_tokenizer *tok, enum letter_type lt)
 {
-	if(tok->prev_letter != LETTER_TYPE_NONE) {
+	if(tok->prev_letter != LETTER_TYPE_NONE)
 		tok->prev_prev_letter = tok->prev_letter;
-		tok->prev_letter = lt;
-	} else
-		tok->prev_letter = lt;
+	tok->prev_letter = lt;
+}
+
+static void
+add_letter_c(struct generic_fts_tokenizer *tok, unichar_t c)
+{
+	if(tok->letter_c != 0)
+		tok->prev_letter_c = tok->letter_c;
+	tok->letter_c = c;
 }
 
 /*
@@ -544,6 +572,7 @@
 
 	return FALSE;
 }
+
 static void
 fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
                                          const char **token_r)
@@ -569,12 +598,23 @@
 
 	tok->prev_prev_letter = LETTER_TYPE_NONE;
 	tok->prev_letter = LETTER_TYPE_NONE;
-
 	*token_r = t_strndup(data, len);
 	buffer_set_used_size(tok->token, 0);
 	tok->untruncated_length = 0;
 }
 
+static void wb5a_reinsert(struct generic_fts_tokenizer *tok)
+{
+	string_t *utf8_str = t_str_new(6);
+
+	uni_ucs4_to_utf8_c(tok->letter_c, utf8_str);
+	buffer_insert(tok->token, 0, str_data(utf8_str), str_len(utf8_str));
+	tok->prev_letter = letter_type(tok->letter_c);
+	tok->letter_c = 0;
+	tok->prev_letter_c = 0;
+	tok->seen_wb5a = FALSE;
+}
+
 struct letter_fn {
 	bool (*fn)(struct generic_fts_tokenizer *tok);
 };
@@ -599,6 +639,8 @@
   (ALetter | Hebrew_Letter) and MidNumLetQ (MidNumLet | Single_Quote).
 
   Adaptions:
+  * Added optional WB5a as a configurable option. The cut of prefix is
+   max FTS_WB5A_PREFIX chars.
   * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
   * Break just once, not before and after.
   * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
@@ -644,12 +686,22 @@
 		i += char_size;
 		lt = letter_type(c);
 
+		/* The WB5a break is detected only when the "after
+		   break" char is inspected. That char needs to be
+		   reinserted as the "previous char". */
+		if (tok->seen_wb5a)
+			wb5a_reinsert(tok);
+
 		if (tok->prev_letter == LETTER_TYPE_NONE && is_nontoken(lt)) {
 			/* Skip non-token chars at the beginning of token */
 			i_assert(tok->token->used == 0);
 			start_pos = i;
 			continue;
 		}
+
+		if (tok->wb5a &&  tok->token->used <= FTS_WB5A_PREFIX_MAX_LENGTH)
+			add_letter_c(tok, c);
+
 		if (uni_found_word_boundary(tok, lt)) {
 			i_assert(char_start_i >= start_pos && size >= start_pos);
 			tok_append_truncated(tok, data + start_pos,
diff -r 0994a6619380 -r f7f6ec738683 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c	Mon Aug 17 13:15:11 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c	Mon Aug 17 13:18:03 2015 +0300
@@ -52,8 +52,10 @@
 	"galore\xE2\x81\x9F""and\xE2\x81\x9Amore.\n\n",
 
 	/* TR29 MinNumLet U+FF0E at end: u+FF0E is EF BC 8E  */
-	"hello world\xEF\xBC\x8E"
+	"hello world\xEF\xBC\x8E",
 
+	/* TR29 WB5a */
+	"l\xE2\x80\x99homme l\xE2\x80\x99humanit\xC3\xA9 d\xE2\x80\x99immixtions qu\xE2\x80\x99il aujourd'hui que'euq"
 };
 
 static void test_fts_tokenizer_find(void)
@@ -172,6 +174,8 @@
 
 		"hello", "world", NULL,
 
+		"l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
+
 		NULL
 	};
 	struct fts_tokenizer *tok;
@@ -229,6 +233,7 @@
 
 		"hello", "world", NULL,
 
+		"l'homme", "l'humanit\xC3\xA9", "d'immixtions", "qu'il", "aujourd'hui", "que'euq", NULL,
 		NULL
 	};
 	struct fts_tokenizer *tok;
@@ -241,6 +246,63 @@
 	test_end();
 }
 
+const char *const tr29_settings_wb5a[] = {"algorithm", "tr29", "wb5a", "yes", NULL};
+
+/* TODO: U+206F is in "Format" and therefore currently not word break.
+   This definitely needs to be remapped. */
+static void test_fts_tokenizer_generic_tr29_wb5a(void)
+{
+	static const char *const expected_output[] = {
+		"hello", "world", "And",
+		"there", "was", "text", "galor\xC3\xA9",
+		"abc", "example", "com", "Bar", "Baz",
+		"bar", "example", "org", "foo", "domain",
+		"1234567890123456789012345678ä",
+		"12345678901234567890123456789",
+		"123456789012345678901234567890",
+		"and", "longlonglongabcdefghijklmnopqr",
+		"more", "Hello", "world", "3", "14", "3,14", "last", NULL,
+
+		"1", NULL,
+
+		"quoted", "text", "word", "hlo", "words", "you're", "bad",
+		"word", "pre", "post", NULL,
+
+		"1234567890123456789012345678ä",
+		"123456789012345678901234567x'",
+		"1234567890123456789012345678x'",
+		"1234567890123456789012345678x",
+		"1234567890123456789012345678x",
+		"12345678901234567890123456789x",
+		"12345678901234567890123456789x",
+		"123456789012345678901234567890",
+		"123456789012345678901234567890",
+
+		"quoted", "text", "word", "hlo", "words", "you're789012345678901234567890", "bad",
+		"word", "pre", "post", NULL,