dovecot-2.2: lib-fts: Change TR29 tokenizer to break at full sto...

dovecot at dovecot.org dovecot at dovecot.org
Mon Jun 1 17:50:14 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/8973a5837b48
changeset: 18766:8973a5837b48
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Mon Jun 01 18:35:58 2015 +0300
description:
lib-fts: Change TR29 tokenizer to break at full stop (and others).
Diverge from the TR29 rules and always break at MidNumLet letters.
This fixes tokenizing first.last at domain.tld email addresses.

diffstat:

 src/lib-fts/fts-tokenizer-generic-private.h |   1 +
 src/lib-fts/fts-tokenizer-generic.c         |  55 ++++++++++++++--------------
 2 files changed, 28 insertions(+), 28 deletions(-)

diffs (137 lines):

diff -r bb1522e10108 -r 8973a5837b48 src/lib-fts/fts-tokenizer-generic-private.h
--- a/src/lib-fts/fts-tokenizer-generic-private.h	Fri May 29 21:39:33 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic-private.h	Mon Jun 01 18:35:58 2015 +0300
@@ -25,6 +25,7 @@
 	LETTER_TYPE_EXTENDNUMLET,
 	LETTER_TYPE_SOT,
 	LETTER_TYPE_EOT,
+	LETTER_TYPE_APOSTROPHE, /* Own modification to TR29 */
 	LETTER_TYPE_OTHER /* WB14 "any" */
 };
 
diff -r bb1522e10108 -r 8973a5837b48 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Fri May 29 21:39:33 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Mon Jun 01 18:35:58 2015 +0300
@@ -268,6 +268,8 @@
 {
 	unsigned int idx;
 
+	if (IS_APOSTROPHE(c))
+		return LETTER_TYPE_APOSTROPHE;
 	if (uint32_find(CR, N_ELEMENTS(CR), c, &idx))
 		return LETTER_TYPE_CR;
 	if (uint32_find(LF, N_ELEMENTS(LF), c, &idx))
@@ -349,10 +351,10 @@
 	if (tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
 		return FALSE;
 
-	/* WB7 WB7c */
+	/* WB7 WB7c, except MidNumLet */
 	if (tok->prev_prev_letter == LETTER_TYPE_HEBREW_LETTER &&
 	    (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
-	     tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
+	     tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
 	     tok->prev_letter == LETTER_TYPE_MIDLETTER ||
 	     tok->prev_letter == LETTER_TYPE_DOUBLE_QUOTE))
 		return FALSE;
@@ -374,10 +376,10 @@
 	if (tok->prev_letter == LETTER_TYPE_ALETTER)
 		return FALSE;
 
-	/* WB7 */
+	/* WB7, except MidNumLet */
 	if (tok->prev_prev_letter == LETTER_TYPE_ALETTER &&
 	    (tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE ||
-	     tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
+	     tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
 	     tok->prev_letter == LETTER_TYPE_MIDLETTER))
 		return FALSE;
 
@@ -416,18 +418,11 @@
 	return TRUE; /* Any / Any */
 }
 
-static bool letter_midnumlet(struct generic_fts_tokenizer *tok)
+static bool letter_midnumlet(struct generic_fts_tokenizer *tok ATTR_UNUSED)
 {
-	/* WB6 */
-	if (tok->prev_letter == LETTER_TYPE_ALETTER ||
-	    tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
-		return FALSE;
 
-	/* WB12 */
-	if (tok->prev_letter == LETTER_TYPE_NUMERIC)
-		return FALSE;
-
-	return TRUE; /* Any / Any */
+	/* Break at MidNumLet, non-conformant with WB6/WB7 */
+	return TRUE;
 }
 
 static bool letter_midletter(struct generic_fts_tokenizer *tok)
@@ -488,6 +483,15 @@
 	return TRUE; /* Any / Any */
 }
 
+static bool letter_apostrophe(struct generic_fts_tokenizer *tok)
+{
+
+       if (tok->prev_letter == LETTER_TYPE_ALETTER ||
+           tok->prev_letter == LETTER_TYPE_HEBREW_LETTER)
+               return FALSE;
+
+       return TRUE; /* Any / Any */
+}
 static bool letter_other(struct generic_fts_tokenizer *tok ATTR_UNUSED)
 
 {
@@ -536,12 +540,14 @@
 	/* WB6/7 false positive detected at one past end. */
 	if (tok->prev_letter == LETTER_TYPE_MIDLETTER ||
 	    tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
+	    tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
 	    tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE )
 		return TRUE;
 
 	/* WB12/12 false positive detected at one past end. */
 	if (tok->prev_letter == LETTER_TYPE_MIDNUM ||
 	    tok->prev_letter == LETTER_TYPE_MIDNUMLET ||
+	    tok->prev_letter == LETTER_TYPE_APOSTROPHE ||
 	    tok->prev_letter == LETTER_TYPE_SINGLE_QUOTE)
 		return TRUE;
 
@@ -577,7 +583,7 @@
 	{letter_single_quote}, {letter_double_quote},
 	{letter_midnumlet}, {letter_midletter}, {letter_midnum},
 	{letter_numeric}, {letter_extendnumlet}, {letter_panic},
-	{letter_panic}, {letter_other}
+	{letter_panic}, {letter_apostrophe}, {letter_other}
 };
 
 /*
@@ -585,19 +591,12 @@
   #29, but tailored for FTS purposes.
   http://www.unicode.org/reports/tr29/
 
-  Adaptions: No word boundary at Start-Of-Text or End-of-Text (Wb1 and
-  WB2). Break just once, not before and after.  Other things also
-  (e.g. is_nonword(), not really pure tr29. Meant to assist in finding
-  individual words.
-
-  TODO: If this letter_fns based approach is too kludgy, do a FSM with function
-  pointers and transition tables.
-
-  TODO: Alternative idea: Replace everything with a super simplistic
-  "lt != ALETTER, HEBREW, NUMERIC, ... --> word break"
-
-  TODO: Rules get split up over several functions. Is it too
-  confusing?
+  Adaptions:
+  * No word boundary at Start-Of-Text or End-of-Text (Wb1 and WB2).
+  * Break just once, not before and after.
+  * Break at MidNumLet, except apostrophes (diverging from WB6/WB7).
+  * Other things also (e.g. is_nonword(), not really pure tr29. Meant
+  to assist in finding individual words.
 */
 static bool
 uni_found_word_boundary(struct generic_fts_tokenizer *tok, enum letter_type lt)


More information about the dovecot-cvs mailing list