dovecot-2.2: lib-fts: Add prefixing contraction filter.

dovecot at dovecot.org dovecot at dovecot.org
Mon Aug 31 10:35:14 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/4b8c7440cf4f
changeset: 19043:4b8c7440cf4f
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Mon Aug 31 13:33:26 2015 +0300
description:
lib-fts: Add prefixing contraction filter.
Filters away prefixing contracted words, e.g. "l'homme" -> "homme".
Tokens to be filtered must be lower case. Only supports French in
this initial version.

diffstat:

 src/lib-fts/Makefile.am               |   1 +
 src/lib-fts/fts-filter-contractions.c |  84 +++++++++++++++++++++++++++++++++++
 src/lib-fts/fts-filter-private.h      |   2 +-
 src/lib-fts/fts-filter.c              |   1 +
 src/lib-fts/fts-filter.h              |   3 +
 src/lib-fts/test-fts-filter.c         |  68 ++++++++++++++++++++++++++-
 6 files changed, 154 insertions(+), 5 deletions(-)

diffs (255 lines):

diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am	Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/Makefile.am	Mon Aug 31 13:33:26 2015 +0300
@@ -62,6 +62,7 @@
 
 libfts_la_SOURCES = \
 	fts-filter.c \
+	fts-filter-contractions.c \
 	fts-filter-english-possessive.c \
 	fts-filter-lowercase.c \
 	fts-filter-normalizer-icu.c \
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter-contractions.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/fts-filter-contractions.c	Mon Aug 31 13:33:26 2015 +0300
@@ -0,0 +1,84 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "fts-language.h"
+#include "fts-filter-private.h"
+#include "fts-common.h"
+#include "unichar.h"
+
+static int
+fts_filter_contractions_create(const struct fts_language *lang,
+			       const char *const *settings,
+			       struct fts_filter **filter_r,
+			       const char **error_r)
+{
+	struct fts_filter *filter;
+
+	if (settings[0] != NULL) {
+		*error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
+		return -1;
+	}
+	if (strcmp(lang->name, "fr") != 0) {
+		*error_r = t_strdup_printf("Unsupported language: %s", lang->name);
+		return -1;
+	}
+
+	filter = i_new(struct fts_filter, 1);
+	*filter = *fts_filter_contractions;
+	filter->token = str_new(default_pool, 64);
+	*filter_r = filter;
+	return 0;
+}
+
+static int
+fts_filter_contractions_filter(struct fts_filter *filter ATTR_UNUSED,
+			    const char **_token,
+			    const char **error_r ATTR_UNUSED)
+{
+	int char_size, pos = 0;
+	unichar_t apostrophe;
+	const char *token = *_token;
+
+	switch (token[pos]) {
+	case 'q':
+		pos++;
+		if (token[pos] == '\0' || token[pos] != 'u')
+			break;
+		/* otherwise fall through */
+	case 'c':
+	case 'd':
+	case 'l':
+	case 'm':
+	case 'n':
+	case 's':
+	case 't':
+		pos++;
+		if (token[pos] == '\0')
+			break;
+		char_size = uni_utf8_get_char(token + pos, &apostrophe);
+		if (IS_APOSTROPHE(apostrophe)) {
+			pos += char_size;
+			*_token = token + pos;
+		}
+		if (token[pos] == '\0') /* nothing left */
+			return 0;
+		break;
+	default:
+		/* do nothing */
+		break;
+	}
+
+	return 1;
+}
+
+static const struct fts_filter fts_filter_contractions_real = {
+	.class_name = "contractions",
+	.v = {
+		fts_filter_contractions_create,
+		fts_filter_contractions_filter,
+		NULL
+	}
+};
+
+const struct fts_filter *fts_filter_contractions = &fts_filter_contractions_real;
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter-private.h
--- a/src/lib-fts/fts-filter-private.h	Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/fts-filter-private.h	Mon Aug 31 13:33:26 2015 +0300
@@ -3,7 +3,7 @@
 
 #include "fts-filter.h"
 
-#define FTS_FILTER_CLASSES_NR 3
+#define FTS_FILTER_CLASSES_NR 6
 
 /*
  API that stemming providers (classes) must provide: The create()
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter.c
--- a/src/lib-fts/fts-filter.c	Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/fts-filter.c	Mon Aug 31 13:33:26 2015 +0300
@@ -21,6 +21,7 @@
 	fts_filter_register(fts_filter_normalizer_icu);
 	fts_filter_register(fts_filter_lowercase);
 	fts_filter_register(fts_filter_english_possessive);
+	fts_filter_register(fts_filter_contractions);
 }
 
 void fts_filters_deinit(void)
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter.h
--- a/src/lib-fts/fts-filter.h	Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/fts-filter.h	Mon Aug 31 13:33:26 2015 +0300
@@ -35,6 +35,9 @@
 /* Removes <'s> suffix from words. */
 extern const struct fts_filter *fts_filter_english_possessive;
 
+/* Removes prefixing contractions from words. */
+extern const struct fts_filter *fts_filter_contractions;
+
 /* Register all built-in filters. */
 void fts_filters_init(void);
 void fts_filters_deinit(void);
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c	Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/test-fts-filter.c	Mon Aug 31 13:33:26 2015 +0300
@@ -12,6 +12,7 @@
 
 static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
 static struct fts_language english_language = { .name = "en" };
+static struct fts_language french_language = { .name = "fr" };
 
 static void test_fts_filter_find(void)
 {
@@ -20,6 +21,65 @@
 	test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball);
 	test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu);
 	test_assert(fts_filter_find("lowercase") == fts_filter_lowercase);
+	test_assert(fts_filter_find("contractions") == fts_filter_contractions);
+	test_end();
+}
+
+
+static void test_fts_filter_contractions_fail(void)
+{
+
+	struct fts_filter *filter;
+	const char *error;
+
+	test_begin("fts filter contractions, unsupported language");
+	test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
+	test_assert(error != NULL);
+	test_end();
+}
+
+static void test_fts_filter_contractions_fr(void)
+{
+	struct {
+		const char *input;
+		const char *output;
+	} tests[] = {
+		{ "foo", "foo" },
+		{ "you're", "you're" },
+		{ "l'homme", "homme" },
+		{ "l\xE2\x80\x99homme", "homme" },
+		{ "aujourd'hui", "aujourd'hui" },
+		{ "qu\xE2\x80\x99il", "il" },
+		{ "qu'il", "il" },
+		{ "du'il", "du'il" },
+		{ "que", "que" },
+		{ "'foobar'", "'foobar'" },
+		{ "foo'bar", "foo'bar" },
+		{ "a'foo", "a'foo" },
+		{ "cu'", "cu'" },
+		{ "qu", "qu" },
+		{ "d", "d" },
+		{ "qu'", NULL }
+	};
+	struct fts_filter *filter;
+	const char *error;
+	const char *token;
+	unsigned int i;
+	int ret;
+
+	test_begin("fts filter contractions, French");
+	test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
+
+	for (i = 0; i < N_ELEMENTS(tests); i++) {
+		token = tests[i].input;
+		ret = fts_filter_filter(filter, &token, &error);
+		test_assert(ret >= 0);
+		if (ret > 0)
+			test_assert_idx(strcmp(token, tests[i].output) == 0, i);
+		else if (ret == 0)
+			test_assert_idx(token == NULL && tests[i].output == NULL, i);
+	}
+	fts_filter_unref(&filter);
 	test_end();
 }
 
@@ -151,7 +211,6 @@
 
 static void test_fts_filter_stopwords_fra(void)
 {
-	const struct fts_language french = { .name = "fr" };
 	struct fts_filter *filter;
 	const char *error;
 	int ret;
@@ -167,7 +226,7 @@
 	const char *token;
 
 	test_begin("fts filter stopwords, French");
-	test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french, stopword_settings, &filter, &error) == 0);
+	test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
 
 	ip = input;
 	op = output;
@@ -245,7 +304,6 @@
 {
 	struct fts_filter *stemmer;
 	const char *error;
-	struct fts_language language = { .name = "fr" };
 	const char *token = NULL;
 	const char * const tokens[] = {
 		"Tous", "les", "\xC3\xAAtres", "humains", "naissent",
@@ -258,7 +316,7 @@
 	const char * const *bpp;
 
 	test_begin("fts filter stem French");
-	test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &language, NULL, &stemmer, &error) == 0);
+	test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
 	bpp = bases;
 	for (tpp=tokens; *tpp != NULL; tpp++) {
 		token = *tpp;
@@ -627,6 +685,8 @@
 {
 	static void (*test_functions[])(void) = {
 		test_fts_filter_find,
+		test_fts_filter_contractions_fail,
+		test_fts_filter_contractions_fr,
 		test_fts_filter_lowercase,
 		test_fts_filter_stopwords_eng,
 		test_fts_filter_stopwords_fin,


More information about the dovecot-cvs mailing list