dovecot-2.2: lib-fts: Add prefixing contraction filter.
dovecot at dovecot.org
dovecot at dovecot.org
Mon Aug 31 10:35:14 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/4b8c7440cf4f
changeset: 19043:4b8c7440cf4f
user: Teemu Huovila <teemu.huovila at dovecot.fi>
date: Mon Aug 31 13:33:26 2015 +0300
description:
lib-fts: Add prefixing contraction filter.
Filters away prefixing contracted words, e.g. "l'homme" -> "homme".
Tokens to be filtered must be lower case. Only supports French in
this initial version.
diffstat:
src/lib-fts/Makefile.am | 1 +
src/lib-fts/fts-filter-contractions.c | 84 +++++++++++++++++++++++++++++++++++
src/lib-fts/fts-filter-private.h | 2 +-
src/lib-fts/fts-filter.c | 1 +
src/lib-fts/fts-filter.h | 3 +
src/lib-fts/test-fts-filter.c | 68 ++++++++++++++++++++++++++-
6 files changed, 154 insertions(+), 5 deletions(-)
diffs (255 lines):
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/Makefile.am Mon Aug 31 13:33:26 2015 +0300
@@ -62,6 +62,7 @@
libfts_la_SOURCES = \
fts-filter.c \
+ fts-filter-contractions.c \
fts-filter-english-possessive.c \
fts-filter-lowercase.c \
fts-filter-normalizer-icu.c \
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter-contractions.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/fts-filter-contractions.c Mon Aug 31 13:33:26 2015 +0300
@@ -0,0 +1,84 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "fts-language.h"
+#include "fts-filter-private.h"
+#include "fts-common.h"
+#include "unichar.h"
+
+static int
+fts_filter_contractions_create(const struct fts_language *lang,
+ const char *const *settings,
+ struct fts_filter **filter_r,
+ const char **error_r)
+{
+ struct fts_filter *filter;
+
+ if (settings[0] != NULL) {
+ *error_r = t_strdup_printf("Unknown setting: %s", settings[0]);
+ return -1;
+ }
+ if (strcmp(lang->name, "fr") != 0) {
+ *error_r = t_strdup_printf("Unsupported language: %s", lang->name);
+ return -1;
+ }
+
+ filter = i_new(struct fts_filter, 1);
+ *filter = *fts_filter_contractions;
+ filter->token = str_new(default_pool, 64);
+ *filter_r = filter;
+ return 0;
+}
+
+static int
+fts_filter_contractions_filter(struct fts_filter *filter ATTR_UNUSED,
+ const char **_token,
+ const char **error_r ATTR_UNUSED)
+{
+ int char_size, pos = 0;
+ unichar_t apostrophe;
+ const char *token = *_token;
+
+ switch (token[pos]) {
+ case 'q':
+ pos++;
+ if (token[pos] == '\0' || token[pos] != 'u')
+ break;
+ /* otherwise fall through */
+ case 'c':
+ case 'd':
+ case 'l':
+ case 'm':
+ case 'n':
+ case 's':
+ case 't':
+ pos++;
+ if (token[pos] == '\0')
+ break;
+ char_size = uni_utf8_get_char(token + pos, &apostrophe);
+ if (IS_APOSTROPHE(apostrophe)) {
+ pos += char_size;
+ *_token = token + pos;
+ }
+ if (token[pos] == '\0') /* nothing left */
+ return 0;
+ break;
+ default:
+ /* do nothing */
+ break;
+ }
+
+ return 1;
+}
+
+static const struct fts_filter fts_filter_contractions_real = {
+ .class_name = "contractions",
+ .v = {
+ fts_filter_contractions_create,
+ fts_filter_contractions_filter,
+ NULL
+ }
+};
+
+const struct fts_filter *fts_filter_contractions = &fts_filter_contractions_real;
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter-private.h
--- a/src/lib-fts/fts-filter-private.h Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/fts-filter-private.h Mon Aug 31 13:33:26 2015 +0300
@@ -3,7 +3,7 @@
#include "fts-filter.h"
-#define FTS_FILTER_CLASSES_NR 3
+#define FTS_FILTER_CLASSES_NR 6
/*
API that stemming providers (classes) must provide: The create()
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter.c
--- a/src/lib-fts/fts-filter.c Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/fts-filter.c Mon Aug 31 13:33:26 2015 +0300
@@ -21,6 +21,7 @@
fts_filter_register(fts_filter_normalizer_icu);
fts_filter_register(fts_filter_lowercase);
fts_filter_register(fts_filter_english_possessive);
+ fts_filter_register(fts_filter_contractions);
}
void fts_filters_deinit(void)
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/fts-filter.h
--- a/src/lib-fts/fts-filter.h Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/fts-filter.h Mon Aug 31 13:33:26 2015 +0300
@@ -35,6 +35,9 @@
/* Removes <'s> suffix from words. */
extern const struct fts_filter *fts_filter_english_possessive;
+/* Removes prefixing contractions from words. */
+extern const struct fts_filter *fts_filter_contractions;
+
/* Register all built-in filters. */
void fts_filters_init(void);
void fts_filters_deinit(void);
diff -r e1c7fcd62813 -r 4b8c7440cf4f src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c Sun Aug 30 01:00:30 2015 +0300
+++ b/src/lib-fts/test-fts-filter.c Mon Aug 31 13:33:26 2015 +0300
@@ -12,6 +12,7 @@
static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
static struct fts_language english_language = { .name = "en" };
+static struct fts_language french_language = { .name = "fr" };
static void test_fts_filter_find(void)
{
@@ -20,6 +21,65 @@
test_assert(fts_filter_find("snowball") == fts_filter_stemmer_snowball);
test_assert(fts_filter_find("normalizer-icu") == fts_filter_normalizer_icu);
test_assert(fts_filter_find("lowercase") == fts_filter_lowercase);
+ test_assert(fts_filter_find("contractions") == fts_filter_contractions);
+ test_end();
+}
+
+
+static void test_fts_filter_contractions_fail(void)
+{
+
+ struct fts_filter *filter;
+ const char *error;
+
+ test_begin("fts filter contractions, unsupported language");
+ test_assert(fts_filter_create(fts_filter_contractions, NULL, &english_language, NULL, &filter, &error) != 0);
+ test_assert(error != NULL);
+ test_end();
+}
+
+static void test_fts_filter_contractions_fr(void)
+{
+ struct {
+ const char *input;
+ const char *output;
+ } tests[] = {
+ { "foo", "foo" },
+ { "you're", "you're" },
+ { "l'homme", "homme" },
+ { "l\xE2\x80\x99homme", "homme" },
+ { "aujourd'hui", "aujourd'hui" },
+ { "qu\xE2\x80\x99il", "il" },
+ { "qu'il", "il" },
+ { "du'il", "du'il" },
+ { "que", "que" },
+ { "'foobar'", "'foobar'" },
+ { "foo'bar", "foo'bar" },
+ { "a'foo", "a'foo" },
+ { "cu'", "cu'" },
+ { "qu", "qu" },
+ { "d", "d" },
+ { "qu'", NULL }
+ };
+ struct fts_filter *filter;
+ const char *error;
+ const char *token;
+ unsigned int i;
+ int ret;
+
+ test_begin("fts filter contractions, French");
+ test_assert(fts_filter_create(fts_filter_contractions, NULL, &french_language, NULL, &filter, &error) == 0);
+
+ for (i = 0; i < N_ELEMENTS(tests); i++) {
+ token = tests[i].input;
+ ret = fts_filter_filter(filter, &token, &error);
+ test_assert(ret >= 0);
+ if (ret > 0)
+ test_assert_idx(strcmp(token, tests[i].output) == 0, i);
+ else if (ret == 0)
+ test_assert_idx(token == NULL && tests[i].output == NULL, i);
+ }
+ fts_filter_unref(&filter);
test_end();
}
@@ -151,7 +211,6 @@
static void test_fts_filter_stopwords_fra(void)
{
- const struct fts_language french = { .name = "fr" };
struct fts_filter *filter;
const char *error;
int ret;
@@ -167,7 +226,7 @@
const char *token;
test_begin("fts filter stopwords, French");
- test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french, stopword_settings, &filter, &error) == 0);
+ test_assert(fts_filter_create(fts_filter_stopwords, NULL, &french_language, stopword_settings, &filter, &error) == 0);
ip = input;
op = output;
@@ -245,7 +304,6 @@
{
struct fts_filter *stemmer;
const char *error;
- struct fts_language language = { .name = "fr" };
const char *token = NULL;
const char * const tokens[] = {
"Tous", "les", "\xC3\xAAtres", "humains", "naissent",
@@ -258,7 +316,7 @@
const char * const *bpp;
test_begin("fts filter stem French");
- test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &language, NULL, &stemmer, &error) == 0);
+ test_assert(fts_filter_create(fts_filter_stemmer_snowball, NULL, &french_language, NULL, &stemmer, &error) == 0);
bpp = bases;
for (tpp=tokens; *tpp != NULL; tpp++) {
token = *tpp;
@@ -627,6 +685,8 @@
{
static void (*test_functions[])(void) = {
test_fts_filter_find,
+ test_fts_filter_contractions_fail,
+ test_fts_filter_contractions_fr,
test_fts_filter_lowercase,
test_fts_filter_stopwords_eng,
test_fts_filter_stopwords_fin,
More information about the dovecot-cvs
mailing list