dovecot-2.2: lib-fts: Added "english-possessive" filter.
dovecot at dovecot.org
dovecot at dovecot.org
Tue Jun 2 22:07:23 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/c909977ec1a1
changeset: 18818:c909977ec1a1
user: Timo Sirainen <tss at iki.fi>
date: Wed Jun 03 01:04:49 2015 +0300
description:
lib-fts: Added "english-possessive" filter.
diffstat:
src/lib-fts/Makefile.am | 1 +
src/lib-fts/fts-filter-english-possessive.c | 47 +++++++++++++++++++++++
src/lib-fts/fts-filter.c | 1 +
src/lib-fts/fts-filter.h | 3 +
src/lib-fts/test-fts-filter.c | 57 +++++++++++++++++++++++++++++
5 files changed, 109 insertions(+), 0 deletions(-)
diffs (160 lines):
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/Makefile.am Wed Jun 03 01:04:49 2015 +0300
@@ -62,6 +62,7 @@
libfts_la_SOURCES = \
fts-filter.c \
+ fts-filter-english-possessive.c \
fts-filter-lowercase.c \
fts-filter-normalizer-icu.c \
fts-filter-stopwords.c \
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/fts-filter-english-possessive.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/fts-filter-english-possessive.c Wed Jun 03 01:04:49 2015 +0300
@@ -0,0 +1,47 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h"
+#include "fts-common.h"
+#include "fts-filter-private.h"
+
+static unichar_t get_ending_utf8_char(const char *str, unsigned int *end_pos)
+{
+ unichar_t c;
+
+ while (!UTF8_IS_START_SEQ(str[*end_pos])) {
+ i_assert(*end_pos > 0);
+ *end_pos -= 1;
+ }
+ if (uni_utf8_get_char(str + *end_pos, &c) <= 0)
+ i_unreached();
+ return c;
+}
+
+static int
+fts_filter_english_possessive_filter(struct fts_filter *filter ATTR_UNUSED,
+ const char **token,
+ const char **error_r ATTR_UNUSED)
+{
+ unsigned int len = strlen(*token);
+ unichar_t c;
+
+ if (len > 1 && ((*token)[len-1] == 's' || (*token)[len-1] == 'S')) {
+ len -= 2;
+ c = get_ending_utf8_char(*token, &len);
+ if (IS_APOSTROPHE(c))
+ *token = t_strndup(*token, len);
+ }
+ return 1;
+}
+
+static const struct fts_filter fts_filter_english_possessive_real = {
+ .class_name = "english-possessive",
+ .v = {
+ NULL,
+ fts_filter_english_possessive_filter,
+ NULL
+ }
+};
+
+const struct fts_filter *fts_filter_english_possessive = &fts_filter_english_possessive_real;
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/fts-filter.c
--- a/src/lib-fts/fts-filter.c Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/fts-filter.c Wed Jun 03 01:04:49 2015 +0300
@@ -20,6 +20,7 @@
fts_filter_register(fts_filter_stemmer_snowball);
fts_filter_register(fts_filter_normalizer_icu);
fts_filter_register(fts_filter_lowercase);
+ fts_filter_register(fts_filter_english_possessive);
}
void fts_filters_deinit(void)
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/fts-filter.h
--- a/src/lib-fts/fts-filter.h Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/fts-filter.h Wed Jun 03 01:04:49 2015 +0300
@@ -32,6 +32,9 @@
/* Lowecases the input. Currently only ASCII data is lowercased. */
extern const struct fts_filter *fts_filter_lowercase;
+/* Removes <'s> suffix from words. */
+extern const struct fts_filter *fts_filter_english_possessive;
+
/* Register all built-in filters. */
void fts_filters_init(void);
void fts_filters_deinit(void);
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/test-fts-filter.c Wed Jun 03 01:04:49 2015 +0300
@@ -572,6 +572,62 @@
#endif
#endif
+static void test_fts_filter_english_possessive(void)
+{
+ struct fts_filter *norm = NULL;
+ const char *input[] = {
+ "foo'",
+
+ "foo's",
+ "fooä's",
+ "foo'S",
+ "foos'S",
+ "foo's's",
+ "foo'ss",
+
+ "foo\xE2\x80\x99s",
+ "fooä\xE2\x80\x99s",
+ "foo\xE2\x80\x99S",
+ "foos\xE2\x80\x99S",
+ "foo\xE2\x80\x99s\xE2\x80\x99s",
+ "foo\xE2\x80\x99ss"
+ };
+ const char *expected_output[] = {
+ "foo'",
+
+ "foo",
+ "fooä",
+ "foo",
+ "foos",
+ "foo's",
+ "foo'ss",
+
+ "foo",
+ "fooä",
+ "foo",
+ "foos",
+ "foo\xE2\x80\x99s",
+ "foo\xE2\x80\x99ss"
+ };
+ const char *error = NULL;
+ const char *token = NULL;
+ unsigned int i;
+
+ test_begin("fts filter english possessive");
+
+ T_BEGIN {
+ test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
+ for (i = 0; i < N_ELEMENTS(input); i++) {
+ token = input[i];
+ test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
+ test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
+ }
+ fts_filter_unref(&norm);
+ } T_END;
+ test_assert(norm == NULL);
+ test_end();
+}
+
/* TODO: Functions to test 1. ref-unref pairs 2. multiple registers +
an unregister + find */
@@ -600,6 +656,7 @@
test_fts_filter_normalizer_stopwords_stemmer_eng,
#endif
#endif
+ test_fts_filter_english_possessive,
NULL
};
int ret;
More information about the dovecot-cvs
mailing list