dovecot-2.2: lib-fts: Added "english-possessive" filter.

Tue Jun 2 22:07:23 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/c909977ec1a1
changeset: 18818:c909977ec1a1
user:      Timo Sirainen <tss at iki.fi>
date:      Wed Jun 03 01:04:49 2015 +0300
description:
lib-fts: Added "english-possessive" filter.

diffstat:

 src/lib-fts/Makefile.am                     |   1 +
 src/lib-fts/fts-filter-english-possessive.c |  47 +++++++++++++++++++++++
 src/lib-fts/fts-filter.c                    |   1 +
 src/lib-fts/fts-filter.h                    |   3 +
 src/lib-fts/test-fts-filter.c               |  57 +++++++++++++++++++++++++++++
 5 files changed, 109 insertions(+), 0 deletions(-)

diffs (160 lines):

diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/Makefile.am

--- a/src/lib-fts/Makefile.am	Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/Makefile.am	Wed Jun 03 01:04:49 2015 +0300
@@ -62,6 +62,7 @@
 
 libfts_la_SOURCES = \
 	fts-filter.c \
+	fts-filter-english-possessive.c \
 	fts-filter-lowercase.c \
 	fts-filter-normalizer-icu.c \
 	fts-filter-stopwords.c \
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/fts-filter-english-possessive.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/fts-filter-english-possessive.c	Wed Jun 03 01:04:49 2015 +0300
@@ -0,0 +1,47 @@
+/* Copyright (c) 2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h"
+#include "fts-common.h"
+#include "fts-filter-private.h"
+
+static unichar_t get_ending_utf8_char(const char *str, unsigned int *end_pos)
+{
+	unichar_t c;
+
+	while (!UTF8_IS_START_SEQ(str[*end_pos])) {
+		i_assert(*end_pos > 0);
+		*end_pos -= 1;
+	}
+	if (uni_utf8_get_char(str + *end_pos, &c) <= 0)
+		i_unreached();
+	return c;
+}
+
+static int
+fts_filter_english_possessive_filter(struct fts_filter *filter ATTR_UNUSED,
+				     const char **token,
+				     const char **error_r ATTR_UNUSED)
+{
+	unsigned int len = strlen(*token);
+	unichar_t c;
+
+	if (len > 1 && ((*token)[len-1] == 's' || (*token)[len-1] == 'S')) {
+		len -= 2;
+		c = get_ending_utf8_char(*token, &len);
+		if (IS_APOSTROPHE(c))
+			*token = t_strndup(*token, len);
+	}
+	return 1;
+}
+
+static const struct fts_filter fts_filter_english_possessive_real = {
+	.class_name = "english-possessive",
+	.v = {
+		NULL,
+		fts_filter_english_possessive_filter,
+		NULL
+	}
+};
+
+const struct fts_filter *fts_filter_english_possessive = &fts_filter_english_possessive_real;
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/fts-filter.c
--- a/src/lib-fts/fts-filter.c	Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/fts-filter.c	Wed Jun 03 01:04:49 2015 +0300
@@ -20,6 +20,7 @@
 	fts_filter_register(fts_filter_stemmer_snowball);
 	fts_filter_register(fts_filter_normalizer_icu);
 	fts_filter_register(fts_filter_lowercase);
+	fts_filter_register(fts_filter_english_possessive);
 }
 
 void fts_filters_deinit(void)
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/fts-filter.h
--- a/src/lib-fts/fts-filter.h	Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/fts-filter.h	Wed Jun 03 01:04:49 2015 +0300
@@ -32,6 +32,9 @@
 /* Lowecases the input. Currently only ASCII data is lowercased. */
 extern const struct fts_filter *fts_filter_lowercase;
 
+/* Removes <'s> suffix from words. */
+extern const struct fts_filter *fts_filter_english_possessive;
+
 /* Register all built-in filters. */
 void fts_filters_init(void);
 void fts_filters_deinit(void);
diff -r ff79a2178fd4 -r c909977ec1a1 src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c	Wed Jun 03 01:04:07 2015 +0300
+++ b/src/lib-fts/test-fts-filter.c	Wed Jun 03 01:04:49 2015 +0300
@@ -572,6 +572,62 @@
 #endif
 #endif
 
+static void test_fts_filter_english_possessive(void)
+{
+	struct fts_filter *norm = NULL;
+	const char *input[] = {
+		"foo'",
+
+		"foo's",
+		"fooä's",
+		"foo'S",
+		"foos'S",
+		"foo's's",
+		"foo'ss",
+
+		"foo\xE2\x80\x99s",
+		"fooä\xE2\x80\x99s",
+		"foo\xE2\x80\x99S",
+		"foos\xE2\x80\x99S",
+		"foo\xE2\x80\x99s\xE2\x80\x99s",
+		"foo\xE2\x80\x99ss"
+	};
+	const char *expected_output[] = {
+		"foo'",
+
+		"foo",
+		"fooä",
+		"foo",
+		"foos",
+		"foo's",
+		"foo'ss",
+
+		"foo",
+		"fooä",
+		"foo",
+		"foos",
+		"foo\xE2\x80\x99s",
+		"foo\xE2\x80\x99ss"
+	};
+	const char *error = NULL;
+	const char *token = NULL;
+	unsigned int i;
+
+	test_begin("fts filter english possessive");
+
+	T_BEGIN {
+		test_assert(fts_filter_create(fts_filter_english_possessive, NULL, NULL, NULL, &norm, &error) == 0);
+		for (i = 0; i < N_ELEMENTS(input); i++) {
+			token = input[i];
+			test_assert_idx(fts_filter_filter(norm, &token, &error) == 1, i);
+			test_assert_idx(null_strcmp(token, expected_output[i]) == 0, i);
+		}
+		fts_filter_unref(&norm);
+	} T_END;
+	test_assert(norm == NULL);
+	test_end();
+}
+
 /* TODO: Functions to test 1. ref-unref pairs 2. multiple registers +
   an unregister + find */
 
@@ -600,6 +656,7 @@
 		test_fts_filter_normalizer_stopwords_stemmer_eng,
 #endif
 #endif
+		test_fts_filter_english_possessive,
 		NULL
 	};
 	int ret;