dovecot-2.2: lib-fts: Add Swedish (sv) to supported languages.

dovecot at dovecot.org dovecot at dovecot.org
Tue Nov 17 09:56:21 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/c7f9d3d8f278
changeset: 19372:c7f9d3d8f278
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Tue Nov 17 11:43:28 2015 +0200
description:
lib-fts: Add Swedish (sv) to supported languages.

diffstat:

 src/lib-fts/Makefile.am         |    3 +-
 src/lib-fts/fts-language.c      |    1 +
 src/lib-fts/stopwords_sv.txt    |  131 ++++++++++++++++++++++++++++++++++++++++
 src/lib-fts/test-fts-filter.c   |   53 ++++++++++++++++
 src/lib-fts/test-fts-language.c |   27 ++++++++
 5 files changed, 214 insertions(+), 1 deletions(-)

diffs (280 lines):

diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am	Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/Makefile.am	Tue Nov 17 11:43:28 2015 +0200
@@ -14,7 +14,8 @@
 dist_stopwords_DATA = \
 	stopwords_en.txt \
 	stopwords_fi.txt \
-	stopwords_fr.txt
+	stopwords_fr.txt \
+	stopwords_sv.txt
 
 BUILT_SOURCES = word-boundary-data.c word-break-data.c
 
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/fts-language.c
--- a/src/lib-fts/fts-language.c	Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/fts-language.c	Tue Nov 17 11:43:28 2015 +0200
@@ -41,6 +41,7 @@
 	{ "pt" },
 	{ "ro" },
 	{ "ru" },
+	{ "sv" }
 };
 
 const struct fts_language fts_language_data = {
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/stopwords_sv.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/stopwords_sv.txt	Tue Nov 17 11:43:28 2015 +0200
@@ -0,0 +1,131 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ |  - Encoding was converted to UTF-8.
+ |  - This notice was added.
+
+ | A Swedish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This is a ranked list (commonest to rarest) of stopwords derived from
+ | a large text sample.
+
+ | Swedish stop words occasionally exhibit homonym clashes. For example
+ |  så = so, but also seed. These are indicated clearly below.
+
+och            | and
+det            | it, this/that
+att            | to (with infinitive)
+i              | in, at
+en             | a
+jag            | I
+hon            | she
+som            | who, that
+han            | he
+på             | on
+den            | it, this/that
+med            | with
+var            | where, each
+sig            | him(self) etc
+för            | for
+så             | so (also: seed)
+till           | to
+är             | is
+men            | but
+ett            | a
+om             | if; around, about
+hade           | had
+de             | they, these/those
+av             | of
+icke           | not, no
+mig            | me
+du             | you
+henne          | her
+då             | then, when
+sin            | his
+nu             | now
+har            | have
+inte           | inte någon = no one
+hans           | his
+honom          | him
+skulle         | 'sake'
+hennes         | her
+där            | there
+min            | my
+man            | one (pronoun)
+ej             | nor
+vid            | at, by, on (also: vast)
+kunde          | could
+något          | some etc
+från           | from, off
+ut             | out
+när            | when
+efter          | after, behind
+upp            | up
+vi             | we
+dem            | them
+vara           | be
+vad            | what
+över           | over
+än             | than
+dig            | you
+kan            | can
+sina           | his
+här            | here
+ha             | have
+mot            | towards
+alla           | all
+under          | under (also: wonder)
+någon          | some etc
+eller          | or (else)
+allt           | all
+mycket         | much
+sedan          | since
+ju             | why
+denna          | this/that
+själv          | myself, yourself etc
+detta          | this/that
+åt             | to
+utan           | without
+varit          | was
+hur            | how
+ingen          | no
+mitt           | my
+ni             | you
+bli            | to be, become
+blev           | from bli
+oss            | us
+din            | thy
+dessa          | these/those
+några          | some etc
+deras          | their
+blir           | from bli
+mina           | my
+samma          | (the) same
+vilken         | who, that
+er             | you, your
+sådan          | such a
+vår            | our
+blivit         | from bli
+dess           | its
+inom           | within
+mellan         | between
+sådant         | such a
+varför         | why
+varje          | each
+vilka          | who, that
+ditt           | thy
+vem            | who
+vilket         | who, that
+sitta          | his
+sådana         | such a
+vart           | each
+dina           | thy
+vars           | whose
+vårt           | our
+våra           | our
+ert            | your
+era            | your
+vilkas         | whose
+
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c	Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/test-fts-filter.c	Tue Nov 17 11:43:28 2015 +0200
@@ -13,6 +13,7 @@
 static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
 static struct fts_language english_language = { .name = "en" };
 static struct fts_language french_language = { .name = "fr" };
+static struct fts_language swedish_language = { .name = "sv" };
 
 static void test_fts_filter_find(void)
 {
@@ -653,6 +654,57 @@
 	test_assert(normalizer == NULL);
 	test_end();
 }
+
+static void test_fts_filter_stopwords_normalizer_stemmer_sv(void)
+{
+	int ret;
+	struct fts_filter *normalizer;
+	struct fts_filter *stemmer;
+	struct fts_filter *filter;
+	const char *error;
+	const char *token = NULL;
+	const char * const tokens[] = {
+		"Enär", "erkännandet", "av", "det", "inneboende", "värdet",
+		"hos", "alla", "medlemmar", "av", "människosläktet", "och",
+		"av", "deras", "lika", "och", "oförytterliga", "rättigheter",
+		"är", "grundvalen", "för", "frihet", "rättvisa", "och", "fred",
+		"i", "världen",	NULL};
+	const char * const bases[] = {
+		"enar", "erkan", NULL, NULL, "inneboend", "vardet", "hos", NULL,
+		"medlemm", NULL, "manniskoslaktet", NULL, NULL, NULL, "lik",
+		NULL, "oforytter", "ratt", NULL, "grundval", NULL, "frihet",
+		"rattvis", NULL, "fred", NULL, "varld", NULL};
+	const char * const *tpp;
+	const char * const *bpp;
+
+	test_begin("fts filters with stopwords, default normalizer and stemming chained, Swedish");
+
+
+	test_assert(fts_filter_create(fts_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0);
+	test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
+	test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0);
+
+	bpp = bases;
+	for (tpp = tokens; *tpp != NULL; tpp++) {
+		token = *tpp;
+		ret = fts_filter_filter(stemmer, &token, &error);
+		if (ret <= 0) {
+			test_assert(ret == 0);
+			test_assert(*bpp == NULL);
+		} else {
+			test_assert(*bpp != NULL);
+			test_assert(strcmp(*bpp, token)  == 0);
+		}
+		bpp++;
+	}
+	fts_filter_unref(&stemmer);
+	fts_filter_unref(&normalizer);
+	fts_filter_unref(&filter);
+	test_assert(stemmer == NULL);
+	test_assert(filter == NULL);
+	test_assert(normalizer == NULL);
+	test_end();
+}
 #endif
 #endif
 
@@ -741,6 +793,7 @@
 		test_fts_filter_normalizer_invalid_id,
 #ifdef HAVE_FTS_STEMMER
 		test_fts_filter_normalizer_stopwords_stemmer_eng,
+		test_fts_filter_stopwords_normalizer_stemmer_sv,
 #endif
 #endif
 		test_fts_filter_english_possessive,
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/test-fts-language.c
--- a/src/lib-fts/test-fts-language.c	Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/test-fts-language.c	Tue Nov 17 11:43:28 2015 +0200
@@ -134,6 +134,32 @@
 	test_end();
 }
 
+/* Detect Swedish */
+static void test_fts_language_detect_swedish(void)
+{
+	struct fts_language_list *lp = NULL;
+	const struct fts_language *lang_r = NULL;
+	const unsigned char swedish[]  =
+		"Artikel 1."\
+		"Alla m\xC3\xA4nniskor \xC3\xA4ro f\xC3\xB6""dda fria och lika"\
+		" i v\xC3\xA4rde och r\xC3\xA4ttigheter. De \xC3\xA4ro "\
+		"utrustade med f\xC3\xB6rnuft och samvete och b\xC3\xB6ra "\
+		"handla gentemot varandra i en anda av broderskap.";
+
+
+
+	const char names[] = "fi, de, sv, fr, en";
+	const char *unknown, *error;
+	test_begin("fts language detect Swedish");
+	test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+	test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+	test_assert(fts_language_detect(lp, swedish, sizeof(swedish)-1, &lang_r)
+	            == FTS_LANGUAGE_RESULT_OK);
+	test_assert(strcmp(lang_r->name, "sv") == 0);
+	fts_language_list_deinit(&lp);
+	test_end();
+}
+
 /* Detect Finnish as English */
 static void test_fts_language_detect_finnish_as_english(void)
 {
@@ -212,6 +238,7 @@
 		test_fts_language_detect_english,
 		test_fts_language_detect_french,
 		test_fts_language_detect_german,
+		test_fts_language_detect_swedish,
 		test_fts_language_detect_finnish_as_english,
 		test_fts_language_detect_na,
 		test_fts_language_detect_unknown,


More information about the dovecot-cvs mailing list