dovecot-2.2: lib-fts: Add Swedish (sv) to supported languages.
dovecot at dovecot.org
dovecot at dovecot.org
Tue Nov 17 09:56:21 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/c7f9d3d8f278
changeset: 19372:c7f9d3d8f278
user: Teemu Huovila <teemu.huovila at dovecot.fi>
date: Tue Nov 17 11:43:28 2015 +0200
description:
lib-fts: Add Swedish (sv) to supported languages.
diffstat:
src/lib-fts/Makefile.am | 3 +-
src/lib-fts/fts-language.c | 1 +
src/lib-fts/stopwords_sv.txt | 131 ++++++++++++++++++++++++++++++++++++++++
src/lib-fts/test-fts-filter.c | 53 ++++++++++++++++
src/lib-fts/test-fts-language.c | 27 ++++++++
5 files changed, 214 insertions(+), 1 deletions(-)
diffs (280 lines):
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/Makefile.am Tue Nov 17 11:43:28 2015 +0200
@@ -14,7 +14,8 @@
dist_stopwords_DATA = \
stopwords_en.txt \
stopwords_fi.txt \
- stopwords_fr.txt
+ stopwords_fr.txt \
+ stopwords_sv.txt
BUILT_SOURCES = word-boundary-data.c word-break-data.c
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/fts-language.c
--- a/src/lib-fts/fts-language.c Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/fts-language.c Tue Nov 17 11:43:28 2015 +0200
@@ -41,6 +41,7 @@
{ "pt" },
{ "ro" },
{ "ru" },
+ { "sv" }
};
const struct fts_language fts_language_data = {
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/stopwords_sv.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/stopwords_sv.txt Tue Nov 17 11:43:28 2015 +0200
@@ -0,0 +1,131 @@
+ | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt
+ | This file is distributed under the BSD License.
+ | See http://snowball.tartarus.org/license.php
+ | Also see http://www.opensource.org/licenses/bsd-license.html
+ | - Encoding was converted to UTF-8.
+ | - This notice was added.
+
+ | A Swedish stop word list. Comments begin with vertical bar. Each stop
+ | word is at the start of a line.
+
+ | This is a ranked list (commonest to rarest) of stopwords derived from
+ | a large text sample.
+
+ | Swedish stop words occasionally exhibit homonym clashes. For example
+ | så = so, but also seed. These are indicated clearly below.
+
+och | and
+det | it, this/that
+att | to (with infinitive)
+i | in, at
+en | a
+jag | I
+hon | she
+som | who, that
+han | he
+på | on
+den | it, this/that
+med | with
+var | where, each
+sig | him(self) etc
+för | for
+så | so (also: seed)
+till | to
+är | is
+men | but
+ett | a
+om | if; around, about
+hade | had
+de | they, these/those
+av | of
+icke | not, no
+mig | me
+du | you
+henne | her
+då | then, when
+sin | his
+nu | now
+har | have
+inte | inte någon = no one
+hans | his
+honom | him
+skulle | 'sake'
+hennes | her
+där | there
+min | my
+man | one (pronoun)
+ej | nor
+vid | at, by, on (also: vast)
+kunde | could
+något | some etc
+från | from, off
+ut | out
+när | when
+efter | after, behind
+upp | up
+vi | we
+dem | them
+vara | be
+vad | what
+över | over
+än | than
+dig | you
+kan | can
+sina | his
+här | here
+ha | have
+mot | towards
+alla | all
+under | under (also: wonder)
+någon | some etc
+eller | or (else)
+allt | all
+mycket | much
+sedan | since
+ju | why
+denna | this/that
+själv | myself, yourself etc
+detta | this/that
+åt | to
+utan | without
+varit | was
+hur | how
+ingen | no
+mitt | my
+ni | you
+bli | to be, become
+blev | from bli
+oss | us
+din | thy
+dessa | these/those
+några | some etc
+deras | their
+blir | from bli
+mina | my
+samma | (the) same
+vilken | who, that
+er | you, your
+sådan | such a
+vår | our
+blivit | from bli
+dess | its
+inom | within
+mellan | between
+sådant | such a
+varför | why
+varje | each
+vilka | who, that
+ditt | thy
+vem | who
+vilket | who, that
+sitta | his
+sådana | such a
+vart | each
+dina | thy
+vars | whose
+vårt | our
+våra | our
+ert | your
+era | your
+vilkas | whose
+
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/test-fts-filter.c
--- a/src/lib-fts/test-fts-filter.c Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/test-fts-filter.c Tue Nov 17 11:43:28 2015 +0200
@@ -13,6 +13,7 @@
static const char *const stopword_settings[] = {"stopwords_dir", TEST_STOPWORDS_DIR, NULL};
static struct fts_language english_language = { .name = "en" };
static struct fts_language french_language = { .name = "fr" };
+static struct fts_language swedish_language = { .name = "sv" };
static void test_fts_filter_find(void)
{
@@ -653,6 +654,57 @@
test_assert(normalizer == NULL);
test_end();
}
+
+static void test_fts_filter_stopwords_normalizer_stemmer_sv(void)
+{
+ int ret;
+ struct fts_filter *normalizer;
+ struct fts_filter *stemmer;
+ struct fts_filter *filter;
+ const char *error;
+ const char *token = NULL;
+ const char * const tokens[] = {
+ "Enär", "erkännandet", "av", "det", "inneboende", "värdet",
+ "hos", "alla", "medlemmar", "av", "människosläktet", "och",
+ "av", "deras", "lika", "och", "oförytterliga", "rättigheter",
+ "är", "grundvalen", "för", "frihet", "rättvisa", "och", "fred",
+ "i", "världen", NULL};
+ const char * const bases[] = {
+ "enar", "erkan", NULL, NULL, "inneboend", "vardet", "hos", NULL,
+ "medlemm", NULL, "manniskoslaktet", NULL, NULL, NULL, "lik",
+ NULL, "oforytter", "ratt", NULL, "grundval", NULL, "frihet",
+ "rattvis", NULL, "fred", NULL, "varld", NULL};
+ const char * const *tpp;
+ const char * const *bpp;
+
+ test_begin("fts filters with stopwords, default normalizer and stemming chained, Swedish");
+
+
+ test_assert(fts_filter_create(fts_filter_stopwords, NULL, &swedish_language, stopword_settings, &filter, &error) == 0);
+ test_assert(fts_filter_create(fts_filter_normalizer_icu, filter, NULL, NULL, &normalizer, &error) == 0);
+ test_assert(fts_filter_create(fts_filter_stemmer_snowball, normalizer, &swedish_language, NULL, &stemmer, &error) == 0);
+
+ bpp = bases;
+ for (tpp = tokens; *tpp != NULL; tpp++) {
+ token = *tpp;
+ ret = fts_filter_filter(stemmer, &token, &error);
+ if (ret <= 0) {
+ test_assert(ret == 0);
+ test_assert(*bpp == NULL);
+ } else {
+ test_assert(*bpp != NULL);
+ test_assert(strcmp(*bpp, token) == 0);
+ }
+ bpp++;
+ }
+ fts_filter_unref(&stemmer);
+ fts_filter_unref(&normalizer);
+ fts_filter_unref(&filter);
+ test_assert(stemmer == NULL);
+ test_assert(filter == NULL);
+ test_assert(normalizer == NULL);
+ test_end();
+}
#endif
#endif
@@ -741,6 +793,7 @@
test_fts_filter_normalizer_invalid_id,
#ifdef HAVE_FTS_STEMMER
test_fts_filter_normalizer_stopwords_stemmer_eng,
+ test_fts_filter_stopwords_normalizer_stemmer_sv,
#endif
#endif
test_fts_filter_english_possessive,
diff -r 01bd19136b2c -r c7f9d3d8f278 src/lib-fts/test-fts-language.c
--- a/src/lib-fts/test-fts-language.c Tue Nov 17 11:42:59 2015 +0200
+++ b/src/lib-fts/test-fts-language.c Tue Nov 17 11:43:28 2015 +0200
@@ -134,6 +134,32 @@
test_end();
}
+/* Detect Swedish */
+static void test_fts_language_detect_swedish(void)
+{
+ struct fts_language_list *lp = NULL;
+ const struct fts_language *lang_r = NULL;
+ const unsigned char swedish[] =
+ "Artikel 1."\
+ "Alla m\xC3\xA4nniskor \xC3\xA4ro f\xC3\xB6""dda fria och lika"\
+ " i v\xC3\xA4rde och r\xC3\xA4ttigheter. De \xC3\xA4ro "\
+ "utrustade med f\xC3\xB6rnuft och samvete och b\xC3\xB6ra "\
+ "handla gentemot varandra i en anda av broderskap.";
+
+
+
+ const char names[] = "fi, de, sv, fr, en";
+ const char *unknown, *error;
+ test_begin("fts language detect Swedish");
+ test_assert(fts_language_list_init(settings, &lp, &error) == 0);
+ test_assert(fts_language_list_add_names(lp, names, &unknown) == TRUE);
+ test_assert(fts_language_detect(lp, swedish, sizeof(swedish)-1, &lang_r)
+ == FTS_LANGUAGE_RESULT_OK);
+ test_assert(strcmp(lang_r->name, "sv") == 0);
+ fts_language_list_deinit(&lp);
+ test_end();
+}
+
/* Detect Finnish as English */
static void test_fts_language_detect_finnish_as_english(void)
{
@@ -212,6 +238,7 @@
test_fts_language_detect_english,
test_fts_language_detect_french,
test_fts_language_detect_german,
+ test_fts_language_detect_swedish,
test_fts_language_detect_finnish_as_english,
test_fts_language_detect_na,
test_fts_language_detect_unknown,
More information about the dovecot-cvs
mailing list