dovecot-2.2: Initial import for lib-fts.
dovecot at dovecot.org
dovecot at dovecot.org
Mon Apr 20 13:24:08 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/81e5b977e5c5
changeset: 18414:81e5b977e5c5
user: Timo Sirainen <tss at iki.fi>
date: Mon Apr 20 16:19:07 2015 +0300
description:
Initial import for lib-fts.
Parts of what this code does was already implemented internally by
fts-lucene. lib-fts is intended to be usable for all the FTS backends. The
APIs are still going to change a bit, but hopefully not after v2.2.17
release.
Mostly written by Teemu Huovila.
diffstat:
.hgignore | 4 +
configure.ac | 17 +
src/Makefile.am | 1 +
src/lib-fts/Makefile.am | 111 +++++
src/lib-fts/fts-filter-normalizer.c | 318 ++++++++++++++
src/lib-fts/fts-filter-private.h | 31 +
src/lib-fts/fts-filter-stemmer-snowball.c | 124 +++++
src/lib-fts/fts-filter-stopwords.c | 153 +++++++
src/lib-fts/fts-filter.c | 109 +++++
src/lib-fts/fts-filter.h | 61 ++
src/lib-fts/fts-language.c | 271 ++++++++++++
src/lib-fts/fts-language.h | 56 ++
src/lib-fts/fts-tokenizer-address.c | 356 ++++++++++++++++
src/lib-fts/fts-tokenizer-generic-private.h | 80 +++
src/lib-fts/fts-tokenizer-generic.c | 596 ++++++++++++++++++++++++++++
src/lib-fts/fts-tokenizer-private.h | 43 ++
src/lib-fts/fts-tokenizer.c | 173 ++++++++
src/lib-fts/fts-tokenizer.h | 60 ++
src/lib-fts/stopwords_en.txt | 54 ++
src/lib-fts/stopwords_fr.txt | 178 ++++++++
src/lib-fts/test-fts-filter.c | 551 +++++++++++++++++++++++++
src/lib-fts/test-fts-language.c | 228 ++++++++++
src/lib-fts/test-fts-tokenizer.c | 532 ++++++++++++++++++++++++
src/lib-fts/udhr_fra.txt | 217 ++++++++++
src/lib-fts/word-boundary-data.sh | 99 ++++
src/lib-fts/word-break-data.sh | 77 +++
26 files changed, 4500 insertions(+), 0 deletions(-)
diffs (truncated from 4636 to 300 lines):
diff -r 68c5e0db61db -r 81e5b977e5c5 .hgignore
--- a/.hgignore Mon Apr 20 15:27:02 2015 +0300
+++ b/.hgignore Mon Apr 20 16:19:07 2015 +0300
@@ -78,6 +78,10 @@
src/ipc/ipc
src/lib/unicodemap.c
src/lib/UnicodeData.txt
+src/lib-fts/PropList.txt
+src/lib-fts/WordBreakProperty.txt
+src/lib-fts/word-boundary-data.c
+src/lib-fts/word-break-data.c
src/lib-dict/dict-drivers-register.c
src/lib-sql/sql-drivers-register.c
src/lib-storage/register/mail-storage-register.c
diff -r 68c5e0db61db -r 81e5b977e5c5 configure.ac
--- a/configure.ac Mon Apr 20 15:27:02 2015 +0300
+++ b/configure.ac Mon Apr 20 16:19:07 2015 +0300
@@ -169,6 +169,11 @@
TEST_WITH(textcat, $withval),
want_textcat=auto)
+AC_ARG_WITH(normalizer,
+AS_HELP_STRING([--with-normalizer], [Build lib-fts with ICU normalization support (auto)]),
+ want_fts_normalizer=$withval,
+ want_fts_normalizer=auto)
+
AC_ARG_WITH(solr,
AS_HELP_STRING([--with-solr], [Build with Solr full text search support]),
TEST_WITH(solr, $withval),
@@ -2786,6 +2791,17 @@
AM_CONDITIONAL(BUILD_FTS_TEXTCAT, test "$have_fts_textcat" = "yes")
AM_CONDITIONAL(BUILD_FTS_EXTTEXTCAT, test "$have_fts_exttextcat" = "yes")
+if test "$want_fts_normalizer" != "no"; then
+ if test "$PKG_CONFIG" != "" && $PKG_CONFIG --exists icu-i18n 2>/dev/null; then
+ PKG_CHECK_MODULES(FTS_NORMALIZER, icu-i18n)
+ have_fts_normalizer=yes
+ AC_DEFINE(HAVE_FTS_NORMALIZER,, Define if you want ICU normalization support for FTS)
+ elif test "$want_fts_normalizer" = "yes"; then
+ AC_ERROR([Can't build with normalizer support: libicu-i18n not found])
+ fi
+fi
+AM_CONDITIONAL(BUILD_FTS_NORMALIZER, test "$have_fts_normalizer" = "yes")
+
if test $have_lucene = no; then
not_fts="$not_fts lucene"
fi
@@ -2857,6 +2873,7 @@
src/lib-dict/Makefile
src/lib-dns/Makefile
src/lib-fs/Makefile
+src/lib-fts/Makefile
src/lib-http/Makefile
src/lib-imap/Makefile
src/lib-imap-storage/Makefile
diff -r 68c5e0db61db -r 81e5b977e5c5 src/Makefile.am
--- a/src/Makefile.am Mon Apr 20 15:27:02 2015 +0300
+++ b/src/Makefile.am Mon Apr 20 16:19:07 2015 +0300
@@ -19,6 +19,7 @@
SUBDIRS = \
$(LIBDOVECOT_SUBDIRS) \
lib-dovecot \
+ lib-fts \
lib-imap-client \
lib-imap-urlauth \
lib-compression \
diff -r 68c5e0db61db -r 81e5b977e5c5 src/lib-fts/Makefile.am
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/Makefile.am Mon Apr 20 16:19:07 2015 +0300
@@ -0,0 +1,111 @@
+noinst_LTLIBRARIES = libfts.la
+
+AM_CPPFLAGS = \
+ -I$(top_srcdir)/src/lib \
+ -I$(top_srcdir)/src/lib-test \
+ $(LIBEXTTEXTCAT_CFLAGS) \
+ $(LIBFTS_NORMALIZER_CFLAGS) \
+ -DUDHRDIR=\""$(top_srcdir)/src/lib-fts"\" \
+ -DDATADIR=\"$(pkgdatadir)\" \
+ -DTEST_TEXTCAT_DIR=\""$(top_srcdir)/ext/libexttextcat/langclass"\" \
+ -DTEST_STOPWORDS_DIR=\""$(top_srcdir)/src/lib-fts"\"
+
+stopwordsdir = $(datadir)/${PACKAGE_TARNAME}/stopwords
+dist_stopwords_DATA = stopwords_en.txt stopwords_fr.txt
+
+BUILT_SOURCES = word-boundary-data.c word-break-data.c
+
+EXTRA_DIST = \
+ WordBreakProperty.txt \
+ word-boundary-data.sh \
+ word-boundary-data.c \
+ word-break-data.sh \
+ word-break-data.c
+
+WordBreakProperty.txt:
+ test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
+$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt
+ $(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@
+
+PropList.txt:
+ test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt
+$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt
+ $(srcdir)/word-break-data.sh < PropList.txt > $@
+
+
+if BUILD_FTS_STEMMER
+STEMMER_LIBS = -lstemmer
+endif
+
+if BUILD_FTS_EXTTEXTCAT
+TEXTCAT_LIBS = $(LIBEXTTEXTCAT_LIBS)
+else
+if BUILD_FTS_TEXTCAT
+TEXTCAT_LIBS = -ltextcat
+endif
+endif
+
+if BUILD_FTS_NORMALIZER
+NORMALIZER_LIBS = $(LIBFTS_NORMALIZER_LIBS)
+endif
+
+libfts_la_LIBADD = \
+ $(STEMMER_LIBS) \
+ $(TEXTCAT_LIBS) \
+ $(NORMALIZER_LIBS)
+
+libfts_la_SOURCES = \
+ fts-filter.c \
+ fts-filter-normalizer.c \
+ fts-filter-stopwords.c \
+ fts-filter-stemmer-snowball.c \
+ fts-language.c \
+ fts-tokenizer.c \
+ fts-tokenizer-address.c \
+ fts-tokenizer-generic.c
+
+noinst_HEADERS = \
+ fts-filter.h \
+ fts-filter-private.h \
+ fts-language.h \
+ fts-tokenizer.h \
+ fts-tokenizer-private.h \
+ fts-tokenizer-generic-private.h
+
+test_programs = \
+ test-fts-filter \
+ $(TEST_FTS_LANGUAGE) \
+ $(TEST_FTS_NORMALIZER) \
+ test-fts-tokenizer
+
+noinst_PROGRAMS = $(test_programs)
+
+test_libs = \
+ ../lib-test/libtest.la \
+ ../lib/liblib.la
+test_deps = $(noinst_LTLIBRARIES) $(test_libs)
+
+filter_deps = \
+ fts-filter.lo fts-filter-stopwords.lo \
+ fts-filter-stemmer-snowball.lo fts-filter-normalizer.lo
+
+test_fts_filter_SOURCES = test-fts-filter.c
+test_fts_filter_LDADD = $(filter_deps) $(test_libs) $(STEMMER_LIBS) $(TEXTCAT_LIBS) $(NORMALIZER_LIBS)
+test_fts_filter_DEPENDENCIES = $(test_deps) $(filter_deps)
+
+if BUILD_FTS_EXTTEXTCAT
+TEST_FTS_LANGUAGE = test-fts-language
+test_fts_language_SOURCES = test-fts-language.c
+test_fts_language_LDADD = fts-language.lo $(test_libs) $(TEXTCAT_LIBS)
+test_fts_language_DEPENDENCIES = $(test_deps)
+endif
+
+test_fts_tokenizer_SOURCES = test-fts-tokenizer.c
+test_fts_tokenizer_LDADD = fts-tokenizer.lo fts-tokenizer-generic.lo fts-tokenizer-address.lo $(test_libs)
+test_fts_tokenizer_DEPENDENCIES = $(test_deps)
+
+check: check-am check-test
+check-test: all-am
+ for bin in $(test_programs); do \
+ if ! $(RUN_TEST) ./$$bin; then exit 1; fi; \
+ done
diff -r 68c5e0db61db -r 81e5b977e5c5 src/lib-fts/fts-filter-normalizer.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/fts-filter-normalizer.c Mon Apr 20 16:19:07 2015 +0300
@@ -0,0 +1,318 @@
+/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h" /* unicode replacement char */
+#include "fts-filter.h"
+#include "fts-filter-private.h"
+#include "fts-language.h"
+
+#ifdef HAVE_LIBFTS_NORMALIZER
+
+#include <unicode/utrans.h>
+#include <unicode/uenum.h>
+#include <unicode/ustring.h>
+#include <unicode/ucnv.h>
+#include <stdlib.h>
+
+struct fts_filter_normalizer {
+ struct fts_filter filter;
+ const char *error;
+ pool_t pool;
+ UTransliterator *transliterator;
+};
+
+static void
+icu_error(const char **error_r, const UErrorCode err, const char *func)
+{
+ if (error_r == NULL)
+ return;
+
+ if (U_FAILURE(err)) {
+ *error_r = t_strdup_printf("Lib ICU function %s failed: %s\n",
+ func, u_errorName(err));
+ }
+}
+
+/* Thin wrapper for vprintf */
+static void ATTR_FORMAT(2, 3)
+fts_filter_normalizer_error(const char **error_r, const char *format, ...)
+{
+ va_list args;
+
+ if (error_r == NULL)
+ return;
+
+ va_start(args, format);
+ *error_r = t_strdup_vprintf(format, args);
+ va_end(args);
+}
+
+/* Helper to create UTF16, which libicu wants as input. Returns -1 on
+ error, 0 on success.
+
+ On input, if *dst_uchars_r > 0, it indicates the number of UChar
+ sized units that should be allocated for the text. However, the
+ function will not use the number, if the text will not fit in that
+ amount.
+
+ On return *dst_uchars_r will contain the number of UChar sized units
+ allocated for the dst. NOT the number of bytes nor the length of the
+ text. */
+static int make_uchar(const char *src, UChar **dst, int32_t *dst_uchars_r)
+{
+ UErrorCode err = U_ZERO_ERROR;
+ int32_t len = strlen(src);
+ int32_t ustr_len = 0;
+ int32_t ustr_len_actual = 0;
+ UChar *retp = NULL;
+ int32_t alloc_uchars = 0;
+
+ i_assert(dst_uchars_r != NULL);
+
+ /* Check length required for encoded dst. */
+ retp = u_strFromUTF8(NULL, 0, &ustr_len, src, len, &err);
+
+ /* When preflighting a successful call returns a buffer overflow
+ error. */
+ if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {
+ i_panic("Failed to estimate allocation size with lib ICU"
+ " u_strFromUTF8(): %s",u_errorName(err));
+ }
+ i_assert(NULL == retp);
+
+ err = U_ZERO_ERROR;
+ if (*dst_uchars_r > 0 && *dst_uchars_r > ustr_len)
+ alloc_uchars = *dst_uchars_r;
+ else
+ alloc_uchars = ustr_len;
+ alloc_uchars++; /* room for null bytes(2) */
+ *dst = t_malloc(alloc_uchars * sizeof(UChar));
+ *dst_uchars_r = alloc_uchars;
+ retp = u_strFromUTF8(*dst, alloc_uchars, &ustr_len_actual,
+ src, len, &err);
+
+ if (U_FAILURE(err))
+ i_panic("Lib ICU u_strFromUTF8 failed: %s", u_errorName(err));
+ i_assert(retp == *dst);
+ i_assert(ustr_len == ustr_len_actual);
+ return 0;
+}
+
+static int make_utf8(const UChar *src, char **dst, const char **error_r)
+{
+ char *retp = NULL;
+ int32_t dsize = 0;
+ int32_t dsize_actual = 0;
+ int32_t sub_num = 0;
+ UErrorCode err = U_ZERO_ERROR;
+ int32_t usrc_len = u_strlen(src); /* libicu selects different codepaths
+ depending if srclen -1 or not */
+
+ retp = u_strToUTF8WithSub(NULL, 0, &dsize, src, usrc_len,
+ UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
+
+ /* Preflighting can cause buffer overflow to be reported */
+ if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {
More information about the dovecot-cvs
mailing list