dovecot-2.2: Initial import for lib-fts.

Mon Apr 20 13:24:08 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/81e5b977e5c5
changeset: 18414:81e5b977e5c5
user:      Timo Sirainen <tss at iki.fi>
date:      Mon Apr 20 16:19:07 2015 +0300
description:
Initial import for lib-fts.
Parts of what this code does was already implemented internally by
fts-lucene. lib-fts is intended to be usable for all the FTS backends. The
APIs are still going to change a bit, but hopefully not after v2.2.17
release.

Mostly written by Teemu Huovila.

diffstat:

 .hgignore                                   |    4 +
 configure.ac                                |   17 +
 src/Makefile.am                             |    1 +
 src/lib-fts/Makefile.am                     |  111 +++++
 src/lib-fts/fts-filter-normalizer.c         |  318 ++++++++++++++
 src/lib-fts/fts-filter-private.h            |   31 +
 src/lib-fts/fts-filter-stemmer-snowball.c   |  124 +++++
 src/lib-fts/fts-filter-stopwords.c          |  153 +++++++
 src/lib-fts/fts-filter.c                    |  109 +++++
 src/lib-fts/fts-filter.h                    |   61 ++
 src/lib-fts/fts-language.c                  |  271 ++++++++++++
 src/lib-fts/fts-language.h                  |   56 ++
 src/lib-fts/fts-tokenizer-address.c         |  356 ++++++++++++++++
 src/lib-fts/fts-tokenizer-generic-private.h |   80 +++
 src/lib-fts/fts-tokenizer-generic.c         |  596 ++++++++++++++++++++++++++++
 src/lib-fts/fts-tokenizer-private.h         |   43 ++
 src/lib-fts/fts-tokenizer.c                 |  173 ++++++++
 src/lib-fts/fts-tokenizer.h                 |   60 ++
 src/lib-fts/stopwords_en.txt                |   54 ++
 src/lib-fts/stopwords_fr.txt                |  178 ++++++++
 src/lib-fts/test-fts-filter.c               |  551 +++++++++++++++++++++++++
 src/lib-fts/test-fts-language.c             |  228 ++++++++++
 src/lib-fts/test-fts-tokenizer.c            |  532 ++++++++++++++++++++++++
 src/lib-fts/udhr_fra.txt                    |  217 ++++++++++
 src/lib-fts/word-boundary-data.sh           |   99 ++++
 src/lib-fts/word-break-data.sh              |   77 +++
 26 files changed, 4500 insertions(+), 0 deletions(-)

diffs (truncated from 4636 to 300 lines):

diff -r 68c5e0db61db -r 81e5b977e5c5 .hgignore

--- a/.hgignore	Mon Apr 20 15:27:02 2015 +0300
+++ b/.hgignore	Mon Apr 20 16:19:07 2015 +0300
@@ -78,6 +78,10 @@
 src/ipc/ipc
 src/lib/unicodemap.c
 src/lib/UnicodeData.txt
+src/lib-fts/PropList.txt
+src/lib-fts/WordBreakProperty.txt
+src/lib-fts/word-boundary-data.c
+src/lib-fts/word-break-data.c
 src/lib-dict/dict-drivers-register.c
 src/lib-sql/sql-drivers-register.c
 src/lib-storage/register/mail-storage-register.c
diff -r 68c5e0db61db -r 81e5b977e5c5 configure.ac
--- a/configure.ac	Mon Apr 20 15:27:02 2015 +0300
+++ b/configure.ac	Mon Apr 20 16:19:07 2015 +0300
@@ -169,6 +169,11 @@
   TEST_WITH(textcat, $withval),
   want_textcat=auto)
 
+AC_ARG_WITH(normalizer,
+AS_HELP_STRING([--with-normalizer], [Build lib-fts with ICU normalization support (auto)]),
+  want_fts_normalizer=$withval,
+  want_fts_normalizer=auto)
+
 AC_ARG_WITH(solr,
 AS_HELP_STRING([--with-solr], [Build with Solr full text search support]),
   TEST_WITH(solr, $withval),
@@ -2786,6 +2791,17 @@
 AM_CONDITIONAL(BUILD_FTS_TEXTCAT, test "$have_fts_textcat" = "yes")
 AM_CONDITIONAL(BUILD_FTS_EXTTEXTCAT, test "$have_fts_exttextcat" = "yes")
 
+if test "$want_fts_normalizer" != "no"; then
+  if test "$PKG_CONFIG" != "" && $PKG_CONFIG --exists icu-i18n 2>/dev/null; then
+    PKG_CHECK_MODULES(FTS_NORMALIZER, icu-i18n)
+    have_fts_normalizer=yes
+    AC_DEFINE(HAVE_FTS_NORMALIZER,, Define if you want ICU normalization support for FTS)
+  elif test "$want_fts_normalizer" = "yes"; then
+    AC_ERROR([Can't build with normalizer support: libicu-i18n not found])
+  fi
+fi
+AM_CONDITIONAL(BUILD_FTS_NORMALIZER, test "$have_fts_normalizer" = "yes")
+
 if test $have_lucene = no; then
   not_fts="$not_fts lucene"
 fi
@@ -2857,6 +2873,7 @@
 src/lib-dict/Makefile
 src/lib-dns/Makefile
 src/lib-fs/Makefile
+src/lib-fts/Makefile
 src/lib-http/Makefile
 src/lib-imap/Makefile
 src/lib-imap-storage/Makefile
diff -r 68c5e0db61db -r 81e5b977e5c5 src/Makefile.am
--- a/src/Makefile.am	Mon Apr 20 15:27:02 2015 +0300
+++ b/src/Makefile.am	Mon Apr 20 16:19:07 2015 +0300
@@ -19,6 +19,7 @@
 SUBDIRS = \
 	$(LIBDOVECOT_SUBDIRS) \
 	lib-dovecot \
+	lib-fts \
 	lib-imap-client \
 	lib-imap-urlauth \
 	lib-compression \
diff -r 68c5e0db61db -r 81e5b977e5c5 src/lib-fts/Makefile.am
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/Makefile.am	Mon Apr 20 16:19:07 2015 +0300
@@ -0,0 +1,111 @@
+noinst_LTLIBRARIES = libfts.la
+
+AM_CPPFLAGS = \
+	-I$(top_srcdir)/src/lib \
+	-I$(top_srcdir)/src/lib-test \
+	$(LIBEXTTEXTCAT_CFLAGS) \
+	$(LIBFTS_NORMALIZER_CFLAGS) \
+	-DUDHRDIR=\""$(top_srcdir)/src/lib-fts"\" \
+	-DDATADIR=\"$(pkgdatadir)\" \
+	-DTEST_TEXTCAT_DIR=\""$(top_srcdir)/ext/libexttextcat/langclass"\" \
+	-DTEST_STOPWORDS_DIR=\""$(top_srcdir)/src/lib-fts"\"
+
+stopwordsdir = $(datadir)/${PACKAGE_TARNAME}/stopwords
+dist_stopwords_DATA = stopwords_en.txt stopwords_fr.txt
+
+BUILT_SOURCES = word-boundary-data.c word-break-data.c
+
+EXTRA_DIST = \
+	WordBreakProperty.txt \
+	word-boundary-data.sh \
+	word-boundary-data.c \
+	word-break-data.sh \
+	word-break-data.c
+
+WordBreakProperty.txt:
+	test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
+$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt
+	$(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@
+
+PropList.txt:
+	test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt
+$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt
+	$(srcdir)/word-break-data.sh < PropList.txt > $@
+
+
+if BUILD_FTS_STEMMER
+STEMMER_LIBS = -lstemmer
+endif
+
+if BUILD_FTS_EXTTEXTCAT
+TEXTCAT_LIBS = $(LIBEXTTEXTCAT_LIBS)
+else
+if BUILD_FTS_TEXTCAT
+TEXTCAT_LIBS = -ltextcat
+endif
+endif
+
+if BUILD_FTS_NORMALIZER
+NORMALIZER_LIBS = $(LIBFTS_NORMALIZER_LIBS)
+endif
+
+libfts_la_LIBADD = \
+	$(STEMMER_LIBS) \
+	$(TEXTCAT_LIBS) \
+	$(NORMALIZER_LIBS)
+
+libfts_la_SOURCES = \
+	fts-filter.c \
+	fts-filter-normalizer.c \
+	fts-filter-stopwords.c \
+	fts-filter-stemmer-snowball.c \
+	fts-language.c \
+	fts-tokenizer.c \
+	fts-tokenizer-address.c \
+	fts-tokenizer-generic.c
+
+noinst_HEADERS = \
+	fts-filter.h \
+	fts-filter-private.h \
+	fts-language.h \
+	fts-tokenizer.h \
+	fts-tokenizer-private.h \
+	fts-tokenizer-generic-private.h
+
+test_programs = \
+	test-fts-filter \
+	$(TEST_FTS_LANGUAGE) \
+	$(TEST_FTS_NORMALIZER) \
+	test-fts-tokenizer
+
+noinst_PROGRAMS = $(test_programs)
+
+test_libs = \
+	../lib-test/libtest.la \
+	../lib/liblib.la
+test_deps = $(noinst_LTLIBRARIES) $(test_libs)
+
+filter_deps = \
+	fts-filter.lo fts-filter-stopwords.lo \
+	fts-filter-stemmer-snowball.lo fts-filter-normalizer.lo
+
+test_fts_filter_SOURCES = test-fts-filter.c
+test_fts_filter_LDADD = $(filter_deps) $(test_libs) $(STEMMER_LIBS) $(TEXTCAT_LIBS) $(NORMALIZER_LIBS)
+test_fts_filter_DEPENDENCIES = $(test_deps) $(filter_deps)
+
+if BUILD_FTS_EXTTEXTCAT
+TEST_FTS_LANGUAGE = test-fts-language
+test_fts_language_SOURCES = test-fts-language.c
+test_fts_language_LDADD = fts-language.lo $(test_libs) $(TEXTCAT_LIBS)
+test_fts_language_DEPENDENCIES = $(test_deps)
+endif
+
+test_fts_tokenizer_SOURCES = test-fts-tokenizer.c
+test_fts_tokenizer_LDADD = fts-tokenizer.lo fts-tokenizer-generic.lo fts-tokenizer-address.lo $(test_libs)
+test_fts_tokenizer_DEPENDENCIES = $(test_deps)
+
+check: check-am check-test
+check-test: all-am
+	for bin in $(test_programs); do \
+	  if ! $(RUN_TEST) ./$$bin; then exit 1; fi; \
+	done
diff -r 68c5e0db61db -r 81e5b977e5c5 src/lib-fts/fts-filter-normalizer.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/fts-filter-normalizer.c	Mon Apr 20 16:19:07 2015 +0300
@@ -0,0 +1,318 @@
+/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h" /* unicode replacement char */
+#include "fts-filter.h"
+#include "fts-filter-private.h"
+#include "fts-language.h"
+
+#ifdef HAVE_LIBFTS_NORMALIZER
+
+#include <unicode/utrans.h>
+#include <unicode/uenum.h>
+#include <unicode/ustring.h>
+#include <unicode/ucnv.h>
+#include <stdlib.h>
+
+struct fts_filter_normalizer {
+	struct fts_filter filter;
+	const char *error;
+	pool_t pool;
+	UTransliterator *transliterator;
+};
+
+static void
+icu_error(const char **error_r, const UErrorCode err, const char *func)
+{
+	if (error_r == NULL)
+		return;
+
+	if (U_FAILURE(err)) {
+		*error_r = t_strdup_printf("Lib ICU function %s failed: %s\n",
+		                            func, u_errorName(err));
+	}
+}
+
+/* Thin wrapper for vprintf */
+static void ATTR_FORMAT(2, 3)
+fts_filter_normalizer_error(const char **error_r, const char *format, ...)
+{
+	va_list args;
+
+	if (error_r == NULL)
+		return;
+
+	va_start(args, format);
+	*error_r = t_strdup_vprintf(format, args);
+	va_end(args);
+}
+
+/* Helper to create UTF16, which libicu wants as input. Returns -1 on
+ error, 0 on success.
+
+ On input,  if *dst_uchars_r  > 0,  it indicates  the number  of UChar
+ sized  units that  should be  allocated  for the  text. However,  the
+ function will not  use the number, if  the text will not  fit in that
+ amount.
+
+ On return *dst_uchars_r will contain the number of UChar sized units
+ allocated for the dst. NOT the number of bytes nor the length of the
+ text. */
+static int make_uchar(const char *src, UChar **dst, int32_t *dst_uchars_r)
+{
+	UErrorCode err = U_ZERO_ERROR;
+	int32_t len = strlen(src);
+	int32_t ustr_len = 0;
+	int32_t ustr_len_actual = 0;
+	UChar *retp = NULL;
+	int32_t alloc_uchars = 0;
+
+	i_assert(dst_uchars_r != NULL);
+
+	/* Check length required for encoded dst. */
+	retp = u_strFromUTF8(NULL, 0, &ustr_len, src, len, &err);
+
+	/* When preflighting a successful call returns a buffer overflow
+	   error. */
+	if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {
+		i_panic("Failed to estimate allocation size with lib ICU"
+		        " u_strFromUTF8(): %s",u_errorName(err));
+	}
+	i_assert(NULL == retp);
+
+	err = U_ZERO_ERROR;
+	if (*dst_uchars_r > 0 && *dst_uchars_r > ustr_len)
+		alloc_uchars =  *dst_uchars_r;
+	else
+		alloc_uchars = ustr_len;
+	alloc_uchars++; /* room for null bytes(2) */
+	*dst = t_malloc(alloc_uchars * sizeof(UChar));
+	*dst_uchars_r = alloc_uchars;
+	retp = u_strFromUTF8(*dst, alloc_uchars, &ustr_len_actual,
+	                     src, len, &err);
+
+	if (U_FAILURE(err))
+		i_panic("Lib ICU u_strFromUTF8 failed: %s", u_errorName(err));
+	i_assert(retp == *dst);
+	i_assert(ustr_len == ustr_len_actual);
+	return 0;
+}
+
+static int make_utf8(const UChar *src, char **dst, const char **error_r)
+{
+	char *retp = NULL;
+	int32_t dsize = 0;
+	int32_t dsize_actual = 0;
+	int32_t sub_num = 0;
+	UErrorCode err = U_ZERO_ERROR;
+	int32_t usrc_len = u_strlen(src); /* libicu selects different codepaths
+	                                     depending if srclen -1 or not */
+
+	retp = u_strToUTF8WithSub(NULL, 0, &dsize, src, usrc_len,
+	                          UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
+
+	/* Preflighting can cause buffer overflow to be reported */
+	if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {