dovecot-2.2: lib-fts: Renamed normalizer to icu-normalizer, incl...
dovecot at dovecot.org
dovecot at dovecot.org
Tue Apr 21 16:38:05 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/41abc9c7eca2
changeset: 18440:41abc9c7eca2
user: Timo Sirainen <tss at iki.fi>
date: Tue Apr 21 19:36:27 2015 +0300
description:
lib-fts: Renamed normalizer to icu-normalizer, including the source code.
diffstat:
src/lib-fts/Makefile.am | 4 +-
src/lib-fts/fts-filter-normalizer-icu.c | 318 ++++++++++++++++++++++++++++++++
src/lib-fts/fts-filter-normalizer.c | 318 --------------------------------
src/lib-fts/fts-filter.c | 2 +-
src/lib-fts/fts-filter.h | 4 +-
5 files changed, 323 insertions(+), 323 deletions(-)
diffs (truncated from 691 to 300 lines):
diff -r b179bbd226e5 -r 41abc9c7eca2 src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am Tue Apr 21 19:31:14 2015 +0300
+++ b/src/lib-fts/Makefile.am Tue Apr 21 19:36:27 2015 +0300
@@ -60,7 +60,7 @@
libfts_la_SOURCES = \
fts-filter.c \
- fts-filter-normalizer.c \
+ fts-filter-normalizer-icu.c \
fts-filter-stopwords.c \
fts-filter-stemmer-snowball.c \
fts-language.c \
@@ -90,7 +90,7 @@
filter_deps = \
fts-filter.lo fts-filter-stopwords.lo \
- fts-filter-stemmer-snowball.lo fts-filter-normalizer.lo
+ fts-filter-stemmer-snowball.lo fts-filter-normalizer-icu.lo
test_fts_filter_SOURCES = test-fts-filter.c
test_fts_filter_LDADD = $(filter_deps) $(test_libs) $(STEMMER_LIBS) $(TEXTCAT_LIBS) $(NORMALIZER_LIBS)
diff -r b179bbd226e5 -r 41abc9c7eca2 src/lib-fts/fts-filter-normalizer-icu.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/fts-filter-normalizer-icu.c Tue Apr 21 19:36:27 2015 +0300
@@ -0,0 +1,318 @@
+/* Copyright (c) 2014-2015 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "unichar.h" /* unicode replacement char */
+#include "fts-filter.h"
+#include "fts-filter-private.h"
+#include "fts-language.h"
+
+#ifdef HAVE_LIBICU
+
+#include <unicode/utrans.h>
+#include <unicode/uenum.h>
+#include <unicode/ustring.h>
+#include <unicode/ucnv.h>
+#include <stdlib.h>
+
+struct fts_filter_normalizer {
+ struct fts_filter filter;
+ const char *error;
+ pool_t pool;
+ UTransliterator *transliterator;
+};
+
+static void
+icu_error(const char **error_r, const UErrorCode err, const char *func)
+{
+ if (error_r == NULL)
+ return;
+
+ if (U_FAILURE(err)) {
+ *error_r = t_strdup_printf("Lib ICU function %s failed: %s\n",
+ func, u_errorName(err));
+ }
+}
+
+/* Thin wrapper for vprintf */
+static void ATTR_FORMAT(2, 3)
+fts_filter_normalizer_icu_error(const char **error_r, const char *format, ...)
+{
+ va_list args;
+
+ if (error_r == NULL)
+ return;
+
+ va_start(args, format);
+ *error_r = t_strdup_vprintf(format, args);
+ va_end(args);
+}
+
+/* Helper to create UTF16, which libicu wants as input. Returns -1 on
+ error, 0 on success.
+
+ On input, if *dst_uchars_r > 0, it indicates the number of UChar
+ sized units that should be allocated for the text. However, the
+ function will not use the number, if the text will not fit in that
+ amount.
+
+ On return *dst_uchars_r will contain the number of UChar sized units
+ allocated for the dst. NOT the number of bytes nor the length of the
+ text. */
+static int make_uchar(const char *src, UChar **dst, int32_t *dst_uchars_r)
+{
+ UErrorCode err = U_ZERO_ERROR;
+ int32_t len = strlen(src);
+ int32_t ustr_len = 0;
+ int32_t ustr_len_actual = 0;
+ UChar *retp = NULL;
+ int32_t alloc_uchars = 0;
+
+ i_assert(dst_uchars_r != NULL);
+
+ /* Check length required for encoded dst. */
+ retp = u_strFromUTF8(NULL, 0, &ustr_len, src, len, &err);
+
+ /* When preflighting a successful call returns a buffer overflow
+ error. */
+ if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {
+ i_panic("Failed to estimate allocation size with lib ICU"
+ " u_strFromUTF8(): %s",u_errorName(err));
+ }
+ i_assert(NULL == retp);
+
+ err = U_ZERO_ERROR;
+ if (*dst_uchars_r > 0 && *dst_uchars_r > ustr_len)
+ alloc_uchars = *dst_uchars_r;
+ else
+ alloc_uchars = ustr_len;
+ alloc_uchars++; /* room for null bytes(2) */
+ *dst = t_malloc(alloc_uchars * sizeof(UChar));
+ *dst_uchars_r = alloc_uchars;
+ retp = u_strFromUTF8(*dst, alloc_uchars, &ustr_len_actual,
+ src, len, &err);
+
+ if (U_FAILURE(err))
+ i_panic("Lib ICU u_strFromUTF8 failed: %s", u_errorName(err));
+ i_assert(retp == *dst);
+ i_assert(ustr_len == ustr_len_actual);
+ return 0;
+}
+
+static int make_utf8(const UChar *src, char **dst, const char **error_r)
+{
+ char *retp = NULL;
+ int32_t dsize = 0;
+ int32_t dsize_actual = 0;
+ int32_t sub_num = 0;
+ UErrorCode err = U_ZERO_ERROR;
+ int32_t usrc_len = u_strlen(src); /* libicu selects different codepaths
+ depending if srclen -1 or not */
+
+ retp = u_strToUTF8WithSub(NULL, 0, &dsize, src, usrc_len,
+ UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
+
+ /* Preflighting can cause buffer overflow to be reported */
+ if (U_BUFFER_OVERFLOW_ERROR != err && U_FAILURE(err)) {
+ i_panic("Failed to estimate allocation size with lib ICU"
+ " u_strToUTF8(): %s",u_errorName(err));
+ }
+ i_assert(0 == sub_num);
+ i_assert(NULL == retp);
+
+ dsize++; /* room for '\0' byte */
+ *dst = t_malloc(dsize);
+ err = U_ZERO_ERROR;
+ retp = u_strToUTF8WithSub(*dst, dsize, &dsize_actual, src, usrc_len,
+ UNICODE_REPLACEMENT_CHAR, &sub_num, &err);
+ if (U_FAILURE(err))
+ i_panic("Lib ICU u_strToUTF8WithSub() failed: %s",
+ u_errorName(err));
+ if (dsize_actual >= dsize) {
+ i_panic("Produced UTF8 string length (%d) does not fit in "
+ "preflighted(%d). Buffer overflow?",
+ dsize_actual, dsize);
+ }
+ if (0 != sub_num) {
+ fts_filter_normalizer_icu_error(error_r, "UTF8 string not well formed."
+ " Substitutions (%d) were made.", sub_num);
+ return -1;
+ }
+ i_assert(retp == *dst);
+
+ return 0;
+}
+
+static bool fts_filter_normalizer_icu_supports(const struct fts_language *lang)
+{
+ if (lang == NULL || lang->name == NULL)
+ return FALSE;
+ return TRUE;
+}
+
+static void fts_filter_normalizer_icu_destroy(struct fts_filter *filter)
+{
+ struct fts_filter_normalizer *np =
+ (struct fts_filter_normalizer *)filter;
+
+ if (np->transliterator != NULL)
+ utrans_close(np->transliterator);
+ pool_unref(&np->pool);
+ return;
+}
+
+static int
+fts_filter_normalizer_icu_create(const struct fts_language *lang ATTR_UNUSED,
+ const char *const *settings,
+ struct fts_filter **filter_r,
+ const char **error_r)
+{
+ struct fts_filter_normalizer *np;
+ pool_t pp;
+ UErrorCode err = U_ZERO_ERROR;
+ UParseError perr;
+ UChar *id_uchar = NULL;
+ int32_t id_len_uchar = 0;
+ unsigned int i;
+ const char *id = "Any-Lower; NFKD; [: Nonspacing Mark :] Remove; NFC";
+
+ memset(&perr, 0, sizeof(perr));
+
+ for (i = 0; settings[i] != NULL; i += 2) {
+ const char *key = settings[i], *value = settings[i+1];
+
+ if (strcmp(key, "id") == 0) {
+ id = value;
+ } else {
+ *error_r = t_strdup_printf("Unknown setting: %s", key);
+ return -1;
+ }
+ }
+
+ pp = pool_alloconly_create(MEMPOOL_GROWING"fts_filter_normalizer",
+ sizeof(struct fts_filter_normalizer));
+ np = p_new(pp, struct fts_filter_normalizer, 1);
+ np->pool = pp;
+ np->filter = *fts_filter_normalizer_icu;
+ if (make_uchar(id, &id_uchar, &id_len_uchar) < 0) {
+
+ }
+ np->transliterator = utrans_openU(id_uchar, u_strlen(id_uchar), UTRANS_FORWARD,
+ NULL, 0, &perr, &err);
+ if (U_FAILURE(err)) {
+ if (perr.line >= 1) {
+ fts_filter_normalizer_icu_error(error_r, "Failed to open transliterator for id: %s. Lib ICU error: %s. Parse error on line %u offset %u.", id, u_errorName(err), perr.line, perr.offset);
+ }
+ else {
+ fts_filter_normalizer_icu_error(error_r, "Failed to open transliterator for id: %s. Lib ICU error: %s.", id, u_errorName(err));
+ }
+ fts_filter_normalizer_icu_destroy(&np->filter);
+ return -1;
+ }
+ *filter_r = &np->filter;
+ return 0;
+}
+
+/* Returns 0 on success and -1 on error. */
+/* TODO: delay errors until _deinit() and return some other values? */
+static const char *
+fts_filter_normalizer_icu_filter(struct fts_filter *filter, const char *token)
+{
+ UErrorCode err = U_ZERO_ERROR;
+ UChar *utext = NULL;
+ int32_t utext_cap = 0;
+ int32_t utext_len = -1;
+ int32_t utext_limit;
+ char *normalized = NULL;
+ struct fts_filter_normalizer *np =
+ (struct fts_filter_normalizer *)filter;
+
+ /* TODO: fix error handling */
+ if (np->error != NULL)
+ return NULL;
+
+ if (make_uchar(token, &utext, &utext_cap) < 0) {
+ fts_filter_normalizer_icu_error(&np->error, "Conversion to UChar failed");
+ return NULL;
+ }
+ /*
+ TODO: Some problems here. How much longer can the result
+ be, than the source? Can it be calculated? Preflighted?
+ */
+ utext_limit = u_strlen(utext);
+ utrans_transUChars(np->transliterator, utext, &utext_len,
+ utext_cap, 0, &utext_limit, &err);
+
+ /* Data did not fit into utext. */
+ if (utext_len > utext_cap || err == U_BUFFER_OVERFLOW_ERROR) {
+
+ /* This is a crude retry fix... Make a new utext of the
+ size utrans_transUChars indicated */
+ utext_len++; /* room for '\0' bytes(2) */
+ utext_cap = utext_len;
+ if (make_uchar(token, &utext, &utext_cap) < 0)
+ return NULL;
+ i_assert(utext_cap == utext_len);
+ utext_limit = u_strlen(utext);
+ utext_len = -1;
+ err = U_ZERO_ERROR;
+ utrans_transUChars(np->transliterator, utext,
+ &utext_len, utext_cap, 0,
+ &utext_limit, &err);
+ }
+
+ if (U_FAILURE(err)) {
+ icu_error(&np->error, err, "utrans_transUChars()");
+ return NULL;
+ }
+
+ if (make_utf8(utext, &normalized, &np->error) < 0)
+ return NULL;
+
+ return normalized;
+}
+
+#else
+
More information about the dovecot-cvs
mailing list