dovecot-2.2: fts-lucene: Support normalize setting also without ...
dovecot at dovecot.org
dovecot at dovecot.org
Sun Jun 9 03:10:55 EEST 2013
details: http://hg.dovecot.org/dovecot-2.2/rev/7e54af474ea4
changeset: 16485:7e54af474ea4
user: Timo Sirainen <tss at iki.fi>
date: Sun Jun 09 03:10:43 2013 +0300
description:
fts-lucene: Support normalize setting also without snowball. Added no_snowball setting.
Snowball seems to be converting / breaking words down rather annoyingly.
diffstat:
src/plugins/fts-lucene/fts-lucene-plugin.c | 9 ++---
src/plugins/fts-lucene/fts-lucene-plugin.h | 1 +
src/plugins/fts-lucene/lucene-wrapper.cc | 43 ++++++++++++++++++++++++-----
3 files changed, 40 insertions(+), 13 deletions(-)
diffs (126 lines):
diff -r 1f3f21081ee5 -r 7e54af474ea4 src/plugins/fts-lucene/fts-lucene-plugin.c
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c Sun Jun 09 02:49:48 2013 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c Sun Jun 09 03:10:43 2013 +0300
@@ -30,6 +30,8 @@
set->whitespace_chars = p_strdup(user->pool, *tmp + 17);
} else if (strcmp(*tmp, "normalize") == 0) {
set->normalize = TRUE;
+ } else if (strcmp(*tmp, "no_snowball") == 0) {
+ set->no_snowball = TRUE;
} else {
i_error("fts_lucene: Invalid setting: %s", *tmp);
return -1;
@@ -51,11 +53,6 @@
"but Dovecot built without stemmer support");
return -1;
}
- if (set->normalize) {
- i_error("fts_lucene: normalize not currently supported "
- "without stemmer support");
- return -1;
- }
#else
if (set->default_language == NULL)
set->default_language = "english";
@@ -80,6 +77,8 @@
crc = crc32_str_more(crc, set->whitespace_chars);
if (set->normalize)
crc = crc32_str_more(crc, "n");
+ if (set->no_snowball)
+ crc = crc32_str_more(crc, "s");
return crc;
}
diff -r 1f3f21081ee5 -r 7e54af474ea4 src/plugins/fts-lucene/fts-lucene-plugin.h
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h Sun Jun 09 02:49:48 2013 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h Sun Jun 09 03:10:43 2013 +0300
@@ -13,6 +13,7 @@
const char *textcat_conf, *textcat_dir;
const char *whitespace_chars;
bool normalize;
+ bool no_snowball;
};
struct fts_lucene_user {
diff -r 1f3f21081ee5 -r 7e54af474ea4 src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Sun Jun 09 02:49:48 2013 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc Sun Jun 09 03:10:43 2013 +0300
@@ -67,6 +67,7 @@
IndexWriter *writer;
IndexSearcher *searcher;
+ buffer_t *normalizer_buf;
Analyzer *default_analyzer, *cur_analyzer;
ARRAY(struct lucene_analyzer) analyzers;
@@ -118,13 +119,20 @@
index->set.default_language = "";
}
#ifdef HAVE_LUCENE_STEMMER
- index->default_analyzer =
- _CLNEW snowball::SnowballAnalyzer(index->normalizer,
- index->set.default_language);
-#else
- index->default_analyzer = _CLNEW standard::StandardAnalyzer();
- i_assert(index->normalizer == NULL);
+ if (!set->no_snowball) {
+ index->default_analyzer =
+ _CLNEW snowball::SnowballAnalyzer(index->normalizer,
+ index->set.default_language);
+ }
#endif
+ else {
+ index->default_analyzer = _CLNEW standard::StandardAnalyzer();
+ if (index->normalizer != NULL) {
+ index->normalizer_buf =
+ buffer_create_dynamic(default_pool, 1024);
+ }
+ }
+
i_array_init(&index->analyzers, 32);
textcat_refcount++;
@@ -155,6 +163,8 @@
textcat = NULL;
}
_CLDELETE(index->default_analyzer);
+ if (index->normalizer_buf != NULL)
+ buffer_free(&index->normalizer_buf);
i_free(index->path);
i_free(index);
}
@@ -517,6 +527,13 @@
index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
}
+ if (index->normalizer_buf != NULL) {
+ buffer_set_used_size(index->normalizer_buf, 0);
+ index->normalizer(data, size, index->normalizer_buf);
+ data = (const unsigned char *)index->normalizer_buf->data;
+ size = index->normalizer_buf->used;
+ }
+
datasize = uni_utf8_strlen_n(data, size) + 1;
wchar_t dest[datasize];
lucene_utf8_n_to_tchar(data, size, dest, datasize);
@@ -1055,8 +1072,18 @@
lucene_get_query_str(struct lucene_index *index,
const TCHAR *key, const char *str, bool fuzzy)
{
- const TCHAR *wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
- Analyzer *analyzer = guess_analyzer(index, str, strlen(str));
+ const TCHAR *wvalue;
+ Analyzer *analyzer;
+
+ if (index->normalizer_buf != NULL) {
+ buffer_set_used_size(index->normalizer_buf, 0);
+ index->normalizer(str, strlen(str), index->normalizer_buf);
+ buffer_append_c(index->normalizer_buf, '\0');
+ str = (const char *)index->normalizer_buf->data;
+ }
+
+ wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
+ analyzer = guess_analyzer(index, str, strlen(str));
if (analyzer == NULL)
analyzer = index->default_analyzer;
More information about the dovecot-cvs
mailing list