dovecot-2.2: fts-lucene: Optionally use lib-fts instead of CLuce...
dovecot at dovecot.org
dovecot at dovecot.org
Tue Apr 21 18:31:15 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/1ba63410b902
changeset: 18445:1ba63410b902
user: Timo Sirainen <tss at iki.fi>
date: Tue Apr 21 21:29:42 2015 +0300
description:
fts-lucene: Optionally use lib-fts instead of CLucene's own analyzers.
fts_lucene = use_libfts enables this.
diffstat:
src/plugins/fts-lucene/fts-backend-lucene.c | 7 +++-
src/plugins/fts-lucene/fts-lucene-plugin.c | 5 ++
src/plugins/fts-lucene/fts-lucene-plugin.h | 1 +
src/plugins/fts-lucene/lucene-wrapper.cc | 53 ++++++++++++++++++++--------
4 files changed, 49 insertions(+), 17 deletions(-)
diffs (195 lines):
diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/fts-backend-lucene.c
--- a/src/plugins/fts-lucene/fts-backend-lucene.c Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/fts-backend-lucene.c Tue Apr 21 21:29:42 2015 +0300
@@ -161,8 +161,13 @@
*error_r = "Invalid fts_lucene settings";
return -1;
}
+ /* fts already checked that index exists */
- /* fts already checked that index exists */
+ if (fuser->set.use_libfts) {
+ /* change our flags so we get proper input */
+ _backend->flags &= ~FTS_BACKEND_FLAG_FUZZY_SEARCH;
+ _backend->flags |= FTS_BACKEND_FLAG_TOKENIZED_INPUT;
+ }
return 0;
}
diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/fts-lucene-plugin.c
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c Tue Apr 21 21:29:42 2015 +0300
@@ -34,6 +34,8 @@
set->no_snowball = TRUE;
} else if (strcmp(*tmp, "mime_parts") == 0) {
set->mime_parts = TRUE;
+ } else if (strcmp(*tmp, "use_libfts") == 0) {
+ set->use_libfts = TRUE;
} else {
i_error("fts_lucene: Invalid setting: %s", *tmp);
return -1;
@@ -73,6 +75,9 @@
{
uint32_t crc;
+ if (set->use_libfts)
+ return crc32_str("l");
+
/* checksum is always different when compiling with/without stemmer */
crc = set->default_language == NULL ? 0 :
crc32_str(set->default_language);
diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/fts-lucene-plugin.h
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h Tue Apr 21 21:29:42 2015 +0300
@@ -15,6 +15,7 @@
bool normalize;
bool no_snowball;
bool mime_parts;
+ bool use_libfts;
};
struct fts_lucene_user {
diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc Tue Apr 21 21:29:42 2015 +0300
@@ -77,6 +77,7 @@
Document *doc;
uint32_t prev_uid, prev_part_idx;
+ bool no_analyzer;
};
struct rescan_context {
@@ -124,6 +125,9 @@
/* this is valid only for doveadm dump, so it doesn't matter */
index->set.default_language = "";
}
+ if (index->set.use_libfts) {
+ index->default_analyzer = _CLNEW KeywordAnalyzer();
+ } else
#ifdef HAVE_FTS_STEMMER
if (set == NULL || !set->no_snowball) {
index->default_analyzer =
@@ -198,7 +202,7 @@
const char *whitespace_chars = index->set.whitespace_chars;
unsigned int i;
- if (*whitespace_chars == '\0')
+ if (*whitespace_chars == '\0' || index->set.use_libfts)
return;
for (i = 0; i < len; i++) {
@@ -224,8 +228,7 @@
}
static const wchar_t *
-t_lucene_utf8_to_tchar(struct lucene_index *index,
- const char *str, bool translate)
+t_lucene_utf8_to_tchar(struct lucene_index *index, const char *str)
{
ARRAY_TYPE(unichars) dest_arr;
const unichar_t *chars;
@@ -540,10 +543,13 @@
return 0;
try {
- index->writer->addDocument(index->doc,
- index->cur_analyzer != NULL ?
- index->cur_analyzer :
- index->default_analyzer);
+ CL_NS(analysis)::Analyzer *analyzer = NULL;
+
+ if (!index->set.use_libfts) {
+ analyzer = index->cur_analyzer != NULL ?
+ index->cur_analyzer : index->default_analyzer;
+ }
+ index->writer->addDocument(index->doc, analyzer);
} catch (CLuceneError &err) {
lucene_handle_error(index, err, "IndexWriter::addDocument()");
ret = -1;
@@ -578,7 +584,7 @@
index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
}
- if (index->normalizer_buf != NULL) {
+ if (index->normalizer_buf != NULL && !index->set.use_libfts) {
buffer_set_used_size(index->normalizer_buf, 0);
index->normalizer(data, size, index->normalizer_buf);
data = (const unsigned char *)index->normalizer_buf->data;
@@ -594,6 +600,8 @@
lucene_utf8_n_to_tchar(data, size, dest, datasize);
lucene_data_translate(index, dest, datasize-1);
+ int token_flag = index->set.use_libfts ?
+ Field::INDEX_UNTOKENIZED : Field::INDEX_TOKENIZED;
if (hdr_name != NULL) {
/* hdr_name should be ASCII, but don't break in case it isn't */
hdr_name = t_str_lcase(hdr_name);
@@ -601,15 +609,16 @@
wchar_t wname[namesize];
lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
strlen(hdr_name), wname, namesize);
- index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | Field::INDEX_TOKENIZED));
- index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
+ if (index->set.use_libfts)
+ index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | token_flag));
+ index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | token_flag));
if (fts_header_want_indexed(hdr_name))
- index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
+ index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | token_flag));
} else if (size > 0) {
- if (index->cur_analyzer == NULL)
+ if (index->cur_analyzer == NULL && !index->set.use_libfts)
index->cur_analyzer = guess_analyzer(index, data, size);
- index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
+ index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | token_flag));
}
i_free(dest_free);
return 0;
@@ -1130,6 +1139,18 @@
const TCHAR *wvalue;
Analyzer *analyzer;
+ if (index->set.use_libfts) {
+ const wchar_t *wstr = t_lucene_utf8_to_tchar(index, str);
+ Term* tm = _CLNEW Term(key, wstr);
+ Query* ret;
+ if (fuzzy)
+ ret = _CLNEW FuzzyQuery( tm );
+ else
+ ret = _CLNEW TermQuery( tm );
+ _CLDECDELETE(tm);
+ return ret;
+ }
+
if (index->normalizer_buf != NULL) {
buffer_set_used_size(index->normalizer_buf, 0);
index->normalizer(str, strlen(str), index->normalizer_buf);
@@ -1137,7 +1158,7 @@
str = (const char *)index->normalizer_buf->data;
}
- wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
+ wvalue = t_lucene_utf8_to_tchar(index, str);
analyzer = guess_analyzer(index, str, strlen(str));
if (analyzer == NULL)
analyzer = index->default_analyzer;
@@ -1194,7 +1215,7 @@
return false;
q = lucene_get_query(index,
- t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name), FALSE),
+ t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name)),
arg);
break;
default:
@@ -1236,7 +1257,7 @@
case SEARCH_HEADER:
case SEARCH_HEADER_ADDRESS:
case SEARCH_HEADER_COMPRESS_LWSP:
- if (*arg->value.str == '\0') {
+ if (*arg->value.str == '\0' && !index->set.use_libfts) {
/* checking potential existence of the header name */
q = lucene_get_query_str(index, _T("hdr"),
t_str_lcase(arg->hdr_field_name), FALSE);
More information about the dovecot-cvs
mailing list