dovecot-2.2: fts-lucene: Optionally use lib-fts instead of CLuce...

Tue Apr 21 18:31:15 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/1ba63410b902
changeset: 18445:1ba63410b902
user:      Timo Sirainen <tss at iki.fi>
date:      Tue Apr 21 21:29:42 2015 +0300
description:
fts-lucene: Optionally use lib-fts instead of CLucene's own analyzers.
fts_lucene = use_libfts enables this.

diffstat:

 src/plugins/fts-lucene/fts-backend-lucene.c |   7 +++-
 src/plugins/fts-lucene/fts-lucene-plugin.c  |   5 ++
 src/plugins/fts-lucene/fts-lucene-plugin.h  |   1 +
 src/plugins/fts-lucene/lucene-wrapper.cc    |  53 ++++++++++++++++++++--------
 4 files changed, 49 insertions(+), 17 deletions(-)

diffs (195 lines):

diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/fts-backend-lucene.c

--- a/src/plugins/fts-lucene/fts-backend-lucene.c	Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/fts-backend-lucene.c	Tue Apr 21 21:29:42 2015 +0300
@@ -161,8 +161,13 @@
 		*error_r = "Invalid fts_lucene settings";
 		return -1;
 	}
+	/* fts already checked that index exists */
 
-	/* fts already checked that index exists */
+	if (fuser->set.use_libfts) {
+		/* change our flags so we get proper input */
+		_backend->flags &= ~FTS_BACKEND_FLAG_FUZZY_SEARCH;
+		_backend->flags |= FTS_BACKEND_FLAG_TOKENIZED_INPUT;
+	}
 	return 0;
 }
 
diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/fts-lucene-plugin.c
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c	Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c	Tue Apr 21 21:29:42 2015 +0300
@@ -34,6 +34,8 @@
 			set->no_snowball = TRUE;
 		} else if (strcmp(*tmp, "mime_parts") == 0) {
 			set->mime_parts = TRUE;
+		} else if (strcmp(*tmp, "use_libfts") == 0) {
+			set->use_libfts = TRUE;
 		} else {
 			i_error("fts_lucene: Invalid setting: %s", *tmp);
 			return -1;
@@ -73,6 +75,9 @@
 {
 	uint32_t crc;
 
+	if (set->use_libfts)
+		return crc32_str("l");
+
 	/* checksum is always different when compiling with/without stemmer */
 	crc = set->default_language == NULL ? 0 :
 		crc32_str(set->default_language);
diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/fts-lucene-plugin.h
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h	Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h	Tue Apr 21 21:29:42 2015 +0300
@@ -15,6 +15,7 @@
 	bool normalize;
 	bool no_snowball;
 	bool mime_parts;
+	bool use_libfts;
 };
 
 struct fts_lucene_user {
diff -r 3f24c8cab32d -r 1ba63410b902 src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc	Tue Apr 21 21:28:41 2015 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc	Tue Apr 21 21:29:42 2015 +0300
@@ -77,6 +77,7 @@
 
 	Document *doc;
 	uint32_t prev_uid, prev_part_idx;
+	bool no_analyzer;
 };
 
 struct rescan_context {
@@ -124,6 +125,9 @@
 		/* this is valid only for doveadm dump, so it doesn't matter */
 		index->set.default_language = "";
 	}
+	if (index->set.use_libfts) {
+		index->default_analyzer = _CLNEW KeywordAnalyzer();
+	} else
 #ifdef HAVE_FTS_STEMMER
 	if (set == NULL || !set->no_snowball) {
 		index->default_analyzer =
@@ -198,7 +202,7 @@
 	const char *whitespace_chars = index->set.whitespace_chars;
 	unsigned int i;
 
-	if (*whitespace_chars == '\0')
+	if (*whitespace_chars == '\0' || index->set.use_libfts)
 		return;
 
 	for (i = 0; i < len; i++) {
@@ -224,8 +228,7 @@
 }
 
 static const wchar_t *
-t_lucene_utf8_to_tchar(struct lucene_index *index,
-		       const char *str, bool translate)
+t_lucene_utf8_to_tchar(struct lucene_index *index, const char *str)
 {
 	ARRAY_TYPE(unichars) dest_arr;
 	const unichar_t *chars;
@@ -540,10 +543,13 @@
 		return 0;
 
 	try {
-		index->writer->addDocument(index->doc,
-					   index->cur_analyzer != NULL ?
-					   index->cur_analyzer :
-					   index->default_analyzer);
+		CL_NS(analysis)::Analyzer *analyzer = NULL;
+
+		if (!index->set.use_libfts) {
+			analyzer = index->cur_analyzer != NULL ?
+				index->cur_analyzer : index->default_analyzer;
+		}
+		index->writer->addDocument(index->doc, analyzer);
 	} catch (CLuceneError &err) {
 		lucene_handle_error(index, err, "IndexWriter::addDocument()");
 		ret = -1;
@@ -578,7 +584,7 @@
 		index->doc->add(*_CLNEW Field(_T("box"), index->mailbox_guid, Field::STORE_YES | Field::INDEX_UNTOKENIZED));
 	}
 
-	if (index->normalizer_buf != NULL) {
+	if (index->normalizer_buf != NULL && !index->set.use_libfts) {
 		buffer_set_used_size(index->normalizer_buf, 0);
 		index->normalizer(data, size, index->normalizer_buf);
 		data = (const unsigned char *)index->normalizer_buf->data;
@@ -594,6 +600,8 @@
 	lucene_utf8_n_to_tchar(data, size, dest, datasize);
 	lucene_data_translate(index, dest, datasize-1);
 
+	int token_flag = index->set.use_libfts ?
+		Field::INDEX_UNTOKENIZED : Field::INDEX_TOKENIZED;
 	if (hdr_name != NULL) {
 		/* hdr_name should be ASCII, but don't break in case it isn't */
 		hdr_name = t_str_lcase(hdr_name);
@@ -601,15 +609,16 @@
 		wchar_t wname[namesize];
 		lucene_utf8_n_to_tchar((const unsigned char *)hdr_name,
 				       strlen(hdr_name), wname, namesize);
-		index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | Field::INDEX_TOKENIZED));
-		index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
+		if (index->set.use_libfts)
+			index->doc->add(*_CLNEW Field(_T("hdr"), wname, Field::STORE_NO | token_flag));
+		index->doc->add(*_CLNEW Field(_T("hdr"), dest, Field::STORE_NO | token_flag));
 
 		if (fts_header_want_indexed(hdr_name))
-			index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
+			index->doc->add(*_CLNEW Field(wname, dest, Field::STORE_NO | token_flag));
 	} else if (size > 0) {
-		if (index->cur_analyzer == NULL)
+		if (index->cur_analyzer == NULL && !index->set.use_libfts)
 			index->cur_analyzer = guess_analyzer(index, data, size);
-		index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | Field::INDEX_TOKENIZED));
+		index->doc->add(*_CLNEW Field(_T("body"), dest, Field::STORE_NO | token_flag));
 	}
 	i_free(dest_free);
 	return 0;
@@ -1130,6 +1139,18 @@
 	const TCHAR *wvalue;
 	Analyzer *analyzer;
 
+	if (index->set.use_libfts) {
+		const wchar_t *wstr = t_lucene_utf8_to_tchar(index, str);
+		Term* tm = _CLNEW Term(key, wstr);
+		Query* ret;
+		if (fuzzy)
+			ret = _CLNEW FuzzyQuery( tm );
+		else
+			ret = _CLNEW TermQuery( tm );
+		_CLDECDELETE(tm);
+		return ret;
+	}
+
 	if (index->normalizer_buf != NULL) {
 		buffer_set_used_size(index->normalizer_buf, 0);
 		index->normalizer(str, strlen(str), index->normalizer_buf);
@@ -1137,7 +1158,7 @@
 		str = (const char *)index->normalizer_buf->data;
 	}
 
-	wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
+	wvalue = t_lucene_utf8_to_tchar(index, str);
 	analyzer = guess_analyzer(index, str, strlen(str));
 	if (analyzer == NULL)
 		analyzer = index->default_analyzer;
@@ -1194,7 +1215,7 @@
 			return false;
 
 		q = lucene_get_query(index,
-				     t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name), FALSE),
+				     t_lucene_utf8_to_tchar(index, t_str_lcase(arg->hdr_field_name)),
 				     arg);
 		break;
 	default:
@@ -1236,7 +1257,7 @@
 	case SEARCH_HEADER:
 	case SEARCH_HEADER_ADDRESS:
 	case SEARCH_HEADER_COMPRESS_LWSP:
-		if (*arg->value.str == '\0') {
+		if (*arg->value.str == '\0' && !index->set.use_libfts) {
 			/* checking potential existence of the header name */
 			q = lucene_get_query_str(index, _T("hdr"),
 				t_str_lcase(arg->hdr_field_name), FALSE);