dovecot-2.2: fts-lucene: Added "normalize" option to put data th...
dovecot at dovecot.org
dovecot at dovecot.org
Sat Sep 15 03:49:37 EEST 2012
details: http://hg.dovecot.org/dovecot-2.2/rev/07ac1dbcc033
changeset: 15054:07ac1dbcc033
user: Timo Sirainen <tss at iki.fi>
date: Sat Sep 15 03:49:23 2012 +0300
description:
fts-lucene: Added "normalize" option to put data through normalize().
diffstat:
src/plugins/fts-lucene/Snowball.cc | 30 +++++++++++++++++++++++-------
src/plugins/fts-lucene/SnowballAnalyzer.h | 7 ++++++-
src/plugins/fts-lucene/SnowballFilter.h | 3 ++-
src/plugins/fts-lucene/fts-lucene-plugin.c | 9 +++++++++
src/plugins/fts-lucene/fts-lucene-plugin.h | 1 +
src/plugins/fts-lucene/lucene-wrapper.cc | 12 ++++++++++--
6 files changed, 51 insertions(+), 11 deletions(-)
diffs (217 lines):
diff -r c976a9c01613 -r 07ac1dbcc033 src/plugins/fts-lucene/Snowball.cc
--- a/src/plugins/fts-lucene/Snowball.cc Sat Sep 15 03:12:20 2012 +0300
+++ b/src/plugins/fts-lucene/Snowball.cc Sat Sep 15 03:49:23 2012 +0300
@@ -26,8 +26,9 @@
CL_NS_DEF2(analysis,snowball)
/** Builds the named analyzer with no stop words. */
- SnowballAnalyzer::SnowballAnalyzer(const char* language) {
+ SnowballAnalyzer::SnowballAnalyzer(normalizer_func_t *normalizer, const char* language) {
this->language = strdup(language);
+ this->normalizer = normalizer;
stopSet = NULL;
prevstream = NULL;
}
@@ -67,7 +68,7 @@
result = _CLNEW CL_NS(analysis)::LowerCaseFilter(result, true);
if (stopSet != NULL)
result = _CLNEW CL_NS(analysis)::StopFilter(result, true, stopSet);
- result = _CLNEW SnowballFilter(result, language, true);
+ result = _CLNEW SnowballFilter(result, normalizer, language, true);
return result;
}
@@ -87,10 +88,11 @@
* @param in the input tokens to stem
* @param name the name of a stemmer
*/
- SnowballFilter::SnowballFilter(TokenStream* in, const char* language, bool deleteTS):
+ SnowballFilter::SnowballFilter(TokenStream* in, normalizer_func_t *normalizer, const char* language, bool deleteTS):
TokenFilter(in,deleteTS)
{
stemmer = sb_stemmer_new(language, NULL); //use utf8 encoding
+ this->normalizer = normalizer;
if ( stemmer == NULL ){
_CLTHROWA(CL_ERR_IllegalArgument, "language not available for stemming\n"); //todo: richer error
@@ -120,10 +122,24 @@
int stemmedLen=sb_stemmer_length(stemmer);
- unsigned int tchartext_size = uni_utf8_strlen_n(stemmed, stemmedLen) + 1;
- TCHAR tchartext[tchartext_size];
- lucene_utf8_n_to_tchar(stemmed,stemmedLen,tchartext,tchartext_size);
- token->set(tchartext,token->startOffset(), token->endOffset(), token->type());
+ if (normalizer == NULL) {
+ unsigned int tchartext_size =
+ uni_utf8_strlen_n(stemmed, stemmedLen) + 1;
+ TCHAR tchartext[tchartext_size];
+ lucene_utf8_n_to_tchar(stemmed, stemmedLen, tchartext, tchartext_size);
+ token->set(tchartext,token->startOffset(), token->endOffset(), token->type());
+ } else T_BEGIN {
+ buffer_t *norm_buf = buffer_create_dynamic(pool_datastack_create(),
+ stemmedLen);
+ normalizer(stemmed, stemmedLen, norm_buf);
+
+ unsigned int tchartext_size =
+ uni_utf8_strlen_n(norm_buf->data, norm_buf->used) + 1;
+ TCHAR tchartext[tchartext_size];
+ lucene_utf8_n_to_tchar((const unsigned char *)norm_buf->data,
+ norm_buf->used, tchartext, tchartext_size);
+ token->set(tchartext,token->startOffset(), token->endOffset(), token->type());
+ } T_END;
return token;
}
diff -r c976a9c01613 -r 07ac1dbcc033 src/plugins/fts-lucene/SnowballAnalyzer.h
--- a/src/plugins/fts-lucene/SnowballAnalyzer.h Sat Sep 15 03:12:20 2012 +0300
+++ b/src/plugins/fts-lucene/SnowballAnalyzer.h Sat Sep 15 03:49:23 2012 +0300
@@ -7,6 +7,10 @@
#ifndef _lucene_analysis_snowball_analyser_
#define _lucene_analysis_snowball_analyser_
+extern "C" {
+#include "lib.h"
+#include "unichar.h"
+};
#include "CLucene/analysis/AnalysisHeader.h"
CL_CLASS_DEF(util,BufferedReader)
@@ -21,12 +25,13 @@
*/
class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
char* language;
+ normalizer_func_t *normalizer;
CLTCSetList* stopSet;
TokenStream *prevstream;
public:
/** Builds the named analyzer with no stop words. */
- SnowballAnalyzer(const char* language="english");
+ SnowballAnalyzer(normalizer_func_t *normalizer, const char* language="english");
/** Builds the named analyzer with the given stop words.
*/
diff -r c976a9c01613 -r 07ac1dbcc033 src/plugins/fts-lucene/SnowballFilter.h
--- a/src/plugins/fts-lucene/SnowballFilter.h Sat Sep 15 03:12:20 2012 +0300
+++ b/src/plugins/fts-lucene/SnowballFilter.h Sat Sep 15 03:49:23 2012 +0300
@@ -22,6 +22,7 @@
*/
class CLUCENE_CONTRIBS_EXPORT SnowballFilter: public TokenFilter {
struct sb_stemmer * stemmer;
+ normalizer_func_t *normalizer;
public:
/** Construct the named stemming filter.
@@ -29,7 +30,7 @@
* @param in the input tokens to stem
* @param name the name of a stemmer
*/
- SnowballFilter(TokenStream* in, const char* language, bool deleteTS);
+ SnowballFilter(TokenStream* in, normalizer_func_t *normalizer, const char* language, bool deleteTS);
~SnowballFilter();
diff -r c976a9c01613 -r 07ac1dbcc033 src/plugins/fts-lucene/fts-lucene-plugin.c
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c Sat Sep 15 03:12:20 2012 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c Sat Sep 15 03:49:23 2012 +0300
@@ -28,6 +28,8 @@
set->textcat_dir = p_strdup(user->pool, *tmp + 12);
} else if (strncmp(*tmp, "whitespace_chars=", 17) == 0) {
set->whitespace_chars = p_strdup(user->pool, *tmp + 17);
+ } else if (strcmp(*tmp, "normalize") == 0) {
+ set->normalize = TRUE;
} else {
i_error("fts_lucene: Invalid setting: %s", *tmp);
return -1;
@@ -49,6 +51,11 @@
"but Dovecot built without stemmer support");
return -1;
}
+ if (set->normalize) {
+ i_error("fts_lucene: normalize not currently supported "
+ "without stemmer support");
+ return -1;
+ }
#else
if (set->default_language == NULL)
set->default_language = "english";
@@ -71,6 +78,8 @@
crc = set->default_language == NULL ? 0 :
crc32_str(set->default_language);
crc = crc32_str_more(crc, set->whitespace_chars);
+ if (set->normalize)
+ crc = crc32_str_more(crc, "n");
return crc;
}
diff -r c976a9c01613 -r 07ac1dbcc033 src/plugins/fts-lucene/fts-lucene-plugin.h
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h Sat Sep 15 03:12:20 2012 +0300
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h Sat Sep 15 03:49:23 2012 +0300
@@ -12,6 +12,7 @@
const char *default_language;
const char *textcat_conf, *textcat_dir;
const char *whitespace_chars;
+ bool normalize;
};
struct fts_lucene_user {
diff -r c976a9c01613 -r 07ac1dbcc033 src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Sat Sep 15 03:12:20 2012 +0300
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc Sat Sep 15 03:49:23 2012 +0300
@@ -10,6 +10,7 @@
#include "mail-index.h"
#include "mail-search.h"
#include "mail-namespace.h"
+#include "mailbox-list-private.h"
#include "mail-storage.h"
#include "fts-expunge-log.h"
#include "fts-lucene-plugin.h"
@@ -58,6 +59,7 @@
char *path;
struct mailbox_list *list;
struct fts_lucene_settings set;
+ normalizer_func_t *normalizer;
wchar_t mailbox_guid[MAILBOX_GUID_HEX_LENGTH + 1];
@@ -107,6 +109,8 @@
index = i_new(struct lucene_index, 1);
index->path = i_strdup(path);
index->list = list;
+ index->normalizer = !set->normalize ? NULL :
+ list->ns->user->default_normalizer;
if (set != NULL)
index->set = *set;
else {
@@ -115,9 +119,11 @@
}
#ifdef HAVE_LUCENE_STEMMER
index->default_analyzer =
- _CLNEW snowball::SnowballAnalyzer(index->set.default_language);
+ _CLNEW snowball::SnowballAnalyzer(index->normalizer,
+ index->set.default_language);
#else
index->default_analyzer = _CLNEW standard::StandardAnalyzer();
+ i_assert(index->normalizer == NULL);
#endif
i_array_init(&index->analyzers, 32);
textcat_refcount++;
@@ -397,6 +403,7 @@
#ifdef HAVE_LUCENE_TEXTCAT
static Analyzer *get_analyzer(struct lucene_index *index, const char *lang)
{
+ normalizer_func_t *normalizer = index->normalizer;
const struct lucene_analyzer *a;
struct lucene_analyzer new_analyzer;
Analyzer *analyzer;
@@ -408,7 +415,8 @@
memset(&new_analyzer, 0, sizeof(new_analyzer));
new_analyzer.lang = i_strdup(lang);
- new_analyzer.analyzer = _CLNEW snowball::SnowballAnalyzer(lang);
+ new_analyzer.analyzer =
+ _CLNEW snowball::SnowballAnalyzer(normalizer, lang);
array_append_i(&index->analyzers.arr, &new_analyzer, 1);
return new_analyzer.analyzer;
}
More information about the dovecot-cvs
mailing list