dovecot-2.1: fts-lucene: Added whitespace_chars subsetting to ft...
dovecot at dovecot.org
dovecot at dovecot.org
Fri Nov 4 19:25:09 EET 2011
details: http://hg.dovecot.org/dovecot-2.1/rev/6d483a22134e
changeset: 13646:6d483a22134e
user: Timo Sirainen <tss at iki.fi>
date: Fri Nov 04 19:35:30 2011 +0200
description:
fts-lucene: Added whitespace_chars subsetting to fts_lucene.
A value of "@." could be useful so that user at domain.tld allows searching
user, domain and tld separately instead of requiring the whole string to
match.
diffstat:
src/plugins/fts-lucene/fts-lucene-plugin.c | 12 +++++++--
src/plugins/fts-lucene/fts-lucene-plugin.h | 1 +
src/plugins/fts-lucene/lucene-wrapper.cc | 35 ++++++++++++++++++++++++-----
3 files changed, 39 insertions(+), 9 deletions(-)
diffs (129 lines):
diff -r b6e5cf112b3e -r 6d483a22134e src/plugins/fts-lucene/fts-lucene-plugin.c
--- a/src/plugins/fts-lucene/fts-lucene-plugin.c Fri Nov 04 18:50:24 2011 +0200
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.c Fri Nov 04 19:35:30 2011 +0200
@@ -26,6 +26,8 @@
set->textcat_conf = p_strdup(user->pool, *tmp + 13);
} else if (strncmp(*tmp, "textcat_dir=", 12) == 0) {
set->textcat_dir = p_strdup(user->pool, *tmp + 12);
+ } else if (strncmp(*tmp, "whitespace_chars=", 17) == 0) {
+ set->whitespace_chars = p_strdup(user->pool, *tmp + 17);
} else {
i_error("fts_lucene: Invalid setting: %s", *tmp);
return -1;
@@ -39,6 +41,8 @@
i_error("fts_lucene: textcat_dir set, but textcat_conf unset");
return -1;
}
+ if (set->whitespace_chars == NULL)
+ set->whitespace_chars = "";
#ifndef HAVE_LUCENE_STEMMER
if (set->default_language != NULL) {
i_error("fts_lucene: default_language set, "
@@ -61,9 +65,11 @@
uint32_t fts_lucene_settings_checksum(const struct fts_lucene_settings *set)
{
- /* only the default language change matters */
- return set->default_language == NULL ? 0 :
- crc32_str(set->default_language);
+ uint32_t crc;
+
+ crc = crc32_str(set->default_language);
+ crc = crc32_str_more(crc, set->whitespace_chars);
+ return crc;
}
static void fts_lucene_mail_user_created(struct mail_user *user)
diff -r b6e5cf112b3e -r 6d483a22134e src/plugins/fts-lucene/fts-lucene-plugin.h
--- a/src/plugins/fts-lucene/fts-lucene-plugin.h Fri Nov 04 18:50:24 2011 +0200
+++ b/src/plugins/fts-lucene/fts-lucene-plugin.h Fri Nov 04 19:35:30 2011 +0200
@@ -11,6 +11,7 @@
struct fts_lucene_settings {
const char *default_language;
const char *textcat_conf, *textcat_dir;
+ const char *whitespace_chars;
};
struct fts_lucene_user {
diff -r b6e5cf112b3e -r 6d483a22134e src/plugins/fts-lucene/lucene-wrapper.cc
--- a/src/plugins/fts-lucene/lucene-wrapper.cc Fri Nov 04 18:50:24 2011 +0200
+++ b/src/plugins/fts-lucene/lucene-wrapper.cc Fri Nov 04 19:35:30 2011 +0200
@@ -143,6 +143,21 @@
i_free(index);
}
+static void lucene_data_translate(struct lucene_index *index,
+ wchar_t *data, unsigned int len)
+{
+ const char *whitespace_chars = index->set.whitespace_chars;
+ unsigned int i;
+
+ if (*whitespace_chars == '\0')
+ return;
+
+ for (i = 0; i < len; i++) {
+ if (strchr(whitespace_chars, data[i]) != NULL)
+ data[i] = ' ';
+ }
+}
+
void lucene_utf8_n_to_tchar(const unsigned char *src, size_t srcsize,
wchar_t *dest, size_t destsize)
{
@@ -159,10 +174,14 @@
dest[destsize-1] = 0;
}
-static const wchar_t *t_lucene_utf8_to_tchar(const char *str)
+static const wchar_t *
+t_lucene_utf8_to_tchar(struct lucene_index *index,
+ const char *str, bool translate)
{
ARRAY_TYPE(unichars) dest_arr;
- const unichar_t *ret;
+ const unichar_t *chars;
+ wchar_t *ret;
+ unsigned int len;
i_assert(sizeof(wchar_t) == sizeof(unichar_t));
@@ -170,8 +189,11 @@
if (uni_utf8_to_ucs4(str, &dest_arr) < 0)
i_unreached();
(void)array_append_space(&dest_arr);
- ret = array_idx(&dest_arr, 0);
- return (const wchar_t *)ret;
+
+ chars = array_get_modifiable(&dest_arr, &len);
+ ret = (wchar_t *)chars;
+ lucene_data_translate(index, ret, len - 1);
+ return ret;
}
void lucene_index_select_mailbox(struct lucene_index *index,
@@ -478,6 +500,7 @@
datasize = uni_utf8_strlen_n(data, size) + 1;
wchar_t dest[datasize];
lucene_utf8_n_to_tchar(data, size, dest, datasize);
+ lucene_data_translate(index, dest, datasize);
if (hdr_name != NULL) {
/* hdr_name should be ASCII, but don't break in case it isn't */
@@ -1010,7 +1033,7 @@
lucene_get_query_str(struct lucene_index *index,
const TCHAR *key, const char *str, bool fuzzy)
{
- const TCHAR *wvalue = t_lucene_utf8_to_tchar(str);
+ const TCHAR *wvalue = t_lucene_utf8_to_tchar(index, str, TRUE);
Analyzer *analyzer = guess_analyzer(index, str, strlen(str));
if (analyzer == NULL)
analyzer = index->default_analyzer;
@@ -1067,7 +1090,7 @@
}
q = lucene_get_query(index,
- t_lucene_utf8_to_tchar(arg->hdr_field_name),
+ t_lucene_utf8_to_tchar(index, arg->hdr_field_name, FALSE),
arg);
break;
default:
More information about the dovecot-cvs
mailing list