dovecot-2.2: fts: Create tokenizers differently
dovecot at dovecot.org
dovecot at dovecot.org
Sat May 9 08:32:07 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/ae0458c63761
changeset: 18549:ae0458c63761
user: Teemu Huovila <teemu.huovila at dovecot.fi>
date: Sat May 09 11:02:22 2015 +0300
description:
fts: Create tokenizers differently
Create tokenizers earlier. Create separate tokenizers for search
and indexing. Enable configuration of tokenizers. Add some helpers
in fts-tokenizer.h api. Change tokenizer unit tests to match
those changes.
lib-fts: Refactor lib-fts settings a bit
Turned address tokenizer settings into "boolean" values. Changed
have_parent to "no_parent" and added "search" setting. Added
documentation in fts-tokenizer.h. Change unit tests accordingly.
diffstat:
src/lib-fts/fts-tokenizer-address.c | 35 ++++++-----
src/lib-fts/fts-tokenizer.c | 19 ++++++
src/lib-fts/fts-tokenizer.h | 32 ++++++++--
src/lib-fts/test-fts-tokenizer.c | 22 +++----
src/plugins/fts/fts-api-private.h | 2 -
src/plugins/fts/fts-build-mail.c | 6 +-
src/plugins/fts/fts-plugin.c | 3 +
src/plugins/fts/fts-search-args.c | 7 +-
src/plugins/fts/fts-storage.c | 14 ----
src/plugins/fts/fts-user.c | 106 ++++++++++++++++++++++++++++++++++-
src/plugins/fts/fts-user.h | 3 +-
11 files changed, 188 insertions(+), 61 deletions(-)
diffs (truncated from 599 to 300 lines):
diff -r abbd71252175 -r ae0458c63761 src/lib-fts/fts-tokenizer-address.c
--- a/src/lib-fts/fts-tokenizer-address.c Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-address.c Sat May 09 11:02:22 2015 +0300
@@ -5,8 +5,8 @@
#include "buffer.h"
#include "fts-tokenizer-private.h"
-/* Return not only our tokens, but also data for parent to process.*/
-#define FTS_DEFAULT_HAVE_PARENT 1
+#define FTS_DEFAULT_NO_PARENT FALSE
+#define FTS_DEFAULT_SEARCH FALSE
enum email_address_parser_state {
EMAIL_ADDRESS_PARSER_STATE_NONE = 0,
@@ -21,8 +21,8 @@
string_t *last_word;
string_t *parent_data; /* Copy of input data between tokens.
TODO: could be buffer_t maybe */
- unsigned int have_parent; /* Setting for stand-alone usage.
- Might be superfluous. */
+ bool no_parent;
+ bool search;
};
/*
@@ -85,18 +85,17 @@
const char **error_r)
{
struct email_address_fts_tokenizer *tok;
- unsigned int have_parent = FTS_DEFAULT_HAVE_PARENT;
+ bool no_parent = FTS_DEFAULT_NO_PARENT;
+ bool search = FTS_DEFAULT_SEARCH;
unsigned int i;
for (i = 0; settings[i] != NULL; i += 2) {
- const char *key = settings[i], *value = settings[i+1];
+ const char *key = settings[i];
- if (strcmp(key, "have_parent") == 0) {
- if (str_to_uint(value, &have_parent) < 0 ) {
- *error_r = t_strdup_printf(
- "Invalid parent setting: %s", value);
- return -1;
- }
+ if (strcmp(key, "no_parent") == 0) {
+ no_parent = TRUE;
+ }else if (strcmp(key, "search") == 0) {
+ search = TRUE;
} else {
*error_r = t_strdup_printf("Unknown setting: %s", key);
return -1;
@@ -107,7 +106,8 @@
tok->tokenizer = *fts_tokenizer_email_address;
tok->last_word = str_new(default_pool, 128);
tok->parent_data = str_new(default_pool, 128);
- tok->have_parent = have_parent;
+ tok->no_parent = no_parent;
+ tok->search = search;
*tokenizer_r = &tok->tokenizer;
return 0;
}
@@ -134,6 +134,9 @@
fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok)
{
const char *ret;
+ /* TODO: search option removes address from data here. */
+ if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN)
+ i_debug("Would remove current token");
ret = t_strdup(str_c(tok->parent_data));
str_truncate(tok->parent_data, 0);
@@ -250,7 +253,7 @@
fts_tokenizer_address_update_parent(struct email_address_fts_tokenizer *tok,
const unsigned char *data, size_t size)
{
- if (tok->have_parent > 0)
+ if (!tok->no_parent)
str_append_n(tok->parent_data, data, size);
}
static const char *
@@ -273,7 +276,7 @@
/* end of data, output lingering tokens. first the parents data, then
possibly our token, if complete enough */
if (size == 0) {
- if (tok->have_parent > 0 && str_len(tok->parent_data) > 0)
+ if (!tok->no_parent && str_len(tok->parent_data) > 0)
return fts_tokenizer_address_parent_data(tok);
if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN
@@ -328,7 +331,7 @@
*skip_r = pos + local_skip;
fts_tokenizer_address_update_parent(tok, data+pos,
local_skip);
- if (tok->have_parent > 0)
+ if (!tok->no_parent)
return fts_tokenizer_address_parent_data(tok);
else {
return fts_tokenizer_address_current_token(tok);
diff -r abbd71252175 -r ae0458c63761 src/lib-fts/fts-tokenizer.c
--- a/src/lib-fts/fts-tokenizer.c Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/fts-tokenizer.c Sat May 09 11:02:22 2015 +0300
@@ -10,6 +10,20 @@
ARRAY(struct fts_tokenizer) fts_tokenizer_classes;
+void fts_tokenizers_init(void)
+{
+ if (!array_is_created(&fts_tokenizer_classes)) {
+ fts_tokenizer_register(fts_tokenizer_generic);
+ fts_tokenizer_register(fts_tokenizer_email_address);
+ }
+}
+
+void fts_tokenizers_deinit(void)
+{
+ if (array_is_created(&fts_tokenizer_classes))
+ array_free(&fts_tokenizer_classes);
+}
+
/* private */
void fts_tokenizer_register(const struct fts_tokenizer *tok_class)
{
@@ -47,6 +61,11 @@
return NULL;
}
+const char *fts_tokenizer_name(const struct fts_tokenizer *tok)
+{
+ return tok->name;
+}
+
int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
struct fts_tokenizer *parent,
const char *const *settings,
diff -r abbd71252175 -r ae0458c63761 src/lib-fts/fts-tokenizer.h
--- a/src/lib-fts/fts-tokenizer.h Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/fts-tokenizer.h Sat May 09 11:02:22 2015 +0300
@@ -3,7 +3,9 @@
/*
Settings are given in the form of a const char * const *settings =
- {"key, "value", "key2", "value2", NULL} array of string pairs.
+ {"key, "value", "key2", "value2", NULL} array of string pairs. Some
+ keys, like "no_parent" and "search" are a sort of boolean and the
+ value does not matter, just mentioning the key enables the functionality.
The array has to be NULL terminated.
*/
/* Email address header tokenizer that returns "user at domain.org" input as
@@ -13,15 +15,21 @@
allows doing an explicit "user at domain" search, which returns only mails
matching that exact address (instead of e.g. a mail with both user at domain2
and user2 at domain words). */
-/* Settings: "have_parent", Return not only our tokens, but also data
- for parent to process. Defaults to 1. Should normally not need to
- be changed. */
+/* Settings:
+ "no_parent", Return only our tokens, no data for parent to process.
+ Defaults to disabled. Should normally not be needed.
+
+ "search" Remove addresses from parent data stream, so they are not processed
+ further. Defaults to disabled. Enable by defining the keyword (and any
+ value). */
extern const struct fts_tokenizer *fts_tokenizer_email_address;
#define FTS_TOKENIZER_EMAIL_ADDRESS_NAME "email-address"
/* Generic email content tokenizer. Cuts text into tokens. */
-/* Settings: "maxlen" Maximum length of token, before an arbitary cut
- off is made. Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
+/* Settings:
+ "maxlen" Maximum length of token, before an arbitary cut off is made.
+ Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
+
"algorithm", accepted values are "simple" or "tr29". Defines the
method for looking for word boundaries. Simple is faster and will
work for many texts, especially those using latin alphabets, but
@@ -35,9 +43,18 @@
extern const struct fts_tokenizer *fts_tokenizer_generic;
#define FTS_TOKENIZER_GENERIC_NAME "generic"
+/*
+ Tokenizing workflow, find --> create --> filter --> destroy.
+ Do init before first use and deinit after all done.
+ */
+
+/* Register all built-in tokenizers. */
+void fts_tokenizers_init(void);
+void fts_tokenizers_deinit(void);
+
const struct fts_tokenizer *fts_tokenizer_find(const char *name);
-/* Create a new tokenizer. The settings is an array of key,value pairs. */
+/* Create a new tokenizer. The settings are described above. */
int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
struct fts_tokenizer *parent,
const char *const *settings,
@@ -57,4 +74,5 @@
fts_tokenizer_next(struct fts_tokenizer *tok,
const unsigned char *data, size_t size);
+const char *fts_tokenizer_name(const struct fts_tokenizer *tok);
#endif
diff -r abbd71252175 -r ae0458c63761 src/lib-fts/test-fts-tokenizer.c
--- a/src/lib-fts/test-fts-tokenizer.c Sat May 09 10:53:25 2015 +0300
+++ b/src/lib-fts/test-fts-tokenizer.c Sat May 09 11:02:22 2015 +0300
@@ -25,7 +25,7 @@
const char *token, *error;
test_begin("fts tokenizer generic simple");
- fts_tokenizer_register(fts_tokenizer_generic);
+ fts_tokenizers_init();
tok_class = fts_tokenizer_find(FTS_TOKENIZER_GENERIC_NAME);
test_assert(fts_tokenizer_create(tok_class, NULL, NULL, &tok, &error) == 0);
while ((token = fts_tokenizer_next(tok, input, sizeof(input)-1)) != NULL) {
@@ -38,7 +38,7 @@
}
test_assert(*eopp == NULL);
fts_tokenizer_unref(&tok);
- fts_tokenizer_unregister(fts_tokenizer_generic);
+ fts_tokenizers_deinit();
test_end();
}
@@ -267,7 +267,7 @@
"abc at example.com", "bar at example.org",
"foo at domain", "foo at domain", "bar at example.org", NULL
};
- const char *const settings[] = {"have_parent", "0", NULL};
+ const char *const settings[] = {"no_parent", "foo", NULL};
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
const char *token, *error;
@@ -305,7 +305,7 @@
"abc at example.com", "bar at example.org",
"foo at domain", NULL
};
- const char *const settings[] = {"have_parent", "0", NULL};
+ const char *const settings[] = {"no_parent", "0", NULL};
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
const char *token, *error;
@@ -346,7 +346,7 @@
struct fts_tokenizer *tok;
const char * const *eopp = expected_output;
const char *token, *error;
- const char *const settings[] = {"have_parent", "0", NULL};
+ const char *const settings[] = {"no_parent", "abc", NULL};
unsigned int i, step, step_max = 10;
test_begin("fts tokenizer email address, input random length");
@@ -390,8 +390,7 @@
unsigned int i;
test_begin("fts tokenizer email address + parent, input one character at a time");
- fts_tokenizer_register(fts_tokenizer_generic);
- fts_tokenizer_register(fts_tokenizer_email_address);
+ fts_tokenizers_init();
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
@@ -411,8 +410,7 @@
test_assert(*eopp == NULL);
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
- fts_tokenizer_unregister(fts_tokenizer_generic);
- fts_tokenizer_unregister(fts_tokenizer_email_address);
+ fts_tokenizers_deinit();
test_end();
}
@@ -437,8 +435,7 @@
unsigned int i;
test_begin("fts tokenizer email address + parent, input one line at a time");
- fts_tokenizer_register(fts_tokenizer_generic);
- fts_tokenizer_register(fts_tokenizer_email_address);
+ fts_tokenizers_init();
test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, NULL, &gen_tok, &error) == 0);
test_assert(fts_tokenizer_create(fts_tokenizer_email_address, gen_tok, NULL, &tok, &error) == 0);
@@ -457,8 +454,7 @@
test_assert(*eopp == NULL);
fts_tokenizer_unref(&tok);
fts_tokenizer_unref(&gen_tok);
- fts_tokenizer_unregister(fts_tokenizer_generic);
- fts_tokenizer_unregister(fts_tokenizer_email_address);
+ fts_tokenizers_deinit();
test_end();
}
diff -r abbd71252175 -r ae0458c63761 src/plugins/fts/fts-api-private.h
--- a/src/plugins/fts/fts-api-private.h Sat May 09 10:53:25 2015 +0300
+++ b/src/plugins/fts/fts-api-private.h Sat May 09 11:02:22 2015 +0300
@@ -76,8 +76,6 @@
struct fts_backend_vfuncs v;
struct mail_namespace *ns;
- struct fts_tokenizer *tokenizer;
-
unsigned int updating:1;
};
More information about the dovecot-cvs
mailing list