dovecot-2.2: fts: Change tokenizer API to be able to return errors

dovecot at dovecot.org dovecot at dovecot.org
Sat May 9 08:32:19 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/7fe766887394
changeset: 18551:7fe766887394
user:      Teemu Huovila <teemu.huovila at dovecot.fi>
date:      Sat May 09 11:05:04 2015 +0300
description:
fts: Change tokenizer API to be able to return errors

Modify fts_tokenizer_next() to return integer status codes. It returns
1 if a token was returned in *token_r, 0 if more input is needed and -1
on error.

diffstat:

 src/lib-fts/fts-tokenizer-address.c |  36 +++++++------
 src/lib-fts/fts-tokenizer-generic.c |  48 +++++++++---------
 src/lib-fts/fts-tokenizer-private.h |   5 +-
 src/lib-fts/fts-tokenizer.c         |  49 +++++++++---------
 src/lib-fts/fts-tokenizer.h         |  19 +++++--
 src/lib-fts/test-fts-tokenizer.c    |  94 ++++++++++++++++++++----------------
 src/plugins/fts/fts-build-mail.c    |   5 +-
 src/plugins/fts/fts-search-args.c   |  12 ++--
 8 files changed, 144 insertions(+), 124 deletions(-)

diffs (truncated from 682 to 300 lines):

diff -r cebe8be92034 -r 7fe766887394 src/lib-fts/fts-tokenizer-address.c
--- a/src/lib-fts/fts-tokenizer-address.c	Sat May 09 11:03:21 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-address.c	Sat May 09 11:05:04 2015 +0300
@@ -122,25 +122,27 @@
 	i_free(tok);
 }
 
-static const char *
-fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok)
+static int
+fts_tokenizer_address_current_token(struct email_address_fts_tokenizer *tok,
+                                    const char **token_r)
 {
 	tok->tokenizer.skip_parents = TRUE;
 	tok->state = EMAIL_ADDRESS_PARSER_STATE_NONE;
-	return t_strdup(str_c(tok->last_word));
+	*token_r = t_strdup(str_c(tok->last_word));
+	return 1;
 }
 
-static const char *
-fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok)
+static int
+fts_tokenizer_address_parent_data(struct email_address_fts_tokenizer *tok,
+                                  const char **token_r)
 {
-	const char *ret;
 	/* TODO: search option removes address from data here. */
 	if (tok->search && tok->state >= EMAIL_ADDRESS_PARSER_STATE_DOMAIN)
 		i_debug("Would remove current token");
 
-	ret = t_strdup(str_c(tok->parent_data));
+	*token_r = t_strdup(str_c(tok->parent_data));
 	str_truncate(tok->parent_data, 0);
-	return ret;
+	return 1;
 }
 
 /* Used to rewind past characters that can not be the start of a new localpart.
@@ -256,10 +258,10 @@
 	if (!tok->no_parent)
 		str_append_n(tok->parent_data, data, size);
 }
-static const char *
+static int
 fts_tokenizer_email_address_next(struct fts_tokenizer *_tok,
-				 const unsigned char *data, size_t size,
-				 size_t *skip_r)
+                                 const unsigned char *data, size_t size,
+                                 size_t *skip_r, const char **token_r)
 {
 	struct email_address_fts_tokenizer *tok =
 		(struct email_address_fts_tokenizer *)_tok;
@@ -270,18 +272,18 @@
 
 	if (tok->state == EMAIL_ADDRESS_PARSER_STATE_COMPLETE) {
 		*skip_r = pos;
-		return fts_tokenizer_address_current_token(tok);
+		return fts_tokenizer_address_current_token(tok, token_r);
 	}
 
 	/* end of data, output lingering tokens. first the parents data, then
 	   possibly our token, if complete enough */
 	if (size == 0) {
 		if (!tok->no_parent && str_len(tok->parent_data) > 0)
-		    return fts_tokenizer_address_parent_data(tok);
+			return fts_tokenizer_address_parent_data(tok, token_r);
 
 		if (tok->state == EMAIL_ADDRESS_PARSER_STATE_DOMAIN
 		    && chars_after_at(tok) > 0)
-			return fts_tokenizer_address_current_token(tok);
+			return fts_tokenizer_address_current_token(tok, token_r);
 	}
 
 	/* 1) regular input data OR
@@ -332,9 +334,9 @@
 			fts_tokenizer_address_update_parent(tok, data+pos,
 			                                    local_skip);
 			if (!tok->no_parent)
-				return fts_tokenizer_address_parent_data(tok);
+				return fts_tokenizer_address_parent_data(tok, token_r);
 			else {
-				return fts_tokenizer_address_current_token(tok);
+				return fts_tokenizer_address_current_token(tok, token_r);
 			}
 		default:
 			i_unreached();
@@ -342,7 +344,7 @@
 
 	}
 	*skip_r = pos;
-	return NULL;
+	return 0;
 }
 
 static const struct fts_tokenizer_vfuncs email_address_tokenizer_vfuncs = {
diff -r cebe8be92034 -r 7fe766887394 src/lib-fts/fts-tokenizer-generic.c
--- a/src/lib-fts/fts-tokenizer-generic.c	Sat May 09 11:03:21 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-generic.c	Sat May 09 11:05:04 2015 +0300
@@ -82,14 +82,13 @@
 	i_free(tok);
 }
 
-static const char *
-fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok)
+static int
+fts_tokenizer_generic_simple_current_token(struct generic_fts_tokenizer *tok,
+                                           const char **token_r)
 {
-	const char *ret;
-
-	ret = t_strndup(tok->token->data, tok->token->used);
+	*token_r = t_strndup(tok->token->data, tok->token->used);
 	buffer_set_used_size(tok->token, 0);
-	return ret;
+	return 1;
 }
 
 /* TODO: This is duplicated from unichar.c */
@@ -135,10 +134,10 @@
 	return is_word_break(c);
 }
 
-static const char *
+static int
 fts_tokenizer_generic_next_simple(struct fts_tokenizer *_tok,
-			   const unsigned char *data, size_t size,
-			   size_t *skip_r)
+                                  const unsigned char *data, size_t size,
+                                  size_t *skip_r, const char **token_r)
 {
 	struct generic_fts_tokenizer *tok =
 		(struct generic_fts_tokenizer *)_tok;
@@ -157,7 +156,7 @@
 			}
 			/* word boundary found - return a new token */
 			*skip_r = i + 1;
-			return fts_tokenizer_generic_simple_current_token(tok);
+			return fts_tokenizer_generic_simple_current_token(tok, token_r);
 		}
 	}
 	/* word boundary not found yet */
@@ -168,9 +167,9 @@
 
 	if (size == 0 && tok->token->used > 0) {
 		/* return the last token */
-		return fts_tokenizer_generic_simple_current_token(tok);
+		return fts_tokenizer_generic_simple_current_token(tok, token_r);
 	}
-	return NULL;
+	return 0;
 }
 
 /* TODO: Arrange array searches roughly in order of likelyhood of a match.
@@ -464,20 +463,20 @@
 
 	return FALSE;
 }
-static const char *
-fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok)
+static int
+fts_tokenizer_generic_tr29_current_token(struct generic_fts_tokenizer *tok,
+                                         const char **token_r)
 {
-	const char *ret;
 	size_t end_skip = 0;
 
 	if (is_one_past_end(tok))
 		end_skip = tok->last_size;
 
-	ret = t_strndup(tok->token->data, tok->token->used - end_skip);
+	*token_r = t_strndup(tok->token->data, tok->token->used - end_skip);
 	buffer_set_used_size(tok->token, 0);
 	tok->prev_prev_letter = LETTER_TYPE_NONE;
 	tok->prev_letter = LETTER_TYPE_NONE;
-	return ret;
+	return 1;
 }
 /*
   Find word boundaries in input text. Based on Unicode standard annex
@@ -516,10 +515,10 @@
 	return FALSE;
 }
 
-static const char *
+static int
 fts_tokenizer_generic_next_tr29(struct fts_tokenizer *_tok,
 			   const unsigned char *data, size_t size,
-			   size_t *skip_r)
+                                size_t *skip_r, const char **token_r)
 {
 	struct generic_fts_tokenizer *tok =
 		(struct generic_fts_tokenizer *)_tok;
@@ -547,7 +546,7 @@
 			buffer_append(tok->token, data + start_skip,
 			              len - start_skip);
 			*skip_r = i + 1;
-			return fts_tokenizer_generic_tr29_current_token(tok);
+			return fts_tokenizer_generic_tr29_current_token(tok, token_r);
 		}
 	}
 	len =  I_MIN(i, tok->max_length);
@@ -558,16 +557,17 @@
 	if (size == 0 && tok->token->used > 0) {
 		/* return the last token */
 		*skip_r = 0;
-		return fts_tokenizer_generic_tr29_current_token(tok);
+		return fts_tokenizer_generic_tr29_current_token(tok, token_r);
 	}
-	return NULL;
+	return 0;
 }
 
-static const char *
+static int
 fts_tokenizer_generic_next(struct fts_tokenizer *_tok ATTR_UNUSED,
 			   const unsigned char *data ATTR_UNUSED,
                            size_t size ATTR_UNUSED,
-			   size_t *skip_r ATTR_UNUSED)
+                           size_t *skip_r ATTR_UNUSED,
+                           const char **token_r ATTR_UNUSED)
 {
 	i_unreached();
 }
diff -r cebe8be92034 -r 7fe766887394 src/lib-fts/fts-tokenizer-private.h
--- a/src/lib-fts/fts-tokenizer-private.h	Sat May 09 11:03:21 2015 +0300
+++ b/src/lib-fts/fts-tokenizer-private.h	Sat May 09 11:05:04 2015 +0300
@@ -10,9 +10,8 @@
 		      struct fts_tokenizer **tokenizer_r, const char **error_r);
 	void (*destroy)(struct fts_tokenizer *tok);
 
-	const char *(*next)(struct fts_tokenizer *tok,
-			    const unsigned char *data, size_t size,
-			    size_t *skip_r);
+	int (*next)(struct fts_tokenizer *tok, const unsigned char *data,
+	            size_t size, size_t *skip_r, const char **token_r);
 };
 
 enum fts_tokenizer_parent_state {
diff -r cebe8be92034 -r 7fe766887394 src/lib-fts/fts-tokenizer.c
--- a/src/lib-fts/fts-tokenizer.c	Sat May 09 11:03:21 2015 +0300
+++ b/src/lib-fts/fts-tokenizer.c	Sat May 09 11:05:04 2015 +0300
@@ -120,11 +120,12 @@
 	tok->v->destroy(tok);
 }
 
-static const char *
+static int
 fts_tokenizer_next_self(struct fts_tokenizer *tok,
-			const unsigned char *data, size_t size)
+                        const unsigned char *data, size_t size,
+                        const char **token_r)
 {
-	const char *token;
+	int ret = 0;
 	size_t skip = 0;
 
 	i_assert(tok->prev_reply_finished ||
@@ -132,60 +133,60 @@
 
 	if (tok->prev_reply_finished) {
 		/* whole new data */
-		token = tok->v->next(tok, data, size, &skip);
+		ret = tok->v->next(tok, data, size, &skip, token_r);
 	} else {
 		/* continuing previous data */
 		i_assert(tok->prev_skip <= size);
-		token = tok->v->next(tok, data + tok->prev_skip,
-				     size - tok->prev_skip, &skip);
+		ret = tok->v->next(tok, data + tok->prev_skip,
+		                   size - tok->prev_skip, &skip, token_r);
 	}
 
-	if (token != NULL) {
+	if (ret > 0) {
 		i_assert(skip <= size - tok->prev_skip);
 		tok->prev_data = data;
 		tok->prev_size = size;
 		tok->prev_skip = tok->prev_skip + skip;
 		tok->prev_reply_finished = FALSE;
-	} else {
+	} else if (ret == 0) {
 		/* we need a new data block */
 		tok->prev_data = NULL;
 		tok->prev_size = 0;
 		tok->prev_skip = 0;
 		tok->prev_reply_finished = TRUE;
 	}
-	return token;
+	return ret;
 }
 
-const char *
+int
 fts_tokenizer_next(struct fts_tokenizer *tok,
-		   const unsigned char *data, size_t size)
+                   const unsigned char *data, size_t size, const char **token_r)
 {
-	const char *token;
+	int ret;


More information about the dovecot-cvs mailing list