dovecot-2.1: fts: Added FTS_BACKEND_FLAG_BUILD_SHORT_UTF8 to req...

dovecot at dovecot.org dovecot at dovecot.org
Tue Nov 27 03:49:46 EET 2012


details:   http://hg.dovecot.org/dovecot-2.1/rev/01550514f189
changeset: 14807:01550514f189
user:      Timo Sirainen <tss at iki.fi>
date:      Tue Nov 27 03:49:25 2012 +0200
description:
fts: Added FTS_BACKEND_FLAG_BUILD_SHORT_UTF8 to require sending only short UTF8 data to backend.

diffstat:

 src/plugins/fts/fts-api-private.h |   4 +++-
 src/plugins/fts/fts-build-mail.c  |   7 +++++--
 src/plugins/fts/fts-parser.c      |  24 ++++++++++++++++++------
 src/plugins/fts/fts-parser.h      |   3 ++-
 4 files changed, 28 insertions(+), 10 deletions(-)

diffs (116 lines):

diff -r 172295f5a78b -r 01550514f189 src/plugins/fts/fts-api-private.h
--- a/src/plugins/fts/fts-api-private.h	Tue Nov 27 03:48:15 2012 +0200
+++ b/src/plugins/fts/fts-api-private.h	Tue Nov 27 03:49:25 2012 +0200
@@ -59,7 +59,9 @@
 	/* Send only fully indexable words rather than randomly sized blocks */
 	FTS_BACKEND_FLAG_BUILD_FULL_WORDS	= 0x04,
 	/* Fuzzy search works */
-	FTS_BACKEND_FLAG_FUZZY_SEARCH		= 0x08
+	FTS_BACKEND_FLAG_FUZZY_SEARCH		= 0x08,
+	/* Don't allow 5-byte or 6-byte UTF8 sequences */
+	FTS_BACKEND_FLAG_BUILD_SHORT_UTF8	= 0x10
 };
 
 struct fts_backend {
diff -r 172295f5a78b -r 01550514f189 src/plugins/fts/fts-build-mail.c
--- a/src/plugins/fts/fts-build-mail.c	Tue Nov 27 03:48:15 2012 +0200
+++ b/src/plugins/fts/fts-build-mail.c	Tue Nov 27 03:49:25 2012 +0200
@@ -144,6 +144,7 @@
 	struct mail_storage *storage;
 	const char *content_type;
 	struct fts_backend_build_key key;
+	bool require_short_utf8;
 
 	i_assert(ctx->body_parser == NULL);
 
@@ -158,9 +159,11 @@
 		return FALSE;
 	}
 
-	
+	require_short_utf8 = (ctx->update_ctx->backend->flags &
+			      FTS_BACKEND_FLAG_BUILD_SHORT_UTF8) != 0;
+
 	storage = mailbox_get_storage(ctx->mail->box);
-	if (fts_parser_init(mail_storage_get_user(storage),
+	if (fts_parser_init(mail_storage_get_user(storage), require_short_utf8,
 			    content_type, ctx->content_disposition,
 			    &ctx->body_parser)) {
 		/* extract text using the the returned parser */
diff -r 172295f5a78b -r 01550514f189 src/plugins/fts/fts-parser.c
--- a/src/plugins/fts/fts-parser.c	Tue Nov 27 03:48:15 2012 +0200
+++ b/src/plugins/fts/fts-parser.c	Tue Nov 27 03:49:25 2012 +0200
@@ -11,7 +11,7 @@
 	&fts_parser_script
 };
 
-bool fts_parser_init(struct mail_user *user,
+bool fts_parser_init(struct mail_user *user, bool require_short_utf8,
 		     const char *content_type, const char *content_disposition,
 		     struct fts_parser **parser_r)
 {
@@ -20,8 +20,10 @@
 	for (i = 0; i < N_ELEMENTS(parsers); i++) {
 		*parser_r = parsers[i]->try_init(user, content_type,
 						 content_disposition);
-		if (*parser_r != NULL)
+		if (*parser_r != NULL) {
+			(*parser_r)->require_short_utf8 = require_short_utf8;
 			return TRUE;
+		}
 	}
 	return FALSE;
 }
@@ -56,11 +58,15 @@
 
 void fts_parser_more(struct fts_parser *parser, struct message_block *block)
 {
+	bool valid_utf8;
+
 	if (parser->v.more != NULL)
 		parser->v.more(parser, block);
 
-	if (!uni_utf8_data_is_valid(block->data, block->size) ||
-	    data_has_nuls(block->data, block->size)) {
+	valid_utf8 = parser->require_short_utf8 ?
+		uni_utf8_short_data_is_valid(block->data, block->size) :
+		uni_utf8_data_is_valid(block->data, block->size);
+	if (!valid_utf8 || data_has_nuls(block->data, block->size)) {
 		/* output isn't valid UTF-8. make it. */
 		if (parser->utf8_output == NULL) {
 			parser->utf8_output =
@@ -68,8 +74,14 @@
 		} else {
 			buffer_set_used_size(parser->utf8_output, 0);
 		}
-		(void)uni_utf8_get_valid_data(block->data, block->size,
-					      parser->utf8_output);
+		if (parser->require_short_utf8) {
+			(void)uni_utf8_short_get_valid_data(block->data,
+							    block->size,
+							    parser->utf8_output);
+		} else {
+			(void)uni_utf8_get_valid_data(block->data, block->size,
+						      parser->utf8_output);
+		}
 		replace_nul_bytes(parser->utf8_output);
 		block->data = parser->utf8_output->data;
 		block->size = parser->utf8_output->used;
diff -r 172295f5a78b -r 01550514f189 src/plugins/fts/fts-parser.h
--- a/src/plugins/fts/fts-parser.h	Tue Nov 27 03:48:15 2012 +0200
+++ b/src/plugins/fts/fts-parser.h	Tue Nov 27 03:49:25 2012 +0200
@@ -15,12 +15,13 @@
 struct fts_parser {
 	struct fts_parser_vfuncs v;
 	buffer_t *utf8_output;
+	bool require_short_utf8;
 };
 
 extern struct fts_parser_vfuncs fts_parser_html;
 extern struct fts_parser_vfuncs fts_parser_script;
 
-bool fts_parser_init(struct mail_user *user,
+bool fts_parser_init(struct mail_user *user, bool require_short_utf8,
 		     const char *content_type, const char *content_disposition,
 		     struct fts_parser **parser_r);
 struct fts_parser *fts_parser_text_init(void);


More information about the dovecot-cvs mailing list