dovecot-2.2: lib-fts: Various fixes and cleanups to stopwords fi...

dovecot at dovecot.org dovecot at dovecot.org
Sat May 9 09:30:52 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/adf9eb277499
changeset: 18565:adf9eb277499
user:      Timo Sirainen <tss at iki.fi>
date:      Sat May 09 12:01:42 2015 +0300
description:
lib-fts: Various fixes and cleanups to stopwords filter.
Most importantly words added to hash table needs to be allocated from the
pool and not data stack.

diffstat:

 src/lib-fts/fts-filter-stopwords.c |  82 ++++++++++++-------------------------
 1 files changed, 27 insertions(+), 55 deletions(-)

diffs (132 lines):

diff -r 92ee245b1406 -r adf9eb277499 src/lib-fts/fts-filter-stopwords.c
--- a/src/lib-fts/fts-filter-stopwords.c	Sat May 09 12:04:56 2015 +0300
+++ b/src/lib-fts/fts-filter-stopwords.c	Sat May 09 12:01:42 2015 +0300
@@ -11,9 +11,9 @@
 #include "fts-filter-private.h"
 
 #define STOPWORDS_FILE_FORMAT "%s/stopwords_%s.txt"
-/* TODO: Configure special characters */
-static const char stopwords_eol_comment = '|';
-static const char stopwords_comment = '#';
+
+#define STOPWORDS_COMMENT_CHAR1 '|'
+#define STOPWORDS_COMMENT_CHAR2 '#'
 
 struct fts_filter_stopwords {
 	struct fts_filter filter;
@@ -32,41 +32,33 @@
 	return TRUE;
 }
 
-static int fts_filter_stopwords_read_list(struct fts_filter_stopwords *filter)
+static int fts_filter_stopwords_read_list(struct fts_filter_stopwords *filter,
+					  const char **error_r)
 {
 	struct istream *input;
-	const char *line;
-	const char **words;
-	const char *list_path = NULL;
+	const char *line, **words, *path;
 	int ret = 0;
 
-	list_path = t_strdup_printf(STOPWORDS_FILE_FORMAT,
-	                            filter->stopwords_dir, filter->lang->name);
+	path = t_strdup_printf(STOPWORDS_FILE_FORMAT,
+			       filter->stopwords_dir, filter->lang->name);
 
-	input = i_stream_create_file(list_path, IO_BLOCK_SIZE);
-	while ((line =  i_stream_read_next_line(input)) != NULL) {
+	input = i_stream_create_file(path, IO_BLOCK_SIZE);
+	while ((line = i_stream_read_next_line(input)) != NULL) T_BEGIN {
+		line = t_strcut(line, STOPWORDS_COMMENT_CHAR1);
+		line = t_strcut(line, STOPWORDS_COMMENT_CHAR2);
 
-		if (uni_utf8_strlen(line) < 1)
-			continue;
-		if (strchr(line, stopwords_comment) != NULL)
-			continue; /* TODO: support eol hashed comments */
-		if (strchr(line, stopwords_eol_comment)!= NULL) {
-			line = t_strcut(line, stopwords_eol_comment);
-			if (line == NULL || strcmp(line, "") == 0)
-			    continue;
+		words = t_strsplit_spaces(line, " \t");
+		for (; *words != NULL; words++) {
+			const char *word = p_strdup(filter->pool, *words);
+			hash_table_insert(filter->stopwords, word, word);
 		}
-		words = t_strsplit_spaces(line, " \t");
-		while (*words != NULL) {
-			hash_table_insert(filter->stopwords, *words, *words);
-			words++;
-		}
+	} T_END;
+
+	if (input->stream_errno != 0) {
+		*error_r = t_strdup_printf("Failed to read stopword list %s: %s",
+					   path, i_stream_get_error(input));
+		ret = -1;
 	}
-	/*
-	   TODO: How to detect non-existing file?
-	   TODO: istream error handling and reporting (i_error()?).
-	 */
-	if (input->stream_errno != 0)
-		ret = -1;
 	i_stream_destroy(&input);
 	return ret;
 }
@@ -74,10 +66,10 @@
 static void fts_filter_stopwords_destroy(struct fts_filter *filter)
 {
 	struct fts_filter_stopwords *sp = (struct fts_filter_stopwords *)filter;
+
 	if (hash_table_is_created(sp->stopwords))
 		hash_table_destroy(&sp->stopwords);
 	pool_unref(&sp->pool);
-	return;
 }
 
 static int
@@ -117,38 +109,18 @@
 }
 
 static int
-fts_filter_stopwords_create_stopwords(struct fts_filter_stopwords *sp,
-				      const char **error_r)
-{
-	int ret;
-
-	hash_table_create(&sp->stopwords, sp->pool, 0, str_hash, strcmp);
-	ret = fts_filter_stopwords_read_list(sp);
-	if (ret < 0) {
-		*error_r = t_strdup_printf("Failed to read stopword list %s",
-					   sp->stopwords_dir);
-	}
-	return ret;
-}
-
-static int
 fts_filter_stopwords_filter(struct fts_filter *filter, const char **token,
 			    const char **error_r)
 {
-	const char *stopword;
 	struct fts_filter_stopwords *sp =
 		(struct fts_filter_stopwords *) filter;
 
-	if (!hash_table_is_created(sp->stopwords))
-		if (fts_filter_stopwords_create_stopwords(sp, error_r) < 0)
+	if (!hash_table_is_created(sp->stopwords)) {
+		hash_table_create(&sp->stopwords, sp->pool, 0, str_hash, strcmp);
+		if (fts_filter_stopwords_read_list(sp, error_r) < 0)
 			return -1;
-	stopword = hash_table_lookup(sp->stopwords, *token);
-	if (stopword != NULL) {
-		*token = NULL;
-		return 0;
 	}
-	else
-		return 1;
+	return hash_table_lookup(sp->stopwords, *token) == NULL ? 1 : 0;
 }
 
 const struct fts_filter_vfuncs stopwords_filter_vfuncs = {


More information about the dovecot-cvs mailing list