dovecot-2.2: lib-fts: Reverted e80969ea8684 which replaced .sh s...

Tue May 12 09:22:58 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/2cfb80f7785e
changeset: 18643:2cfb80f7785e
user:      Timo Sirainen <tss at iki.fi>
date:      Tue May 12 12:20:56 2015 +0300
description:
lib-fts: Reverted e80969ea8684 which replaced .sh scripts with awk
Bugs in older awk versions (used at least by Debian squeeze & wheezy) caused
awk to crash while processing the script.

diffstat:

 src/lib-fts/Makefile.am            |   12 ++--
 src/lib-fts/word-boundary-data.awk |  103 -------------------------------------
 src/lib-fts/word-boundary-data.sh  |   99 +++++++++++++++++++++++++++++++++++
 src/lib-fts/word-break-data.awk    |  102 ------------------------------------
 src/lib-fts/word-break-data.sh     |   77 +++++++++++++++++++++++++++
 5 files changed, 182 insertions(+), 211 deletions(-)

diffs (truncated from 427 to 300 lines):

diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/Makefile.am

--- a/src/lib-fts/Makefile.am	Mon May 11 22:38:38 2015 +0300
+++ b/src/lib-fts/Makefile.am	Tue May 12 12:20:56 2015 +0300
@@ -22,20 +22,20 @@
 	udhr_fra.txt \
 	PropList.txt \
 	WordBreakProperty.txt \
-	word-boundary-data.awk \
+	word-boundary-data.sh \
 	word-boundary-data.c \
-	word-break-data.awk \
+	word-break-data.sh \
 	word-break-data.c
 
 WordBreakProperty.txt:
 	test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
-$(srcdir)/word-boundary-data.c: word-boundary-data.awk WordBreakProperty.txt
-	$(AWK) -f $(srcdir)/word-boundary-data.awk < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@
+$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt
+	$(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@
 
 PropList.txt:
 	test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt
-$(srcdir)/word-break-data.c: word-break-data.awk PropList.txt
-	$(AWK) -f $(srcdir)/word-break-data.awk < PropList.txt > $@.tmp && mv $@.tmp $@
+$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt
+	$(srcdir)/word-break-data.sh < PropList.txt > $@.tmp && mv $@.tmp $@
 
 
 if BUILD_FTS_STEMMER
diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/word-boundary-data.awk
--- a/src/lib-fts/word-boundary-data.awk	Mon May 11 22:38:38 2015 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,103 +0,0 @@
-#!/usr/bin/awk -f
-
-#
-# converts strings to hex numbers (gawk's strtonum function)
-# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
-#
-function mystrtonum(str) {
-	# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
-	if (str ~ /^0[xX][[:xdigit:]]+$/) {
-		str = substr(str, 3) # lop off leading 0x
-		n = length(str)
-		ret = 0
-		for (i = 1; i <= n; i++) {
-			c = substr(str, i, 1)
-			c = tolower(c)
-			# index() returns 0 if c not in string,
-			# includes c == "0"
-			k = index("123456789abcdef", c)
-			ret = ret * 16 + k
-		}
-	} else {
-		ret = "NOT-A-HEX-NUMBER"
-	}
-	return ret
-}
-
-#
-# expand number ranges (from..to) to sequences of numbers (emulate seq function)
-#
-function add_hexrange (start, end) {
-	from = mystrtonum("0x"start)
-	to   = mystrtonum("0x"end)
-	for ( i=from; i<=to; i++ )
-		temp[i] = i
-	result = temp[from]
-	for ( i=from+1; i<=to; i++ )
-		result = result " " temp[i]
-	return result
-}
-
-#
-# initialization stuff (define categories of intrest in input file)
-#
-BEGIN {
-	FS = " "
-	ncategories = split("CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter \
-		Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet", array_names)
-}
-
-#
-# evaluate every line in input read from <stdin>
-#
-{ 
-	# skip comments and empty lines
-	if ( $0 !~ /^#/ && NF != 0 ) {
-		# cycle over array_names and do the math
-		for (category in array_names) {
-			# identify categories of interest (attention: relies on leading '; ' and trailing ' #' anchors,
-			# might be suited regex preferable!)
-			if ( $0 ~ "; "array_names[category]" #" ) {
-				# distinguish beetween single numbers and number ranges (from..to)
-				if ( $1 ~ /\.\./ ) {
-					split($1, bounderies, "\.")
-					array[category] = array[category] " " add_hexrange(bounderies[1], bounderies[3])
-				} else {
-					array[category] = array[category] " " mystrtonum("0x"$1)
-				}
-			}
-		}
-	}
-}
-
-#
-# format output to <stdout>
-#
-END {
-	print "/* This file is automatically generated by word-boundary-data.awk from WordBreakProperty.txt */"
-	for (category=1; category<=ncategories; category++) {
-		n = split(array[category], integers)
-		print "static const uint32_t "array_names[category]"[]= {"
-		if (n == 1) {
-			# split puts '0' into integers if arraysize equals to 1, thus:
-			printf("\t0x%05X", array[category])
-		} else {
-			for ( i=1; i<=n; i++) {
-				if ( i == 1 ) {
-					printf("\t0x%05X, ", integers[i])
-				} else if ( (i-1)%8 == 0 ) {
-					if ( i != n ) {
-						printf("\n\t0x%05X, ", integers[i])
-					} else {
-						printf("\n\t0x%05X", integers[i])
-					}
-				} else if ( i != n ) {
-					printf("0x%05X, ", integers[i])
-				} else {
-					printf("0x%05X", integers[i])
-				}
-			}
-		}
-		print "\n};"
-	}
-}
diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/word-boundary-data.sh
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/word-boundary-data.sh	Tue May 12 12:20:56 2015 +0300
@@ -0,0 +1,99 @@
+#!/bin/bash
+# TODO: Should perhaps be written in perl/python/awk
+# FIXME: The runtime is a bit long.
+
+#Array names match category names in data file.
+declare -a CR
+declare -a LF
+declare -a Newline
+declare -a Extend
+declare -a Regional_Indicator
+declare -a Format
+declare -a Katakana
+declare -a Hebrew_Letter
+declare -a ALetter
+declare -a Single_Quote
+declare -a Double_Quote
+declare -a MidNumLet
+declare -a MidLetter
+declare -a MidNum
+declare -a Numeric
+declare -a ExtendNumLet
+
+WIDTH=5
+
+add_hexrange () {
+
+    array_name="$1"
+    from="$2"
+    to="$3"
+
+    eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))"
+}
+
+print_c_array () {
+
+    array_name="$1"
+    eval "array=("\${$array_name[@]}")"
+    array_length=${#array[@]}
+    i=1
+
+    printf "static const uint32_t %s[]= {\n\t" "$array_name"
+
+    for val in "${array[@]}" ; do
+        printf "0x%0${WIDTH}X" "$val"
+        if [  $i -lt $array_length ]; then
+            echo -n ", "
+            if [ $(($i%8)) -eq 0 ]; then
+                echo -ne "\n\t"
+            fi
+            i=$((i+1))
+        else
+            break
+        fi
+    done
+
+   echo -ne "\n};\n"
+}
+#read everything except comments.
+while read -s -a line; do
+    [ -z "${line[0]}" ] && continue #ignore empty lines
+
+     case "${line[0]}" in \#*) continue ;; esac #ignore comments
+
+    value="${line[0]}"
+    category="${line[2]}"
+
+    case "$value" in
+        *..*)
+            start=`echo "$value" | cut -d . -f 1`
+            end=`echo "$value" | cut -d . -f 3`
+            add_hexrange "$category" "$start" "$end"
+            ;;
+        *)
+            value=`printf "%05X" $((16#$value))`
+            eval "$category+=(0x\$value)"
+            ;;
+        esac;
+
+done
+
+printf "/* This file is automatically generated by %s from WordBreakProperty.txt */\n" "$0"
+
+print_c_array CR
+print_c_array LF
+print_c_array Newline
+print_c_array Extend
+print_c_array Regional_Indicator
+print_c_array Format
+print_c_array Katakana
+print_c_array Hebrew_Letter
+print_c_array ALetter
+print_c_array Single_Quote
+print_c_array Double_Quote
+print_c_array MidNumLet
+print_c_array MidLetter
+print_c_array MidNum
+print_c_array Numeric
+print_c_array ExtendNumLet
+
diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/word-break-data.awk
--- a/src/lib-fts/word-break-data.awk	Mon May 11 22:38:38 2015 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,102 +0,0 @@
-#!/usr/bin/awk -f
-
-#
-# converts strings to hex numbers (gawk's strtonum function)
-# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
-#
-function mystrtonum(str) {
-	# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
-	if (str ~ /^0[xX][[:xdigit:]]+$/) {
-		str = substr(str, 3) # lop off leading 0x
-		n = length(str)
-		ret = 0
-		for (i = 1; i <= n; i++) {
-			c = substr(str, i, 1)
-			c = tolower(c)
-			# index() returns 0 if c not in string,
-			# includes c == "0"
-			k = index("123456789abcdef", c)
-			ret = ret * 16 + k
-		}
-	} else {
-		ret = "NOT-A-HEX-NUMBER"
-	}
-	return ret
-}
-
-#
-# expand number ranges (from..to) to sequences of numbers (emulate seq function)
-#
-function add_hexrange (start, end) {
-	from = mystrtonum("0x"start)
-	to   = mystrtonum("0x"end)
-	for ( i=from; i<=to; i++ )
-		temp[i] = i
-	result = temp[from]
-	for ( i=from+1; i<=to; i++ )
-		result = result " " temp[i]
-	return result
-}
-
-#
-# initialization stuff (define categories of intrest in input file)
-#
-BEGIN {
-	FS = " "
-	ncategories = split("White_Space Dash Terminal_Punctuation STerm Pattern_White_Space", array_names)
-}
-
-#
-# evaluate every line in input read from <stdin>
-#
-{ 
-	# skip comments and empty lines
-	if ( $0 !~ /^#/ && NF != 0 ) {
-		# cycle over array_names and do the math
-		for (category in array_names) {