dovecot-2.2: lib-fts: Replaced word-boundary/break-data.sh with ...

Mon May 11 19:39:17 UTC 2015

details:   http://hg.dovecot.org/dovecot-2.2/rev/e80969ea8684
changeset: 18641:e80969ea8684
user:      Timo Sirainen <tss at iki.fi>
date:      Mon May 11 22:37:21 2015 +0300
description:
lib-fts: Replaced word-boundary/break-data.sh with more portable awk scripts
Patch by Michael Grimm.

diffstat:

 src/lib-fts/Makefile.am            |   12 ++--
 src/lib-fts/word-boundary-data.awk |  103 +++++++++++++++++++++++++++++++++++++
 src/lib-fts/word-boundary-data.sh  |   99 -----------------------------------
 src/lib-fts/word-break-data.awk    |  102 ++++++++++++++++++++++++++++++++++++
 src/lib-fts/word-break-data.sh     |   77 ---------------------------
 5 files changed, 211 insertions(+), 182 deletions(-)

diffs (truncated from 427 to 300 lines):

diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/Makefile.am

--- a/src/lib-fts/Makefile.am	Mon May 11 21:55:42 2015 +0300
+++ b/src/lib-fts/Makefile.am	Mon May 11 22:37:21 2015 +0300
@@ -22,20 +22,20 @@
 	udhr_fra.txt \
 	PropList.txt \
 	WordBreakProperty.txt \
-	word-boundary-data.sh \
+	word-boundary-data.awk \
 	word-boundary-data.c \
-	word-break-data.sh \
+	word-break-data.awk \
 	word-break-data.c
 
 WordBreakProperty.txt:
 	test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
-$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt
-	$(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@
+$(srcdir)/word-boundary-data.c: word-boundary-data.awk WordBreakProperty.txt
+	$(AWK) -f $(srcdir)/word-boundary-data.awk < WordBreakProperty.txt > $@
 
 PropList.txt:
 	test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt
-$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt
-	$(srcdir)/word-break-data.sh < PropList.txt > $@
+$(srcdir)/word-break-data.c: word-break-data.awk PropList.txt
+	$(AWK) -f $(srcdir)/word-break-data.awk < PropList.txt > $@
 
 
 if BUILD_FTS_STEMMER
diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/word-boundary-data.awk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/word-boundary-data.awk	Mon May 11 22:37:21 2015 +0300
@@ -0,0 +1,103 @@
+#!/usr/bin/awk -f
+
+#
+# converts strings to hex numbers (gawk's strtonum function)
+# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+#
+function mystrtonum(str) {
+	# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+	if (str ~ /^0[xX][[:xdigit:]]+$/) {
+		str = substr(str, 3) # lop off leading 0x
+		n = length(str)
+		ret = 0
+		for (i = 1; i <= n; i++) {
+			c = substr(str, i, 1)
+			c = tolower(c)
+			# index() returns 0 if c not in string,
+			# includes c == "0"
+			k = index("123456789abcdef", c)
+			ret = ret * 16 + k
+		}
+	} else {
+		ret = "NOT-A-HEX-NUMBER"
+	}
+	return ret
+}
+
+#
+# expand number ranges (from..to) to sequences of numbers (emulate seq function)
+#
+function add_hexrange (start, end) {
+	from = mystrtonum("0x"start)
+	to   = mystrtonum("0x"end)
+	for ( i=from; i<=to; i++ )
+		temp[i] = i
+	result = temp[from]
+	for ( i=from+1; i<=to; i++ )
+		result = result " " temp[i]
+	return result
+}
+
+#
+# initialization stuff (define categories of intrest in input file)
+#
+BEGIN {
+	FS = " "
+	ncategories = split("CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter \
+		Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet", array_names)
+}
+
+#
+# evaluate every line in input read from <stdin>
+#
+{ 
+	# skip comments and empty lines
+	if ( $0 !~ /^#/ && NF != 0 ) {
+		# cycle over array_names and do the math
+		for (category in array_names) {
+			# identify categories of interest (attention: relies on leading '; ' and trailing ' #' anchors,
+			# might be suited regex preferable!)
+			if ( $0 ~ "; "array_names[category]" #" ) {
+				# distinguish beetween single numbers and number ranges (from..to)
+				if ( $1 ~ /\.\./ ) {
+					split($1, bounderies, "\.")
+					array[category] = array[category] " " add_hexrange(bounderies[1], bounderies[3])
+				} else {
+					array[category] = array[category] " " mystrtonum("0x"$1)
+				}
+			}
+		}
+	}
+}
+
+#
+# format output to <stdout>
+#
+END {
+	print "/* This file is automatically generated by word-boundary-data.awk from WordBreakProperty.txt */"
+	for (category=1; category<=ncategories; category++) {
+		n = split(array[category], integers)
+		print "static const uint32_t "array_names[category]"[]= {"
+		if (n == 1) {
+			# split puts '0' into integers if arraysize equals to 1, thus:
+			printf("\t0x%05X", array[category])
+		} else {
+			for ( i=1; i<=n; i++) {
+				if ( i == 1 ) {
+					printf("\t0x%05X, ", integers[i])
+				} else if ( (i-1)%8 == 0 ) {
+					if ( i != n ) {
+						printf("\n\t0x%05X, ", integers[i])
+					} else {
+						printf("\n\t0x%05X", integers[i])
+					}
+				} else if ( i != n ) {
+					printf("0x%05X, ", integers[i])
+				} else {
+					printf("0x%05X", integers[i])
+				}
+			}
+		}
+		print "\n};"
+	}
+}
diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/word-boundary-data.sh
--- a/src/lib-fts/word-boundary-data.sh	Mon May 11 21:55:42 2015 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-#!/bin/bash
-# TODO: Should perhaps be written in perl/python/awk
-# FIXME: The runtime is a bit long.
-
-#Array names match category names in data file.
-declare -a CR
-declare -a LF
-declare -a Newline
-declare -a Extend
-declare -a Regional_Indicator
-declare -a Format
-declare -a Katakana
-declare -a Hebrew_Letter
-declare -a ALetter
-declare -a Single_Quote
-declare -a Double_Quote
-declare -a MidNumLet
-declare -a MidLetter
-declare -a MidNum
-declare -a Numeric
-declare -a ExtendNumLet
-
-WIDTH=5
-
-add_hexrange () {
-
-    array_name="$1"
-    from="$2"
-    to="$3"
-
-    eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))"
-}
-
-print_c_array () {
-
-    array_name="$1"
-    eval "array=("\${$array_name[@]}")"
-    array_length=${#array[@]}
-    i=1
-
-    printf "static const uint32_t %s[]= {\n\t" "$array_name"
-
-    for val in "${array[@]}" ; do
-        printf "0x%0${WIDTH}X" "$val"
-        if [  $i -lt $array_length ]; then
-            echo -n ", "
-            if [ $(($i%8)) -eq 0 ]; then
-                echo -ne "\n\t"
-            fi
-            i=$((i+1))
-        else
-            break
-        fi
-    done
-
-   echo -ne "\n};\n"
-}
-#read everything except comments.
-while read -s -a line; do
-    [ -z "${line[0]}" ] && continue #ignore empty lines
-
-     case "${line[0]}" in \#*) continue ;; esac #ignore comments
-
-    value="${line[0]}"
-    category="${line[2]}"
-
-    case "$value" in
-        *..*)
-            start=`echo "$value" | cut -d . -f 1`
-            end=`echo "$value" | cut -d . -f 3`
-            add_hexrange "$category" "$start" "$end"
-            ;;
-        *)
-            value=`printf "%05X" $((16#$value))`
-            eval "$category+=(0x\$value)"
-            ;;
-        esac;
-
-done
-
-printf "/* This file is automatically generated by %s from WordBreakProperty.txt */\n" "$0"
-
-print_c_array CR
-print_c_array LF
-print_c_array Newline
-print_c_array Extend
-print_c_array Regional_Indicator
-print_c_array Format
-print_c_array Katakana
-print_c_array Hebrew_Letter
-print_c_array ALetter
-print_c_array Single_Quote
-print_c_array Double_Quote
-print_c_array MidNumLet
-print_c_array MidLetter
-print_c_array MidNum
-print_c_array Numeric
-print_c_array ExtendNumLet
-
diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/word-break-data.awk
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/word-break-data.awk	Mon May 11 22:37:21 2015 +0300
@@ -0,0 +1,102 @@
+#!/usr/bin/awk -f
+
+#
+# converts strings to hex numbers (gawk's strtonum function)
+# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+#
+function mystrtonum(str) {
+	# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+	if (str ~ /^0[xX][[:xdigit:]]+$/) {
+		str = substr(str, 3) # lop off leading 0x
+		n = length(str)
+		ret = 0
+		for (i = 1; i <= n; i++) {
+			c = substr(str, i, 1)
+			c = tolower(c)
+			# index() returns 0 if c not in string,
+			# includes c == "0"
+			k = index("123456789abcdef", c)
+			ret = ret * 16 + k
+		}
+	} else {
+		ret = "NOT-A-HEX-NUMBER"
+	}
+	return ret
+}
+
+#
+# expand number ranges (from..to) to sequences of numbers (emulate seq function)
+#
+function add_hexrange (start, end) {
+	from = mystrtonum("0x"start)
+	to   = mystrtonum("0x"end)
+	for ( i=from; i<=to; i++ )
+		temp[i] = i
+	result = temp[from]
+	for ( i=from+1; i<=to; i++ )
+		result = result " " temp[i]
+	return result
+}
+
+#
+# initialization stuff (define categories of intrest in input file)
+#
+BEGIN {
+	FS = " "
+	ncategories = split("White_Space Dash Terminal_Punctuation STerm Pattern_White_Space", array_names)
+}
+
+#
+# evaluate every line in input read from <stdin>
+#
+{ 
+	# skip comments and empty lines
+	if ( $0 !~ /^#/ && NF != 0 ) {
+		# cycle over array_names and do the math
+		for (category in array_names) {