dovecot-2.2: lib-fts: Reverted e80969ea8684 which replaced .sh s...
dovecot at dovecot.org
dovecot at dovecot.org
Tue May 12 09:22:58 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/2cfb80f7785e
changeset: 18643:2cfb80f7785e
user: Timo Sirainen <tss at iki.fi>
date: Tue May 12 12:20:56 2015 +0300
description:
lib-fts: Reverted e80969ea8684 which replaced .sh scripts with awk
Bugs in older awk versions (used at least by Debian squeeze & wheezy) caused
awk to crash while processing the script.
diffstat:
src/lib-fts/Makefile.am | 12 ++--
src/lib-fts/word-boundary-data.awk | 103 -------------------------------------
src/lib-fts/word-boundary-data.sh | 99 +++++++++++++++++++++++++++++++++++
src/lib-fts/word-break-data.awk | 102 ------------------------------------
src/lib-fts/word-break-data.sh | 77 +++++++++++++++++++++++++++
5 files changed, 182 insertions(+), 211 deletions(-)
diffs (truncated from 427 to 300 lines):
diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am Mon May 11 22:38:38 2015 +0300
+++ b/src/lib-fts/Makefile.am Tue May 12 12:20:56 2015 +0300
@@ -22,20 +22,20 @@
udhr_fra.txt \
PropList.txt \
WordBreakProperty.txt \
- word-boundary-data.awk \
+ word-boundary-data.sh \
word-boundary-data.c \
- word-break-data.awk \
+ word-break-data.sh \
word-break-data.c
WordBreakProperty.txt:
test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
-$(srcdir)/word-boundary-data.c: word-boundary-data.awk WordBreakProperty.txt
- $(AWK) -f $(srcdir)/word-boundary-data.awk < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@
+$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt
+ $(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@
PropList.txt:
test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt
-$(srcdir)/word-break-data.c: word-break-data.awk PropList.txt
- $(AWK) -f $(srcdir)/word-break-data.awk < PropList.txt > $@.tmp && mv $@.tmp $@
+$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt
+ $(srcdir)/word-break-data.sh < PropList.txt > $@.tmp && mv $@.tmp $@
if BUILD_FTS_STEMMER
diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/word-boundary-data.awk
--- a/src/lib-fts/word-boundary-data.awk Mon May 11 22:38:38 2015 +0300
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,103 +0,0 @@
-#!/usr/bin/awk -f
-
-#
-# converts strings to hex numbers (gawk's strtonum function)
-# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
-#
-function mystrtonum(str) {
- # adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
- if (str ~ /^0[xX][[:xdigit:]]+$/) {
- str = substr(str, 3) # lop off leading 0x
- n = length(str)
- ret = 0
- for (i = 1; i <= n; i++) {
- c = substr(str, i, 1)
- c = tolower(c)
- # index() returns 0 if c not in string,
- # includes c == "0"
- k = index("123456789abcdef", c)
- ret = ret * 16 + k
- }
- } else {
- ret = "NOT-A-HEX-NUMBER"
- }
- return ret
-}
-
-#
-# expand number ranges (from..to) to sequences of numbers (emulate seq function)
-#
-function add_hexrange (start, end) {
- from = mystrtonum("0x"start)
- to = mystrtonum("0x"end)
- for ( i=from; i<=to; i++ )
- temp[i] = i
- result = temp[from]
- for ( i=from+1; i<=to; i++ )
- result = result " " temp[i]
- return result
-}
-
-#
-# initialization stuff (define categories of intrest in input file)
-#
-BEGIN {
- FS = " "
- ncategories = split("CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter \
- Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet", array_names)
-}
-
-#
-# evaluate every line in input read from <stdin>
-#
-{
- # skip comments and empty lines
- if ( $0 !~ /^#/ && NF != 0 ) {
- # cycle over array_names and do the math
- for (category in array_names) {
- # identify categories of interest (attention: relies on leading '; ' and trailing ' #' anchors,
- # might be suited regex preferable!)
- if ( $0 ~ "; "array_names[category]" #" ) {
- # distinguish beetween single numbers and number ranges (from..to)
- if ( $1 ~ /\.\./ ) {
- split($1, bounderies, "\.")
- array[category] = array[category] " " add_hexrange(bounderies[1], bounderies[3])
- } else {
- array[category] = array[category] " " mystrtonum("0x"$1)
- }
- }
- }
- }
-}
-
-#
-# format output to <stdout>
-#
-END {
- print "/* This file is automatically generated by word-boundary-data.awk from WordBreakProperty.txt */"
- for (category=1; category<=ncategories; category++) {
- n = split(array[category], integers)
- print "static const uint32_t "array_names[category]"[]= {"
- if (n == 1) {
- # split puts '0' into integers if arraysize equals to 1, thus:
- printf("\t0x%05X", array[category])
- } else {
- for ( i=1; i<=n; i++) {
- if ( i == 1 ) {
- printf("\t0x%05X, ", integers[i])
- } else if ( (i-1)%8 == 0 ) {
- if ( i != n ) {
- printf("\n\t0x%05X, ", integers[i])
- } else {
- printf("\n\t0x%05X", integers[i])
- }
- } else if ( i != n ) {
- printf("0x%05X, ", integers[i])
- } else {
- printf("0x%05X", integers[i])
- }
- }
- }
- print "\n};"
- }
-}
diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/word-boundary-data.sh
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/word-boundary-data.sh Tue May 12 12:20:56 2015 +0300
@@ -0,0 +1,99 @@
+#!/bin/bash
+# TODO: Should perhaps be written in perl/python/awk
+# FIXME: The runtime is a bit long.
+
+#Array names match category names in data file.
+declare -a CR
+declare -a LF
+declare -a Newline
+declare -a Extend
+declare -a Regional_Indicator
+declare -a Format
+declare -a Katakana
+declare -a Hebrew_Letter
+declare -a ALetter
+declare -a Single_Quote
+declare -a Double_Quote
+declare -a MidNumLet
+declare -a MidLetter
+declare -a MidNum
+declare -a Numeric
+declare -a ExtendNumLet
+
+WIDTH=5
+
+add_hexrange () {
+
+ array_name="$1"
+ from="$2"
+ to="$3"
+
+ eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))"
+}
+
+print_c_array () {
+
+ array_name="$1"
+ eval "array=("\${$array_name[@]}")"
+ array_length=${#array[@]}
+ i=1
+
+ printf "static const uint32_t %s[]= {\n\t" "$array_name"
+
+ for val in "${array[@]}" ; do
+ printf "0x%0${WIDTH}X" "$val"
+ if [ $i -lt $array_length ]; then
+ echo -n ", "
+ if [ $(($i%8)) -eq 0 ]; then
+ echo -ne "\n\t"
+ fi
+ i=$((i+1))
+ else
+ break
+ fi
+ done
+
+ echo -ne "\n};\n"
+}
+#read everything except comments.
+while read -s -a line; do
+ [ -z "${line[0]}" ] && continue #ignore empty lines
+
+ case "${line[0]}" in \#*) continue ;; esac #ignore comments
+
+ value="${line[0]}"
+ category="${line[2]}"
+
+ case "$value" in
+ *..*)
+ start=`echo "$value" | cut -d . -f 1`
+ end=`echo "$value" | cut -d . -f 3`
+ add_hexrange "$category" "$start" "$end"
+ ;;
+ *)
+ value=`printf "%05X" $((16#$value))`
+ eval "$category+=(0x\$value)"
+ ;;
+ esac;
+
+done
+
+printf "/* This file is automatically generated by %s from WordBreakProperty.txt */\n" "$0"
+
+print_c_array CR
+print_c_array LF
+print_c_array Newline
+print_c_array Extend
+print_c_array Regional_Indicator
+print_c_array Format
+print_c_array Katakana
+print_c_array Hebrew_Letter
+print_c_array ALetter
+print_c_array Single_Quote
+print_c_array Double_Quote
+print_c_array MidNumLet
+print_c_array MidLetter
+print_c_array MidNum
+print_c_array Numeric
+print_c_array ExtendNumLet
+
diff -r 7d52d6595f5e -r 2cfb80f7785e src/lib-fts/word-break-data.awk
--- a/src/lib-fts/word-break-data.awk Mon May 11 22:38:38 2015 +0300
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,102 +0,0 @@
-#!/usr/bin/awk -f
-
-#
-# converts strings to hex numbers (gawk's strtonum function)
-# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
-#
-function mystrtonum(str) {
- # adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
- if (str ~ /^0[xX][[:xdigit:]]+$/) {
- str = substr(str, 3) # lop off leading 0x
- n = length(str)
- ret = 0
- for (i = 1; i <= n; i++) {
- c = substr(str, i, 1)
- c = tolower(c)
- # index() returns 0 if c not in string,
- # includes c == "0"
- k = index("123456789abcdef", c)
- ret = ret * 16 + k
- }
- } else {
- ret = "NOT-A-HEX-NUMBER"
- }
- return ret
-}
-
-#
-# expand number ranges (from..to) to sequences of numbers (emulate seq function)
-#
-function add_hexrange (start, end) {
- from = mystrtonum("0x"start)
- to = mystrtonum("0x"end)
- for ( i=from; i<=to; i++ )
- temp[i] = i
- result = temp[from]
- for ( i=from+1; i<=to; i++ )
- result = result " " temp[i]
- return result
-}
-
-#
-# initialization stuff (define categories of intrest in input file)
-#
-BEGIN {
- FS = " "
- ncategories = split("White_Space Dash Terminal_Punctuation STerm Pattern_White_Space", array_names)
-}
-
-#
-# evaluate every line in input read from <stdin>
-#
-{
- # skip comments and empty lines
- if ( $0 !~ /^#/ && NF != 0 ) {
- # cycle over array_names and do the math
- for (category in array_names) {
More information about the dovecot-cvs
mailing list