dovecot-2.2: lib-fts: Replaced word-boundary/break-data.sh with ...
dovecot at dovecot.org
dovecot at dovecot.org
Mon May 11 19:39:17 UTC 2015
details: http://hg.dovecot.org/dovecot-2.2/rev/e80969ea8684
changeset: 18641:e80969ea8684
user: Timo Sirainen <tss at iki.fi>
date: Mon May 11 22:37:21 2015 +0300
description:
lib-fts: Replaced word-boundary/break-data.sh with more portable awk scripts
Patch by Michael Grimm.
diffstat:
src/lib-fts/Makefile.am | 12 ++--
src/lib-fts/word-boundary-data.awk | 103 +++++++++++++++++++++++++++++++++++++
src/lib-fts/word-boundary-data.sh | 99 -----------------------------------
src/lib-fts/word-break-data.awk | 102 ++++++++++++++++++++++++++++++++++++
src/lib-fts/word-break-data.sh | 77 ---------------------------
5 files changed, 211 insertions(+), 182 deletions(-)
diffs (truncated from 427 to 300 lines):
diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am Mon May 11 21:55:42 2015 +0300
+++ b/src/lib-fts/Makefile.am Mon May 11 22:37:21 2015 +0300
@@ -22,20 +22,20 @@
udhr_fra.txt \
PropList.txt \
WordBreakProperty.txt \
- word-boundary-data.sh \
+ word-boundary-data.awk \
word-boundary-data.c \
- word-break-data.sh \
+ word-break-data.awk \
word-break-data.c
WordBreakProperty.txt:
test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
-$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt
- $(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@
+$(srcdir)/word-boundary-data.c: word-boundary-data.awk WordBreakProperty.txt
+ $(AWK) -f $(srcdir)/word-boundary-data.awk < WordBreakProperty.txt > $@
PropList.txt:
test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt
-$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt
- $(srcdir)/word-break-data.sh < PropList.txt > $@
+$(srcdir)/word-break-data.c: word-break-data.awk PropList.txt
+ $(AWK) -f $(srcdir)/word-break-data.awk < PropList.txt > $@
if BUILD_FTS_STEMMER
diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/word-boundary-data.awk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/word-boundary-data.awk Mon May 11 22:37:21 2015 +0300
@@ -0,0 +1,103 @@
+#!/usr/bin/awk -f
+
+#
+# converts strings to hex numbers (gawk's strtonum function)
+# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+#
+function mystrtonum(str) {
+ # adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+ if (str ~ /^0[xX][[:xdigit:]]+$/) {
+ str = substr(str, 3) # lop off leading 0x
+ n = length(str)
+ ret = 0
+ for (i = 1; i <= n; i++) {
+ c = substr(str, i, 1)
+ c = tolower(c)
+ # index() returns 0 if c not in string,
+ # includes c == "0"
+ k = index("123456789abcdef", c)
+ ret = ret * 16 + k
+ }
+ } else {
+ ret = "NOT-A-HEX-NUMBER"
+ }
+ return ret
+}
+
+#
+# expand number ranges (from..to) to sequences of numbers (emulate seq function)
+#
+function add_hexrange (start, end) {
+ from = mystrtonum("0x"start)
+ to = mystrtonum("0x"end)
+ for ( i=from; i<=to; i++ )
+ temp[i] = i
+ result = temp[from]
+ for ( i=from+1; i<=to; i++ )
+ result = result " " temp[i]
+ return result
+}
+
+#
+# initialization stuff (define categories of intrest in input file)
+#
+BEGIN {
+ FS = " "
+ ncategories = split("CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter \
+ Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet", array_names)
+}
+
+#
+# evaluate every line in input read from <stdin>
+#
+{
+ # skip comments and empty lines
+ if ( $0 !~ /^#/ && NF != 0 ) {
+ # cycle over array_names and do the math
+ for (category in array_names) {
+ # identify categories of interest (attention: relies on leading '; ' and trailing ' #' anchors,
+ # might be suited regex preferable!)
+ if ( $0 ~ "; "array_names[category]" #" ) {
+ # distinguish beetween single numbers and number ranges (from..to)
+ if ( $1 ~ /\.\./ ) {
+ split($1, bounderies, "\.")
+ array[category] = array[category] " " add_hexrange(bounderies[1], bounderies[3])
+ } else {
+ array[category] = array[category] " " mystrtonum("0x"$1)
+ }
+ }
+ }
+ }
+}
+
+#
+# format output to <stdout>
+#
+END {
+ print "/* This file is automatically generated by word-boundary-data.awk from WordBreakProperty.txt */"
+ for (category=1; category<=ncategories; category++) {
+ n = split(array[category], integers)
+ print "static const uint32_t "array_names[category]"[]= {"
+ if (n == 1) {
+ # split puts '0' into integers if arraysize equals to 1, thus:
+ printf("\t0x%05X", array[category])
+ } else {
+ for ( i=1; i<=n; i++) {
+ if ( i == 1 ) {
+ printf("\t0x%05X, ", integers[i])
+ } else if ( (i-1)%8 == 0 ) {
+ if ( i != n ) {
+ printf("\n\t0x%05X, ", integers[i])
+ } else {
+ printf("\n\t0x%05X", integers[i])
+ }
+ } else if ( i != n ) {
+ printf("0x%05X, ", integers[i])
+ } else {
+ printf("0x%05X", integers[i])
+ }
+ }
+ }
+ print "\n};"
+ }
+}
diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/word-boundary-data.sh
--- a/src/lib-fts/word-boundary-data.sh Mon May 11 21:55:42 2015 +0300
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-#!/bin/bash
-# TODO: Should perhaps be written in perl/python/awk
-# FIXME: The runtime is a bit long.
-
-#Array names match category names in data file.
-declare -a CR
-declare -a LF
-declare -a Newline
-declare -a Extend
-declare -a Regional_Indicator
-declare -a Format
-declare -a Katakana
-declare -a Hebrew_Letter
-declare -a ALetter
-declare -a Single_Quote
-declare -a Double_Quote
-declare -a MidNumLet
-declare -a MidLetter
-declare -a MidNum
-declare -a Numeric
-declare -a ExtendNumLet
-
-WIDTH=5
-
-add_hexrange () {
-
- array_name="$1"
- from="$2"
- to="$3"
-
- eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))"
-}
-
-print_c_array () {
-
- array_name="$1"
- eval "array=("\${$array_name[@]}")"
- array_length=${#array[@]}
- i=1
-
- printf "static const uint32_t %s[]= {\n\t" "$array_name"
-
- for val in "${array[@]}" ; do
- printf "0x%0${WIDTH}X" "$val"
- if [ $i -lt $array_length ]; then
- echo -n ", "
- if [ $(($i%8)) -eq 0 ]; then
- echo -ne "\n\t"
- fi
- i=$((i+1))
- else
- break
- fi
- done
-
- echo -ne "\n};\n"
-}
-#read everything except comments.
-while read -s -a line; do
- [ -z "${line[0]}" ] && continue #ignore empty lines
-
- case "${line[0]}" in \#*) continue ;; esac #ignore comments
-
- value="${line[0]}"
- category="${line[2]}"
-
- case "$value" in
- *..*)
- start=`echo "$value" | cut -d . -f 1`
- end=`echo "$value" | cut -d . -f 3`
- add_hexrange "$category" "$start" "$end"
- ;;
- *)
- value=`printf "%05X" $((16#$value))`
- eval "$category+=(0x\$value)"
- ;;
- esac;
-
-done
-
-printf "/* This file is automatically generated by %s from WordBreakProperty.txt */\n" "$0"
-
-print_c_array CR
-print_c_array LF
-print_c_array Newline
-print_c_array Extend
-print_c_array Regional_Indicator
-print_c_array Format
-print_c_array Katakana
-print_c_array Hebrew_Letter
-print_c_array ALetter
-print_c_array Single_Quote
-print_c_array Double_Quote
-print_c_array MidNumLet
-print_c_array MidLetter
-print_c_array MidNum
-print_c_array Numeric
-print_c_array ExtendNumLet
-
diff -r 3725c601dbaf -r e80969ea8684 src/lib-fts/word-break-data.awk
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/word-break-data.awk Mon May 11 22:37:21 2015 +0300
@@ -0,0 +1,102 @@
+#!/usr/bin/awk -f
+
+#
+# converts strings to hex numbers (gawk's strtonum function)
+# adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+#
+function mystrtonum(str) {
+ # adopted from http://www.gnu.org/software/gawk/manual/html_node/Strtonum-Function.html#Strtonum-Function
+ if (str ~ /^0[xX][[:xdigit:]]+$/) {
+ str = substr(str, 3) # lop off leading 0x
+ n = length(str)
+ ret = 0
+ for (i = 1; i <= n; i++) {
+ c = substr(str, i, 1)
+ c = tolower(c)
+ # index() returns 0 if c not in string,
+ # includes c == "0"
+ k = index("123456789abcdef", c)
+ ret = ret * 16 + k
+ }
+ } else {
+ ret = "NOT-A-HEX-NUMBER"
+ }
+ return ret
+}
+
+#
+# expand number ranges (from..to) to sequences of numbers (emulate seq function)
+#
+function add_hexrange (start, end) {
+ from = mystrtonum("0x"start)
+ to = mystrtonum("0x"end)
+ for ( i=from; i<=to; i++ )
+ temp[i] = i
+ result = temp[from]
+ for ( i=from+1; i<=to; i++ )
+ result = result " " temp[i]
+ return result
+}
+
+#
+# initialization stuff (define categories of intrest in input file)
+#
+BEGIN {
+ FS = " "
+ ncategories = split("White_Space Dash Terminal_Punctuation STerm Pattern_White_Space", array_names)
+}
+
+#
+# evaluate every line in input read from <stdin>
+#
+{
+ # skip comments and empty lines
+ if ( $0 !~ /^#/ && NF != 0 ) {
+ # cycle over array_names and do the math
+ for (category in array_names) {
More information about the dovecot-cvs
mailing list