dovecot-2.2: lib-fts: autogenerate C arrays using perl

dovecot at dovecot.org dovecot at dovecot.org
Tue May 12 13:15:23 UTC 2015


details:   http://hg.dovecot.org/dovecot-2.2/rev/58d7234a6658
changeset: 18646:58d7234a6658
user:      Phil Carmody <phil at dovecot.fi>
date:      Tue May 12 16:12:29 2015 +0300
description:
lib-fts: autogenerate C arrays using perl

The sh script had bashisms, the awk script crashed mawk, so let's try perl...

Signed-off-by: Phil Carmody <phil at dovecot.fi>

diffstat:

 src/lib-fts/Makefile.am           |  11 +--
 src/lib-fts/word-boundary-data.sh |  99 ---------------------------------------
 src/lib-fts/word-break-data.sh    |  77 ------------------------------
 src/lib-fts/word-properties.pl    |  34 +++++++++++++
 4 files changed, 39 insertions(+), 182 deletions(-)

diffs (252 lines):

diff -r 0cbb125046a5 -r 58d7234a6658 src/lib-fts/Makefile.am
--- a/src/lib-fts/Makefile.am	Tue May 12 12:45:34 2015 +0300
+++ b/src/lib-fts/Makefile.am	Tue May 12 16:12:29 2015 +0300
@@ -21,21 +21,20 @@
 EXTRA_DIST = \
 	udhr_fra.txt \
 	PropList.txt \
+	word-properties.pl \
 	WordBreakProperty.txt \
-	word-boundary-data.sh \
 	word-boundary-data.c \
-	word-break-data.sh \
 	word-break-data.c
 
 WordBreakProperty.txt:
 	test -f WordBreakProperty.txt || wget http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
-$(srcdir)/word-boundary-data.c: word-boundary-data.sh WordBreakProperty.txt
-	bash $(srcdir)/word-boundary-data.sh < WordBreakProperty.txt > $@.tmp && mv $@.tmp $@
+$(srcdir)/word-boundary-data.c: word-properties.pl PropList.txt
+	perl word-properties.pl boundaries WordBreakProperty.txt > $@.tmp && mv $@.tmp $@
 
 PropList.txt:
 	test -f PropList.txt || wget http://www.unicode.org/Public/UNIDATA/PropList.txt
-$(srcdir)/word-break-data.c: word-break-data.sh PropList.txt
-	bash $(srcdir)/word-break-data.sh < PropList.txt > $@.tmp && mv $@.tmp $@
+$(srcdir)/word-break-data.c: word-properties.pl PropList.txt
+	perl word-properties.pl breaks PropList.txt > $@.tmp && mv $@.tmp $@
 
 
 if BUILD_FTS_STEMMER
diff -r 0cbb125046a5 -r 58d7234a6658 src/lib-fts/word-boundary-data.sh
--- a/src/lib-fts/word-boundary-data.sh	Tue May 12 12:45:34 2015 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,99 +0,0 @@
-#!/bin/bash
-# TODO: Should perhaps be written in perl/python/awk
-# FIXME: The runtime is a bit long.
-
-#Array names match category names in data file.
-declare -a CR
-declare -a LF
-declare -a Newline
-declare -a Extend
-declare -a Regional_Indicator
-declare -a Format
-declare -a Katakana
-declare -a Hebrew_Letter
-declare -a ALetter
-declare -a Single_Quote
-declare -a Double_Quote
-declare -a MidNumLet
-declare -a MidLetter
-declare -a MidNum
-declare -a Numeric
-declare -a ExtendNumLet
-
-WIDTH=5
-
-add_hexrange () {
-
-    array_name="$1"
-    from="$2"
-    to="$3"
-
-    eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))"
-}
-
-print_c_array () {
-
-    array_name="$1"
-    eval "array=("\${$array_name[@]}")"
-    array_length=${#array[@]}
-    i=1
-
-    printf "static const uint32_t %s[]= {\n\t" "$array_name"
-
-    for val in "${array[@]}" ; do
-        printf "0x%0${WIDTH}X" "$val"
-        if [  $i -lt $array_length ]; then
-            echo -n ", "
-            if [ $(($i%8)) -eq 0 ]; then
-                echo -ne "\n\t"
-            fi
-            i=$((i+1))
-        else
-            break
-        fi
-    done
-
-   echo -ne "\n};\n"
-}
-#read everything except comments.
-while read -s -a line; do
-    [ -z "${line[0]}" ] && continue #ignore empty lines
-
-     case "${line[0]}" in \#*) continue ;; esac #ignore comments
-
-    value="${line[0]}"
-    category="${line[2]}"
-
-    case "$value" in
-        *..*)
-            start=`echo "$value" | cut -d . -f 1`
-            end=`echo "$value" | cut -d . -f 3`
-            add_hexrange "$category" "$start" "$end"
-            ;;
-        *)
-            value=`printf "%05X" $((16#$value))`
-            eval "$category+=(0x\$value)"
-            ;;
-        esac;
-
-done
-
-printf "/* This file is automatically generated by %s from WordBreakProperty.txt */\n" "$0"
-
-print_c_array CR
-print_c_array LF
-print_c_array Newline
-print_c_array Extend
-print_c_array Regional_Indicator
-print_c_array Format
-print_c_array Katakana
-print_c_array Hebrew_Letter
-print_c_array ALetter
-print_c_array Single_Quote
-print_c_array Double_Quote
-print_c_array MidNumLet
-print_c_array MidLetter
-print_c_array MidNum
-print_c_array Numeric
-print_c_array ExtendNumLet
-
diff -r 0cbb125046a5 -r 58d7234a6658 src/lib-fts/word-break-data.sh
--- a/src/lib-fts/word-break-data.sh	Tue May 12 12:45:34 2015 +0300
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,77 +0,0 @@
-#!/bin/bash
-
-#Array names match category names in data file.
-array_names="White_Space Dash Terminal_Punctuation STerm Pattern_White_Space"
-declare -a White_Space
-declare -a Dash
-declare -a Terminal_Punctuation
-declare -a STerm
-declare -a Pattern_White_Space
-#TODO include Pattern_Syntax?
-
-WIDTH=5
-
-add_hexrange () {
-
-    array_name="$1"
-    from="$2"
-    to="$3"
-
-    eval "$array_name+=($(seq -s ' ' "0x$from" "0x$to"))"
-}
-
-print_c_array () {
-
-    array_name="$1"
-    eval "array=("\${$array_name[@]}")"
-    array_length=${#array[@]}
-    i=1
-
-    printf "static const uint32_t %s[]= {\n\t" "$array_name"
-
-    for val in "${array[@]}" ; do
-        printf "0x%0${WIDTH}X" "$val"
-        if [  $i -lt $array_length ]; then
-            echo -n ", "
-            if [ $(($i%8)) -eq 0 ]; then
-                echo -ne "\n\t"
-            fi
-            i=$((i+1))
-        else
-            break
-        fi
-    done
-
-   echo -ne "\n};\n"
-}
-#read everything except comments.
-while read -s -a line; do
-    [ -z "${line[0]}" ] && continue #ignore empty lines
-
-     case "${line[0]}" in \#*) continue ;; esac #ignore comments
-
-    value="${line[0]}"
-    category="${line[2]}"
-
-   case "$array_names" in
-       *"$category"*)
-           case "$value" in
-               *..*)
-                   start=`echo "$value" | cut -d . -f 1`
-                   end=`echo "$value" | cut -d . -f 3`
-                   add_hexrange "$category" "$start" "$end"
-                   ;;
-               *)
-                   value=`printf "%05X" $((16#$value))`
-                   eval "$category+=(0x\$value)"
-                   ;;
-           esac
-           ;;
-   esac
-done
-printf "/* This file is automatically generated by %s from PropList.txt */\n" "$0"
-
-for name in $array_names; do
-    print_c_array "$name"
-done
-
diff -r 0cbb125046a5 -r 58d7234a6658 src/lib-fts/word-properties.pl
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/lib-fts/word-properties.pl	Tue May 12 16:12:29 2015 +0300
@@ -0,0 +1,34 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+
+my @categories;
+my $which = shift(@ARGV);
+if ($which eq 'boundaries') {
+    @categories = qw(CR LF Newline Extend Regional_Indicator Format Katakana Hebrew_Letter ALetter
+		    Single_Quote Double_Quote MidNumLet MidLetter MidNum Numeric ExtendNumLet);
+} elsif ($which eq 'breaks') {
+    @categories = qw(White_Space Dash Terminal_Punctuation STerm Pattern_White_Space);
+} else {
+    die "specify 'boundaries' or 'breaks'";
+}
+
+my $catregexp=join('|', @categories);
+my %catlists = map { $_ => []; } (@categories);
+
+while(<>) {
+    next if (m/^#/ or m/^\s*$/);
+    push(@{$catlists{$3}}, defined($2) ? (hex($1)..hex($2)) : hex($1))
+	if (m/([[:xdigit:]]+)(?:\.\.([[:xdigit:]]+))?\s+; ($catregexp) #/)
+}
+
+print "/* This file is automatically generated by word-properties.pl from $ARGV */\n";
+foreach(@categories) {
+    my $arref=$catlists{$_};
+    print "static const uint32_t ${_}[]= {\n";
+    while(scalar(@$arref)) {
+	print("\t", join(", ", map { sprintf("0x%05X", $_); } splice(@$arref, 0, 8)));
+	print(scalar(@$arref) ? ", \n" : "\n");
+    }
+    print("};\n");
+}


More information about the dovecot-cvs mailing list