character data generation

2018-03-08 00:02:35 +01:00 · 2018-03-08 00:02:35 +01:00 · a352730004
parent 41f72ab4d7
commit a352730004
3 changed files with 237 additions and 0 deletions
--- a/newlib/libc/ctype/mkcaseconv
+++ b/newlib/libc/ctype/mkcaseconv
@ -0,0 +1,128 @@
+#! /bin/sh -f
+
+# generate a table for Unicode case conversion; entries:
+# struct caseconv_entry defined in towctrans_l.c
+
+if [ -r UnicodeData.txt ]
+then	UnicodeData=UnicodeData.txt
+elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
+then	UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
+else	echo UnicodeData.txt not found >&2
+	exit 1
+fi
+
+LC_ALL=C
+export LC_ALL
+
+compact=true
+
+#0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
+#0061;LATIN SMALL LETTER A;Ll;0;L;;;;;N;;;0041;;0041
+#0130;LATIN CAPITAL LETTER I WITH DOT ABOVE;Lu;0;L;0049 0307;;;;N;LATIN CAPITAL LETTER I DOT;;;0069;
+#01C4;LATIN CAPITAL LETTER DZ WITH CARON;Lu;0;L;<compat> 0044 017D;;;;N;LATIN CAPITAL LETTER D Z HACEK;;;01C6;01C5
+#01C5;LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON;Lt;0;L;<compat> 0044 017E;;;;N;LATIN LETTER CAPITAL D SMALL Z HACEK;;01C4;01C6;01C5
+#01C6;LATIN SMALL LETTER DZ WITH CARON;Ll;0;L;<compat> 0064 017E;;;;N;LATIN SMALL LETTER D Z HACEK;;01C4;;01C5
+
+tr -d '\015' < $UnicodeData |
+sed \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);\([^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e 's,^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;]*\);\([^;][^;]*\)$,src \1 upper "\2" lower "\3" title "\4",' \
+-e t \
+-e d |
+(#src 01C5 upper "01C4" lower "01C6" title "01C5"
+if $compact
+then
+  (
+  cat <<\/EOS
+  src () {
+    if [ -n "$3" ]
+    then	tohi=$(( 0x0$3 - 0x0$1 ))
+    else	tohi=0
+    fi
+    if [ -n "$5" ]
+    then	tolo=$(( 0x0$5 - 0x0$1 ))
+    else	tolo=0
+    fi
+    case "$tolo.$tohi" in
+    0.0)	true;;
+    0.*)
+	case "$1.$tohi" in
+	*[02468ACE].1)	echo "'#error' U+$1 ODDSML";;
+	*[02468ACE].-1)	echo "  0x$1 TO1 ODDCAP";;
+	*[13579BDF].1)	echo "'#error' U+$1 EVENSML";;
+	*[13579BDF].-1)	echo "  0x$1 TO1 EVENCAP";;
+	*)		echo "  0x$1 TOUP $tohi";;
+	esac;;
+    *.0)
+	case "$1.$tolo" in
+	*[02468ACE].1)	echo "  0x$1 TO1 EVENCAP";;
+	*[02468ACE].-1)	echo "'#error' U+$1 EVENSML";;
+	*[13579BDF].1)	echo "  0x$1 TO1 ODDCAP";;
+	*[13579BDF].-1)	echo "'#error' U+$1 ODDSML";;
+	*)		echo "  0x$1 TOLO $tolo";;
+	esac;;
+    *)	case "$tolo.$tohi" in
+	1.-1)		echo "  0x$1 TOBOTH 0";;
+	*)		echo "'#error' U+$1";;
+	esac;;
+    esac
+  }
+/EOS
+  cat
+  ) | sh |
+  uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ," |
+  (
+  cat <<\/EOS
+  first=
+  diff=-1
+  max=255
+  range () {
+	# $diff == $(($last - $first))
+	if [ "$diff" -ge 0 ]
+	then	# we have items at all
+		echo "  {$first, $diff, $v2, $v3},"
+	fi
+	first=
+	diff=-1
+  }
+  item () {
+	if [ "$1" == "#error" ]
+	then	echo "$*"
+		return
+	fi
+
+	if [ $diff -eq $max ]
+	then	range
+	elif [ -n "$first" ]
+	then	if [ $(( $1 )) -ne $(( ${last-0} + 1 )) ]
+		then	range
+		fi
+	fi
+
+	if [ -z "$first" ]
+	then	first=$1
+		v2=$2
+		v3=$3
+	fi
+
+	last=$1
+	diff=$(( $diff + 1 ))
+  }
+/EOS
+  cat
+  ) | sh
+elif false
+then
+  sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/  {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
+      -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/' \
+      -e 's/\(0x[0-9A-F][0-9A-F]*\) - \(0x[0-9A-F][0-9A-F]*\)/$((`printf %d \1` - `printf %d \2`))/g' \
+      -e 's/^/echo "/' -e 's/$/"/' |
+  sh
+else
+  sed -e 's/src \([^ ]*\) upper "\([^ ]*\)" lower "\([^ ]*\)" title "\([^ ]*\)"/  {0x\1, 0x\2 - 0x\1, 0x\3 - 0x\1},/' \
+      -e 's/0x - 0x[^ ,}]*/0/g' -e 's/0x}/0}/'
+fi
+) > caseconv.t
--- a/newlib/libc/ctype/mkcategories
+++ b/newlib/libc/ctype/mkcategories
@ -0,0 +1,69 @@
+#! /bin/sh
+
+# generate table of Unicode character category ranges;
+# note: undefined characters between two characters of the same category
+# are associated to the same category, e.g.
+#0A0A;GURMUKHI LETTER UU;Lo
+#0A0B..0A0E           -> Lo
+#0A0F;GURMUKHI LETTER EE;Lo
+
+if [ -r UnicodeData.txt ]
+then	UnicodeData=UnicodeData.txt
+elif [ -r /usr/share/unicode/ucd/UnicodeData.txt ]
+then	UnicodeData=/usr/share/unicode/ucd/UnicodeData.txt
+else	echo UnicodeData.txt not found >&2
+	exit 1
+fi
+
+# the code assumes foldall=false, foldcase=true
+foldall=false
+foldcase=true
+
+(
+cat <<\/EOS
+first=
+item () {
+	if [ -n "$first" ]
+	then	if [ $(( 0x$1 )) -ne $(( 0x${last-0} + 1 )) ]
+		then	range
+		fi
+	fi
+
+	if [ -z "$first" ]
+	then	first=$1
+		val=$2
+	fi
+
+	last=$1
+}
+range () {
+#	echo "    {0x$first, 0x$last, CAT_$val},"
+#	echo "    {0x$first, $((0x$last - 0x$first)), CAT_$val},"
+#	echo "    {0x$first | (CAT_$val << 24), $((0x$last - 0x$first))},"
+	echo "    {CAT_$val, 0x$first, $((0x$last - 0x$first))},"
+	first=
+}
+/EOS
+
+cat "$UnicodeData" |
+if $foldall
+then sed -e "s,;L[lu];,;LC;," -e "s,;C[fs];,;Cfs;," \
+	 -e "s,;L[mo];,;Lmo;," -e "s,;Nl;,;Lmo;," \
+	 -e "s,;P.;,;P;,"  -e "s,;No;,;P;," \
+	 -e "s,;S.;,;S;," -e "s,;Z[lp];,;Zlp;," \
+	 -e "s,;C[no];,;X;," -e "s,;M[cen];,;M;,"
+elif $foldcase
+then
+# fold Lu/Ll to LC only if lower/upper conversion is available
+ sed -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;]*\);\([^;][^;]*\);.*/ s/;Lu;/;LC;/' \
+     -e '/^\([^;]*\);[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;[^;]*;\([^;][^;]*\);\([^;]*\);.*/ s/;Ll;/;LC;/' \
+     -e '/;Co;/ d'
+else cat
+fi |
+sed -e "s,^\([^;]*\);[^;]*;\([^;]*\);.*,\1	\2," |
+uniq -f1 --group=append | sed -e "s,^$,range," -e t -e "s,^,item ,"
+) | sh > categories.t
+
+sed -e "s/.*\(CAT_[A-Za-z]*\).*/  \1,/" categories.t |
+sort | uniq > categories.cat
+
--- a/newlib/libc/ctype/mkunidata
+++ b/newlib/libc/ctype/mkunidata
@ -0,0 +1,40 @@
+#! /bin/sh
+
+echo generating Unicode character properties data for newlib/libc/ctype
+
+cd `dirname $0`
+
+#############################################################################
+# checks and (with option -u) download
+
+case "$1" in
+-u)
+	#WGET=wget -N -t 1 --timeout=55
+	WGET=curl -R -O --connect-timeout 55
+	WGET+=-z $@
+
+	echo downloading data from unicode.org
+	for data in UnicodeData.txt
+	do	$WGET http://unicode.org/Public/UNIDATA/$data
+	done
+	;;
+*)	echo checking package unicode-ucd
+	grep unicode-ucd /etc/setup/installed.db || exit 9
+	;;
+esac
+
+for data in UnicodeData.txt
+do	test -r $data || ln -s /usr/share/unicode/ucd/$data . || exit 9
+done
+
+#############################################################################
+# table generation
+
+echo generating character category table for "isw*.c"
+	sh ./mkcategories
+
+echo generating case conversion table for "tow*.c"
+	sh ./mkcaseconv
+
+#############################################################################
+# end