File tree Expand file tree Collapse file tree 1 file changed +62
-0
lines changed
evaluation/benchmarks/dgsh/sequential Expand file tree Collapse file tree 1 file changed +62
-0
lines changed Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+
3+ # Consistent sorting across machines
4+ export LC_ALL=C
5+
6+ # Convert input into a ranked frequency list
7+ ranked_frequency ()
8+ {
9+ awk ' {count[$1]++} END {for (i in count) print count[i], i}' |
10+ # We want the standard sort here
11+ sort -rn
12+ }
13+
14+ # Convert standard input to a ranked frequency list of specified n-grams
15+ ngram ()
16+ {
17+ local N=$1
18+
19+ perl -ne ' for ($i = 0; $i < length($_) - ' $N ' ; $i++) {
20+ print substr($_, $i, ' $N ' ), "\n";
21+ }' |
22+ ranked_frequency
23+ }
24+
25+ # Temporary files
26+ file1=$( mktemp)
27+ file2=$( mktemp)
28+ file3=$( mktemp)
29+
30+ cat > " $file1 "
31+
32+ # Split input one word per line
33+ tr -cs a-zA-Z ' \n' < " $file1 " > " $file2 "
34+
35+ # Digram frequency
36+ echo " Digram frequency"
37+ ngram 2 < " $file2 "
38+
39+ # Trigram frequency
40+ echo " Trigram frequency"
41+ ngram 3 < " $file2 "
42+
43+ # Word frequency
44+ echo " Word frequency"
45+ ranked_frequency < " $file2 "
46+
47+ # Store number of characters to use in awk below
48+ nchars=$( wc -c < " $file1 " )
49+
50+ # Character frequency
51+ echo " Character frequency"
52+ sed ' s/./&\
53+ /g' < " $file1 " |
54+ # Print absolute
55+ ranked_frequency | tee " $file3 "
56+
57+ # Print relative
58+ echo " Relative character frequency"
59+ awk -v NCHARS=$nchars ' BEGIN {
60+ OFMT = "%.2g%%"}
61+ {print $1, $2, $1 / NCHARS * 100}' " $file3 "
62+
You can’t perform that action at this time.
0 commit comments