forked from ocaml/ocaml
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck-typo
executable file
·457 lines (411 loc) · 15.8 KB
/
check-typo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
#!/bin/sh
#**************************************************************************
#* *
#* OCaml *
#* *
#* Damien Doligez, projet Gallium, INRIA Rocquencourt *
#* *
#* Copyright 2012 Institut National de Recherche en Informatique et *
#* en Automatique. *
#* *
#* All rights reserved. This file is distributed under the terms of *
#* the GNU Lesser General Public License version 2.1, with the *
#* special exception on linking described in the file LICENSE. *
#* *
#**************************************************************************
# check-typo - Check typographic conventions on OCaml sources.
# This program will check files for the following rules:
# - absence of TAB characters (tab)
# - absence of non-ASCII characters (non-ascii)
# - absence of non-printing ASCII characters (non-printing)
# - absence of white space at end of line (white-at-eol)
# - absence of empty lines at end of file (white-at-eof)
# - presence of a LF character at the end of the file (missing-lf)
# - maximum line length of 80 characters (long-line)
# - maximum line length of 132 characters (very-long-line)
# - presence of a copyright header (missing-header)
# - absence of a leftover "$Id" string (svn-keyword)
# Exceptions are handled with git attributes: "typo.*".
# Its value for a given file is a comma-separated list of rule names,
# which lists the rules that should be disabled for this file.
# The rule names are the ones shown above in parentheses.
# Built-in exception:
# - Any file git identifies as binary
# is automatically exempt from all the rules.
# ASCII characters are bytes from 0 to 127. Any other byte is
# flagged as a non-ASCII character.
# For the purpose of this tool, printing ASCII characters are:
# - the non-white printable ASCII characters (33 to 126)
# - TAB (09)
# - LF (10)
# - SPC (32)
# Anything else is flagged as a non-printing ASCII character.
# This program will recursively explore the files and directories given
# on the command line (or by default the current directory), and check
# every file therein for compliance to the rules.
# Directories named .git (and their contents) are always ignored.
# This program ignores any file that is not under git control, unless
# explicitly given on the command line.
# If a directory has the git attribute "typo.prune" then it and its contents are
# ignored.
# You can ignore a rule by giving the option -<rule> on the command
# line (before any file names).
# Files which include the utf8 rule will be validated using grep and line-length
# computations will take UTF-8 sequences into account. As a special case, UTF-8
# sequences are always allowed in the copyright headers.
# First prevent i18n from messing up everything.
export LC_ALL=C
OCAML_CT_CAT=${OCAML_CT_CAT:-cat}
OCAML_CT_LS_FILES=${OCAML_CT_LS_FILES:-git ls-files}
OCAML_CT_HEAD=${OCAML_CT_HEAD:-HEAD}
OCAML_CT_AWK=${OCAML_CT_AWK:-awk}
if [ -z "${OCAML_CT_GIT_INDEX+x}" ] ; then
OCAML_CT_GIT_INDEX=
else
OCAML_CT_GIT_INDEX="GIT_INDEX_FILE=$OCAML_CT_GIT_INDEX"
fi
# The output of processing the attributes should be whitespace-separated with
# - the "typo." prefix dropped
# - unset/false keys not present
# - set/true keys present
# - "may" keys present, suffixed by a question mark
#
# for example,
# typo.long-line: set
# typo.missing-header: may
# typo.very-long-line: false
# should result in "long-line missing-header?"
get_attrs() {
env $OCAML_CT_GIT_INDEX git check-attr --all $OCAML_CT_CA_FLAG "$1" \
| grep -o " typo\\..*$" | sed "s/ typo\\.//g" \
| grep -v ": unset" | grep -v ": false" \
| sed "s/: set//g" | sed "s/: true//g" | sed "s/: may/?/g"
}
# empty if the path is *not* pruned
check_prune() {
env $OCAML_CT_GIT_INDEX git check-attr typo.prune $OCAML_CT_CA_FLAG "$1" \
| grep -v ': unspecified$' | grep -v ': false$'
}
# Special case for recursive call from the find command (see IGNORE_DIRS).
case "$1" in
--check-prune)
case $2 in
.git|.git/*)
echo "INFO: pruned path $2 (.git)" >&2
exit 0;;
esac
if test -n "$(check_prune "$2")"; then
echo "INFO: pruned path $2 (typo.prune)" >&2
exit 0
fi
exit 3;;
esac
case "$1" in
--get-attrs)
get_attrs "$2"
exit 0;;
esac
usage () {
echo "usage: check-typo {-<rule>} [--] {<file-or-dir>}" >&2
exit 2
}
check_script () {
if [ "$($OCAML_CT_CAT "$OCAML_CT_PREFIX$1" \
| sed -ne '1s/^#!.*/#!/p')" != '#!' ] ; then
# These files are listed manually, rather than via gitattributes,
# because the list should never expand, and it should not be trivial to
# expand (the unix-execvpe test is an ultra-special-case!)
f=${1#./}
if [ "$f" != "boot/ocamlc" ] && [ "$f" != "boot/ocamllex" ] && \
[ "$f" != "testsuite/tests/lib-unix/unix-execvpe/subdir/script2" ] ; then
echo "$1 shouldn't be executable; either:"
echo " - Add a #! line"
echo " - Run chmod -x $1 (on Unix)"
echo " - Run git update-index --chmod=-x $1 (on Windows)"
echo "You may wish to check your core.fileMode setting"
EXIT_CODE=1
fi
fi
}
userrules=''
while : ; do
case "$1" in
-help|--help) usage;;
-*) userrules="${1#-} $userrules"; shift;;
--) shift; break;;
*) break;;
esac
done
IGNORE_DIRS="
-name .git -prune -o
-type d -exec $0 --check-prune {} ; -prune -o
"
# `-type d`: simple files (not directories) are not pruned during the
# "find" invocation but below (look for "check_prune") for performance
# reasons: most files outside pruned directories are not pruned, so it
# is faster to optimistically run check-typo on them (and maybe get
# out in the middle) than to first check then run.
TEST_AWK='BEGIN {if ("a{1}" ~ /a{1}/) exit 0}'
if $OCAML_CT_AWK "$TEST_AWK" ; then
TEST_AWK='BEGIN {if ("a" ~ /a{1}/) exit 0}'
if $OCAML_CT_AWK --re-interval "$TEST_AWK" 2>/dev/null ; then
OCAML_CT_AWK="$OCAML_CT_AWK --re-interval"
else
echo "This script requires interval support in regexes ({m} notation)">&2
echo "Please install a version of awk (e.g. gawk) which supports this">&2
exit 2
fi
fi
EXIT_CODE=0
( case $# in
0) find . $IGNORE_DIRS -type f -print;;
*) for i in "$@"; do find "$i" $IGNORE_DIRS -type f -print; done;;
esac
) | (
while read f; do
if test -n "$(check_prune "$f")"; then continue; fi
if $(git check-ignore -q "$f"); then continue; fi
case `$OCAML_CT_LS_FILES "$f" 2>&1` in
"") path_in_index=false;;
*) path_in_index=true;;
esac
case "$*" in
*$f*) is_cmd_line=true;;
*) is_cmd_line=false;;
esac
if [ -z "$OCAML_CT_PREFIX" ] ; then
if [ -x "$f" ] ; then
check_script "$f"
fi
else
if git ls-files -s "$f" | grep -q "^100755" ; then
check_script "$f"
fi
fi
if $path_in_index || $is_cmd_line; then :; else continue; fi
attr_rules=''
if $path_in_index; then
# Below is a git plumbing command to detect whether git regards a
# particular file as binary. This takes into account .gitattributes, but
# also works if the file has been automatically detected as binary by git.
# EMPTY is the hash of the empty tree (which is specially known to git -
# it is automatically included in every repository) as a way to get
# `diff-tree` to print the whole tree state; its `--numstat` output then
# prints a summary where two dashes in the first two columns indicates a
# binary file.
# (See https://git-scm.com/docs/git-diff-tree#_other_diff_formats and
# the documentation for the --numstat option. Commands designated as
# "plumbing" commands in git have stable output intended for parsing)
EMPTY=`git hash-object -t tree /dev/null`
git diff-tree --numstat $EMPTY $OCAML_CT_HEAD -- "$f" \
| grep -q "^-[[:blank:]]-" && continue
attr_rules=$(get_attrs "$f")
fi
rules="$userrules"
# remove newlines, ensure spaces at boundary
rules=" $(echo $rules) "
attr_rules=" $(echo $attr_rules) "
if test -n "$(echo "$rules $attr_rules" | grep " utf8 ")"
then
# grep -a is used to force the file to be considered as text and -x
# requires the entire line to match. This specifically detects the
# presence of lines containing malformed UTF-8. It may be tested using
# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
if $OCAML_CT_CAT "$OCAML_CT_PREFIX$f" \
| LC_ALL=en_US.UTF8 grep -qaxv '.*' ; then
echo "File \"$f\" is not correctly encoded in UTF-8"
exit 2
fi
fi
if ! \
($OCAML_CT_CAT "$OCAML_CT_PREFIX$f" | tr -d '\r'; echo) \
| $OCAML_CT_AWK -v rules="$rules" -v attr_rules="$attr_rules" -v file="$f" \
'
function is_err(name) {
return ((rules attr_rules) !~ (" " name "[\\? ]"));
}
function report_err(name, msg) {
printf ("%s:%d.%d:", file, NR, RSTART + RLENGTH);
printf (" [%s] %s\n", name, msg);
got_errors = 1;
}
function err(name, msg) {
++ counts[name];
if (is_err(name) && counts[name] <= 10) {
report_err(name, msg);
if (counts[name] == 10){
printf ("WARNING: too many [%s] in this file.", name);
printf (" Others will not be reported.\n");
}
}
}
function err_if(guard, name, msg) {
if (is_err(guard)) {
err(name, msg);
} else {
++ counts[name];
}
}
function more_columns(str, limit, c){
c = 0;
for (i = 1; i <= length(str); i++){
if (substr(str, i, 1) == "\t"){
c = int((c + 8) / 8) * 8;
}else{
++ c;
}
}
return c > limit;
}
function utf8_decode(str) {
if (is_err("utf8")) {
return str;
} else {
# This script assumes that the UTF-8 has been externally validated
t = str;
gsub(/[\300-\367][\200-\277]+/, "?", t);
if (t != str) {
++ counts["utf8"];
}
return t;
}
}
BEGIN { state = "(first line)"; }
match($0, /\t/) {
err("tab", "TAB character(s)");
t = utf8_decode($0);
if (more_columns(t, 80)){
RSTART=81;
RLENGTH = 0;
err_if("very-long-line", "long-line", "line is over 80 columns");
}
if (more_columns(t, 132)){
RSTART=133;
RLENGTH = 0;
err("very-long-line", "line is over 132 columns");
}
}
match($0, /[\200-\377]/) \
&& state != "authors" && state != "copyright" {
if (is_err("utf8")) {
err("non-ascii", "non-ASCII character(s)");
if (header_utf8 && !is_err("non-ascii")) {
err("non-ascii-utf8", \
"non-ASCII character(s) AND UTF-8 encountered");
}
} else {
++ counts["utf8"];
}
}
match($0, /[^\t\200-\377 -~]/) {
err("non-printing", "non-printing ASCII character(s)");
}
match($0, /[ \t]+$/) {
err("white-at-eol", "whitespace at end of line");
}
match($0, /\$Id(: .*)?\$/) {
err("svn-keyword", "SVN keyword marker");
}
$0 !~ /\t/ && length($0) > 80 {
t = utf8_decode($0);
sub(/https?:[A-Za-z0-9._~:\/?#\[\]@!$&\047()*+,;=%-]{73,}$/, "", t);
if (length(t) > 80) {
RSTART = 81;
RLENGTH = 0;
err_if("very-long-line", "long-line", "line is over 80 columns");
}
}
$0 !~ /\t/ && length($0) > 132 {
RSTART = 133;
RLENGTH = 0;
t = utf8_decode($0);
if (length(t) > 132) {
err("very-long-line", "line is over 132 columns");
}
}
# Record that the header contained UTF-8 sequences
match($0, /[\300-\367][\200-\277]+/) \
&& (state == "authors" || state == "copyright") {
header_utf8 = 1;
if (counts["non-ascii"] > 0 && is_err("non-ascii")) {
err("non-ascii-utf8", \
"non-ASCII character(s) AND UTF-8 encountered");
}
}
# Header-recognition automaton. Read this from bottom to top.
# Valid UTF-8 chars are recognised in copyright and authors
# TODO: ensure all files are valid UTF-8 before awking them.
# Note that this code also assumes that combining characters are NOT
# used (i.e. that every Unicode code-point corresponds to exactly
# one displayed character, i.e. no Camels and no including
# weird-and-wonderful ways of encoded accented letters).
state == "close" && $0 ~ /\*{74}/ { state = "OK"; }
state == "close" { state = "(last line)"; }
state == "blurb" && $0 ~ /\* {72}\*/ { state = "close"; }
state == "blurb" && $0 ~ /\/LICENSE/ { state = "(license path)" }
state == "blurb1" && $0 ~ /\* All rights reserved. .{47} \*/ \
{ state = "blurb"; }
state == "blurb1" { state = "(blurb line 1)"; }
state == "copyright" && $0 ~ /\* {72}\*/ { state = "blurb1"; }
state == "copyright" \
&& $0 !~ /\* Copyright [0-9]{4}([\300-\367][\200-\277]+|.){54} \*/ \
&& $0 !~ /\* ([\300-\367][\200-\277]+|.){66} \*/ \
{ state = "(copyright lines)"; }
state == "authors" && $0 ~ /\* {72}\*/ { state = "copyright"; }
state == "authors" \
&& $0 !~ /\* ([\300-\367][\200-\277]+|.){70} \*/ \
{ state = "(authors)"; }
state == "blank2" && $0 ~ /\* {72}\*/ { state = "authors"; }
state == "blank2" { state = "(blank line 2)"; }
state == "title" && $0 ~ /\* {33}OCaml {34}\*/ { state = "blank2"; }
state == "title" { state = "(title line)"; }
state == "blank1" && $0 ~ /\* {72}\*/ { state = "title"; }
state == "blank1" { state = "(blank line 1)"; }
state == "(first line)" && NR < 4 && $0 ~ /\*{74}/ { state = "blank1"; }
{
prev_line = last_line;
last_line = $0;
}
END {
if (match(last_line, /.+/)){
err("missing-lf", "missing linefeed at EOF");
prev_line = last_line;
++ NR;
empty_file = 0;
}else{
empty_file = NR == 1;
}
if (!empty_file && match(prev_line, /^$/)){
err("white-at-eof", "empty line(s) at EOF");
}
if (state != "OK"){
if (NR >= 10){
NR = 1;
RSTART = 1;
RLENGTH = 0;
err("missing-header", sprintf("bad copyright header %s", state));
}else{
counts["missing-header"] = 1;
}
}
split(attr_rules, r, "[? ]");
for (i in r){
name = r[i];
if (name != "" && !counts[name]){
NR = 1;
RSTART = 1;
RLENGTH = 0;
if (attr_rules !~ (" " name "\\? ")) {
report_err(name,
sprintf("attribute is unused", name));
}
}
}
exit got_errors;
}
' ; then
EXIT_CODE=1
fi
done
exit $EXIT_CODE
)