forked from squid-cache/squid
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathupdate-contributors.pl
executable file
·383 lines (311 loc) · 11.3 KB
/
update-contributors.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/perl -w
#
## Copyright (C) 1996-2023 The Squid Software Foundation and contributors
##
## Squid software is distributed under GPLv2+ license and includes
## contributions from numerous individuals and organizations.
## Please see the COPYING and CONTRIBUTORS files for details.
##
use strict;
use warnings;
use Getopt::Long;
# Reads (presumed to be previously vetted) CONTRIBUTORS file.
# Reads untrusted CONTIBUTORS-like new input (without the preamble).
# Reports and ignores invalid new contributor entries.
# Reports and ignores valid new entries already covered by CONTRIBUTORS.
# Prints CONTRIBUTORS preamble, vetted entries, and imported new contributors
# using CONTRIBUTORS file format.
my $VettedLinesIn = 0;
my $NewLinesIn = 0;
my $LinesOut = 0;
my $SkippedBanned = 0;
my $SkippedAlreadyVetted = 0;
my $SkippedNewDuplicates = 0;
my $SkippedEmptyLines = 0;
my $SkippedBadLines = 0;
# Brief display by default.
# Use --quiet for no output
# Use -v or --verbose for more details, repeating them for even more details.
my $VerboseOutput = 1;
GetOptions(
'quiet' => sub { $VerboseOutput = 0 },
'verbose+' => \$VerboseOutput, 'v+' => \$VerboseOutput,
) or die("$0: Bad command line arguments\n");
my @VettedContributors = ();
my @NewContributors = ();
my %Problems = ();
exit &main();
# whether the new entry is already sufficiently represented by the vetted one
sub similarToVetted
{
my ($c, $vetted) = @_;
# It is not critical (and is probably impossible) to get this right for
# every single use case. When the script gets it wrong, a human can always
# update CONTRIBUTORS manually. Rare mistakes are not a big deal.
# same email is enough, regardless of name differences
if (defined($c->{email}) && defined($vetted->{email})) {
my $diff = &caseCmp($c->{email}, $vetted->{email});
return 1 if $diff == 0;
}
# same name is enough, regardless of email differences
if (defined($c->{name}) && defined($vetted->{name})) {
my $diff = &caseCmp($c->{name}, $vetted->{name});
return 1 if $diff == 0;
}
return 0;
}
# ensures final, stable order while guaranteeing no duplicates
sub cmpContributorsForPrinting
{
my ($l, $r) = @_;
my $diff = &cmpContributors($l, $r);
return $diff if $diff;
# now case-sensitively
$diff = &contributorToString($l) cmp &contributorToString($r);
return $diff if $diff;
die("duplicates in output");
}
# case-insensitive comparison
# for list stability, use cmpContributorsForPrinting() when ordering entries
sub cmpContributors
{
my ($l, $r) = @_;
# Compare based on the first field (name or, if nameless, email)
# Do not use &contributorToString() on nameless entries because the
# leading "<" in such entries will group them all together. We want
# nameless entries to use email (without brackets) for this comparison.
my $lRep = defined($l->{name}) ? $l->{name} : $l->{email};
my $rRep = defined($r->{name}) ? $r->{name} : $r->{email};
die() unless defined($lRep) && defined($rRep);
my $diff = &caseCmp($lRep, $rRep);
return $diff if $diff;
# nameless entries go after (matching) named entries
return -1 if defined($l->{name}) && !defined($r->{name});
return +1 if !defined($l->{name}) && defined($r->{name});
return 0 if !defined($l->{name}) && !defined($r->{name});
# we are left with the same-name entries
die() unless defined($l->{name}) && defined($r->{name});
# email-less entries go after (same-name) with-email entries
return -1 if defined($l->{email}) && !defined($r->{email});
return +1 if !defined($l->{email}) && defined($r->{email});
return 0 if !defined($l->{email}) && !defined($r->{email});
# we are left with same-name entries with emails
return &caseCmp($l->{email}, $r->{email});
}
# whether the given entry is (better) represented by the other one
sub worseThan
{
my ($l, $r) = @_;
return 1 if &cmpContributors($l, $r) == 0; # pure duplicate
return 1 if !defined($l->{name}) && defined($r->{email}) &&
$l->{email} eq $r->{email};
return 1 if !defined($l->{email}) && defined($r->{name}) &&
$l->{name} eq $r->{name};
return 0;
}
# whether the entry should be excluded based on some out-of-band rules
sub isManuallyExcluded
{
my ($c) = @_;
return lc(contributorToString($c)) =~ /squidadm/; # a known bot
}
sub contributorToString
{
my ($c) = @_;
if (defined($c->{name}) && defined($c->{email})) {
return sprintf("%s <%s>", $c->{name}, $c->{email});
}
if (defined $c->{name}) {
return $c->{name};
}
die() unless defined $c->{email};
return sprintf("<%s>", $c->{email});
}
sub printContributors
{
foreach my $c (sort { &cmpContributorsForPrinting($a, $b) } (@VettedContributors, @NewContributors)) {
my $entry = &contributorToString($c);
die() unless defined $entry && length $entry;
&lineOut(" $entry\n");
}
}
# convert an unvetted/raw input line into a {name, email, ...} object
sub parseContributor
{
s/^\s+|\s+$//g; # trim
my $trimmedRaw = $_;
s/\s+/ /g; # canonical space characters
die() unless length $_;
return "entry with strange characters" if /[^-,_+'" a-zA-Z0-9@<>().]/;
my $name = undef();
my $email = undef();
if (s/\s*<(.*)>$//) {
$email = $1 if length $1;
return "multiple emails" if defined($email) && $email =~ /,/;
return "suspicious email" if defined($email) && !&isEmail($email);
}
# convert: name@example.com <>
# into: <name@example.com>
if (!defined($email) && &isEmail($_)) {
$email = $_;
$_ = '';
}
$name = $_ if length $_;
if (defined($name)) {
return "name that looks like email" if $name =~ /@|<|\sat\s|^unknown$/;
# strip paired surrounding quotes
if ($name =~ /^'\s*(.*)\s*'$/ || $name =~ /^"\s*(.*)\s*"$/) {
$name = $1;
}
}
return "nameless, email-less entry" if !defined($name) && !defined($email);
return {
name => $name,
email => $email,
raw => $trimmedRaw,
};
}
# Handle CONTRIBUTORS file, printing preamble and loading vetted entries. The
# parsing rules here are a lot more relaxed because we know that this vetted
# content might contain manual entries that violate our automated rules.
sub loadVettedContributors
{
my ($vettedFilename) = @_;
open(IF, "<$vettedFilename") or die("Cannot open $vettedFilename: $!\n");
while (<IF>) {
my $original = $_;
++$VettedLinesIn;
if (s/^\S// || s/^\s*$//) {
# preamble and its terminator (a more-or-less empty line)
&lineOut($original);
next;
}
chomp;
s/^\s+|\s+$//g; # trim
my $trimmedRaw = $_;
my ($name, $email);
if (s/\s*<(.+)>$//) {
$email = $1;
}
if (length $_) {
$name = $_;
die("Malformed vetted entry name: ", $name) if $name =~ /[@<>]/;
}
die("Malformed $vettedFilename entry:", $original) if !defined($name) && !defined($email);
push @VettedContributors, {
name => $name,
email => $email,
raw => $trimmedRaw,
};
}
close(IF) or die();
die() unless @VettedContributors;
}
# import contributor (name, email) pairs from CONTRIBUTOR-like input
# skip unwanted entries where the decision can be made w/o knowing all entries
sub loadCandidates
{
while (<>) {
++$NewLinesIn;
my $original = $_;
chomp;
s/^\s+|\s+$//g; # trim
if (!length $_) {
++$SkippedEmptyLines;
next;
}
my $c = &parseContributor();
die() unless $c;
if (!ref($c)) {
¬eProblem("Skipping %s: %s", $c, $original);
++$SkippedBadLines;
next;
}
die(ref($c)) unless ref($c) eq 'HASH';
if (&isManuallyExcluded($c)) {
¬eProblem("Skipping banned entry: %s\n", $c->{raw}) if ($VerboseOutput > 0);
++$SkippedBanned;
next;
}
if (my ($vettedC) = grep { &similarToVetted($c, $_) } @VettedContributors) {
if ($VerboseOutput > 1) {
¬eProblem("Skipping already vetted:\n %s\n %s\n", $vettedC->{raw}, $c->{raw})
unless &contributorToString($vettedC) eq &contributorToString($c);
}
++$SkippedAlreadyVetted;
next;
}
push @NewContributors, $c;
}
}
sub pruneCandidates
{
my @ngContributors = ();
while (@NewContributors) {
my $c = pop @NewContributors;
if (my ($otherC) = grep { &worseThan($c, $_) } (@VettedContributors, @NewContributors, @ngContributors)) {
if ($VerboseOutput > 0) {
¬eProblem("Skipping very similar:\n %s\n %s\n", $otherC->{raw}, $c->{raw})
unless &contributorToString($otherC) eq &contributorToString($c);
}
++$SkippedNewDuplicates;
next;
}
push @ngContributors, $c;
}
@NewContributors = @ngContributors;
}
sub lineOut {
print(@_);
++$LinesOut;
}
# report the given problem, once
sub noteProblem {
my $format = shift;
my $problem = sprintf($format, @_);
return if exists $Problems{$problem};
$Problems{$problem} = undef();
print(STDERR $problem);
}
sub isEmail
{
my ($raw) = @_;
return $raw =~ /^\S+@\S+[.]\S+$/;
}
sub caseCmp
{
my ($l, $r) = @_;
return (uc $l) cmp (uc $r);
}
sub main
{
&loadVettedContributors("CONTRIBUTORS");
&loadCandidates();
&pruneCandidates();
my $loadedNewContributors = scalar @NewContributors;
die("$NewLinesIn != $SkippedEmptyLines + $SkippedBadLines + $SkippedBanned + $SkippedAlreadyVetted + $SkippedNewDuplicates + $loadedNewContributors; stopped")
unless $NewLinesIn == $SkippedEmptyLines + $SkippedBadLines + $SkippedBanned + $SkippedAlreadyVetted + $SkippedNewDuplicates + $loadedNewContributors;
&printContributors();
if ($VerboseOutput > 1) {
printf(STDERR "Vetted lines in: %4d\n", $VettedLinesIn);
printf(STDERR "Updated lines out: %4d\n", $LinesOut);
printf(STDERR "\n");
}
if ($VerboseOutput > 2) {
printf(STDERR "New lines in: %4d\n", $NewLinesIn);
printf(STDERR "Skipped empty lines: %4d\n", $SkippedEmptyLines) unless ($SkippedEmptyLines == 0);
printf(STDERR "Skipped duplicates: %4d\n", $SkippedNewDuplicates) unless ($SkippedNewDuplicates == 0);
}
if ($VerboseOutput > 1) {
printf(STDERR "Skipped banned: %4d\n", $SkippedBanned) unless ($SkippedBanned == 0);
printf(STDERR "Skipped similar: %4d\n", $SkippedAlreadyVetted) unless ($SkippedAlreadyVetted == 0);
}
if ($VerboseOutput > 0) {
printf(STDERR "Skipped bad lines: %4d\n", $SkippedBadLines) unless ($SkippedBadLines == 0);
printf(STDERR "\n");
printf(STDERR "Vetted contributors: %3d\n", scalar @VettedContributors) if ($VerboseOutput > 1);
printf(STDERR "New contributors: %3d\n", scalar @NewContributors) unless (scalar @NewContributors == 0);
printf(STDERR "Contributors out: %3d\n", @VettedContributors + @NewContributors) if ($VerboseOutput > 1);
}
return 0;
}