-
Notifications
You must be signed in to change notification settings - Fork 0
/
toadaptxt.pl
88 lines (80 loc) · 1.96 KB
/
toadaptxt.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
binmode STDIN, ":utf8";
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
# Both WORDLIST and FREQLIST should have words, bigrams, trigrams, etc.
if ($#ARGV != 2) {
die "Usage: perl toadaptxt.pl xx WORDLIST FREQLIST";
}
# don't include more than this number of words/phrases in inclusion file
my $max = 100000;
my %dict;
open(DICT, "<:utf8", $ARGV[1]) or die "Could not open clean word list: $!";
while (<DICT>) {
chomp;
$dict{$_}++;
}
close DICT;
my $count = 0;
# don't include words with freq lower than cutoff (so 0 == no effect)
# it's set automatically in loop below...
my $cutoff = 0;
my %freq;
open(FREQ, "<:utf8", $ARGV[2]) or die "Could not open frequency list: $!";
while (<FREQ>) {
chomp;
(my $c, my $w) = /^ *([0-9]+) (.+)$/;
next if ($w =~ /^[htn]-/); # handled by so-called "elision rules"
next if ($w =~ /^[bdm]'[aeiouáéíóúAEIOUÁÉÍÓÚ]/);
next if ($w =~ /[.]/);
my $lowered = lcfirst($w);
if (exists($dict{$lowered})) {
$w = $lowered;
}
else {
next unless exists($dict{$w});
}
if ($w =~ /\p{Ll}.*\p{Lu}/) { # hÉireann -> héireann
# tAcht -> tacht, but that's ok; idea is for end users to correct, as
# lame as that is...
$w = lc($w);
}
if (exists($freq{$w})) {
$freq{$w} += $c;
}
else {
$freq{$w} = $c;
$count++;
if ($count == $max) {
print STDERR "Cutoff set to $c\n";
$cutoff = $c;
}
}
}
close FREQ;
open(INCLUSION, ">:utf8", "$ARGV[0]_inclusion-utf8.txt") or die "Could not open inclusion file: $!";
$count = 0;
for my $k (sort keys %freq) {
next if ($freq{$k} <= $cutoff);
next if ($k =~ / /);
last if ($count >= $max);
print INCLUSION "$k\n";
$count++;
}
close INCLUSION;
open(CORPUS, ">:utf8", "$ARGV[0]_corpus-utf8.txt") or die "Could not open corpus file: $!";
$count = 0;
for my $k (sort keys %freq) {
next if ($freq{$k} <= $cutoff);
last if ($count >= $max);
my $num = $freq{$k};
for (1..$num) {
print CORPUS "$k ,\n";
}
$count++;
}
close CORPUS;
exit 0;