-
Notifications
You must be signed in to change notification settings - Fork 1
/
dict-po-builder.pl
141 lines (131 loc) · 4.01 KB
/
dict-po-builder.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/perl
# HACKED VERSION of enwn2gawn.pl....
# Used to output PO files to be translated by humans for the dictionary
use strict;
use warnings;
use locale;
my %wnga; # see makewn2ga.pl; hash of arrays, keys are sense_keys,
# values are array refs with ga words in the array
open(WNGA, "<", "wn2ga.txt") or die "Could not open wn2ga.txt: $!\n";
while (<WNGA>) {
chomp;
(my $sk, my $focail) = /^([^|]+)\|(.+)$/;
$wnga{$sk} = [ split /,/,$focail ];
}
close WNGA;
# freq data used for sorting Irish words within a synset
my %gafreq;
open(ROGET, "<", "roget.txt") or die "Could not open roget.txt: $!\n";
while (<ROGET>) {
chomp;
(my $cnt, my $word) = /^([0-9]+) (.+)$/;
$gafreq{$word} = $cnt;
}
close ROGET;
# just need index.sense for the adjective lookups
my %adjlookup;
open(SENSEINDEX, "<", "index.sense") or die "Could not open index.sense: $!\n";
while (<SENSEINDEX>) {
chomp;
(my $sense_key, my $offset, my $wnsensenumber, my $count) = /^([^ ]+) ([0-9]{8}) ([0-9]+) ([0-9]+)$/;
(my $lemma, my $ss_type, my $lex_filenum, my $lex_id) = $sense_key =~ /^([^%]+)%([1-5]):([0-9][0-9]):([0-9][0-9])/;
if ($ss_type == 5) {
$adjlookup{"$lemma|$offset"} = $sense_key;
}
}
close SENSEINDEX;
my %pos_codes = ('n' => '1', # used for generating sense key correctly
'v' => '2',
'a' => '3',
'r' => '4',
's' => '5',
);
my %irish_words;
sub my_sort {
if ($irish_words{$a} == $irish_words{$b}) {
return $a cmp $b;
}
else {
return $irish_words{$b} <=> $irish_words{$a};
}
}
sub process_data_file
{
(my $file) = @_;
open(DATAFILE, "<", $file) or die "Could not open $file: $!\n";
my $outputfile = $file;
$outputfile =~ s/^/ga-/;
$outputfile =~ s/$/.pot/;
open(OUTPUTFILE, ">", $outputfile) or die "Could not open $outputfile: $!\n";
print OUTPUTFILE "msgid \"\"\nmsgstr \"Content-Type: text/plain; charset=UTF-8\\n\"\n\n";
while (<DATAFILE>) {
chomp;
unless (/^ /) {
(my $synset_offset, my $lex_filenum, my $ss_type, my $w_cnt, my $rest) = /^([0-9]{8}) ([0-9][0-9]) ([nvasr]) ([0-9a-f][0-9a-f]) (.+)$/;
my $decimal_words = hex($w_cnt);
my $pos = $ss_type;
$pos = 'a' if ($pos eq 's');
$pos = 'adv' if ($pos eq 'r');
%irish_words = ();
my $enstring;
for (my $i=0; $i < $decimal_words; $i++) {
$rest =~ s/^([^ ]+) ([0-9a-z]) //;
my $lemma=$1;
my $lex_id_hex=$2;
my $sense_key;
$lemma =~ s/\([a-z]+\)$//; # (s) or (a) only: "syntactic marker"
$enstring .= "$lemma, ";
if ($ss_type eq 's') {
$sense_key = $adjlookup{"\L$lemma"."|$synset_offset"};
}
else {
# for non-adjs, rebuild the sense_key just from data in data.*
my $ss_num_type = $pos_codes{$ss_type};
my $lex_id=sprintf("%02d", hex($lex_id_hex));
$sense_key = "\L$lemma".'%'.$ss_num_type.':'.$lex_filenum.':'.$lex_id.'::';
# should be same as adjlookup as in 's' case
}
foreach my $ir (@{$wnga{$sense_key}}) {
$ir =~ s/\+/\/$pos+/ unless ($ir =~ / /); # to match gafreq
$irish_words{$ir}++;
}
}
$enstring =~ s/_/ /g;
$enstring =~ s/, $//;
my $icount = scalar keys %irish_words;
if ($icount > 0) {
foreach my $i (keys %irish_words) {
$i =~ m/^([^+]+)\+([0-9]+)\+/;
my $freqkey = $1;
my $tot = $2;
$irish_words{$i} *= 12; # tune up as gafreq corpus grows!
$irish_words{$i} /= $tot;
if (exists($gafreq{$freqkey})) {
$irish_words{$i} += log($gafreq{$freqkey}+1);
}
}
my $gastring;
foreach my $i (sort my_sort keys %irish_words) {
$i =~ s/\+.*//;
$i =~ s/\/.*//;
$gastring .= "$i, ";
}
$gastring =~ s/, $//;
print OUTPUTFILE "# $gastring\n";
print OUTPUTFILE "# $enstring\n";
print OUTPUTFILE "msgctxt \"$synset_offset $ss_type\"\n";
$rest =~ s/^[^|]+\| //;
$rest =~ s/; ".+$//;
$rest =~ s/ +$//;
$rest =~ s/"/\\"/g;
print OUTPUTFILE "msgid \"$rest\"\nmsgstr \"\"\n\n";
}
}
}
close DATAFILE;
close OUTPUTFILE;
}
process_data_file('data.adj');
process_data_file('data.adv');
process_data_file('data.noun');
process_data_file('data.verb');