-
Notifications
You must be signed in to change notification settings - Fork 0
/
builder
executable file
·43 lines (36 loc) · 1.04 KB
/
builder
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/perl
# Used in makefile to make veicteoir.db
#
# Not installed as part of package, only used for development
use strict;
use warnings;
use utf8;
use Lingua::GA::Gramadoir;
binmode STDOUT, ":utf8";
binmode STDERR, ":utf8";
# compare gramadoir/gr/bin/abairti-utf and gram-xx.pl -
# gramadoir module expects bytes even though those bytes are
# actually a utf-8 stream - job on the "input_encoding" to convert to such
binmode STDIN, ":bytes";
# Irish tokenizer works sufficiently well on English sentences too
my $gr = new Lingua::GA::Gramadoir(input_encoding => 'utf-8');
my %alltokes;
my %stopwords;
open(STOPS, "<:utf8", 'stoplist') or die "Could not open stop word file.\n";
while (<STOPS>) {
chomp;
$stopwords{$_}++;
}
close STOPS;
while (<STDIN>) {
chomp;
my ( $tag, $sentence ) = m/^([^ ]+) (.*)$/;
my $tokes = $gr->tokenize($sentence);
foreach (@$tokes) {
if (/../) {
tr/A-ZÁÉÍÓÚ/a-záéíóú/;
$alltokes{$_} .=" $tag" unless (exists($stopwords{$_}));
}
}
}
print "$_$alltokes{$_}\n" foreach (keys %alltokes);