-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyze-transliterations.py
43 lines (37 loc) · 1.38 KB
/
analyze-transliterations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import re
import time
import io
import sys
import argparse
from collections import defaultdict
# usage:
# corpus is assumed to consist of independent lines in a single file. lines are split deterministically into three files: train, test and dev, using the ratio specified as an argument.
# parse/validate arguments
argParser = argparse.ArgumentParser()
argParser.add_argument("-m2m", "--m2mFilename", type=str, help="filename that contains transliterations in the input m2m format")
args = argParser.parse_args()
m2mFile = io.open(args.m2mFilename, encoding='utf8')
totalSrcWords, totalTgtWords = 0, 0
totalSrcChars, totalTgtChars = 0, 0
linesCount = 0
for line in m2mFile:
linesCount+= 1
(src, tgt) = line.split('\t')
srcChars, tgtChars = src.split(), tgt.split()
totalSrcWords += 1
totalTgtWords += 1
for srcChar in srcChars:
totalSrcChars += 1
if srcChar == u'<space>':
totalSrcWords += 1
for tgtChar in tgtChars:
totalTgtChars += 1
if tgtChar == u'<space>':
totalTgtWords += 1
print 'avg src words per named entity = {0}'.format(1.0 * totalSrcWords / linesCount)
print 'avg tgt words per named entity = {0}'.format(1.0 * totalTgtWords / linesCount)
print ''
print 'avg src chars per named entity = {0}'.format(1.0 * totalSrcChars / linesCount)
print 'avg tgt chars per named entity = {0}'.format(1.0 * totalTgtChars / linesCount)
print ''
m2mFile.close()