-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Waleed Ammar
committed
Oct 30, 2013
1 parent
49395e6
commit 724ed90
Showing
8 changed files
with
208 additions
and
60 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import re | ||
import time | ||
import io | ||
import sys | ||
import argparse | ||
from collections import defaultdict | ||
|
||
# parse/validate arguments | ||
argParser = argparse.ArgumentParser() | ||
argParser.add_argument("-d", "--delimiter", type=str, default=' ||| ', help="delimiter defaults to \t") | ||
argParser.add_argument("-1", "--firstFilename", type=str, help="src output file") | ||
argParser.add_argument("-2", "--secondFilename", type=str, help="tgt output file") | ||
argParser.add_argument("-i", "--inputFilename", type=str) | ||
argParser.add_argument("-ie", "--input_encoding", type=str, default='utf8') | ||
argParser.add_argument("-oe", "--output_encoding", type=str, default='utf8') | ||
args = argParser.parse_args() | ||
|
||
firstFile = io.open(args.firstFilename, encoding=args.output_encoding, mode='w') | ||
secondFile = io.open(args.secondFilename, encoding=args.output_encoding, mode='w') | ||
inputFile = io.open(args.inputFilename, encoding=args.input_encoding, mode='r') | ||
|
||
counter = 0 | ||
for line in inputFile: | ||
splits = line.strip().split(args.delimiter) | ||
if len(splits) != 2: | ||
continue | ||
firstLine, secondLine = splits | ||
if len(secondLine) == 0 or len(firstLine) == 0: | ||
print 'error: line {0} has an empty side'.format(counter) | ||
exit(1) | ||
firstFile.write(u'{0}\n'.format(firstLine.strip())) | ||
secondFile.write(u'{0}\n'.format(secondLine.strip())) | ||
counter += 1 | ||
|
||
firstFile.close() | ||
secondFile.close() | ||
inputFile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import argparse | ||
import io | ||
import gzip | ||
from collections import defaultdict | ||
from math import log | ||
|
||
# parse/validate arguments | ||
argParser = argparse.ArgumentParser() | ||
argParser.add_argument("-brown", help="(input) output of brown clusters file") | ||
argParser.add_argument("-prob", help="(output) log p(word|class) file") | ||
argParser.add_argument("-encoding", default="utf8", help="input and output file encoding, defaults to utf8") | ||
args = argParser.parse_args() | ||
|
||
marginals = defaultdict(int) | ||
with io.open(args.brown, encoding=args.encoding) as brown_file: | ||
for line in brown_file: | ||
(_class, word, count) = line.split('\t') | ||
marginals[_class] += int(count) | ||
|
||
with io.open(args.brown, encoding=args.encoding) as brown_file, io.open(args.prob, encoding=args.encoding, mode='w') as prob_file: | ||
for line in brown_file: | ||
(_class, word, count) = line.split('\t') | ||
logprob = log(int(count) * 1.0 / marginals[_class]) | ||
prob_file.write( u'{0}\t{1}\t{2}\n'.format(_class, word, logprob) ) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import re | ||
import time | ||
import io | ||
import sys | ||
import argparse | ||
from collections import defaultdict | ||
|
||
# parse/validate arguments | ||
argParser = argparse.ArgumentParser() | ||
argParser.add_argument("-d", "--delimiter", default='\t', help="delimiter defaults to \t") | ||
argParser.add_argument("-1", "--firstFilename") | ||
argParser.add_argument("-2", "--secondFilename") | ||
argParser.add_argument("-3", "--thirdFilename", default='') | ||
argParser.add_argument("-o", "--outputFilename") | ||
argParser.add_argument("-ie", "--input_encoding", default='utf8') | ||
argParser.add_argument("-oe", "--output_encoding", default='utf8') | ||
args = argParser.parse_args() | ||
|
||
firstFile = io.open(args.firstFilename, encoding=args.input_encoding, mode='r') | ||
secondFile = io.open(args.secondFilename, encoding=args.input_encoding, mode='r') | ||
thirdFile = io.open(args.thirdFilename, encoding=args.input_encoding, mode='r') if len(args.thirdFilename) > 0 else None | ||
outputFile = io.open(args.outputFilename, encoding=args.output_encoding, mode='w') | ||
|
||
counter = 0 | ||
for firstLine in firstFile: | ||
secondLine = secondFile.readline() | ||
if thirdFile: | ||
thirdLine = thirdFile.readline() | ||
if len(secondLine) == 0: | ||
print 'error: second file is shorter than first file at line {0}'.format(counter) | ||
exit(1) | ||
elif thirdFile and len(thirdLine) == 0: | ||
print 'error: third file is shorter than first file at line {0}'.format(counter) | ||
exit(1) | ||
outputFile.write(u'{0}'.format(firstLine.strip())) | ||
outputFile.write(u'{0}{1}'.format(args.delimiter, secondLine.strip())) | ||
if thirdFile: | ||
outputFile.write(u'{0}{1}'.format(args.delimiter, thirdLine.strip())) | ||
outputFile.write(u'\n') | ||
counter += 1 | ||
|
||
firstFile.close() | ||
secondFile.close() | ||
outputFile.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
import re | ||
import time | ||
import io | ||
import sys | ||
import argparse | ||
from collections import defaultdict | ||
|
||
# parse/validate arguments | ||
argParser = argparse.ArgumentParser() | ||
argParser.add_argument("-tokens", type=str, help="prune line if it has more than this many tokens") | ||
argParser.add_argument("-in", "--input_filename", type=str, help="input filename") | ||
argParser.add_argument("-out", "--output_filename", type=str, help="output filename") | ||
argParser.add_argument("-ie", "--input_encoding", type=str, default='utf8') | ||
argParser.add_argument("-oe", "--output_encoding", type=str, default='utf8') | ||
args = argParser.parse_args() | ||
|
||
counter = 0 | ||
of = io.open(args.output_filename, encoding=args.output_encoding, mode='w') | ||
for line in io.open(args.input_filename, encoding=args.input_encoding, mode='r'): | ||
if len(line.split()) <= int(args.tokens): | ||
#print len(line.split()) | ||
of.write(line) | ||
else: | ||
counter += 1 | ||
|
||
of.close() | ||
print '{0} lines pruned out'.format(counter) |
This file was deleted.
Oops, something went wrong.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import re | ||
import time | ||
import io | ||
import sys | ||
import argparse | ||
from collections import defaultdict | ||
|
||
# parse/validate arguments | ||
argParser = argparse.ArgumentParser() | ||
argParser.add_argument("-ratio", "--ratio", type=str, help="train:dev:test ratio e.g. 1000:1:1") | ||
argParser.add_argument("-corpus-src", "--corpusSrcFilename", type=str, help="input corpus file (src side)") | ||
argParser.add_argument("-corpus-tgt", "--corpusTgtFilename", type=str, help="input corpus file (tgt side)") | ||
argParser.add_argument("-train-src", "--trainSrcFilename", type=str, help="output train filename (src side)") | ||
argParser.add_argument("-train-tgt", "--trainTgtFilename", type=str, help="output train filename (tgt side)") | ||
argParser.add_argument("-dev-src", "--devSrcFilename", type=str, help="output dev filename (src side)") | ||
argParser.add_argument("-dev-tgt", "--devTgtFilename", type=str, help="output dev filename (tgt side)") | ||
argParser.add_argument("-test-src", "--testSrcFilename", type=str, help="output test filename (src side)") | ||
argParser.add_argument("-test-tgt", "--testTgtFilename", type=str, help="output test filename (tgt side)") | ||
args = argParser.parse_args() | ||
|
||
ratio = args.ratio # 1000:1:1 | ||
|
||
srcInput = io.open(args.corpusSrcFilename, encoding='utf8', mode='r') | ||
tgtInput = io.open(args.corpusTgtFilename, encoding='utf8', mode='r') | ||
|
||
srcTrain = io.open(args.trainSrcFilename, encoding='utf8', mode='w') | ||
tgtTrain = io.open(args.trainTgtFilename, encoding='utf8', mode='w') | ||
|
||
srcDev = io.open(args.devSrcFilename, encoding='utf8', mode='w') | ||
tgtDev = io.open(args.devTgtFilename, encoding='utf8', mode='w') | ||
|
||
srcTest = io.open(args.testSrcFilename, encoding='utf8', mode='w') | ||
tgtTest = io.open(args.testTgtFilename, encoding='utf8', mode='w') | ||
|
||
[trainSize, devSize, testSize] = ratio.split(':') | ||
[trainSize, devSize, testSize] = [int(trainSize), int(devSize), int(testSize)] | ||
|
||
counter = 0 | ||
for srcLine in srcInput: | ||
tgtLine = tgtInput.readline() | ||
counter += 1 | ||
if counter <= trainSize: | ||
srcTrain.write(srcLine) | ||
tgtTrain.write(tgtLine) | ||
elif counter <= trainSize + devSize: | ||
srcDev.write(srcLine) | ||
tgtDev.write(tgtLine) | ||
elif counter < trainSize + devSize + testSize: | ||
srcTest.write(srcLine) | ||
tgtTest.write(tgtLine) | ||
elif counter == trainSize + devSize + testSize: | ||
srcTest.write(srcLine) | ||
tgtTest.write(tgtLine) | ||
counter = 0 | ||
else: | ||
print 'error: sizes don\'t make sense' | ||
exit(1) | ||
|
||
srcInput.close() | ||
tgtInput.close() | ||
srcTrain.close() | ||
tgtTrain.close() | ||
srcDev.close() | ||
tgtDev.close() | ||
srcTest.close() | ||
tgtTest.close() |