Skip to content

Commit

Permalink
commonly used scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
Waleed Ammar committed Oct 30, 2013
1 parent 49395e6 commit 724ed90
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 60 deletions.
37 changes: 37 additions & 0 deletions horizontal-split-parallel-corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import re
import time
import io
import sys
import argparse
from collections import defaultdict

# parse/validate arguments
argParser = argparse.ArgumentParser()
argParser.add_argument("-d", "--delimiter", type=str, default=' ||| ', help="delimiter defaults to \t")
argParser.add_argument("-1", "--firstFilename", type=str, help="src output file")
argParser.add_argument("-2", "--secondFilename", type=str, help="tgt output file")
argParser.add_argument("-i", "--inputFilename", type=str)
argParser.add_argument("-ie", "--input_encoding", type=str, default='utf8')
argParser.add_argument("-oe", "--output_encoding", type=str, default='utf8')
args = argParser.parse_args()

firstFile = io.open(args.firstFilename, encoding=args.output_encoding, mode='w')
secondFile = io.open(args.secondFilename, encoding=args.output_encoding, mode='w')
inputFile = io.open(args.inputFilename, encoding=args.input_encoding, mode='r')

counter = 0
for line in inputFile:
splits = line.strip().split(args.delimiter)
if len(splits) != 2:
continue
firstLine, secondLine = splits
if len(secondLine) == 0 or len(firstLine) == 0:
print 'error: line {0} has an empty side'.format(counter)
exit(1)
firstFile.write(u'{0}\n'.format(firstLine.strip()))
secondFile.write(u'{0}\n'.format(secondLine.strip()))
counter += 1

firstFile.close()
secondFile.close()
inputFile.close()
12 changes: 10 additions & 2 deletions lowercase.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,17 @@
import io
import sys
import nltk
import argparse

inputFile = io.open(sys.argv[1], encoding='utf8', mode='r')
outputFile = io.open(sys.argv[2], encoding='utf8', mode='w')
argParser = argparse.ArgumentParser()
argParser.add_argument("-input")
argParser.add_argument("-output")
argParser.add_argument("-ie", "--input_encoding", type=str, default='utf8')
argParser.add_argument("-oe", "--output_encoding", type=str, default='utf8')
args = argParser.parse_args()

inputFile = io.open(args.input, encoding=args.input_encoding, mode='r')
outputFile = io.open(args.output, encoding=args.output_encoding, mode='w')

for line in inputFile:
tokens = line.strip().split()
Expand Down
24 changes: 24 additions & 0 deletions normalize-brown-cluster-emission-probs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import argparse
import io
import gzip
from collections import defaultdict
from math import log

# parse/validate arguments
argParser = argparse.ArgumentParser()
argParser.add_argument("-brown", help="(input) output of brown clusters file")
argParser.add_argument("-prob", help="(output) log p(word|class) file")
argParser.add_argument("-encoding", default="utf8", help="input and output file encoding, defaults to utf8")
args = argParser.parse_args()

marginals = defaultdict(int)
with io.open(args.brown, encoding=args.encoding) as brown_file:
for line in brown_file:
(_class, word, count) = line.split('\t')
marginals[_class] += int(count)

with io.open(args.brown, encoding=args.encoding) as brown_file, io.open(args.prob, encoding=args.encoding, mode='w') as prob_file:
for line in brown_file:
(_class, word, count) = line.split('\t')
logprob = log(int(count) * 1.0 / marginals[_class])
prob_file.write( u'{0}\t{1}\t{2}\n'.format(_class, word, logprob) )
44 changes: 44 additions & 0 deletions paste.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import re
import time
import io
import sys
import argparse
from collections import defaultdict

# parse/validate arguments
argParser = argparse.ArgumentParser()
argParser.add_argument("-d", "--delimiter", default='\t', help="delimiter defaults to \t")
argParser.add_argument("-1", "--firstFilename")
argParser.add_argument("-2", "--secondFilename")
argParser.add_argument("-3", "--thirdFilename", default='')
argParser.add_argument("-o", "--outputFilename")
argParser.add_argument("-ie", "--input_encoding", default='utf8')
argParser.add_argument("-oe", "--output_encoding", default='utf8')
args = argParser.parse_args()

firstFile = io.open(args.firstFilename, encoding=args.input_encoding, mode='r')
secondFile = io.open(args.secondFilename, encoding=args.input_encoding, mode='r')
thirdFile = io.open(args.thirdFilename, encoding=args.input_encoding, mode='r') if len(args.thirdFilename) > 0 else None
outputFile = io.open(args.outputFilename, encoding=args.output_encoding, mode='w')

counter = 0
for firstLine in firstFile:
secondLine = secondFile.readline()
if thirdFile:
thirdLine = thirdFile.readline()
if len(secondLine) == 0:
print 'error: second file is shorter than first file at line {0}'.format(counter)
exit(1)
elif thirdFile and len(thirdLine) == 0:
print 'error: third file is shorter than first file at line {0}'.format(counter)
exit(1)
outputFile.write(u'{0}'.format(firstLine.strip()))
outputFile.write(u'{0}{1}'.format(args.delimiter, secondLine.strip()))
if thirdFile:
outputFile.write(u'{0}{1}'.format(args.delimiter, thirdLine.strip()))
outputFile.write(u'\n')
counter += 1

firstFile.close()
secondFile.close()
outputFile.close()
27 changes: 27 additions & 0 deletions prune-long-lines.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import re
import time
import io
import sys
import argparse
from collections import defaultdict

# parse/validate arguments
argParser = argparse.ArgumentParser()
argParser.add_argument("-tokens", type=str, help="prune line if it has more than this many tokens")
argParser.add_argument("-in", "--input_filename", type=str, help="input filename")
argParser.add_argument("-out", "--output_filename", type=str, help="output filename")
argParser.add_argument("-ie", "--input_encoding", type=str, default='utf8')
argParser.add_argument("-oe", "--output_encoding", type=str, default='utf8')
args = argParser.parse_args()

counter = 0
of = io.open(args.output_filename, encoding=args.output_encoding, mode='w')
for line in io.open(args.input_filename, encoding=args.input_encoding, mode='r'):
if len(line.split()) <= int(args.tokens):
#print len(line.split())
of.write(line)
else:
counter += 1

of.close()
print '{0} lines pruned out'.format(counter)
58 changes: 0 additions & 58 deletions split-parallel-corpus.py

This file was deleted.

File renamed without changes.
66 changes: 66 additions & 0 deletions vertical-split-parallel-corpus.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import re
import time
import io
import sys
import argparse
from collections import defaultdict

# parse/validate arguments
argParser = argparse.ArgumentParser()
argParser.add_argument("-ratio", "--ratio", type=str, help="train:dev:test ratio e.g. 1000:1:1")
argParser.add_argument("-corpus-src", "--corpusSrcFilename", type=str, help="input corpus file (src side)")
argParser.add_argument("-corpus-tgt", "--corpusTgtFilename", type=str, help="input corpus file (tgt side)")
argParser.add_argument("-train-src", "--trainSrcFilename", type=str, help="output train filename (src side)")
argParser.add_argument("-train-tgt", "--trainTgtFilename", type=str, help="output train filename (tgt side)")
argParser.add_argument("-dev-src", "--devSrcFilename", type=str, help="output dev filename (src side)")
argParser.add_argument("-dev-tgt", "--devTgtFilename", type=str, help="output dev filename (tgt side)")
argParser.add_argument("-test-src", "--testSrcFilename", type=str, help="output test filename (src side)")
argParser.add_argument("-test-tgt", "--testTgtFilename", type=str, help="output test filename (tgt side)")
args = argParser.parse_args()

ratio = args.ratio # 1000:1:1

srcInput = io.open(args.corpusSrcFilename, encoding='utf8', mode='r')
tgtInput = io.open(args.corpusTgtFilename, encoding='utf8', mode='r')

srcTrain = io.open(args.trainSrcFilename, encoding='utf8', mode='w')
tgtTrain = io.open(args.trainTgtFilename, encoding='utf8', mode='w')

srcDev = io.open(args.devSrcFilename, encoding='utf8', mode='w')
tgtDev = io.open(args.devTgtFilename, encoding='utf8', mode='w')

srcTest = io.open(args.testSrcFilename, encoding='utf8', mode='w')
tgtTest = io.open(args.testTgtFilename, encoding='utf8', mode='w')

[trainSize, devSize, testSize] = ratio.split(':')
[trainSize, devSize, testSize] = [int(trainSize), int(devSize), int(testSize)]

counter = 0
for srcLine in srcInput:
tgtLine = tgtInput.readline()
counter += 1
if counter <= trainSize:
srcTrain.write(srcLine)
tgtTrain.write(tgtLine)
elif counter <= trainSize + devSize:
srcDev.write(srcLine)
tgtDev.write(tgtLine)
elif counter < trainSize + devSize + testSize:
srcTest.write(srcLine)
tgtTest.write(tgtLine)
elif counter == trainSize + devSize + testSize:
srcTest.write(srcLine)
tgtTest.write(tgtLine)
counter = 0
else:
print 'error: sizes don\'t make sense'
exit(1)

srcInput.close()
tgtInput.close()
srcTrain.close()
tgtTrain.close()
srcDev.close()
tgtDev.close()
srcTest.close()
tgtTest.close()

0 comments on commit 724ed90

Please sign in to comment.