Skip to content

Commit 10f068f

Browse files
committed
Added cleanup script
1 parent 692c0a6 commit 10f068f

File tree

1 file changed

+36
-0
lines changed

1 file changed

+36
-0
lines changed

clean.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""Skip-thought dataset preprocessor.
2+
3+
Every file in the input directory will be overwritten to a clean version where
4+
each line is a sentence with letters, numbers, hyphens and/or apostrophes.
5+
6+
Usage: python preprocess.py [output_dir (defaults to ./books)]
7+
"""
8+
9+
import os
10+
import re
11+
import string
12+
import sys
13+
14+
15+
def sentences(s):
16+
"""Convert a string of text to lines of cleaned sentences."""
17+
sentences = s.split('.')
18+
lines = []
19+
for sentence in sentences:
20+
sentence = re.sub(r"[^A-Za-z0-9 '-]", " ", sentence)
21+
sentence = re.sub(r"[ ]+", " ", sentence)
22+
lines.append(sentence.strip())
23+
return '\n'.join(lines)
24+
25+
26+
output_dir = './books'
27+
if len(sys.argv) > 1:
28+
output_dir = sys.argv[1]
29+
30+
filenames = os.listdir(output_dir)
31+
for filename in filenames:
32+
with open(os.path.join(output_dir, filename)) as f:
33+
contents = f.read()
34+
with open(os.path.join(output_dir, filename), 'w') as f:
35+
f.write(sentences(contents))
36+

0 commit comments

Comments
 (0)