File tree Expand file tree Collapse file tree 1 file changed +36
-0
lines changed Expand file tree Collapse file tree 1 file changed +36
-0
lines changed Original file line number Diff line number Diff line change
1
+ """Skip-thought dataset preprocessor.
2
+
3
+ Every file in the input directory will be overwritten to a clean version where
4
+ each line is a sentence with letters, numbers, hyphens and/or apostrophes.
5
+
6
+ Usage: python preprocess.py [output_dir (defaults to ./books)]
7
+ """
8
+
9
+ import os
10
+ import re
11
+ import string
12
+ import sys
13
+
14
+
15
+ def sentences (s ):
16
+ """Convert a string of text to lines of cleaned sentences."""
17
+ sentences = s .split ('.' )
18
+ lines = []
19
+ for sentence in sentences :
20
+ sentence = re .sub (r"[^A-Za-z0-9 '-]" , " " , sentence )
21
+ sentence = re .sub (r"[ ]+" , " " , sentence )
22
+ lines .append (sentence .strip ())
23
+ return '\n ' .join (lines )
24
+
25
+
26
+ output_dir = './books'
27
+ if len (sys .argv ) > 1 :
28
+ output_dir = sys .argv [1 ]
29
+
30
+ filenames = os .listdir (output_dir )
31
+ for filename in filenames :
32
+ with open (os .path .join (output_dir , filename )) as f :
33
+ contents = f .read ()
34
+ with open (os .path .join (output_dir , filename ), 'w' ) as f :
35
+ f .write (sentences (contents ))
36
+
You can’t perform that action at this time.
0 commit comments