Moved all data to a parent directory

danielwatson6 · danielwatson6 · commit d4f701ce1dd1 · 2018-06-26T12:10:05.000+01:00
diff --git a/.gitignore b/.gitignore
@@ -12,7 +12,6 @@ Icon?
 
 # Skip Thoughts
 word2vecModel*
-books/
-books_tf/
+data/
 output/
 
diff --git a/README.md b/README.md
@@ -9,24 +9,32 @@ This code is written for python 3.6. To download, clone this repository:
 git clone https://github.com/danielwatson6/skip-thoughts.git
 ```
 
+### Obtaining the training data
+
 To obtain the training data, navigate to [https://www.smashwords.com/] and navigate the website to restrict the books to obtain to the desired categories (e.g. only free books of >=20,000 word length, of a certain genre, etc.). The resulting URL in the browser with the paginated list of books can be passed to this script to download all books in English that are available in text file format:
 ```bash
-python smashwords.py [URL] [SAVE_DIRECTORY (defaults to ./books)]
+python smashwords.py [URL] [SAVE_DIRECTORY (defaults to data/books)]
 # Example: python smashwords.py https://www.smashwords.com/books/category/1/newest/0/free/medium 
 ```
 
 We use Google's pre-trained 300-dimensional word vectors, which can be downloaded [here](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) (this link just mirrors the download link of the [official website](https://code.google.com/archive/p/word2vec/)). The general model is of course independent of what word vectors are fed.
 
+### Preprocessing
+
 To clean the training data, there is a script provided that will textually normalize (sentences are extracted and only alphanumerics and apostrophes are kept) all the files in the input directory, convert the words to unique integer IDs according to the provided word vector model, and save them in the TensorFlow binary format. See the help page for further details:
 ```bash
 python clean.py --help
 ```
 
+### Training the model
+
 To train the model, change hyperparameters, or get sentence embeddings, see the help page of the training script:
 ```bash
 python train.py --help
 ```
 
+### Running a trained model
+
 To use the model in any python script, follow this basic pattern:
 ```python
 import tensorflow as tf
@@ -43,13 +51,23 @@ with graph.as_default():
   model = SkipThoughts(word2vec_model, **kwargs)
 
 with tf.Session(graph=graph):
-  # Restore the model only once:
-  model.restore(save_dir)  # pass in the directory where the .ckpt files live.
+  # Restore the model only once.
+  # Here, `save_dir` is the directory where the .ckpt files live. Typically
+  # this would be "output/mymodel" where --model_name=mymodel in train.py.
+  model.restore(save_dir)
   
   # Run the model like this as many times as desired.
   print(model.encode(sentence_strings))
 ```
 
+### Evaluating a trained model
+
+We provide an evaluation script to test the quality of the sentence vectors
+produced by the trained model.
+
+
+
+
 ## Dependencies
 
 All the dependencies are listed in the `requirements.txt` file. They can be installed with `pip` as follows:
diff --git a/clean.py b/clean.py
@@ -21,9 +21,9 @@
   help="Keep only the n most common words of the training data.")
 parser.add_argument('--max_length', type=int, default=40,
   help="Truncate input and output sentences to maximum length n.")
-parser.add_argument('--input', type=str, default="books",
+parser.add_argument('--input', type=str, default="data/books",
   help="Path to the directory containing the text files.")
-parser.add_argument('--output', type=str, default="books_tf",
+parser.add_argument('--output', type=str, default="data/books_tf",
   help="Path to the directory that will contain the TFRecord files.")
 parser.add_argument('--embeddings_path', type=str, default="./word2vecModel",
   help="Path to the pre-trained word embeddings model.")
diff --git a/smashwords.py b/smashwords.py
@@ -1,6 +1,6 @@
 """Book scraping script for smashwords.com.
 
-Usage: python smashwords.py [scrape_link] [output_dir (defaults to ./books)]
+Usage: python smashwords.py [scrape_link] [output_dir (defaults to data/books)]
 """
 
 import os
@@ -25,7 +25,7 @@ def to_filename(s):
 
 if __name__ == '__main__':
 
-  write_dir = 'books'
+  write_dir = 'data/books'
   if len(sys.argv) > 2:
     write_dir = sys.argv[2]
 
diff --git a/train.py b/train.py
@@ -1,3 +1,5 @@
+"""Script for training the skip-thoughts model."""
+
 import argparse
 import itertools
 import os
@@ -50,7 +52,7 @@
 # Configuration args
 parser.add_argument('--embeddings_path', type=str, default="word2vecModel",
   help="Path to the pre-trained word embeddings model.")
-parser.add_argument('--input', type=str, default="books_tf",
+parser.add_argument('--input', type=str, default="data/books_tf",
   help="Path to the directory containing the dataset TFRecord files.")
 parser.add_argument('--model_name', type=str, default="default",
   help="Will save/restore model in ./output/[model_name].")