update data_utils

carpedm20 · Jan 23, 2016 · 440964f · 440964f
1 parent 15c5e7b
commit 440964f
Show file tree

Hide file tree

Showing 4 changed files with 29 additions and 17 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,12 @@ Prerequisites
 Usage
 -----
 
-First, you need to download [DeepMind Q&A Dataset](https://github.com/deepmind/rc-data) from [here](https://github.com/deepmind/rc-data) or [here](http://cs.nyu.edu/~kcho/DMQA/).
+First, you need to download [DeepMind Q&A Dataset](https://github.com/deepmind/rc-data) from [here](http://cs.nyu.edu/~kcho/DMQA/), save `cnn.tgz` and `dailymail.tgz` into the repo, and run:
+
+    $ ./unzip.sh cnn.tgz dailymail.tgz
+
+Then run the pre-processing code with:
+    $ python data_utils.py data cnn
 
 To train a model with `cnn` dataset:
 

diff --git a/data_utils.py b/data_utils.py
@@ -20,9 +20,10 @@
 from __future__ import division
 from __future__ import print_function
 
-import gzip
 import os
 import re
+import sys
+import gzip
 import tarfile
 from tqdm import *
 from glob import glob
@@ -265,4 +266,14 @@ def prepare_data(data_dir, dataset, vocab_size):
   questions_to_token_ids(train_path, vocab_fname, vocab_size)
 
 if __name__ == '__main__':
-  prepare_data('data', 'cnn', 1000000)
+  if len(sys.argv) < 3:
+    print(" [*] usage: python data_utils.py DATA_DIR DATASET_NAME VOCAB_SIZE")
+  else:
+    data_dir = sys.argv[1]
+    dataset_name = sys.argv[2]
+    if len(sys.argv) > 3:
+      vocab_size = sys.argv[3]
+    else:
+      vocab_size = 100000
+
+    prepare_data(data_dir, dataset_name, vocab_size)
diff --git a/main.py b/main.py
@@ -14,7 +14,7 @@
 flags.DEFINE_float("learning_rate", 0.0002, "Learning rate of for adam [0.0002]")
 flags.DEFINE_string("model", "LSTM", "The type of model to train and test [LSTM, Attentive, Impatient]")
 flags.DEFINE_string("data_dir", "data", "The name of data directory [data]")
-flags.DEFINE_string("dataset", "cnn", "The name of dataset [cnn, dailymail]")
+flags.DEFINE_string("dataset", "small", "The name of dataset [cnn, dailymail]")
 flags.DEFINE_string("checkpoint_dir", "checkpoint", "Directory name to save the checkpoints [checkpoint]")
 flags.DEFINE_boolean("forward_only", False, "True for forward only, False for training [False]")
 FLAGS = flags.FLAGS

diff --git a/unzip.sh b/unzip.sh
@@ -4,16 +4,12 @@ if [ ! -d ./data ]; then
   mkdir -p ./data
 fi
 
-echo "Unzip cnn.tgz..."
-if [ type "pigz" &> /dev/null ]; then
-  tar -xvf -C data/ | pigz > cnn.tgz
-else
-  tar -xzvf cnn.tgz -C data/
-fi
-
-echo "Unzip cnn.tgz..."
-if [ type "pigz" &> /dev/null ]; then
-  tar -xvf -C data/ | pigz > dailymail.tgz
-else
-  tar -xzvf dailymail.tgz -C data/
-fi
+for file in "$@"; do
+  if which pigz > /dev/null; then
+    echo "Unzip $file with pigz..."
+    tar -I pigz -xvf $file -C data/
+  else
+    echo "Unzip $file..."
+    tar -xvf $file -C data/
+  fi
+done