Fix bAbi data generator and readme (#1235)

artitw · afrozenator · commit 2167370937d3 · 2018-11-21T10:22:50.000-08:00
* fix bAbi data generator and readme

* Fix bAbi hparams deletion

* Fix bAbi hparams delete unecessary keys

* Fix bAbi hparams clean keys

* bAbi hparams delete keys
diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ pip install tensor2tensor && t2t-trainer \
 ### Contents
 
 * [Suggested Datasets and Models](#suggested-datasets-and-models)
+  * [Story, Question and Answer](#story-question-and-answer)
   * [Image Classification](#image-classification)
   * [Image Generation](#image-generation)
   * [Language Modeling](#language-modeling)
@@ -78,6 +79,16 @@ hyperparameters that we know works well in our setup. We usually
 run either on Cloud TPUs or on 8-GPU machines; you might need
 to modify the hyperparameters if you run on a different setup.
 
+### Story, Question and Answer
+
+For answering questions based on a story, use
+ 
+* the [bAbi][1] data-set: `--problem=babi_qa_concat_task1_1k`
+
+You can choose the bAbi task from the range [1,20] and the subset from 1k or 10k. To combine test data from all tasks into a single test set, use `--problem=babi_qa_concat_all_tasks_10k`
+
+[1] https://research.fb.com/downloads/babi/
+
 ### Image Classification
 
 For image classification, we have a number of standard data-sets:
diff --git a/tensor2tensor/data_generators/babi_qa.py b/tensor2tensor/data_generators/babi_qa.py
@@ -34,10 +34,9 @@
 import os
 import shutil
 import tarfile
-
+import requests
 import six
 
-from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators import text_problems
@@ -109,8 +108,12 @@ def _prepare_babi_data(tmp_dir, data_dir):
   if not tf.gfile.Exists(data_dir):
     tf.gfile.MakeDirs(data_dir)
 
-  # TODO(dehghani@): find a solution for blocking user-agent (download)
-  file_path = generator_utils.maybe_download(tmp_dir, _TAR, _URL)
+  file_path = os.path.join(tmp_dir, _TAR)
+  headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
+  resp = requests.get(_URL, headers=headers)
+  with open(file_path, 'wb') as f:
+    f.write(resp.content)
+
   tar = tarfile.open(file_path)
   tar.extractall(tmp_dir)
   tar.close()
@@ -449,8 +452,12 @@ def preprocess_example(self, example, unused_mode, unused_model_hparams):
   def hparams(self, defaults, unused_model_hparams):
     super(BabiQaConcat, self).hparams(defaults, unused_model_hparams)
     p = defaults
-    del p.modality['context']
-    del p.vocab_size['context']
+
+    if 'context' in p.modality:
+      del p.modality['context']
+
+    if 'context' in p.vocab_size:
+      del p.vocab_size['context']
 
 
 def _problems_to_register():