feedback.

vincentqb · vincentqb · commit 6efedf545a0a · 2020-10-26T18:00:39.000-04:00
diff --git a/test/torchaudio_unittest/datasets/speechcommands_test.py b/test/torchaudio_unittest/datasets/speechcommands_test.py
@@ -96,15 +96,16 @@ def setUpClass(cls):
                             utterance,
                         )
                         cls.samples.append(sample)
-                        label_filename = os.path.join(label, filename)
-                        if 2 <= j < 4:
+                        if j < 2:
+                            cls.train_samples.append(sample)
+                        elif j < 4:
+                            label_filename = os.path.join(label, filename)
                             valid.write(f'{label_filename}\n')
                             cls.valid_samples.append(sample)
-                        elif 4 <= j < 6:
+                        elif j < 6:
+                            label_filename = os.path.join(label, filename)
                             test.write(f'{label_filename}\n')
                             cls.test_samples.append(sample)
-                        else:
-                            cls.train_samples.append(sample)
 
     def testSpeechCommands(self):
         dataset = speechcommands.SPEECHCOMMANDS(self.root_dir)
@@ -141,7 +142,6 @@ def testSpeechCommandsSubsetTrain(self):
 
     def testSpeechCommandsSubsetValid(self):
         dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="validation")
-        print(dataset._path)
 
         num_samples = 0
         for i, (data, sample_rate, label, speaker_id, utterance_number) in enumerate(
@@ -156,9 +156,8 @@ def testSpeechCommandsSubsetValid(self):
 
         assert num_samples == len(self.valid_samples)
 
-    def testSpeechCommandsSubset(self):
+    def testSpeechCommandsSubsetTest(self):
         dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="testing")
-        print(dataset._path)
 
         num_samples = 0
         for i, (data, sample_rate, label, speaker_id, utterance_number) in enumerate(
diff --git a/torchaudio/datasets/speechcommands.py b/torchaudio/datasets/speechcommands.py
@@ -20,6 +20,8 @@
     "https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz":
     "6b74f3901214cb2c2934e98196829835",
 }
+VALIDATION_LIST = "validation_list.txt"
+TESTING_LIST = "testing_list.txt"
 
 
 def load_speechcommands_item(filepath: str, path: str) -> Tuple[Tensor, int, str, str, int]:
@@ -90,29 +92,22 @@ def __init__(self,
                     download_url(url, root, hash_value=checksum, hash_type="md5")
                 extract_archive(archive, self._path)
 
-        walker = walk_files(self._path, suffix=".wav", prefix=True)
-        walker = filter(lambda w: HASH_DIVIDER in w and EXCEPT_FOLDER not in w, walker)
-
-        if subset in ["training", "validation"]:
-            filepath = os.path.join(self._path, "validation_list.txt")
-            with open(filepath) as f:
-                validation_list = [os.path.join(self._path, l.strip()) for l in f.readlines()]
-
-        if subset in ["training", "testing"]:
-            filepath = os.path.join(self._path, "testing_list.txt")
-            with open(filepath) as f:
-                testing_list = [os.path.join(self._path, l.strip()) for l in f.readlines()]
+        def load_list(filename):
+            filepath = os.path.join(self._path, filename)
+            with open(filepath) as fileobj:
+                return [os.path.join(self._path, line.strip()) for line in fileobj]
 
         if subset == "validation":
-            walker = validation_list
+            self._walker = load_list(VALIDATION_LIST)
         elif subset == "testing":
-            walker = testing_list
+            self._walker = load_list(TESTING_LIST)
         elif subset == "training":
-            walker = filter(
-                lambda w: not (w in validation_list or w in testing_list), walker
-            )
-
-        self._walker = list(walker)
+            excludes = load_list(VALIDATION_LIST) + load_list(TESTING_LIST)
+            walker = walk_files(self._path, suffix=".wav", prefix=True)
+            self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w and w not in excludes]
+        else:
+            walker = walk_files(self._path, suffix=".wav", prefix=True)
+            self._walker = [w for w in walker if HASH_DIVIDER in w and EXCEPT_FOLDER not in w]
 
     def __getitem__(self, n: int) -> Tuple[Tensor, int, str, str, int]:
         """Load the n-th sample from the dataset.