Skip to content

Make directory traversal order alphabetical and breadth-first #814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 23, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions test/datasets/libritts_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,14 @@ def setUpClass(cls):

def test_libritts(self):
dataset = LIBRITTS(self.root_dir)
samples = list(dataset)
samples.sort(key=lambda s: s[4])

n_ites = 0
for i, (waveform,
sample_rate,
original_text,
normalized_text,
speaker_id,
chapter_id,
utterance_id) in enumerate(samples):
utterance_id) in enumerate(dataset):

expected_ids = self.utterance_ids[i]
expected_data = self.data[i]
Expand All @@ -69,3 +67,5 @@ def test_libritts(self):
assert original_text == self.original_text
assert normalized_text == self.normalized_text
assert utterance_id == f'{"_".join(str(u) for u in expected_ids[-4:])}'
n_ites += 1
assert n_ites == len(self.utterance_ids)
47 changes: 47 additions & 0 deletions test/datasets/utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import os
from pathlib import Path

from torchaudio.datasets import utils as dataset_utils

from ..common_utils import (
TempDirMixin,
TorchaudioTestCase,
)


class TestWalkFiles(TempDirMixin, TorchaudioTestCase):
root = None
expected = None

def _add_file(self, *parts):
path = self.get_temp_path(*parts)
self.expected.append(path)
Path(path).touch()

def setUp(self):
self.root = self.get_temp_path()
self.expected = []

# level 1
for filename in ['a.txt', 'b.txt', 'c.txt']:
self._add_file(filename)

# level 2
for dir1 in ['d1', 'd2', 'd3']:
for filename in ['d.txt', 'e.txt', 'f.txt']:
self._add_file(dir1, filename)
# level 3
for dir2 in ['d1', 'd2', 'd3']:
for filename in ['g.txt', 'h.txt', 'i.txt']:
self._add_file(dir1, dir2, filename)

print('\n'.join(self.expected))

def test_walk_files(self):
"""walk_files should traverse files in alphabetical order"""
n_ites = 0
for i, path in enumerate(dataset_utils.walk_files(self.root, '.txt', prefix=True)):
found = os.path.join(self.root, path)
assert found == self.expected[i]
n_ites += 1
assert n_ites == len(self.expected)
7 changes: 4 additions & 3 deletions test/datasets/yesno_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,12 @@ def setUpClass(cls):

def test_yesno(self):
dataset = yesno.YESNO(self.root_dir)
samples = list(dataset)
samples.sort(key=lambda s: s[2])
for i, (waveform, sample_rate, label) in enumerate(samples):
n_ite = 0
for i, (waveform, sample_rate, label) in enumerate(dataset):
expected_label = self.labels[i]
expected_data = self.data[i]
self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
assert sample_rate == 8000
assert label == expected_label
n_ite += 1
assert n_ite == len(self.data)
8 changes: 7 additions & 1 deletion torchaudio/datasets/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,13 @@ def walk_files(root: str,

root = os.path.expanduser(root)

for dirpath, _, files in os.walk(root):
for dirpath, dirs, files in os.walk(root):
dirs.sort()
# `dirs` is the list used in os.walk function and by sorting it in-place here, we change the
# behavior of os.walk to traverse sub directory alphabetically
# see also
# https://stackoverflow.com/questions/6670029/can-i-force-python3s-os-walk-to-visit-directories-in-alphabetical-order-how#comment71993866_6670926
files.sort()
for f in files:
if f.endswith(suffix):

Expand Down