Skip to content

Commit 381f123

Browse files
authored
Fix to maintain directory structure for single files and empty dirs (#224)
* Fix for issue 214 * Fix path conversion on Windows and Linux * Remove superfluous assignment * Fix empty directories not being downloaded * Fix for test failure for single file to single file downloads * Fix for empty directory correct relative path * Added tests for single file download and empty dir download * Fix for python 2.7 os.makedirs * Updated History.rst * Updated version number
1 parent f50819a commit 381f123

File tree

6 files changed

+50
-8
lines changed

6 files changed

+50
-8
lines changed

HISTORY.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
Release History
44
===============
55

6+
0.0.25 (2018-07-26)
7+
+++++++++++++++++++
8+
* Fixed downloading of empty directories and download of directory structure with only a single file
9+
610
0.0.24 (2018-07-16)
711
+++++++++++++++++++
812
* Retry policy implemented for all operations, default being Exponential Retry Policy

azure/datalake/store/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# license information.
77
# --------------------------------------------------------------------------
88

9-
__version__ = "0.0.24"
9+
__version__ = "0.0.25"
1010

1111
from .core import AzureDLFileSystem
1212
from .multithread import ADLDownloader

azure/datalake/store/core.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def __init__(self, token=None, **kwargs):
6868
self.kwargs = kwargs
6969
self.connect()
7070
self.dirs = {}
71+
self._emptyDirs = []
7172
AzureDLFileSystem._singleton[0] = self
7273

7374
@classmethod
@@ -179,11 +180,20 @@ def info(self, path, invalidate_cache=True, expected_error_code=None):
179180

180181
def _walk(self, path, invalidate_cache=True):
181182
fi = list(self._ls(path, invalidate_cache))
183+
self._emptyDirs = []
182184
for apath in fi:
183185
if apath['type'] == 'DIRECTORY':
184-
fi.extend(self._ls(apath['name'], invalidate_cache))
186+
sub_elements = self._ls(apath['name'], invalidate_cache)
187+
if not sub_elements:
188+
self._emptyDirs.append(apath)
189+
else:
190+
fi.extend(sub_elements)
185191
return [f for f in fi if f['type'] == 'FILE']
186192

193+
def _empty_dirs_to_add(self):
194+
""" Returns directories found empty during walk. Only for internal use"""
195+
return self._emptyDirs
196+
187197
def walk(self, path='', details=False, invalidate_cache=True):
188198
""" Get all files below given path
189199
"""

azure/datalake/store/multithread.py

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -194,16 +194,17 @@ def _setup(self):
194194
rfiles = self.client._adlfs.walk(self.rpath, details=True, invalidate_cache=True)
195195
else:
196196
rfiles = self.client._adlfs.glob(self.rpath, details=True, invalidate_cache=True)
197-
if len(rfiles) > 1:
198-
local_rel_rpath = str(AzureDLPath(self.rpath).trim().globless_prefix)
199-
file_pairs = [(os.path.join(self.lpath, os.path.relpath(f['name'] +'.inprogress', local_rel_rpath)), f)
200-
for f in rfiles]
201-
elif len(rfiles) == 1:
197+
198+
if len(rfiles) == 1 and os.path.abspath(rfiles[0]['name']) == os.path.abspath(self.rpath):
202199
if os.path.exists(self.lpath) and os.path.isdir(self.lpath):
203200
file_pairs = [(os.path.join(self.lpath, os.path.basename(rfiles[0]['name'] + '.inprogress')),
204201
rfiles[0])]
205202
else:
206203
file_pairs = [(self.lpath, rfiles[0])]
204+
elif len(rfiles) >= 1:
205+
local_rel_rpath = str(AzureDLPath(self.rpath).trim().globless_prefix)
206+
file_pairs = [(os.path.join(self.lpath, os.path.relpath(f['name'] +'.inprogress', local_rel_rpath)), f)
207+
for f in rfiles]
207208
else:
208209
raise ValueError('No files to download')
209210

@@ -242,6 +243,14 @@ def touch(self, src, dst):
242243
with open(dst, 'wb'):
243244
pass
244245

246+
for empty_directory in self.client._adlfs._empty_dirs_to_add():
247+
local_rel_rpath = str(AzureDLPath(self.rpath).trim().globless_prefix)
248+
path = os.path.join(self.lpath, os.path.relpath(empty_directory['name'], local_rel_rpath))
249+
try:
250+
os.makedirs(path)
251+
except OSError as e:
252+
if e.errno != errno.EEXIST:
253+
raise
245254
self.client.run(nthreads, monitor, before_start=touch)
246255

247256
def active(self):

azure/datalake/store/transfer.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -489,11 +489,12 @@ def run(self, nthreads=None, monitor=True, before_start=None):
489489
self._nthreads = nthreads or self._nthreads
490490
self._ffutures = {}
491491
self._cfutures = {}
492+
492493
for src, dst in self._files:
493494
if before_start:
494495
before_start(self._adlfs, src, dst)
495496
self._start(src, dst)
496-
before_start = None
497+
497498
if monitor:
498499
self.monitor()
499500
has_errors = False

tests/test_multithread.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,10 @@ def setup_tree(azure):
4646
for filename in ['x.csv', 'y.csv', 'z.txt']:
4747
with azure.open(test_dir / directory / filename, 'wb') as f:
4848
f.write(b'123456')
49+
azure.mkdir(test_dir / 'data/empty')
50+
azure.mkdir(test_dir / 'data/single/single')
51+
with azure.open(test_dir / 'data/single/single'/ 'single.txt', 'wb') as f:
52+
f.write(b'123456')
4953
try:
5054
yield
5155
finally:
@@ -132,6 +136,20 @@ def test_download_single_to_dir(tempdir, azure):
132136
if os.path.isfile(fname):
133137
os.remove(fname)
134138

139+
@my_vcr.use_cassette
140+
def test_download_empty_directory(tempdir, azure):
141+
with setup_tree(azure):
142+
down = ADLDownloader(azure, test_dir, tempdir, 1, 2 ** 24, overwrite=True)
143+
dirname = os.path.join(tempdir, 'data/empty')
144+
assert os.path.isdir(dirname)
145+
146+
@my_vcr.use_cassette
147+
def test_download_single_file_in_directory(tempdir, azure):
148+
with setup_tree(azure):
149+
down = ADLDownloader(azure, test_dir, tempdir, 1, 2 ** 24, overwrite=True)
150+
dirname = os.path.join(tempdir, 'data/single/single')
151+
assert os.path.isdir(dirname)
152+
assert os.path.isfile(os.path.join(dirname,'single.txt'))
135153

136154
@my_vcr.use_cassette
137155
def test_download_many(tempdir, azure):

0 commit comments

Comments
 (0)