|
23 | 23 |
|
24 | 24 |
|
25 | 25 | # local imports |
26 | | -from .exceptions import DatalakeBadOffsetException |
| 26 | +from .exceptions import DatalakeBadOffsetException, DatalakeIncompleteTransferException |
27 | 27 | from .exceptions import FileNotFoundError, PermissionError |
28 | 28 | from .lib import DatalakeRESTInterface |
29 | 29 | from .utils import ensure_writable, read_block |
@@ -195,17 +195,31 @@ def info(self, path, invalidate_cache=True, expected_error_code=None): |
195 | 195 |
|
196 | 196 | raise FileNotFoundError(path) |
197 | 197 |
|
198 | | - def _walk(self, path, invalidate_cache=True): |
199 | | - fi = list(self._ls(path, invalidate_cache)) |
| 198 | + def _walk(self, path, invalidate_cache=True, include_dirs=False): |
| 199 | + ret = list(self._ls(path, invalidate_cache)) |
200 | 200 | self._emptyDirs = [] |
201 | | - for apath in fi: |
202 | | - if apath['type'] == 'DIRECTORY': |
203 | | - sub_elements = self._ls(apath['name'], invalidate_cache) |
| 201 | + current_subdirs = [f for f in ret if f['type'] != 'FILE'] |
| 202 | + while current_subdirs: |
| 203 | + dirs_below_current_level = [] |
| 204 | + for apath in current_subdirs: |
| 205 | + try: |
| 206 | + sub_elements = self._ls(apath['name'], invalidate_cache) |
| 207 | + except FileNotFoundError: |
| 208 | + # Folder may have been deleted while walk is going on. Infrequent so we can take the linear hit |
| 209 | + ret.remove(apath) |
| 210 | + continue |
204 | 211 | if not sub_elements: |
205 | 212 | self._emptyDirs.append(apath) |
206 | 213 | else: |
207 | | - fi.extend(sub_elements) |
208 | | - return [f for f in fi if f['type'] == 'FILE'] |
| 214 | + ret.extend(sub_elements) |
| 215 | + dirs_below_current_level.extend([f for f in sub_elements if f['type'] != 'FILE']) |
| 216 | + current_subdirs = dirs_below_current_level |
| 217 | + |
| 218 | + if include_dirs: |
| 219 | + return ret |
| 220 | + else: |
| 221 | + return [f for f in ret if f['type'] == 'FILE'] |
| 222 | + |
209 | 223 |
|
210 | 224 | def _empty_dirs_to_add(self): |
211 | 225 | """ Returns directories found empty during walk. Only for internal use""" |
@@ -240,9 +254,31 @@ def du(self, path, total=False, deep=False, invalidate_cache=True): |
240 | 254 | return {p['name']: p['length'] for p in files} |
241 | 255 |
|
242 | 256 | def df(self, path): |
243 | | - """ Resource summary of path """ |
| 257 | + """ Resource summary of path |
| 258 | + Parameters |
| 259 | + ---------- |
| 260 | + path: str |
| 261 | + Location |
| 262 | + """ |
244 | 263 | path = AzureDLPath(path).trim() |
245 | | - return self.azure.call('GETCONTENTSUMMARY', path.as_posix())['ContentSummary'] |
| 264 | + current_path_info = self.info(path, invalidate_cache=False) |
| 265 | + if current_path_info['type'] == 'FILE': |
| 266 | + return {'directoryCount': 0, 'fileCount': 1, 'length': current_path_info['length'], 'quota': -1, |
| 267 | + 'spaceConsumed': current_path_info['length'], 'spaceQuota': -1} |
| 268 | + else: |
| 269 | + all_files_and_dirs = self._walk(path, include_dirs=True) |
| 270 | + dir_count = 1 # 1 as walk doesn't return current directory |
| 271 | + length = file_count = 0 |
| 272 | + for item in all_files_and_dirs: |
| 273 | + length += item['length'] |
| 274 | + if item['type'] == 'FILE': |
| 275 | + file_count += 1 |
| 276 | + else: |
| 277 | + dir_count += 1 |
| 278 | + |
| 279 | + return {'directoryCount': dir_count, 'fileCount': file_count, 'length': length, 'quota': -1, |
| 280 | + 'spaceConsumed': length, 'spaceQuota': -1} |
| 281 | + |
246 | 282 |
|
247 | 283 | def chmod(self, path, mod): |
248 | 284 | """ Change access mode of path |
@@ -858,14 +894,18 @@ def read(self, length=-1): |
858 | 894 | length = self.size |
859 | 895 | if self.closed: |
860 | 896 | raise ValueError('I/O operation on closed file.') |
861 | | - |
| 897 | + flag = 0 |
862 | 898 | out = b"" |
863 | 899 | while length > 0: |
864 | 900 | self._read_blocksize() |
865 | 901 | data_read = self.cache[self.loc - self.start: |
866 | 902 | min(self.loc - self.start + length, self.end - self.start)] |
867 | 903 | if not data_read: # Check to catch possible server errors. Ideally shouldn't happen. |
868 | | - break |
| 904 | + flag += 1 |
| 905 | + if flag >= 5: |
| 906 | + raise DatalakeIncompleteTransferException('Could not read data: {}. ' |
| 907 | + 'Repeated zero byte reads. ' |
| 908 | + 'Possible file corruption'.format(self.path)) |
869 | 909 | out += data_read |
870 | 910 | self.loc += len(data_read) |
871 | 911 | length -= len(data_read) |
|
0 commit comments