Skip to content

Commit db7a6ec

Browse files
committed
fix: performance of _ls_tree
1 parent 619ffd0 commit db7a6ec

File tree

1 file changed

+31
-40
lines changed

1 file changed

+31
-40
lines changed

src/huggingface_hub/hf_file_system.py

Lines changed: 31 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
REPO_TYPES_URL_PREFIXES,
2525
)
2626
from .file_download import hf_hub_url, http_get
27-
from .hf_api import HfApi, LastCommitInfo, RepoFile
27+
from .hf_api import HfApi, LastCommitInfo, RepoFile, RepoFolder
2828
from .utils import (
2929
EntryNotFoundError,
3030
HFValidationError,
@@ -375,29 +375,14 @@ def _ls_tree(
375375
revision=resolved_path.revision,
376376
repo_type=resolved_path.repo_type,
377377
)
378+
378379
for path_info in tree:
379-
if isinstance(path_info, RepoFile):
380-
cache_path_info = {
381-
"name": root_path + "/" + path_info.path,
382-
"size": path_info.size,
383-
"type": "file",
384-
"blob_id": path_info.blob_id,
385-
"lfs": path_info.lfs,
386-
"last_commit": path_info.last_commit,
387-
"security": path_info.security,
388-
}
389-
else:
390-
cache_path_info = {
391-
"name": root_path + "/" + path_info.path,
392-
"size": 0,
393-
"type": "directory",
394-
"tree_id": path_info.tree_id,
395-
"last_commit": path_info.last_commit,
396-
}
380+
cache_path_info = self._make_cache_path_info(root_path, path_info, should_copy=False)
397381
parent_path = self._parent(cache_path_info["name"])
398382
self.dircache.setdefault(parent_path, []).append(cache_path_info)
399-
out.append(cache_path_info)
400-
return copy.deepcopy(out) # copy to not let users modify the dircache
383+
out_cache_path_info = self._make_cache_path_info(root_path, path_info)
384+
out.append(out_cache_path_info)
385+
return out
401386

402387
def glob(self, path, **kwargs):
403388
# Set expand_info=False by default to get a x10 speed boost
@@ -540,28 +525,34 @@ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None,
540525
path_in_repo="",
541526
_raw_revision=resolved_path._raw_revision,
542527
).unresolve()
543-
if isinstance(path_info, RepoFile):
544-
out = {
545-
"name": root_path + "/" + path_info.path,
546-
"size": path_info.size,
547-
"type": "file",
548-
"blob_id": path_info.blob_id,
549-
"lfs": path_info.lfs,
550-
"last_commit": path_info.last_commit,
551-
"security": path_info.security,
552-
}
553-
else:
554-
out = {
555-
"name": root_path + "/" + path_info.path,
556-
"size": 0,
557-
"type": "directory",
558-
"tree_id": path_info.tree_id,
559-
"last_commit": path_info.last_commit,
560-
}
528+
out = self._make_path_info(root_path, path_info)
561529
if not expand_info:
562530
out = {k: out[k] for k in ["name", "size", "type"]}
563531
assert out is not None
564-
return copy.deepcopy(out) # copy to not let users modify the dircache
532+
return out
533+
534+
def _make_cache_path_info(
535+
self, root_path: str, path_info: RepoFile | RepoFolder, shallow_copy: bool = True
536+
) -> Dict[str, Any]:
537+
return (
538+
{
539+
"name": root_path + "/" + path_info.path,
540+
"size": path_info.size,
541+
"type": "file",
542+
"blob_id": path_info.blob_id,
543+
"lfs": copy.copy(path_info.lfs) if shallow_copy else path_info.lfs,
544+
"last_commit": copy.copy(path_info.last_commit) if shallow_copy else path_info.last_commit,
545+
"security": copy.copy(path_info.security) if shallow_copy else path_info.security,
546+
}
547+
if isinstance(path_info, RepoFile)
548+
else {
549+
"name": root_path + "/" + path_info.path,
550+
"size": 0,
551+
"type": "directory",
552+
"tree_id": path_info.tree_id,
553+
"last_commit": copy.copy(path_info.last_commit) if shallow_copy else path_info.last_commit,
554+
}
555+
)
565556

566557
def exists(self, path, **kwargs):
567558
"""Is there a file at the given path"""

0 commit comments

Comments
 (0)