Skip to content

Commit 6490cc9

Browse files
authored
feat: increase file scanning performance (#486)
* feat: increase file scanning performance * fix: correct typo in comment * refactor: use `continue` in place of nested `ifs`
1 parent dfa4079 commit 6490cc9

File tree

2 files changed

+59
-51
lines changed

2 files changed

+59
-51
lines changed

tagstudio/src/core/library.py

Lines changed: 54 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -737,7 +737,9 @@ def _map_filenames_to_entry_ids(self):
737737
"""Maps a full filepath to its corresponding Entry's ID."""
738738
self.filename_to_entry_id_map.clear()
739739
for entry in self.entries:
740-
self.filename_to_entry_id_map[(entry.path / entry.filename)] = entry.id
740+
self.filename_to_entry_id_map[
741+
(self.library_dir / entry.path / entry.filename)
742+
] = entry.id
741743

742744
# def _map_filenames_to_entry_ids(self):
743745
# """Maps the file paths of entries to their index in the library list."""
@@ -884,59 +886,71 @@ def refresh_dir(self) -> Generator:
884886

885887
# Scans the directory for files, keeping track of:
886888
# - Total file count
887-
# - Files without library entries
888-
# for type in TYPES:
889-
start_time = time.time()
889+
# - Files without Library entries
890+
start_time_total = time.time()
891+
start_time_loop = time.time()
892+
ext_set = set(self.ext_list) # Should be slightly faster
890893
for f in self.library_dir.glob("**/*"):
894+
end_time_loop = time.time()
895+
# Yield output every 1/30 of a second
896+
if (end_time_loop - start_time_loop) > 0.034:
897+
yield self.dir_file_count
898+
start_time_loop = time.time()
891899
try:
900+
# Skip this file if it should be excluded
901+
ext: str = f.suffix.lower()
902+
if (ext in ext_set and self.is_exclude_list) or (
903+
ext not in ext_set and not self.is_exclude_list
904+
):
905+
continue
906+
907+
# Finish if the file/path is already mapped in the Library
908+
if self.filename_to_entry_id_map.get(f) is not None:
909+
# No other checks are required.
910+
self.dir_file_count += 1
911+
continue
912+
913+
# If the file is new, check for validity
892914
if (
893-
"$RECYCLE.BIN" not in f.parts
894-
and TS_FOLDER_NAME not in f.parts
895-
and "tagstudio_thumbs" not in f.parts
896-
and not f.is_dir()
915+
"$RECYCLE.BIN" in f.parts
916+
or TS_FOLDER_NAME in f.parts
917+
or "tagstudio_thumbs" in f.parts
918+
or f.is_dir()
897919
):
898-
if f.suffix.lower() not in self.ext_list and self.is_exclude_list:
899-
self.dir_file_count += 1
900-
file = f.relative_to(self.library_dir)
901-
if file not in self.filename_to_entry_id_map:
902-
self.files_not_in_library.append(file)
903-
elif f.suffix.lower() in self.ext_list and not self.is_exclude_list:
904-
self.dir_file_count += 1
905-
file = f.relative_to(self.library_dir)
906-
try:
907-
_ = self.filename_to_entry_id_map[file]
908-
except KeyError:
909-
# print(file)
910-
self.files_not_in_library.append(file)
920+
continue
921+
922+
# Add the validated new file to the Library
923+
self.dir_file_count += 1
924+
self.files_not_in_library.append(f)
925+
911926
except PermissionError:
912-
logging.info(
913-
f"The File/Folder {f} cannot be accessed, because it requires higher permission!"
914-
)
915-
end_time = time.time()
916-
# Yield output every 1/30 of a second
917-
if (end_time - start_time) > 0.034:
918-
yield self.dir_file_count
919-
start_time = time.time()
920-
# Sorts the files by date modified, descending.
927+
logging.info(f'[LIBRARY] Cannot access "{f}": PermissionError')
928+
929+
yield self.dir_file_count
930+
end_time_total = time.time()
931+
logging.info(
932+
f"[LIBRARY] Scanned directories in {(end_time_total - start_time_total):.3f} seconds"
933+
)
934+
# Sorts the files by date modified, descending
921935
if len(self.files_not_in_library) <= 150000:
922936
try:
923937
if platform.system() == "Windows" or platform.system() == "Darwin":
924938
self.files_not_in_library = sorted(
925939
self.files_not_in_library,
926-
key=lambda t: -(self.library_dir / t).stat().st_birthtime, # type: ignore[attr-defined]
940+
key=lambda t: -(t).stat().st_birthtime, # type: ignore[attr-defined]
927941
)
928942
else:
929943
self.files_not_in_library = sorted(
930944
self.files_not_in_library,
931-
key=lambda t: -(self.library_dir / t).stat().st_ctime,
945+
key=lambda t: -(t).stat().st_ctime,
932946
)
933947
except (FileExistsError, FileNotFoundError):
934-
print(
935-
"[LIBRARY] [ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
948+
logging.info(
949+
"[LIBRARY][ERROR] Couldn't sort files, some were moved during the scanning/sorting process."
936950
)
937951
pass
938952
else:
939-
print(
953+
logging.info(
940954
"[LIBRARY][INFO] Not bothering to sort files because there's OVER 150,000! Better sorting methods will be added in the future."
941955
)
942956

@@ -957,7 +971,7 @@ def remove_entry(self, entry_id: int) -> None:
957971
# Step [1/2]:
958972
# Remove this Entry from the Entries list.
959973
entry = self.get_entry(entry_id)
960-
path = entry.path / entry.filename
974+
path = self.library_dir / entry.path / entry.filename
961975
# logging.info(f'Removing path: {path}')
962976

963977
del self.filename_to_entry_id_map[path]
@@ -1087,8 +1101,8 @@ def refresh_dupe_files(self, results_filepath: str | Path):
10871101
)
10881102
)
10891103
for match in matches:
1090-
file_1 = files[match[0]].relative_to(self.library_dir)
1091-
file_2 = files[match[1]].relative_to(self.library_dir)
1104+
file_1 = files[match[0]]
1105+
file_2 = files[match[1]]
10921106

10931107
if (
10941108
file_1 in self.filename_to_entry_id_map.keys()
@@ -1289,8 +1303,7 @@ def add_new_files_as_entries(self) -> list[int]:
12891303
"""Adds files from the `files_not_in_library` list to the Library as Entries. Returns list of added indices."""
12901304
new_ids: list[int] = []
12911305
for file in self.files_not_in_library:
1292-
path = Path(file)
1293-
# print(os.path.split(file))
1306+
path = Path(*file.parts[len(self.library_dir.parts) :])
12941307
entry = Entry(
12951308
id=self._next_entry_id, filename=path.name, path=path.parent, fields=[]
12961309
)
@@ -1301,8 +1314,6 @@ def add_new_files_as_entries(self) -> list[int]:
13011314
self.files_not_in_library.clear()
13021315
return new_ids
13031316

1304-
self.files_not_in_library.clear()
1305-
13061317
def get_entry(self, entry_id: int) -> Entry:
13071318
"""Returns an Entry object given an Entry ID."""
13081319
return self.entries[self._entry_id_to_index_map[int(entry_id)]]
@@ -1323,9 +1334,7 @@ def get_entry_id_from_filepath(self, filename: Path):
13231334
"""Returns an Entry ID given the full filepath it points to."""
13241335
try:
13251336
if self.entries:
1326-
return self.filename_to_entry_id_map[
1327-
Path(filename).relative_to(self.library_dir)
1328-
]
1337+
return self.filename_to_entry_id_map[filename]
13291338
except KeyError:
13301339
return -1
13311340

tagstudio/src/qt/modals/drop_import.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ def copy_files(self):
106106
continue
107107

108108
dest_file = self.get_relative_path(file)
109+
full_dest_path: Path = self.driver.lib.library_dir / dest_file
109110

110111
if file in self.duplicate_files:
111112
duplicated_files_progress += 1
@@ -115,14 +116,12 @@ def copy_files(self):
115116
if self.choice == 2: # rename
116117
new_name = self.get_renamed_duplicate_filename_in_lib(dest_file)
117118
dest_file = dest_file.with_name(new_name)
118-
self.driver.lib.files_not_in_library.append(dest_file)
119+
self.driver.lib.files_not_in_library.append(full_dest_path)
119120
else: # override is simply copying but not adding a new entry
120-
self.driver.lib.files_not_in_library.append(dest_file)
121+
self.driver.lib.files_not_in_library.append(full_dest_path)
121122

122-
(self.driver.lib.library_dir / dest_file).parent.mkdir(
123-
parents=True, exist_ok=True
124-
)
125-
shutil.copyfile(file, self.driver.lib.library_dir / dest_file)
123+
(full_dest_path).parent.mkdir(parents=True, exist_ok=True)
124+
shutil.copyfile(file, full_dest_path)
126125

127126
fileCount += 1
128127
yield [fileCount, duplicated_files_progress]

0 commit comments

Comments
 (0)