Skip to content

Duplicate Entry Handling (Fixes #179) #204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 65 additions & 81 deletions tagstudio/src/core/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,23 +5,18 @@
"""The Library object and related methods for TagStudio."""

import datetime
import glob
import logging
import os
import sys
import time
import traceback
import typing
import xml.etree.ElementTree as ET
import ujson

from enum import Enum
from pathlib import Path
from typing import cast, Generator

from typing_extensions import Self

import ujson
from pathlib import Path

from src.core.json_typing import JsonCollation, JsonEntry, JsonLibary, JsonTag
from src.core.utils.str import strip_punctuation
from src.core.utils.web import strip_web_protocol
Expand Down Expand Up @@ -89,7 +84,7 @@ def __repr__(self) -> str:
return self.__str__()

def __eq__(self, __value: object) -> bool:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At this point there's no type check in this method. I'm not sure how how is that better than casting the variable, but if you want disable the type checking then you can add this decorator to the method declaration rather than adding ignore on each line with error.

Suggested change
def __eq__(self, __value: object) -> bool:
@typing.no_type_check
def __eq__(self, __value: object) -> bool:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the info on the @typing.no_type_check decorator, I'll definitely use that if needed going forward 👍

The casting itself was causing me the following error when invoking the __eq__ method:

Traceback (most recent call last):
  File "F:\GitHub\TagStudio/tagstudio\src\qt\helpers\custom_runnable.py", line 18, in run
    self.function()
  File "F:\GitHub\TagStudio/tagstudio\src\qt\modals\merge_dupe_entries.py", line 44, in <lambda>
    r = CustomRunnable(lambda: iterator.run())
                               ^^^^^^^^^^^^^^
  File "F:\GitHub\TagStudio/tagstudio\src\qt\helpers\function_iterator.py", line 20, in run
    for i in self.iterable():
  File "F:\GitHub\TagStudio/tagstudio\src\core\library.py", line 1043, in merge_dupe_entries
    self.entries.remove(id_to_entry_map[id])
  File "F:\GitHub\TagStudio/tagstudio\src\core\library.py", line 92, in __eq__
    int(self.id) == int(__value.id)
                        ^^^^^^^^^^
AttributeError: type object 'object' has no attribute 'id'

I can reproduce the error by comparing any two Entry objects:

entry_1 = Entry(1, "file_2", "path_2", [])
        entry_2 = Entry(2, "file_2", "path_2", [])
        logging.info(entry_1 == entry_2)

It looks like the object cast is just stripping the Entry's attributes down and turning it back into a bare object, right? I'm not sure how the cast is intended to function here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right, my bad. It should be this instead:

__value = cast(Self, __value)

__value = cast(Self, object)
__value = cast(Self, __value)
return (
int(self.id) == int(__value.id)
and self.filename == __value.filename
Expand Down Expand Up @@ -343,7 +338,6 @@ def __init__(self) -> None:
# Maps the filenames of entries in the Library to their entry's index in the self.entries list.
# Used for O(1) lookup of a file based on the current index (page number - 1) of the image being looked at.
# That filename can then be used to provide quick lookup to image metadata entries in the Library.

self.filename_to_entry_id_map: dict[Path, int] = {}
# A list of file extensions to be ignored by TagStudio.
self.default_ext_blacklist: list = ["json", "xmp", "aae"]
Expand Down Expand Up @@ -420,7 +414,7 @@ def __init__(self) -> None:
{"id": 30, "name": "Comments", "type": "text_box"},
]

def create_library(self, path) -> int:
def create_library(self, path: Path) -> int:
"""
Creates a TagStudio library in the given directory.\n
Return Codes:\n
Expand Down Expand Up @@ -657,6 +651,7 @@ def open_library(self, path: str | Path) -> int:
)
self.entries.append(e)
self._map_entry_id_to_index(e, -1)

end_time = time.time()
logging.info(
f"[LIBRARY] Entries loaded in {(end_time - start_time):.3f} seconds"
Expand Down Expand Up @@ -946,42 +941,34 @@ def refresh_dupe_entries(self):
`dupe_entries = tuple(int, list[int])`
"""

# self.dupe_entries.clear()
# known_files: set = set()
# for entry in self.entries:
# full_path = os.path.normpath(f'{self.library_dir}/{entry.path}/{entry.filename}')
# if full_path in known_files:
# self.dupe_entries.append(full_path)
# else:
# known_files.add(full_path)

self.dupe_entries.clear()
checked = set()
remaining: list[Entry] = list(self.entries)
for p, entry_p in enumerate(self.entries, start=0):
if p not in checked:
matched: list[int] = []
for c, entry_c in enumerate(remaining, start=0):
if (
entry_p.path == entry_c.path
and entry_p.filename == entry_c.filename
and c != p
):
matched.append(c)
checked.add(c)
if matched:
self.dupe_entries.append((p, matched))
sys.stdout.write(
f"\r[LIBRARY] Entry [{p}/{len(self.entries)-1}]: Has Duplicate(s): {matched}"
)
sys.stdout.flush()
else:
sys.stdout.write(
f"\r[LIBRARY] Entry [{p}/{len(self.entries)-1}]: Has No Duplicates"
)
sys.stdout.flush()
checked.add(p)
print("")
registered: dict = {} # string: list[int]

# Registered: filename : list[ALL entry IDs pointing to this filename]
# Dupe Entries: primary ID : list of [every OTHER entry ID pointing]

for i, e in enumerate(self.entries):
file: Path = Path() / e.path / e.filename
# If this unique filepath has not been marked as checked,
if not registered.get(file, None):
# Register the filepath as having been checked, and include
# its entry ID as the first entry in the corresponding list.
registered[file] = [e.id]
# Else if the filepath is already been seen in another entry,
else:
# Add this new entry ID to the list of entry ID(s) pointing to
# the same file.
registered[file].append(e.id)
yield i - 1 # The -1 waits for the next step to finish

for k, v in registered.items():
if len(v) > 1:
self.dupe_entries.append((v[0], v[1:]))
# logging.info(f"DUPLICATE FOUND: {(v[0], v[1:])}")
# for id in v:
# logging.info(f"\t{(Path()/self.get_entry(id).path/self.get_entry(id).filename)}")

yield len(self.entries)

def merge_dupe_entries(self):
"""
Expand All @@ -991,35 +978,36 @@ def merge_dupe_entries(self):
`dupe_entries = tuple(int, list[int])`
"""

print("[LIBRARY] Mirroring Duplicate Entries...")
logging.info("[LIBRARY] Mirroring Duplicate Entries...")
id_to_entry_map: dict = {}

for dupe in self.dupe_entries:
# Store the id to entry relationship as the library one is about to
# be destroyed.
# NOTE: This is not a good solution, but will be upended by the
# database migration soon anyways.
for id in dupe[1]:
id_to_entry_map[id] = self.get_entry(id)
self.mirror_entry_fields([dupe[0]] + dupe[1])

# print('Consolidating Entries...')
# for dupe in self.dupe_entries:
# for index in dupe[1]:
# print(f'Consolidating Duplicate: {(self.entries[index].path + os.pathsep + self.entries[index].filename)}')
# self.entries.remove(self.entries[index])
# self._map_filenames_to_entry_indices()

print(
logging.info(
"[LIBRARY] Consolidating Entries... (This may take a while for larger libraries)"
)
unique: list[Entry] = []
for i, e in enumerate(self.entries):
if e not in unique:
unique.append(e)
# print(f'[{i}/{len(self.entries)}] Appending: {(e.path + os.pathsep + e.filename)[0:32]}...')
sys.stdout.write(
f"\r[LIBRARY] [{i}/{len(self.entries)}] Appending Unique Entry..."
)
else:
sys.stdout.write(
f"\r[LIBRARY] [{i}/{len(self.entries)}] Consolidating Duplicate: {e.path / e.filename}..."
)
print("")
# [unique.append(x) for x in self.entries if x not in unique]
self.entries = unique
for i, dupe in enumerate(self.dupe_entries):
for id in dupe[1]:
# NOTE: Instead of using self.remove_entry(id), I'm bypassing it
# because it's currently inefficient in how it needs to remap
# every ID to every list index. I'm recreating the steps it
# takes but in a batch-friendly way here.
# NOTE: Couldn't use get_entry(id) because that relies on the
# entry's index in the list, which is currently being messed up.
logging.info(f"[LIBRARY] Removing Unneeded Entry {id}")
self.entries.remove(id_to_entry_map[id])
yield i - 1 # The -1 waits for the next step to finish

self._entry_id_to_index_map.clear()
for i, e in enumerate(self.entries, start=0):
self._map_entry_id_to_index(e, i)
self._map_filenames_to_entry_ids()

def refresh_dupe_files(self, results_filepath: str | Path):
Expand All @@ -1028,6 +1016,7 @@ def refresh_dupe_files(self, results_filepath: str | Path):
A duplicate file is defined as an identical or near-identical file as determined
by a DupeGuru results file.
"""

full_results_path: Path = Path(results_filepath)
if self.library_dir not in full_results_path.parents:
full_results_path = self.library_dir / full_results_path
Expand Down Expand Up @@ -1066,8 +1055,6 @@ def refresh_dupe_files(self, results_filepath: str | Path):
self.dupe_files.append(
(files[match[0]], files[match[1]], match[2])
)
# self.dupe_files.append((files[match[0]], files[match[1]], match[2]))

print("")

for dupe in self.dupe_files:
Expand Down Expand Up @@ -1143,19 +1130,16 @@ def fix_missing_files(self):
print(f"[LIBRARY] Fixed {self.get_entry(id).filename}")
# (int, str)

# Consolidate new matches with existing unlinked entries.
self.refresh_dupe_entries()
if self.dupe_entries:
self.merge_dupe_entries()

# Remap filenames to entry IDs.
self._map_filenames_to_entry_ids()
# TODO - the type here doesnt match but I cant reproduce calling this
self.remove_missing_matches(fixed_indices)

# for i in fixed_indices:
# # print(json_dump[i])
# del self.missing_matches[i]

# with open(matched_json_filepath, "w") as outfile:
# outfile.flush()
# json.dump({}, outfile, indent=4)
# print(f'Re-saved to disk at {matched_json_filepath}')

def _match_missing_file(self, file: str) -> list[Path]:
"""
Tries to find missing entry files within the library directory.
Expand Down Expand Up @@ -1295,7 +1279,7 @@ def get_entry_from_index(self, index: int) -> Entry | None:
return None

# @deprecated('Use new Entry ID system.')
def get_entry_id_from_filepath(self, filename):
def get_entry_id_from_filepath(self, filename: Path):
"""Returns an Entry ID given the full filepath it points to."""
try:
if self.entries:
Expand Down
36 changes: 1 addition & 35 deletions tagstudio/src/qt/modals/delete_unlinked.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, library: "Library", driver: "QtDriver"):
super().__init__()
self.lib = library
self.driver = driver
self.setWindowTitle(f"Delete Unlinked Entries")
self.setWindowTitle("Delete Unlinked Entries")
self.setWindowModality(Qt.WindowModality.ApplicationModal)
self.setMinimumSize(500, 400)
self.root_layout = QVBoxLayout(self)
Expand Down Expand Up @@ -81,20 +81,6 @@ def refresh_list(self):
self.model.appendRow(QStandardItem(str(i)))

def delete_entries(self):
# pb = QProgressDialog('', None, 0, len(self.lib.missing_files))
# # pb.setMaximum(len(self.lib.missing_files))
# pb.setFixedSize(432, 112)
# pb.setWindowFlags(pb.windowFlags() & ~Qt.WindowType.WindowCloseButtonHint)
# pb.setWindowTitle('Deleting Entries')
# pb.setWindowModality(Qt.WindowModality.ApplicationModal)
# pb.show()

# r = CustomRunnable(lambda: self.lib.ref(pb))
# r.done.connect(lambda: self.done.emit())
# # r.done.connect(lambda: self.model.clear())
# QThreadPool.globalInstance().start(r)
# # r.run()

iterator = FunctionIterator(self.lib.remove_missing_files)

pw = ProgressWidget(
Expand All @@ -119,23 +105,3 @@ def delete_entries(self):
r = CustomRunnable(lambda: iterator.run())
QThreadPool.globalInstance().start(r)
r.done.connect(lambda: (pw.hide(), pw.deleteLater(), self.done.emit()))

# def delete_entries_runnable(self):
# deleted = []
# for i, missing in enumerate(self.lib.missing_files):
# # pb.setValue(i)
# # pb.setLabelText(f'Deleting {i}/{len(self.lib.missing_files)} Unlinked Entries')
# try:
# id = self.lib.get_entry_id_from_filepath(missing)
# logging.info(f'Removing Entry ID {id}:\n\t{missing}')
# self.lib.remove_entry(id)
# self.driver.purge_item_from_navigation(ItemType.ENTRY, id)
# deleted.append(missing)
# except KeyError:
# logging.info(
# f'{ERROR} \"{id}\" was reported as missing, but is not in the file_to_entry_id map.')
# yield i
# for d in deleted:
# self.lib.missing_files.remove(d)
# # self.driver.filter_items('')
# # self.done.emit()
Loading