Skip to content

feat(parity): migrate json libraries to sqlite #604

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 38 commits into from
Nov 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c0a01fd
feat(ui): add PagedPanel widget
CyanVoxel Nov 14, 2024
f766223
feat(ui): add MigrationModal widget
CyanVoxel Nov 14, 2024
c13904c
feat: add basic json to sql conversion
CyanVoxel Nov 17, 2024
1888a20
fix: chose `poolclass` based on file or memory db
CyanVoxel Nov 17, 2024
45f24f8
feat: migrate tag colors from json to sql
CyanVoxel Nov 18, 2024
29d4eea
feat: migrate entry fields from json to sql
CyanVoxel Nov 20, 2024
4b8bd32
set default `is_new` case
CyanVoxel Nov 21, 2024
ecabba9
fix: limit correct tag query
CyanVoxel Nov 21, 2024
186c90f
feat: migrate tag aliases and subtags from json to sql
CyanVoxel Nov 21, 2024
7e82534
add migration timer
CyanVoxel Nov 21, 2024
a810c70
fix(tests): fix broken tests
CyanVoxel Nov 23, 2024
2159caf
rename methods, add docstrings
CyanVoxel Nov 23, 2024
86098ba
revert tag id search, split tag name search
CyanVoxel Nov 24, 2024
0ffa4b5
fix: use correct type in sidecar macro
CyanVoxel Nov 24, 2024
3df8cfd
tests: add json migration tests
CyanVoxel Nov 24, 2024
6103e15
fix: drop leading dot from json extensions
CyanVoxel Nov 25, 2024
642c856
add special characters to json db test
CyanVoxel Nov 25, 2024
a8bfc9d
tests: add file path and entry field parity checks
CyanVoxel Nov 26, 2024
a40c6fe
fix(ui): tag manager no longer starts empty
CyanVoxel Nov 26, 2024
4d01baf
fix: read old windows paths as posix
CyanVoxel Nov 26, 2024
19daa68
tests: add posix + windows paths to json library
CyanVoxel Nov 26, 2024
d37d408
tests: add subtag, alias, and shorthand parity tests
CyanVoxel Nov 26, 2024
7a2131b
tests: ensure no none values in parity checks
CyanVoxel Nov 26, 2024
5d60797
tests: add tag color test, use tag id in tag tests
CyanVoxel Nov 27, 2024
001019f
tests: fix and optimize tests
CyanVoxel Nov 27, 2024
4cfe23d
tests: add discrepancy tracker
CyanVoxel Nov 27, 2024
4ac893d
refactor: reduce duplicate UI code
CyanVoxel Nov 27, 2024
f81be7a
fix: load non-sequential entry ids
CyanVoxel Nov 27, 2024
bde75ab
fix(ui): sort tags in the preview panel
CyanVoxel Nov 27, 2024
c9d585f
tests(fix): prioritize `None` check over equality
CyanVoxel Nov 28, 2024
6879b84
fix(tests): fix multi "same tag field type" tests
CyanVoxel Nov 29, 2024
2c52057
ui: increase height of migration modal
CyanVoxel Nov 29, 2024
21a0068
feat: add progress bar to migration ui
CyanVoxel Nov 30, 2024
f37632c
fix(ui): sql values update earlier
CyanVoxel Nov 30, 2024
e2a81e1
refactor: use `get_color_from_str` in test
CyanVoxel Nov 30, 2024
6651085
refactor: migrate tags before aliases and subtags
CyanVoxel Nov 30, 2024
29b8585
remove unused assertion
CyanVoxel Nov 30, 2024
4ec0b8c
refactor: use `json_migration_req` flag
CyanVoxel Nov 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ coverage.xml
.hypothesis/
.pytest_cache/
cover/
tagstudio/tests/fixtures/library/*

# Translations
*.mo
Expand Down Expand Up @@ -248,11 +247,14 @@ compile_commands.json
# Ignore all local history of files
.history
.ionide
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,qt

# TagStudio
.TagStudio
!*/tests/**/.TagStudio
tagstudio/tests/fixtures/library/*
tagstudio/tests/fixtures/json_library/.TagStudio/*.sqlite
TagStudio.ini
# End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,python,qt

.envrc
.direnv
Expand Down
7 changes: 7 additions & 0 deletions tagstudio/src/core/library/alchemy/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,13 @@ class TagColor(enum.IntEnum):
COOL_GRAY = 36
OLIVE = 37

@staticmethod
def get_color_from_str(color_name: str) -> "TagColor":
for color in TagColor:
if color.name == color_name.upper().replace(" ", "_"):
return color
return TagColor.DEFAULT


class SearchMode(enum.IntEnum):
"""Operational modes for item searching."""
Expand Down
201 changes: 171 additions & 30 deletions tagstudio/src/core/library/alchemy/library.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import re
import shutil
import time
import unicodedata
from dataclasses import dataclass
from datetime import UTC, datetime
Expand All @@ -9,9 +10,11 @@
from uuid import uuid4

import structlog
from humanfriendly import format_timespan
from sqlalchemy import (
URL,
Engine,
NullPool,
and_,
create_engine,
delete,
Expand All @@ -29,6 +32,7 @@
make_transient,
selectinload,
)
from src.core.library.json.library import Library as JsonLibrary # type: ignore

from ...constants import (
BACKUP_FOLDER_NAME,
Expand Down Expand Up @@ -122,6 +126,7 @@ class LibraryStatus:
success: bool
library_path: Path | None = None
message: str | None = None
json_migration_req: bool = False


class Library:
Expand All @@ -132,7 +137,8 @@ class Library:
engine: Engine | None
folder: Folder | None

FILENAME: str = "ts_library.sqlite"
SQL_FILENAME: str = "ts_library.sqlite"
JSON_FILENAME: str = "ts_library.json"

def close(self):
if self.engine:
Expand All @@ -141,32 +147,119 @@ def close(self):
self.storage_path = None
self.folder = None

def migrate_json_to_sqlite(self, json_lib: JsonLibrary):
"""Migrate JSON library data to the SQLite database."""
logger.info("Starting Library Conversion...")
start_time = time.time()
folder: Folder = Folder(path=self.library_dir, uuid=str(uuid4()))

# Tags
for tag in json_lib.tags:
self.add_tag(
Tag(
id=tag.id,
name=tag.name,
shorthand=tag.shorthand,
color=TagColor.get_color_from_str(tag.color),
)
)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems a bit weird to me, why are the sub tags and aliases added before the tags that their foreign keys point to?

I haven't tested this, but I suspect that, because every operation in those loops is its own DB Transaction, should TS crash after the Aliases, but before the Tags have been added, that there would be database inconsistencies stemming from foreign keys in the Aliases and Subtags pointing to tags that don't exist. This might not matter if the migration just restarts after such a crash (or killing by the user), but seems like it is just asking for problems.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might've been some negative reinforcement during testing that got me to believe that this was needed, but swapping it to do the tags first seems to work just fine (and logically makes more sense).

While the user shouldn't have the option to complete the migration without the full DB migrated, it's still a great observation.


# Tag Aliases
for tag in json_lib.tags:
for alias in tag.aliases:
self.add_alias(name=alias, tag_id=tag.id)

# Tag Subtags
for tag in json_lib.tags:
for subtag_id in tag.subtag_ids:
self.add_subtag(parent_id=tag.id, child_id=subtag_id)

# Entries
self.add_entries(
[
Entry(
path=entry.path / entry.filename,
folder=folder,
fields=[],
id=entry.id + 1, # JSON IDs start at 0 instead of 1
)
for entry in json_lib.entries
]
)
for entry in json_lib.entries:
for field in entry.fields:
for k, v in field.items():
self.add_entry_field_type(
entry_ids=(entry.id + 1), # JSON IDs start at 0 instead of 1
field_id=self.get_field_name_from_id(k),
value=v,
)

# Preferences
self.set_prefs(LibraryPrefs.EXTENSION_LIST, [x.strip(".") for x in json_lib.ext_list])
self.set_prefs(LibraryPrefs.IS_EXCLUDE_LIST, json_lib.is_exclude_list)

end_time = time.time()
logger.info(f"Library Converted! ({format_timespan(end_time-start_time)})")

def get_field_name_from_id(self, field_id: int) -> _FieldID:
for f in _FieldID:
if field_id == f.value.id:
return f
return None

def open_library(self, library_dir: Path, storage_path: str | None = None) -> LibraryStatus:
is_new: bool = True
if storage_path == ":memory:":
self.storage_path = storage_path
is_new = True
return self.open_sqlite_library(library_dir, is_new)
else:
self.verify_ts_folders(library_dir)
self.storage_path = library_dir / TS_FOLDER_NAME / self.FILENAME
is_new = not self.storage_path.exists()
self.storage_path = library_dir / TS_FOLDER_NAME / self.SQL_FILENAME

if self.verify_ts_folder(library_dir) and (is_new := not self.storage_path.exists()):
json_path = library_dir / TS_FOLDER_NAME / self.JSON_FILENAME
if json_path.exists():
return LibraryStatus(
success=False,
library_path=library_dir,
message="[JSON] Legacy v9.4 library requires conversion to v9.5+",
json_migration_req=True,
)

return self.open_sqlite_library(library_dir, is_new)

def open_sqlite_library(
self, library_dir: Path, is_new: bool, add_default_data: bool = True
) -> LibraryStatus:
connection_string = URL.create(
drivername="sqlite",
database=str(self.storage_path),
)
# NOTE: File-based databases should use NullPool to create new DB connection in order to
# keep connections on separate threads, which prevents the DB files from being locked
# even after a connection has been closed.
# SingletonThreadPool (the default for :memory:) should still be used for in-memory DBs.
# More info can be found on the SQLAlchemy docs:
# https://docs.sqlalchemy.org/en/20/changelog/migration_07.html
# Under -> sqlite-the-sqlite-dialect-now-uses-nullpool-for-file-based-databases
poolclass = None if self.storage_path == ":memory:" else NullPool

logger.info("opening library", library_dir=library_dir, connection_string=connection_string)
self.engine = create_engine(connection_string)
logger.info(
"Opening SQLite Library", library_dir=library_dir, connection_string=connection_string
)
self.engine = create_engine(connection_string, poolclass=poolclass)
with Session(self.engine) as session:
make_tables(self.engine)

tags = get_default_tags()
try:
session.add_all(tags)
session.commit()
except IntegrityError:
# default tags may exist already
session.rollback()
if add_default_data:
tags = get_default_tags()
try:
session.add_all(tags)
session.commit()
except IntegrityError:
# default tags may exist already
session.rollback()

# dont check db version when creating new library
if not is_new:
Expand Down Expand Up @@ -217,7 +310,6 @@ def open_library(self, library_dir: Path, storage_path: str | None = None) -> Li
db_version=db_version.value,
expected=LibraryPrefs.DB_VERSION.default,
)
# TODO - handle migration
return LibraryStatus(
success=False,
message=(
Expand Down Expand Up @@ -352,8 +444,12 @@ def tags(self) -> list[Tag]:

return list(tags_list)

def verify_ts_folders(self, library_dir: Path) -> None:
"""Verify/create folders required by TagStudio."""
def verify_ts_folder(self, library_dir: Path) -> bool:
"""Verify/create folders required by TagStudio.

Returns:
bool: True if path exists, False if it needed to be created.
"""
if library_dir is None:
raise ValueError("No path set.")

Expand All @@ -364,6 +460,8 @@ def verify_ts_folders(self, library_dir: Path) -> None:
if not full_ts_path.exists():
logger.info("creating library directory", dir=full_ts_path)
full_ts_path.mkdir(parents=True, exist_ok=True)
return False
return True

def add_entries(self, items: list[Entry]) -> list[int]:
"""Add multiple Entry records to the Library."""
Expand Down Expand Up @@ -505,21 +603,23 @@ def search_library(

def search_tags(
self,
search: FilterState,
name: str,
) -> list[Tag]:
"""Return a list of Tag records matching the query."""
tag_limit = 100

with Session(self.engine) as session:
query = select(Tag)
query = query.options(
selectinload(Tag.subtags),
selectinload(Tag.aliases),
)
).limit(tag_limit)

if search.tag:
if name:
query = query.where(
or_(
Tag.name.icontains(search.tag),
Tag.shorthand.icontains(search.tag),
Tag.name.icontains(name),
Tag.shorthand.icontains(name),
)
)

Expand All @@ -529,7 +629,7 @@ def search_tags(

logger.info(
"searching tags",
search=search,
search=name,
statement=str(query),
results=len(res),
)
Expand Down Expand Up @@ -692,7 +792,7 @@ def add_entry_field_type(
*,
field: ValueType | None = None,
field_id: _FieldID | str | None = None,
value: str | datetime | list[str] | None = None,
value: str | datetime | list[int] | None = None,
) -> bool:
logger.info(
"add_field_to_entry",
Expand Down Expand Up @@ -725,8 +825,11 @@ def add_entry_field_type(

if value:
assert isinstance(value, list)
for tag in value:
field_model.tags.add(Tag(name=tag))
with Session(self.engine) as session:
for tag_id in list(set(value)):
tag = session.scalar(select(Tag).where(Tag.id == tag_id))
field_model.tags.add(tag)
session.flush()

elif field.type == FieldTypeEnum.DATETIME:
field_model = DatetimeField(
Expand Down Expand Up @@ -758,6 +861,28 @@ def add_entry_field_type(
)
return True

def tag_from_strings(self, strings: list[str] | str) -> list[int]:
"""Create a Tag from a given string."""
# TODO: Port over tag searching with aliases fallbacks
# and context clue ranking for string searches.
tags: list[int] = []

if isinstance(strings, str):
strings = [strings]

with Session(self.engine) as session:
for string in strings:
tag = session.scalar(select(Tag).where(Tag.name == string))
if tag:
tags.append(tag.id)
else:
new = session.add(Tag(name=string))
if new:
tags.append(new.id)
session.flush()
session.commit()
return tags

def add_tag(
self,
tag: Tag,
Expand Down Expand Up @@ -850,7 +975,7 @@ def save_library_backup_to_disk(self) -> Path:
target_path = self.library_dir / TS_FOLDER_NAME / BACKUP_FOLDER_NAME / filename

shutil.copy2(
self.library_dir / TS_FOLDER_NAME / self.FILENAME,
self.library_dir / TS_FOLDER_NAME / self.SQL_FILENAME,
target_path,
)

Expand All @@ -877,15 +1002,15 @@ def get_alias(self, tag_id: int, alias_id: int) -> TagAlias:

return alias

def add_subtag(self, base_id: int, new_tag_id: int) -> bool:
if base_id == new_tag_id:
def add_subtag(self, parent_id: int, child_id: int) -> bool:
if parent_id == child_id:
return False

# open session and save as parent tag
with Session(self.engine) as session:
subtag = TagSubtag(
parent_id=base_id,
child_id=new_tag_id,
parent_id=parent_id,
child_id=child_id,
)

try:
Expand All @@ -897,6 +1022,22 @@ def add_subtag(self, base_id: int, new_tag_id: int) -> bool:
logger.exception("IntegrityError")
return False

def add_alias(self, name: str, tag_id: int) -> bool:
with Session(self.engine) as session:
alias = TagAlias(
name=name,
tag_id=tag_id,
)

try:
session.add(alias)
session.commit()
return True
except IntegrityError:
session.rollback()
logger.exception("IntegrityError")
return False

def remove_subtag(self, base_id: int, remove_tag_id: int) -> bool:
with Session(self.engine) as session:
p_id = base_id
Expand Down
Loading