Skip to content

feat: add .ts_ignore pattern ignoring system #897

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions nix/package/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
ujson,
vtf2img,
wrapGAppsHook,
wcmatch,

withJXLSupport ? false,
}:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ dependencies = [
"vtf2img==0.1.0",
"toml==0.10.2",
"pydantic==2.9.2",
"wcmatch==10.*",
]

[project.optional-dependencies]
Expand Down
1 change: 1 addition & 0 deletions src/tagstudio/core/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
TS_FOLDER_NAME: str = ".TagStudio"
BACKUP_FOLDER_NAME: str = "backups"
COLLAGE_FOLDER_NAME: str = "collages"
IGNORE_NAME: str = ".ts_ignore"
THUMB_CACHE_NAME: str = "thumbs"

FONT_SAMPLE_TEXT: str = (
Expand Down
32 changes: 31 additions & 1 deletion src/tagstudio/core/library/alchemy/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
ValueType,
)
from tagstudio.core.library.alchemy.visitors import SQLBoolExpressionBuilder
from tagstudio.core.library.ignore import Ignore
from tagstudio.core.library.json.library import Library as JsonLibrary
from tagstudio.qt.translations import Translations

Expand All @@ -92,6 +93,7 @@

logger = structlog.get_logger(__name__)


TAG_CHILDREN_QUERY = text("""
-- Note for this entire query that tag_parents.child_id is the parent id and tag_parents.parent_id is the child id due to bad naming
WITH RECURSIVE ChildTags AS (
Expand Down Expand Up @@ -865,6 +867,7 @@ def search_library(
"""
assert isinstance(search, FilterState)
assert self.engine
assert self.library_dir

with Session(self.engine, expire_on_commit=False) as session:
statement = select(Entry)
Expand All @@ -877,6 +880,7 @@ def search_library(
f"SQL Expression Builder finished ({format_timespan(end_time - start_time)})"
)

# TODO: Convert old extension lists to new .ts_ignore format
extensions = self.prefs(LibraryPrefs.EXTENSION_LIST)
is_exclude_list = self.prefs(LibraryPrefs.IS_EXCLUDE_LIST)

Expand All @@ -886,11 +890,37 @@ def search_library(
statement = statement.where(Entry.suffix.in_(extensions))

statement = statement.distinct(Entry.id)
ignore_patterns: list[str] = Ignore.get_patterns(self.library_dir)

# Add glob pattern filters with exclusion patterns allowing for overrides.
statement = statement.filter(
and_(
or_(
or_(
*[
Entry.path.op("GLOB")(p.lstrip("!"))
for p in ignore_patterns
if p.startswith("!")
]
),
and_(
*[
Entry.path.op("NOT GLOB")(p)
for p in ignore_patterns
if not p.startswith("!")
]
),
)
)
)

# TODO: This query will become unnecessary once this method returns unlimited IDs and
# the it becomes the frontend's responsibility (once again) to split and display them.
start_time = time.time()
query_count = select(func.count()).select_from(statement.alias("entries"))
count_all: int = session.execute(query_count).scalar() or 0
end_time = time.time()
logger.info(f"finished counting ({format_timespan(end_time - start_time)})")
logger.info(f"[Library] Finished counting ({format_timespan(end_time - start_time)})")

sort_on: ColumnExpressionArgument = Entry.id
match search.sorting_mode:
Expand Down
149 changes: 149 additions & 0 deletions src/tagstudio/core/library/ignore.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
# Copyright (C) 2025 Travis Abendshien (CyanVoxel).
# Licensed under the GPL-3.0 License.
# Created for TagStudio: https://github.com/CyanVoxel/TagStudio

from copy import deepcopy
from pathlib import Path

import structlog
from wcmatch import glob, pathlib

from tagstudio.core.constants import IGNORE_NAME, TS_FOLDER_NAME
from tagstudio.core.singleton import Singleton

logger = structlog.get_logger()

PATH_GLOB_FLAGS = glob.GLOBSTARLONG | glob.DOTGLOB | glob.NEGATE | pathlib.MATCHBASE


def _ignore_to_glob(ignore_patterns: list[str]) -> list[str]:
"""Convert .gitignore-like patterns to explicit glob syntax.

Args:
ignore_patterns (list[str]): The .gitignore-like patterns to convert.
"""
glob_patterns: list[str] = deepcopy(ignore_patterns)
additional_patterns: list[str] = []

# Mimic implicit .gitignore syntax behavior for the SQLite GLOB function.
for pattern in glob_patterns:
# Temporarily remove any exclusion character before processing
exclusion_char = ""
gp = pattern
if pattern.startswith("!"):
gp = pattern[1:]
exclusion_char = "!"

if not gp.startswith("**/") and not gp.startswith("*/") and not gp.startswith("/"):
# Create a version of a prefix-less pattern that starts with "**/"
gp = "**/" + gp
additional_patterns.append(exclusion_char + gp)

gp = gp.removesuffix("/**").removesuffix("/*").removesuffix("/")
additional_patterns.append(exclusion_char + gp)

gp = gp.removeprefix("**/").removeprefix("*/")
additional_patterns.append(exclusion_char + gp)

glob_patterns = glob_patterns + additional_patterns

# Add "/**" suffix to suffix-less patterns to match implicit .gitignore behavior.
for pattern in glob_patterns:
if pattern.endswith("/**"):
continue

glob_patterns.append(pattern.removesuffix("/*").removesuffix("/") + "/**")

glob_patterns = list(set(glob_patterns))

logger.info("[Ignore]", glob_patterns=glob_patterns)
return glob_patterns


GLOBAL_IGNORE = _ignore_to_glob(
[
# TagStudio -------------------
f"{TS_FOLDER_NAME}",
# System Trashes --------------
".Trash",
".Trash-*",
".Trashes",
"$RECYCLE.BIN",
# macOS Generated -------------
"._*",
".DS_Store",
".fseventsd",
".Spotlight-V100",
".TemporaryItems",
"System Volume Information",
]
)


class Ignore(metaclass=Singleton):
"""Class for processing and managing glob-like file ignore file patterns."""

_last_loaded: tuple[Path, float] | None = None
_patterns: list[str] = []

@staticmethod
def get_patterns(library_dir: Path, include_global: bool = True) -> list[str]:
"""Get the ignore patterns for the given library directory.

Args:
library_dir (Path): The path of the library to load patterns from.
include_global (bool): Flag for including the global ignore set.
In most scenarios, this should be True.
"""
patterns = GLOBAL_IGNORE if include_global else []
ts_ignore_path = Path(library_dir / TS_FOLDER_NAME / IGNORE_NAME)

if not ts_ignore_path.exists():
logger.info(
"[Ignore] No .ts_ignore file found",
path=ts_ignore_path,
)
Ignore._last_loaded = None
Ignore._patterns = patterns

return Ignore._patterns

# Process the .ts_ignore file if the previous result is non-existent or outdated.
loaded = (ts_ignore_path, ts_ignore_path.stat().st_mtime)
if not Ignore._last_loaded or (Ignore._last_loaded and Ignore._last_loaded != loaded):
logger.info(
"[Ignore] Processing the .ts_ignore file...",
library=library_dir,
last_mtime=Ignore._last_loaded[1] if Ignore._last_loaded else None,
new_mtime=loaded[1],
)
Ignore._patterns = _ignore_to_glob(patterns + Ignore._load_ignore_file(ts_ignore_path))
else:
logger.info(
"[Ignore] No updates to the .ts_ignore detected",
library=library_dir,
last_mtime=Ignore._last_loaded[1],
new_mtime=loaded[1],
)
Ignore._last_loaded = loaded

return Ignore._patterns

@staticmethod
def _load_ignore_file(path: Path) -> list[str]:
"""Load and process the .ts_ignore file into a list of glob patterns.

Args:
path (Path): The path of the .ts_ignore file.
"""
patterns: list[str] = []
if path.exists():
with open(path, encoding="utf8") as f:
for line_raw in f.readlines():
line = line_raw.strip()
# Ignore blank lines and comments
if not line or line.startswith("#"):
continue
patterns.append(line)

return patterns
24 changes: 13 additions & 11 deletions src/tagstudio/core/utils/missing_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
from pathlib import Path

import structlog
from wcmatch import pathlib

from tagstudio.core.library.alchemy.library import Library
from tagstudio.core.library.alchemy.models import Entry
from tagstudio.core.utils.refresh_dir import GLOBAL_IGNORE_SET
from tagstudio.core.library.ignore import PATH_GLOB_FLAGS, Ignore

logger = structlog.get_logger()

Expand All @@ -25,7 +26,9 @@ def missing_file_entries_count(self) -> int:

def refresh_missing_files(self) -> Iterator[int]:
"""Track the number of entries that point to an invalid filepath."""
assert self.library.library_dir
logger.info("[refresh_missing_files] Refreshing missing files...")

self.missing_file_entries = []
for i, entry in enumerate(self.library.get_entries()):
full_path = self.library.library_dir / entry.path
Expand All @@ -38,16 +41,15 @@ def match_missing_file_entry(self, match_entry: Entry) -> list[Path]:

Works if files were just moved to different subfolders and don't have duplicate names.
"""
matches = []
for path in self.library.library_dir.glob(f"**/{match_entry.path.name}"):
# Ensure matched file isn't in a globally ignored folder
skip: bool = False
for part in path.parts:
if part in GLOBAL_IGNORE_SET:
skip = True
break
if skip:
continue
assert self.library.library_dir
matches: list[Path] = []

ignore_patterns = Ignore.get_patterns(self.library.library_dir)
for path in pathlib.Path(str(self.library.library_dir)).glob(
f"***/{match_entry.path.name}",
flags=PATH_GLOB_FLAGS,
exclude=ignore_patterns,
):
if path.name == match_entry.path.name:
new_path = Path(path).relative_to(self.library.library_dir)
matches.append(new_path)
Expand Down
46 changes: 14 additions & 32 deletions src/tagstudio/core/utils/refresh_dir.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,14 @@
from time import time

import structlog
from wcmatch import pathlib

from tagstudio.core.constants import TS_FOLDER_NAME
from tagstudio.core.library.alchemy.library import Library
from tagstudio.core.library.alchemy.models import Entry
from tagstudio.core.library.ignore import PATH_GLOB_FLAGS, Ignore

logger = structlog.get_logger(__name__)

GLOBAL_IGNORE_SET: set[str] = set(
[
TS_FOLDER_NAME,
"$RECYCLE.BIN",
".Trashes",
".Trash",
"tagstudio_thumbs",
".fseventsd",
".Spotlight-V100",
"System Volume Information",
".DS_Store",
]
)


@dataclass
class RefreshDirTracker:
Expand All @@ -42,7 +29,7 @@ def save_new_files(self):
entries = [
Entry(
path=entry_path,
folder=self.library.folder,
folder=self.library.folder, # pyright: ignore[reportArgumentType]
fields=[],
date_added=dt.now(),
)
Expand All @@ -54,7 +41,7 @@ def save_new_files(self):

yield

def refresh_dir(self, lib_path: Path) -> Iterator[int]:
def refresh_dir(self, library_dir: Path) -> Iterator[int]:
"""Scan a directory for files, and add those relative filenames to internal variables."""
if self.library.library_dir is None:
raise ValueError("No library directory set.")
Expand All @@ -65,13 +52,19 @@ def refresh_dir(self, lib_path: Path) -> Iterator[int]:
self.files_not_in_library = []
dir_file_count = 0

for f in lib_path.glob("**/*"):
ignore_patterns = Ignore.get_patterns(library_dir)
logger.info(ignore_patterns)
for f in pathlib.Path(str(library_dir)).glob(
"***/*", flags=PATH_GLOB_FLAGS, exclude=ignore_patterns
):
end_time_loop = time()
# Yield output every 1/30 of a second
if (end_time_loop - start_time_loop) > 0.034:
yield dir_file_count
start_time_loop = time()

logger.info(f)

# Skip if the file/path is already mapped in the Library
if f in self.library.included_files:
dir_file_count += 1
Expand All @@ -81,21 +74,10 @@ def refresh_dir(self, lib_path: Path) -> Iterator[int]:
if f.is_dir():
continue

# Ensure new file isn't in a globally ignored folder
skip: bool = False
for part in f.parts:
# NOTE: Files starting with "._" are sometimes generated by macOS Finder.
# More info: https://lists.apple.com/archives/applescript-users/2006/Jun/msg00180.html
if part.startswith("._") or part in GLOBAL_IGNORE_SET:
skip = True
break
if skip:
continue

dir_file_count += 1
self.library.included_files.add(f)

relative_path = f.relative_to(lib_path)
relative_path = f.relative_to(library_dir)
# TODO - load these in batch somehow
if not self.library.has_path_entry(relative_path):
self.files_not_in_library.append(relative_path)
Expand All @@ -104,8 +86,8 @@ def refresh_dir(self, lib_path: Path) -> Iterator[int]:
yield dir_file_count
logger.info(
"Directory scan time",
path=lib_path,
path=library_dir,
duration=(end_time_total - start_time_total),
files_not_in_lib=self.files_not_in_library,
files_scanned=dir_file_count,
ignore_patterns=ignore_patterns,
)
Loading