Skip to content

Commit 1e4b0c5

Browse files
committed
feat: use ripgrep for scanning if available
1 parent 8344dfd commit 1e4b0c5

File tree

3 files changed

+113
-10
lines changed

3 files changed

+113
-10
lines changed

src/tagstudio/core/utils/refresh_dir.py

Lines changed: 111 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import shutil
12
from collections.abc import Iterator
23
from dataclasses import dataclass, field
34
from datetime import datetime as dt
@@ -10,6 +11,7 @@
1011
from tagstudio.core.library.alchemy.library import Library
1112
from tagstudio.core.library.alchemy.models import Entry
1213
from tagstudio.core.library.ignore import PATH_GLOB_FLAGS, Ignore
14+
from tagstudio.qt.helpers.silent_popen import silent_run
1315

1416
logger = structlog.get_logger(__name__)
1517

@@ -41,19 +43,120 @@ def save_new_files(self):
4143

4244
yield
4345

44-
def refresh_dir(self, library_dir: Path) -> Iterator[int]:
45-
"""Scan a directory for files, and add those relative filenames to internal variables."""
46+
def refresh_dir(self, library_dir: Path, force_internal_tools: bool = False) -> Iterator[int]:
47+
"""Scan a directory for files, and add those relative filenames to internal variables.
48+
49+
Args:
50+
library_dir (Path): The library directory.
51+
force_internal_tools (bool): Option to force the use of internal tools for scanning
52+
(i.e. wcmatch) instead of using tools found on the system (i.e. ripgrep).
53+
"""
4654
if self.library.library_dir is None:
4755
raise ValueError("No library directory set.")
4856

57+
ignore_patterns = Ignore.get_patterns(library_dir)
58+
59+
if force_internal_tools:
60+
return self.__wc_add(library_dir, ignore_patterns)
61+
62+
dir_list: list[str] | None = self.__get_dir_list(library_dir, ignore_patterns)
63+
64+
# Use ripgrep if it was found and working, else fallback to wcmatch.
65+
if dir_list is not None:
66+
return self.__rg_add(library_dir, dir_list)
67+
else:
68+
return self.__wc_add(library_dir, ignore_patterns)
69+
70+
def __get_dir_list(self, library_dir: Path, ignore_patterns: list[str]) -> list[str] | None:
71+
"""Use ripgrep to return a list of matched directories and files.
72+
73+
Return `None` if ripgrep not found on system.
74+
"""
75+
rg_path = shutil.which("rg")
76+
# Use ripgrep if found on system
77+
if rg_path is not None:
78+
logger.info("[Refresh: Using ripgrep for scanning]")
79+
80+
compiled_ignore_path = library_dir / ".TagStudio" / ".compiled_ignore"
81+
82+
# Write compiled ignore patterns (built-in + user) to a temp file to pass to ripgrep
83+
with open(compiled_ignore_path, "w") as pattern_file:
84+
pattern_file.write("\n".join(ignore_patterns))
85+
86+
result = silent_run(
87+
" ".join(
88+
[
89+
"rg",
90+
"--files",
91+
"--follow",
92+
"--hidden",
93+
"--ignore-file",
94+
f'"{str(compiled_ignore_path)}"',
95+
]
96+
),
97+
cwd=library_dir,
98+
capture_output=True,
99+
text=True,
100+
shell=True,
101+
)
102+
compiled_ignore_path.unlink()
103+
104+
if result.stderr:
105+
logger.error(result.stderr)
106+
107+
return result.stdout.splitlines() # pyright: ignore [reportReturnType]
108+
109+
logger.warning("[Refresh: ripgrep not found on system]")
110+
return None
111+
112+
def __rg_add(self, library_dir: Path, dir_list: list[str]) -> Iterator[int]:
49113
start_time_total = time()
50114
start_time_loop = time()
51-
115+
dir_file_count = 0
52116
self.files_not_in_library = []
117+
118+
for r in dir_list:
119+
f = pathlib.Path(r)
120+
121+
end_time_loop = time()
122+
# Yield output every 1/30 of a second
123+
if (end_time_loop - start_time_loop) > 0.034:
124+
yield dir_file_count
125+
start_time_loop = time()
126+
127+
# Skip if the file/path is already mapped in the Library
128+
if f in self.library.included_files:
129+
dir_file_count += 1
130+
continue
131+
132+
# Ignore if the file is a directory
133+
if f.is_dir():
134+
continue
135+
136+
dir_file_count += 1
137+
self.library.included_files.add(f)
138+
139+
if not self.library.has_path_entry(f):
140+
self.files_not_in_library.append(f)
141+
142+
end_time_total = time()
143+
yield dir_file_count
144+
logger.info(
145+
"[Refresh]: Directory scan time",
146+
path=library_dir,
147+
duration=(end_time_total - start_time_total),
148+
files_scanned=dir_file_count,
149+
tool_used="ripgrep (system)",
150+
)
151+
152+
def __wc_add(self, library_dir: Path, ignore_patterns: list[str]) -> Iterator[int]:
153+
start_time_total = time()
154+
start_time_loop = time()
53155
dir_file_count = 0
156+
self.files_not_in_library = []
157+
158+
logger.info("[Refresh]: Falling back to wcmatch for scanning")
54159

55-
ignore_patterns = Ignore.get_patterns(library_dir)
56-
logger.info(ignore_patterns)
57160
for f in pathlib.Path(str(library_dir)).glob(
58161
"***/*", flags=PATH_GLOB_FLAGS, exclude=ignore_patterns
59162
):
@@ -76,16 +179,16 @@ def refresh_dir(self, library_dir: Path) -> Iterator[int]:
76179
self.library.included_files.add(f)
77180

78181
relative_path = f.relative_to(library_dir)
79-
# TODO - load these in batch somehow
182+
80183
if not self.library.has_path_entry(relative_path):
81184
self.files_not_in_library.append(relative_path)
82185

83186
end_time_total = time()
84187
yield dir_file_count
85188
logger.info(
86-
"Directory scan time",
189+
"[Refresh]: Directory scan time",
87190
path=library_dir,
88191
duration=(end_time_total - start_time_total),
89192
files_scanned=dir_file_count,
90-
ignore_patterns=ignore_patterns,
193+
tool_used="wcmatch (internal)",
91194
)

src/tagstudio/qt/helpers/silent_popen.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def silent_Popen( # noqa: N802
8686
)
8787

8888

89-
def silent_run( # noqa: N802
89+
def silent_run(
9090
args,
9191
bufsize=-1,
9292
executable=None,

tests/macros/test_refresh_dir.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def test_refresh_new_files(library, exclude_mode):
2020
(library.library_dir / "FOO.MD").touch()
2121

2222
# When
23-
assert len(list(registry.refresh_dir(library.library_dir))) == 1
23+
assert len(list(registry.refresh_dir(library.library_dir, force_internal_tools=True))) == 1
2424

2525
# Then
2626
assert registry.files_not_in_library == [Path("FOO.MD")]

0 commit comments

Comments
 (0)