Skip to content

Commit 53bca7f

Browse files
SeregaCoditSerhii Naumenko
andauthored
Naumenko/clean_annotations (#2)
* feat(commands): initial commit for clean_annotations * chore(deps): update requirements.txt * fix(defaults): fix warnings with config.json while testing * feat(annotations): implement operation for removing orphan annotations * fix(annotations): make a_suffix Tuple type; make a_source argument setter, if a_source is None - a_source = source_directory * docs(annotations): add clean-annotations command description to readme --------- Co-authored-by: Serhii Naumenko <naumenko.s.mail@gmail.com>
1 parent 2d6b6b5 commit 53bca7f

12 files changed

Lines changed: 111 additions & 21 deletions

README.MD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ if you don’t want the command works in a cycle, just don't use "-r" argument.
1414
- **delete** - delete files that match patterns from source directory
1515
- **dedup** - find duplicates in source directory that matches a pattern. An image means a duplicate if it's hash has lower
1616
Hamming distance with comparing image hash than threshold value. The threshold value setups in percentage and must be in range [0, 100]. Pay attention to core_size parameter: the lower value makes details at photo less important, and the higher value makes details mach important while comparing information at images. It’s implemented only dHash comparing method for now.
17+
- **clean-annotations** - find annotation files in directory that doesn't have corresponding files
1718
## How to use:
1819
clone git repository:
1920

config.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,7 @@
1414
"core_size": 16,
1515
"n_jobs": 20,
1616
"cache_file_path": "./cache",
17-
"cache_name": null
17+
"cache_name": null,
18+
"a_suffix": [".xml"],
19+
"a_source": null
1820
}

const_utils/arguments.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,5 @@ class Arguments:
2828
core_size: str = "--core_size"
2929
n_jobs: str = "--n_jobs"
3030
cache_name: str = "--cache_name"
31+
a_suffix: str = "--a_suffix"
32+
a_source: str = "--a_source"

const_utils/commands.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,5 @@ class Commands:
66
move: str = "move"
77
slice: str = "slice"
88
delete: str = "delete"
9-
dedup: str = "dedup"
9+
dedup: str = "dedup"
10+
clean_annotations: str = "clean-annotations"

const_utils/default_values.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@ class AppSettings(BaseSettings):
1717

1818
model_config = SettingsConfigDict(
1919
env_prefix="APP_",
20-
json_file=Constants.config_file,
21-
extra="ignore"
20+
# json_file=Constants.config_file,
21+
extra="ignore",
22+
validate_assignment=True
2223
)
2324

2425
remove: bool = Field(default=False)
@@ -37,6 +38,8 @@ class AppSettings(BaseSettings):
3738
n_jobs: int = Field(default=2, ge=1, le=multiprocessing.cpu_count())
3839
cache_file_path: Path = Field(default=Path("./cache"))
3940
cache_name: Optional[Path] = Field(default=None)
41+
a_suffix: Tuple[str, ...] = Field(default_factory=tuple)
42+
a_source: Optional[Path] = Field(default=None)
4043

4144
@field_validator('core_size')
4245
@classmethod
@@ -45,13 +48,15 @@ def check_power_of_two(cls, value: int) -> int:
4548
raise ValueError(f"core_size must be a power of 2 (e.g., 8, 16, 32, 64...), got {value}")
4649
return value
4750

48-
@field_validator('log_path', 'cache_file_path', mode='before')
51+
@field_validator("log_path", "cache_file_path", "a_source", mode='before')
4952
@classmethod
5053
def ensure_path(cls, value: Union[str, Path]) -> Path:
5154
if isinstance(value, str):
5255
return Path(value)
5356
return value
5457

58+
59+
5560
@classmethod
5661
def load_config(cls, config_path: Path = Constants.config_file) -> "AppSettings":
5762

const_utils/parser_help.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,7 @@ class HelpStrings:
2929
)
3030
n_jobs: str = "A count of workers for CPU Bound tasks like a hashmap building"
3131
cache_name: str = ("A cache file name. If you don't set this parameter cache name will be generated automatically "
32-
"with next signature: <cache_{path_hash}_d{folder_name}{hash_type}s{core_size}.pkl>")
32+
"with next signature: <cache_{path_hash}_d{folder_name}{hash_type}s{core_size}.pkl>")
33+
a_suffix: str = "A suffix pattern for annotations"
34+
a_source: str = ("A source directory to annotations. If None - that means annotations are in the same folder with"
35+
" images")

fileManager.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from file_operations.delete import DeleteOperation
1111
from file_operations.move import MoveOperation
1212
from file_operations.slice import SliceOperation
13-
from logger.log_level_mapping import LevelMapping
13+
from file_operations.clean_annotations import CleanAnnotationsOperation
1414

1515

1616
class FileManager:
@@ -22,7 +22,8 @@ def __init__(self):
2222
Commands.move: MoveOperation,
2323
Commands.slice: SliceOperation,
2424
Commands.delete: DeleteOperation,
25-
Commands.dedup: DedupOperation
25+
Commands.dedup: DedupOperation,
26+
Commands.clean_annotations: CleanAnnotationsOperation
2627
}
2728
self.settings = AppSettings.load_config(Constants.config_file)
2829
self._setup_commands()
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import argparse
2+
from pathlib import Path
3+
from typing import Union
4+
5+
from const_utils.arguments import Arguments
6+
from const_utils.default_values import AppSettings
7+
from const_utils.parser_help import HelpStrings
8+
from file_operations.file_operation import FileOperation
9+
from file_operations.file_remover import FileRemoverMixin
10+
11+
12+
13+
class CleanAnnotationsOperation(FileOperation, FileRemoverMixin):
14+
def __init__(self, **kwargs):
15+
"""
16+
Cleans orphan annotations from same or different paths with images.
17+
Unique args:
18+
a_source: Path - a path to annotations directory. If None - will be set as source_directory value
19+
a_suffix: Tuple[str, ...] - Pattern for annotations file suffix
20+
"""
21+
super().__init__(**kwargs)
22+
self.a_source = self.settings.a_source
23+
24+
25+
@staticmethod
26+
def add_arguments(settings: AppSettings, parser: argparse.ArgumentParser) -> None:
27+
parser.add_argument(
28+
Arguments.a_suffix,
29+
nargs="+",
30+
help=HelpStrings.a_suffix,
31+
default=settings.a_suffix,
32+
)
33+
parser.add_argument(
34+
Arguments.a_source,
35+
help=HelpStrings.a_source,
36+
default=settings.a_source,
37+
)
38+
39+
40+
def do_task(self) -> None:
41+
self.logger.info(f"Checking for orphan annotations in {self.settings.a_source}")
42+
annotation_paths = self.get_files(
43+
source_directory=self.a_source,
44+
pattern=self.settings.a_suffix
45+
)
46+
47+
image_stems = set(image.stem for image in self.files_for_task)
48+
49+
orphans_removed = 0
50+
for a_path in annotation_paths:
51+
if a_path.stem not in image_stems:
52+
if self._remove_file(a_path):
53+
orphans_removed += 1
54+
self.logger.info(f"Removed {a_path.stem}")
55+
56+
self.logger.info(f"Removed {orphans_removed} orphan annotations")
57+
58+
@property
59+
def a_source(self) -> Path:
60+
return self._a_source
61+
62+
@a_source.setter
63+
def a_source(self, value: Union[Path, str, None]) -> None:
64+
"""setter for a_source, it might be set to Path type, or rise Type error """
65+
if isinstance(value, Path):
66+
self._a_source = value
67+
elif isinstance(value, str):
68+
self._a_source = Path(value)
69+
elif value is None:
70+
self._a_source = self.source_directory
71+
else:
72+
self.logger.error(f"Invalid value for a_source: {value}")
73+
raise TypeError(f"Invalid value for a_source, can be Union[Path, str, None], got {type(value)}")

file_operations/file_operation.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,17 @@ def __init__(self, settings: AppSettings, **kwargs):
3737
self.logger.info(f"Started with parameters: {kwargs}")
3838

3939

40-
def get_files(self) -> None:
40+
def get_files(self, source_directory: Path, pattern: Union[Tuple[str], Tuple[str, ...]]) -> Tuple[Path]:
4141
"""Get files from source directory that match a set of patterns"""
4242
files = set()
4343

44-
for p in self.pattern:
45-
current_pattern_files = self.source_directory.glob(f"*{p}*")
44+
for p in pattern:
45+
current_pattern_files = source_directory.glob(f"*{p}*")
4646
files.update(current_pattern_files)
4747

48-
self.files_for_task = tuple(files)
49-
self.logger.debug(f"Total files_for_task: {len(self.files_for_task)}")
48+
files_for_task = tuple(files)
49+
self.logger.debug(f"Total files_for_task: {len(files_for_task)}")
50+
return files_for_task
5051

5152
def check_source_directory(self) -> None:
5253
"""Check if source directory is valid"""
@@ -67,7 +68,7 @@ def run(self) -> None:
6768
self.check_directories()
6869
while True:
6970
try:
70-
self.get_files()
71+
self.files_for_task = self.get_files(source_directory=self.source_directory, pattern=self.pattern)
7172

7273
if len(self.files_for_task) == 0 and self.repeat:
7374
self.logger.info(f"No files found for task'{self.pattern}'. Wait for {self.sleep} seconds...")

file_operations/file_remover.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@ def _remove_all(self, filepaths: Union[List[Path], Tuple[Path], Path]) -> None:
1919
raise TypeError(f'filepaths should be a list or a tuple or a Path, not {type(filepaths)}')
2020

2121

22-
def _remove_file(self: LoggerProtocol, path: Path) -> None:
23-
"""deletes one received file"""
22+
def _remove_file(self: LoggerProtocol, path: Path) -> bool:
23+
"""deletes one received file. Returns True or False if file was successfully removed or not"""
2424
if not path.is_file():
2525
self.logger.warning(f"{path} is not a file")
2626
try:
2727
path.unlink(missing_ok=True)
2828
self.logger.info(f"{path} removed")
29+
return True
2930
except FileNotFoundError:
3031
self.logger.warning(f"{path} file not exists, skipping")
32+
return False

0 commit comments

Comments
 (0)