Description
Feature Name
Type Checking & Error Reporting
Feature Description
Type checking the annotations
dataframe passed to run
would have saved me a lot of time today:
fd = fastdup.create(input_dir=my_path)
fd.run(
annotations=df,
verbose=True,
)
Turns out, some of my col_x
had two decimal places in it, rendering it not a number.
Another thing that would have saved me a lot of time is actual error reporting, instead of catching the error and reporting something arbitrary. The above, even with verbose set, just simply returned:
Reading regular file. Please make sure the command works in terminal.
If this error message was even in reference to my mistake (which I don't believe it is), what command am I ensuring works in the terminal? I don't know... The console doesn't know... Does anyone know?
This is a rather unacceptable and ambiguous error, as to bypass it I had to copy and paste your source code then delete the part that silently caught the error:
import json
import os
import tempfile
import warnings
from typing import List, Union, Tuple
import numpy as np
import pandas as pd
from pathlib import Path
import fastdup
from pandas.errors import EmptyDataError
import shutil
import fastdup.definitions as FD
#import boto3
from fastdup.sentry import v1_sentry_handler, fastdup_capture_exception, fastdup_capture_log_debug_state
from fastdup.definitions import FOLDER_FULL_IMAGE_RUN
import pathlib
import re
import fastdup
fd = fastdup.create(input_dir=my_path)
def set_fastdup_kwargs(input_kwargs: dict) -> dict:
"""
override default arguments in fastdup args with users-input
:param input_kwargs:iunput kwargs to init function
:return: updated dict
"""
fastdup_params = {
'input_dir', 'work_dir', 'test_dir', 'compute', 'verbose', 'num_threads', 'num_images', 'distance',
'threshold', 'lower_threshold', 'model_path', 'license', 'version', 'nearest_neighbors_k', 'd', 'run_mode',
'nn_provider', 'min_offset', 'max_offset', 'nnf_mode', 'nnf_param', 'bounding_box', 'batch_size', 'resume',
'high_accuracy'
}
turi_params = {
'nnmodel': {'map': {'brute_force': 0, 'ball_tree': 1, 'lsh': 2}, 'default': 'brute_force'},
'ccthreshold': {'map': None, 'default': 0.96},
'run_cc': {'map': {True: 1, False: 0}, 'default': True},
'run_sentry': {'map': {True: 1, False: 0}, 'default': True},
'delete_tar': {'map': {True: 1, False: 0}, 'default': False},
'delete_img': {'map': {True: 1, False: 0}, 'default': False},
'tar_only': {'map': {True: 1, False: 0}, 'default': False},
'run_stats': {'map': {True: 1, False: 0}, 'default': True},
'run_stats_only': {'map': {True: 1, False: 0}, 'default': False},
'run_advanced_stats': {'map': {True: 1, False: 0}, 'default': False},
'sync_s3_to_local': {'map': {True: 1, False: 0}, 'default': False},
'store_int': {'map': {True: 1, False: 0}, 'default': True},
'shorten_filenames': {'map': {True: 1, False: 0}, 'default': False},
'save_crops': {'map': {True: 1, False: 0}, 'default': False},
'augmentation_horiz': {'map': None, 'default': 0.2},
'augmentation_vert': {'map': None, 'default': 0.2},
'augmentation_additive_margin': {'map': None, 'default': 0},
'num_onnx_inter_threads': {'map': None, 'default': 0},
'num_onnx_intra_threads': {'map': None, 'default': 0},
'is_clip14_model': {'map': {True: 1, False: 0}, 'default': False},
#'run_labels': {'map': {True: 1, False: 0}, 'default': True},
#'run_read_filenames': {'map': {True: 1, False: 0}, 'default': True},
#'min_file_size': {'map': None, 'default': 0},
#'read_features_parallel': {'map': None, 'default': 0},
#'test_correction_offset': {'map': None, 'default': 0},
#'max_augmentations': {'map': None, 'default': 1},
#'augmentation_type': {'map': None, 'default': 0},
#'is_ultraface_model': {'map': {True: 1, False: 0}, 'default': False},
#'is_yolo_model': {'map': {True: 1, False: 0}, 'default': False},
'min_input_image_height': {'map': None, 'default': 10},
'min_input_image_width': {'map': None, 'default': 10},
'save_thumbnails': {'map': {True: 1, False: 0}, 'default': False},
'find_regex': {'map': None, 'default': ""},
'no_sort': {'map': {True: 1, False: 0}, 'default': False},
'quiet': {'map': {True: 1, False: 0}, 'default': False},
'fastdup_ocr_lang': {'map': None, 'default': "en"},
'fastdup_ocr_no_crop': {'map': {True: 1, False: 0}, 'default': False}
}
for key, value in input_kwargs.items():
if key not in fastdup_params and key not in turi_params:
raise ValueError(f'invalid argument {key}, allowed fastdup params are {fastdup_params}, allowed turi_param values are {turi_params}')
turi_kwargs = []
for arg_name, param in turi_params.items():
map_dict = param['map']
map_func = lambda x: x if map_dict is None else map_dict[x]
value = input_kwargs.get(arg_name, param['default'])
turi_kwargs.append(f'{arg_name}={map_func(value)}')
fastdup_kwargs = {key: value for key, value in input_kwargs.items() if key in fastdup_params}
fastdup_kwargs['turi_param'] = ','.join(turi_kwargs)
return fastdup_kwargs
def run(
input_dir: Union[str, Path] = None,
annotations: Union[pd.DataFrame,list] = None,
subset: list = None,
embeddings=None,
data_type: str = FD.IMG,
overwrite: bool = False,
print_summary: bool = True,
print_vl_datasets_ref: bool = True,
**fastdup_kwargs
):
"""
This function
1. calculate subset of images to analyze
2. run fastdup
3. map images/bboxes to fastdup index (on bbox this is done in self._set_fastdup_input)
4. expand annotation csv to include files that are not in annotation but is in subset
5. create a version of annotation that is grouped by image
:param input_dir: input directory containing images
:param annotations: (Optional) annotations dataframe, the expected column convention is:
- filename: input_dir-relative filenames
- img_h, img_w (Optional): image height and width
- bbox_x, bbox_y, bbox_h, bbox_w (Optional): bounding box arguments. Alternatively x1,y2,x2,y2,x3,y3,x4,x4 for rotated bounding box.
- split (Optional): data split, e.g. train, test, etc ...
Alternatively, a list of filenames
Alternatively, a filename of json coco format contains bounding box annotations
Alternatively, a dictionry containing coco format annotations
:param subset: (Optional) subset of images to analyze
:param embeddings: (Optional) pre-calculated feature embeddings. Data type of np.ndarray of size n x d, n is the number of data points, d is the feature vector length.
data type must be 'float32'.
:param data_type: (Optional) data type, one of 'image', 'bbox'
:param overwrite: (Optional) overwrite existing files
:param print_summary: Print summary report of fastdup run results
:param fastdup_kwargs: (Optional) fastdup run arguments, see fastdup.run() documentation
"""
fastdup_capture_log_debug_state(locals())
if fd._fastdup_applied and not overwrite:
warnings.warn('Fastdup was already applied, use overwrite=True to re-run')
return
if annotations is not None:
if isinstance(annotations, list):
annotations = pd.DataFrame({'filename':annotations})
elif isinstance(annotations, dict):
assert isinstance(fd.input_dir, str), f"When working with COCO annotations need to provide fastdup.create(input_dur=...) with input_dir which is a single assolute path pointing to root folder with all images, got {fd._input_dir}"
annotations = convert_coco_dict_to_df(annotations, fd._input_dir)
elif isinstance(annotations, str) or isinstance(annotations, pathlib.Path):
if isinstance(annotations, str):
annotations = shorten_path(annotations)
assert os.path.isfile(annotations), f"Failed to find annotations file {annotations}"
if annotations.endswith('.csv'):
annotations = pd.read_csv(annotations)
elif annotations.endswith('.json'):
import json
label = json.loads(open(annotations, 'r').read())
annotations = convert_coco_dict_to_df(label, fd._input_dir)
else:
assert False, "Unknown annotation file format, should end with .csv or .json"
assert isinstance(annotations, pd.DataFrame) and not annotations.empty and "filename" in annotations.columns, f"Got wrong annotation parameter, should be pd.DataFrame with the mandatory columns: filename {annotations}"
first_filename = annotations['filename'].values[0]
fd._init_run(input_dir, annotations, subset, embeddings, data_type, overwrite, fastdup_kwargs)
# get user's fastdup kwargs or use default
fastdup_kwargs = {} if fastdup_kwargs is None else fastdup_kwargs
if fd._pre_calc_features is not None:
fastdup_kwargs['run_mode'] = 2
fastdup_kwargs['d'] = fd._embeddings_dim
fastdup_kwargs = set_fastdup_kwargs(fastdup_kwargs)
if 'run_stats' in fastdup_kwargs and not fastdup_kwargs['run_stats']:
fd._run_stats = False
os.makedirs(fd._work_dir, exist_ok=True)
assert os.path.exists(fd._work_dir), "Failed to create folder " + str(fd._work_dir)
if overwrite and os.path.isfile(os.path.join(fd._work_dir, 'atrain_features.dat.csv')):
os.unlink(os.path.join(fd._work_dir, 'atrain_features.dat.csv'))
# run fastdup - create embeddings
fastdup.run(fd._set_fastdup_input(), work_dir=str(fd._work_dir), **fastdup_kwargs)
#fastdup_convert_to_relpath(self._work_dir, self._filename_prefix)
# post process - map fastdup-id to image (for bbox this is done in self._set_fastdup_input)
if fd._dtype == FD.IMG or fd._run_mode == FD.MODE_CROP:
fd._create_img_mapping()
# expand annotation csv to include files that are not in annotation but is in subset
fd._expand_annot_df()
if fd._dtype != FD.BBOX:
fd._index_annot_df()
fd._save_artifacts(fastdup_kwargs)
fd._fastdup_applied = True
if print_summary:
fd.summary()
if print_vl_datasets_ref:
fd.vl_datasets_ref_printout()
return 0
run(
input_dir=fd.input_dir,
annotations=df,
run_stats=True,
run_advanced_stats=True,
overwrite=True,
verbose=True,
)
Finally, after all of this, I noticed my error in the verbose logs. This error was completely my fault, and I probably made very wrong / dumb assumptions. I know for sure that I made a dumb mistake... but I cannot help but to feel that this really cool utility library failed me in some ways.
Contact Information [Optional]
No response