[Feature Request]: Type Checking & Error Reporting

### Feature Name

Type Checking & Error Reporting

### Feature Description

Type checking the `annotations` dataframe passed to `run` would have saved me a lot of time today:

```python
fd = fastdup.create(input_dir=my_path)

fd.run(
    annotations=df,
    verbose=True,
)
```

Turns out, some of my `col_x` had two decimal places in it, rendering it not a number.

Another thing that would have saved me a lot of time is actual error reporting, instead of catching the error and reporting something arbitrary. The above, even with verbose set, just simply returned:

```
Reading regular file. Please make sure the command works in terminal.
```

If this error message was even in reference to my mistake (which I don't believe it is), what command am I ensuring works in the terminal? I don't know... The console doesn't know... Does anyone know?

This is a rather unacceptable and ambiguous error, as to bypass it I had to copy and paste your source code then delete the part that silently caught the error:

```python
import json
import os
import tempfile
import warnings
from typing import List, Union, Tuple
import numpy as np
import pandas as pd
from pathlib import Path
import fastdup
from pandas.errors import EmptyDataError
import shutil
import fastdup.definitions as FD
#import boto3
from fastdup.sentry import v1_sentry_handler, fastdup_capture_exception, fastdup_capture_log_debug_state
from fastdup.definitions import FOLDER_FULL_IMAGE_RUN
import pathlib
import re

import fastdup

fd = fastdup.create(input_dir=my_path)

def set_fastdup_kwargs(input_kwargs: dict) -> dict:
    """
    override default arguments in fastdup args with users-input
    :param input_kwargs:iunput kwargs to init function
    :return: updated dict
    """
    fastdup_params = {
        'input_dir', 'work_dir', 'test_dir', 'compute', 'verbose', 'num_threads', 'num_images', 'distance',
        'threshold', 'lower_threshold', 'model_path', 'license', 'version', 'nearest_neighbors_k', 'd', 'run_mode',
        'nn_provider', 'min_offset', 'max_offset', 'nnf_mode', 'nnf_param', 'bounding_box', 'batch_size', 'resume',
        'high_accuracy'
    }
    turi_params = {
        'nnmodel': {'map': {'brute_force': 0, 'ball_tree': 1, 'lsh': 2}, 'default': 'brute_force'},
        'ccthreshold': {'map': None, 'default': 0.96},
        'run_cc': {'map': {True: 1, False: 0}, 'default': True},
        'run_sentry': {'map': {True: 1, False: 0}, 'default': True},
        'delete_tar': {'map': {True: 1, False: 0}, 'default': False},
        'delete_img': {'map': {True: 1, False: 0}, 'default': False},
        'tar_only': {'map': {True: 1, False: 0}, 'default': False},
        'run_stats': {'map': {True: 1, False: 0}, 'default': True},
        'run_stats_only': {'map': {True: 1, False: 0}, 'default': False},
        'run_advanced_stats': {'map': {True: 1, False: 0}, 'default': False},
        'sync_s3_to_local': {'map': {True: 1, False: 0}, 'default': False},
        'store_int': {'map': {True: 1, False: 0}, 'default': True},
        'shorten_filenames': {'map': {True: 1, False: 0}, 'default': False},
        'save_crops': {'map': {True: 1, False: 0}, 'default': False},
        'augmentation_horiz': {'map': None, 'default': 0.2},
        'augmentation_vert': {'map': None, 'default': 0.2},
        'augmentation_additive_margin': {'map': None, 'default': 0},
        'num_onnx_inter_threads': {'map': None, 'default': 0},
        'num_onnx_intra_threads': {'map': None, 'default': 0},
        'is_clip14_model':  {'map': {True: 1, False: 0}, 'default': False},
        #'run_labels': {'map': {True: 1, False: 0}, 'default': True},
        #'run_read_filenames': {'map': {True: 1, False: 0}, 'default': True},
        #'min_file_size': {'map': None, 'default': 0},
        #'read_features_parallel': {'map': None, 'default': 0},
        #'test_correction_offset': {'map': None, 'default': 0},
        #'max_augmentations': {'map': None, 'default': 1},
        #'augmentation_type': {'map': None, 'default': 0},
        #'is_ultraface_model': {'map': {True: 1, False: 0}, 'default': False},
        #'is_yolo_model': {'map': {True: 1, False: 0}, 'default': False},
        'min_input_image_height': {'map': None, 'default': 10},
        'min_input_image_width': {'map': None, 'default': 10},
        'save_thumbnails': {'map': {True: 1, False: 0}, 'default': False},
        'find_regex': {'map': None, 'default': ""},
        'no_sort': {'map': {True: 1, False: 0}, 'default': False},
        'quiet': {'map': {True: 1, False: 0}, 'default': False},
        'fastdup_ocr_lang': {'map': None, 'default': "en"},
        'fastdup_ocr_no_crop': {'map': {True: 1, False: 0}, 'default': False}
    }

    for key, value in input_kwargs.items():
        if key not in fastdup_params and key not in turi_params:
            raise ValueError(f'invalid argument {key}, allowed fastdup params are {fastdup_params}, allowed turi_param values are {turi_params}')

    turi_kwargs = []
    for arg_name, param in turi_params.items():
        map_dict = param['map']
        map_func = lambda x: x if map_dict is None else map_dict[x]
        value = input_kwargs.get(arg_name, param['default'])
        turi_kwargs.append(f'{arg_name}={map_func(value)}')

    fastdup_kwargs = {key: value for key, value in input_kwargs.items() if key in fastdup_params}
    fastdup_kwargs['turi_param'] = ','.join(turi_kwargs)
    return fastdup_kwargs

def run(
        input_dir: Union[str, Path] = None, 
        annotations: Union[pd.DataFrame,list] = None, 
        subset: list = None,
        embeddings=None, 
        data_type: str = FD.IMG, 
        overwrite: bool = False,
        print_summary: bool = True, 
        print_vl_datasets_ref: bool = True, 
        **fastdup_kwargs
    ):
    """
    This function
        1. calculate subset of images to analyze
        2. run fastdup
        3. map images/bboxes to fastdup index (on bbox this is done in self._set_fastdup_input)
        4. expand annotation csv to include files that are not in annotation but is in subset
        5. create a version of annotation that is grouped by image
    :param input_dir: input directory containing images
    :param annotations: (Optional) annotations dataframe, the expected column convention is:
            - filename: input_dir-relative filenames
            - img_h, img_w (Optional): image height and width
            - bbox_x, bbox_y, bbox_h, bbox_w (Optional): bounding box arguments. Alternatively x1,y2,x2,y2,x3,y3,x4,x4 for rotated bounding box.
            - split (Optional): data split, e.g. train, test, etc ...
            Alternatively, a list of filenames
            Alternatively, a filename of json coco format contains bounding box annotations
            Alternatively, a dictionry containing coco format annotations
    :param subset: (Optional) subset of images to analyze
    :param embeddings: (Optional) pre-calculated feature embeddings. Data type of np.ndarray of size n x d, n is the number of data points, d is the feature vector length.
        data type must be 'float32'.
    :param data_type: (Optional) data type, one of 'image', 'bbox'
    :param overwrite: (Optional) overwrite existing files
    :param print_summary: Print summary report of fastdup run results
    :param fastdup_kwargs: (Optional) fastdup run arguments, see fastdup.run() documentation
    """
    fastdup_capture_log_debug_state(locals())

    if fd._fastdup_applied and not overwrite:
        warnings.warn('Fastdup was already applied, use overwrite=True to re-run')
        return
    if annotations is not None:
        if isinstance(annotations, list):
            annotations = pd.DataFrame({'filename':annotations})
        elif isinstance(annotations, dict):
            assert isinstance(fd.input_dir, str), f"When working with COCO annotations need to provide fastdup.create(input_dur=...) with input_dir which is a single assolute path pointing to root folder with all images, got {fd._input_dir}"
            annotations = convert_coco_dict_to_df(annotations, fd._input_dir)
        elif isinstance(annotations, str) or isinstance(annotations, pathlib.Path):
            if isinstance(annotations, str):
                annotations = shorten_path(annotations)
            assert os.path.isfile(annotations), f"Failed to find annotations file {annotations}"
            if annotations.endswith('.csv'):
                annotations = pd.read_csv(annotations)
            elif annotations.endswith('.json'):
                import json
                label = json.loads(open(annotations, 'r').read())
                annotations = convert_coco_dict_to_df(label, fd._input_dir)
            else:
                assert False, "Unknown annotation file format, should end with .csv or .json"


        assert isinstance(annotations, pd.DataFrame) and not annotations.empty and "filename" in annotations.columns, f"Got wrong annotation parameter, should be pd.DataFrame with the mandatory columns: filename {annotations}"
        first_filename = annotations['filename'].values[0]
    fd._init_run(input_dir, annotations, subset, embeddings, data_type, overwrite, fastdup_kwargs)

    # get user's fastdup kwargs or use default
    fastdup_kwargs = {} if fastdup_kwargs is None else fastdup_kwargs
    if fd._pre_calc_features is not None:
        fastdup_kwargs['run_mode'] = 2
        fastdup_kwargs['d'] = fd._embeddings_dim
    fastdup_kwargs = set_fastdup_kwargs(fastdup_kwargs)
    if 'run_stats' in fastdup_kwargs and not fastdup_kwargs['run_stats']:
        fd._run_stats = False

    os.makedirs(fd._work_dir, exist_ok=True)
    assert os.path.exists(fd._work_dir), "Failed to create folder " + str(fd._work_dir)

    if overwrite and os.path.isfile(os.path.join(fd._work_dir, 'atrain_features.dat.csv')):
        os.unlink(os.path.join(fd._work_dir, 'atrain_features.dat.csv'))
    # run fastdup - create embeddings
    
    fastdup.run(fd._set_fastdup_input(), work_dir=str(fd._work_dir), **fastdup_kwargs)

    #fastdup_convert_to_relpath(self._work_dir, self._filename_prefix)

    # post process - map fastdup-id to image (for bbox this is done in self._set_fastdup_input)
    if fd._dtype == FD.IMG or fd._run_mode == FD.MODE_CROP:
        fd._create_img_mapping()

    # expand annotation csv to include files that are not in annotation but is in subset
    fd._expand_annot_df()
    if fd._dtype != FD.BBOX:
        fd._index_annot_df()

    fd._save_artifacts(fastdup_kwargs)
    fd._fastdup_applied = True
    if print_summary:
        fd.summary()
    if print_vl_datasets_ref:
        fd.vl_datasets_ref_printout()

    return 0

run(
    input_dir=fd.input_dir,
    annotations=df,
    run_stats=True,
    run_advanced_stats=True,
    overwrite=True,
    verbose=True,
)
```

Finally, after all of this, I noticed my error in the verbose logs. This error was completely my fault, and I probably made very wrong / dumb assumptions. I know for sure that I made a dumb mistake... but I cannot help but to feel that this really cool utility library failed me in some ways.

### Contact Information [Optional]

_No response_

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Feature Request]: Type Checking & Error Reporting #344

Feature Name

Feature Description

Contact Information [Optional]

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[Feature Request]: Type Checking & Error Reporting #344

Description

Feature Name

Feature Description

Contact Information [Optional]

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions